utility/topGene_step1_cnt_abstracts.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113

#!/bin/env python3 
import os
import sys
import re
import time
from ratspub_keywords import *

def undic(dic):
    return "|".join(dic.values())

def gene_addiction_cnt(gene):
    time.sleep(0.2)
    q="\'(\"" + addiction.replace("|", "\"[tiab] OR \"")  + "\") AND (\"" + drug.replace("|", "\"[tiab] OR \"", ) + "\") AND (\"" + gene + "\")\'"
    count=os.popen('esearch -db pubmed  -query ' + q + ' | xtract -pattern ENTREZ_DIRECT -element Count ').read()
    if (len(count)==0):
        print("pause")
        time.sleep(15)
        return gene_addiction_cnt(gene)
    else:
        return (count)

def removeStopWords(terms):
    out=str()
    for one in terms.upper().split("|"):
       if one not in stopWords:
            out+="|"+one
    return(out[1:])

def saveStopWord(w):
    with open (stopword_f,"a") as swf:
        swf.write(w+"\n")
    return

# either start with ncbi_gene_symb_syno_name_txid9606 for fresh new counts
# or recount the results after adding additional stopwords

if len(sys.argv)==2:
    input_f=sys.argv[1]
    mincnt=10
else:
    input_f="./ncbi_gene_symb_syno_name_txid9606.txt"
    mincnt=0;

addiction=undic(addiction_d)
drug=undic(drug_d)
output_f=input_f.replace(".txt","_absCnt.txt")
out=open(output_f, "w+")

stopword_f="./stop_words_addiction_gene_search.txt"
with open (stopword_f, "r") as swf:
    stopWords=swf.read().upper().split("\n")
    swf.close()

with open (input_f, "r") as f:
    for line in f:
        do_search=0
        skip=0
        inputline=line
        line=line.replace("-","\ ")
        # remove the annotated stopword
        if re.findall(r"^xx", line):
            print ("skip this line\n")
            skip=1
        elif "'" in line:
            do_search=1
            words=line.split("|")
            line=str()
            for word in words:
                # ' is used to mark/annotate a word is a stop word in the results
                # remove the ' mark 
                if "'" in word:
                    word=word.replace("'","")
                    stopWords.append(word)
                    saveStopWord(word)
                line+="|"+word
            line=line[1:]
        line=removeStopWords(line)
        # tab is added if there are abstracts counts
        if "\t" in line:
            (gene, count)=line.split("\t")
            # rerun if count > 20; save time on reruns 
            if int(count)>=20:
                do_search=1
        else:
            #no count, 
            gene=line.strip()
            do_search=1
        if do_search==1 and skip==0:
            # remove synonyms with only two letters
            if "|" in gene:
                synos=gene.split("|")
                # keep the gene name regardless number of characters
                gene=synos[0]
                #print ("gene: "+gene + " synos -->" + str(synos[1:]))
                for syno in synos[1:]:
                    #synonyms must be at least 3 characters
                    if len(syno)>3:
                        gene+="|"+syno
            gene_q=gene.replace("|", "\"[tiab] OR \"")
            gene_q+="[tiab]"
            count=gene_addiction_cnt(gene_q)
            print("original line->\t"+inputline.strip())
            print("stopword rmed->\t"+line.strip())
            print("final  result->\t"+gene+"\t"+count)
            out.write(gene+"\t"+count)
        else:
            print("original resl->\t"+inputline.strip())
            out.write(inputline)

sorted_f=output_f.replace(".txt","_sorted.txt")
os.system("sort -k2 -t$'\t' -rn " + output_f + " > " + sorted_f )