topGene_step1_cnt_abstracts.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95

#!/bin/env python3 
import os
import sys
import re
import time
from ratspub_keywords import *

def undic(dic):
    return "|".join(dic.values())

def gene_addiction_cnt(gene):
    time.sleep(0.2)
    q="\'(\"" + addiction.replace("|", "\"[tiab] OR \"")  + "\") AND (\"" + drug.replace("|", "\"[tiab] OR \"", ) + "\") AND (\"" + gene + "\")\'"
    count=os.popen('esearch -db pubmed  -query ' + q + ' | xtract -pattern ENTREZ_DIRECT -element Count ').read()
    if (len(count)==0):
        print("pause")
        time.sleep(15)
        return gene_addiction_cnt(gene)
    else:
        return (count)

def removeStopWords(terms):
    out=str()
    for one in terms.upper().split("|"):
       if one not in stopWords:
            out+="|"+one
    return(out[1:])

def saveStopWord(w):
    with open (stopword_f,"a") as swf:
        swf.write(w+"\n")
    return


# either start with ncbi_gene_symb_syno_name_txid9606 for fresh new counts
# or recount the results after adding additional stopwords

if len(sys.argv)==2:
    input_f=sys.argv[1]
else:
    input_f="./ncbi_gene_symb_syno_name_txid9606.txt"

addiction=undic(addiction_d)
drug=undic(drug_d)
output_f=input_f.replace(".txt","_absCnt.txt")
out=open(output_f, "w+")

stopword_f="./stop_words_addiction_gene_search.txt"
with open (stopword_f, "r") as swf:
    stopWords=swf.read().upper().split("\n")
    swf.close()

with open (input_f, "r") as f:
    for line in f:
        rerun=0
        count=-1
        inputline=line
        if "'" in line:
            words=line.split("|")
            line=str()
            for word in words:
                # ' is used to mark/annotate a word is a stop word in the results
                if "'" in word:
                    word=word.replace("'","")
                    stopWords.append(word)
                    saveStopWord(word)
                    rerun=1
                # remove the ' mark 
                line+="|"+word
            line=line[1:]
        line=removeStopWords(line)
        # tab is added if there are abstracts counts
        if "\t" in line:
            (gene, count)=line.split("\t")
        else:
            gene=line.strip()
        # remove synonyms with only two letters
        if "|" in gene:
            synos=gene.split("|")
            gene=str()
            for syno in synos:
                if len(syno)>2:
                    gene+="|"+syno
            gene=gene[1:]
        gene_q=gene.replace("|", "\"[tiab] OR \"")
        gene_q+="[tiab]"
        if rerun==1 or count== -1 :
            count=gene_addiction_cnt(gene_q)
        print("original line->\t"+inputline.strip())
        print("stopword rmed->\t"+line.strip())
        print("final  result->\t"+gene+"\t"+count)
        # only save the non_zero results
        if (int(count)>0):
            out.write(gene+"\t"+count)