aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHao Chen2019-05-16 11:46:37 -0500
committerHao Chen2019-05-16 11:46:37 -0500
commiteeee0e3daba8dacc7301b137f131f8673c963077 (patch)
treeef79156f446729e73d439306b8d9264c24c61ba3
parentcef45f7e5fbf89e759bd5a4fcaaadec5065bbadd (diff)
downloadgenecup-eeee0e3daba8dacc7301b137f131f8673c963077.tar.gz
combine initial search and annotation into one
-rwxr-xr-xtopGene_step1_cnt_abstracts.py92
1 files changed, 75 insertions, 17 deletions
diff --git a/topGene_step1_cnt_abstracts.py b/topGene_step1_cnt_abstracts.py
index 0880aff..780c314 100755
--- a/topGene_step1_cnt_abstracts.py
+++ b/topGene_step1_cnt_abstracts.py
@@ -1,11 +1,16 @@
#!/bin/env python3
import os
+import sys
import re
import time
-from ratspub import *
+from ratspub_keywords import *
+
+def undic(dic):
+ return "|".join(dic.values())
def gene_addiction_cnt(gene):
- q="\"(" + addiction.replace("|", "[tiab] OR ") + ") AND (" + drug.replace("|", "[tiab] OR ", ) + ") AND (" + gene + ")\""
+ time.sleep(0.2)
+ q="\'(\"" + addiction.replace("|", "\"[tiab] OR \"") + "\") AND (\"" + drug.replace("|", "\"[tiab] OR \"", ) + "\") AND (\"" + gene + "\")\'"
count=os.popen('esearch -db pubmed -query ' + q + ' | xtract -pattern ENTREZ_DIRECT -element Count ').read()
if (len(count)==0):
print("pause")
@@ -14,24 +19,77 @@ def gene_addiction_cnt(gene):
else:
return (count)
-out=open("gene_addiction_abstract_cnt_result.tab", "w+")
+def removeStopWords(terms):
+ out=str()
+ for one in terms.upper().split("|"):
+ if one not in stopWords:
+ out+="|"+one
+ return(out[1:])
+
+def saveStopWord(w):
+ with open (stopword_f,"a") as swf:
+ swf.write(w+"\n")
+ return
+
+
+# either start with ncbi_gene_symb_syno_name_txid9606 for fresh new counts
+# or recount the results after adding additional stopwords
+
+if len(sys.argv)==2:
+ input_f=sys.argv[1]
+else:
+ input_f="./ncbi_gene_symb_syno_name_txid9606.txt"
+
+addiction=undic(addiction_d)
+drug=undic(drug_d)
+output_f=input_f.replace(".txt","_absCnt.txt")
+out=open(output_f, "w+")
-with open ("./ncbi_gene_symb_syno_name_txid9606.txt", "r") as f:
+stopword_f="./stop_words_addiction_gene_search.txt"
+with open (stopword_f, "r") as swf:
+ stopWords=swf.read().upper().split("\n")
+ swf.close()
+
+with open (input_f, "r") as f:
for line in f:
- line=re.sub(r"\)|\(|\[|\]|\*|\'","",line.strip())
+ rerun=0
+ count=-1
+ inputline=line
+ if "'" in line:
+ words=line.split("|")
+ line=str()
+ for word in words:
+ # ' is used to mark/annotate a word is a stop word in the results
+ if "'" in word:
+ word=word.replace("'","")
+ stopWords.append(word)
+ saveStopWord(word)
+ rerun=1
+ # remove the ' mark
+ line+="|"+word
+ line=line[1:]
+ line=removeStopWords(line)
+ # tab is added if there are abstracts counts
if "\t" in line:
- (gene, synostring)=line.strip().split("\t")
- if "|" in synostring:
- synos=synostring.split("|")
- elif len(synostring)>3:
- synos=synostring
- for syno in synos:
- if len(syno)>3:
- gene+="|"+syno
+ (gene, count)=line.split("\t")
else:
gene=line.strip()
- gene_q=gene.replace("|", " [tiab] OR ")
+ # remove synonyms with only two letters
+ if "|" in gene:
+ synos=gene.split("|")
+ gene=str()
+ for syno in synos:
+ if len(syno)>2:
+ gene+="|"+syno
+ gene=gene[1:]
+ gene_q=gene.replace("|", "\"[tiab] OR \"")
gene_q+="[tiab]"
- count=gene_addiction_cnt(gene_q)
- print(gene+"\t"+count)
- out.write(gene+"\t"+count)
+ if rerun==1 or count== -1 :
+ count=gene_addiction_cnt(gene_q)
+ print("original line->\t"+inputline.strip())
+ print("stopword rmed->\t"+line.strip())
+ print("final result->\t"+gene+"\t"+count)
+ # only save the non_zero results
+ if (int(count)>0):
+ out.write(gene+"\t"+count)
+