diff options
Diffstat (limited to 'topGene_step1_cnt_abstracts.py')
-rwxr-xr-x | topGene_step1_cnt_abstracts.py | 92 |
1 files changed, 75 insertions, 17 deletions
diff --git a/topGene_step1_cnt_abstracts.py b/topGene_step1_cnt_abstracts.py index 0880aff..780c314 100755 --- a/topGene_step1_cnt_abstracts.py +++ b/topGene_step1_cnt_abstracts.py @@ -1,11 +1,16 @@ #!/bin/env python3 import os +import sys import re import time -from ratspub import * +from ratspub_keywords import * + +def undic(dic): + return "|".join(dic.values()) def gene_addiction_cnt(gene): - q="\"(" + addiction.replace("|", "[tiab] OR ") + ") AND (" + drug.replace("|", "[tiab] OR ", ) + ") AND (" + gene + ")\"" + time.sleep(0.2) + q="\'(\"" + addiction.replace("|", "\"[tiab] OR \"") + "\") AND (\"" + drug.replace("|", "\"[tiab] OR \"", ) + "\") AND (\"" + gene + "\")\'" count=os.popen('esearch -db pubmed -query ' + q + ' | xtract -pattern ENTREZ_DIRECT -element Count ').read() if (len(count)==0): print("pause") @@ -14,24 +19,77 @@ def gene_addiction_cnt(gene): else: return (count) -out=open("gene_addiction_abstract_cnt_result.tab", "w+") +def removeStopWords(terms): + out=str() + for one in terms.upper().split("|"): + if one not in stopWords: + out+="|"+one + return(out[1:]) + +def saveStopWord(w): + with open (stopword_f,"a") as swf: + swf.write(w+"\n") + return + + +# either start with ncbi_gene_symb_syno_name_txid9606 for fresh new counts +# or recount the results after adding additional stopwords + +if len(sys.argv)==2: + input_f=sys.argv[1] +else: + input_f="./ncbi_gene_symb_syno_name_txid9606.txt" + +addiction=undic(addiction_d) +drug=undic(drug_d) +output_f=input_f.replace(".txt","_absCnt.txt") +out=open(output_f, "w+") -with open ("./ncbi_gene_symb_syno_name_txid9606.txt", "r") as f: +stopword_f="./stop_words_addiction_gene_search.txt" +with open (stopword_f, "r") as swf: + stopWords=swf.read().upper().split("\n") + swf.close() + +with open (input_f, "r") as f: for line in f: - line=re.sub(r"\)|\(|\[|\]|\*|\'","",line.strip()) + rerun=0 + count=-1 + inputline=line + if "'" in line: + words=line.split("|") + line=str() + for word in words: + # ' is used to mark/annotate a word is a stop word in the results + if "'" in word: + word=word.replace("'","") + stopWords.append(word) + saveStopWord(word) + rerun=1 + # remove the ' mark + line+="|"+word + line=line[1:] + line=removeStopWords(line) + # tab is added if there are abstracts counts if "\t" in line: - (gene, synostring)=line.strip().split("\t") - if "|" in synostring: - synos=synostring.split("|") - elif len(synostring)>3: - synos=synostring - for syno in synos: - if len(syno)>3: - gene+="|"+syno + (gene, count)=line.split("\t") else: gene=line.strip() - gene_q=gene.replace("|", " [tiab] OR ") + # remove synonyms with only two letters + if "|" in gene: + synos=gene.split("|") + gene=str() + for syno in synos: + if len(syno)>2: + gene+="|"+syno + gene=gene[1:] + gene_q=gene.replace("|", "\"[tiab] OR \"") gene_q+="[tiab]" - count=gene_addiction_cnt(gene_q) - print(gene+"\t"+count) - out.write(gene+"\t"+count) + if rerun==1 or count== -1 : + count=gene_addiction_cnt(gene_q) + print("original line->\t"+inputline.strip()) + print("stopword rmed->\t"+line.strip()) + print("final result->\t"+gene+"\t"+count) + # only save the non_zero results + if (int(count)>0): + out.write(gene+"\t"+count) + |