From eb51746ce69fb43450b6896e4a71e5052df51115 Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Fri, 17 May 2019 19:10:08 -0500 Subject: find top genes --- server.py | 2 +- topGene_step1_cnt_abstracts.py | 7 +++++ topGene_step2_cnt_sentences.py | 59 ++++++++++++++++++++++++++++++++++++++++++ topGene_step3_generate_html.py | 50 +++++++++++++++++++++++++++++++++++ 4 files changed, 117 insertions(+), 1 deletion(-) create mode 100755 topGene_step2_cnt_sentences.py create mode 100755 topGene_step3_generate_html.py diff --git a/server.py b/server.py index d9e3551..0a97413 100755 --- a/server.py +++ b/server.py @@ -127,7 +127,7 @@ def sentences(): @app.route("/showTopGene") def showTopGene(): topGene=request.args.get('topGene') - topGeneSentFile="gene_addiction_sentences_cnt.tab" + topGeneSentFile="gene_addiction_sentences.tab" with open(topGeneSentFile, "r") as sents: catCnt={} for sent in sents: diff --git a/topGene_step1_cnt_abstracts.py b/topGene_step1_cnt_abstracts.py index 780c314..a9dd23f 100755 --- a/topGene_step1_cnt_abstracts.py +++ b/topGene_step1_cnt_abstracts.py @@ -39,6 +39,7 @@ if len(sys.argv)==2: input_f=sys.argv[1] else: input_f="./ncbi_gene_symb_syno_name_txid9606.txt" + input_f="./ncbi_gene_symb_syno_name_txid9606_p2.txt" addiction=undic(addiction_d) drug=undic(drug_d) @@ -72,6 +73,8 @@ with open (input_f, "r") as f: # tab is added if there are abstracts counts if "\t" in line: (gene, count)=line.split("\t") + if int(count)<100: + rerun=1 else: gene=line.strip() # remove synonyms with only two letters @@ -93,3 +96,7 @@ with open (input_f, "r") as f: if (int(count)>0): out.write(gene+"\t"+count) +sorted_f=out_f.replace(".txt","_sorted.txt") +os.system("sort -k2 -t$'\t' -rn " + out_f + " > " + sorted_f ) + + diff --git a/topGene_step2_cnt_sentences.py b/topGene_step2_cnt_sentences.py new file mode 100755 index 0000000..fe97cdd --- /dev/null +++ b/topGene_step2_cnt_sentences.py @@ -0,0 +1,59 @@ +#!/bin/env python3 +import os +import re +import time +from nltk.tokenize import sent_tokenize +from ratspub_keywords import * + +def undic(dic): + return "|".join(dic.values()) + +def findWholeWord(w): + return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search + +def getSentences(query, genes): + abstracts = os.popen("esearch -db pubmed -query " + query + " | efetch -format uid |fetch-pubmed -path /run/media/hao/PubMed/Archive/ | xtract -pattern PubmedArticle -element MedlineCitation/PMID,ArticleTitle,AbstractText|sed \"s/-/ /g\"").read() + gene_syno=genes.split("|") + symb=gene_syno[0] + out=str() + for row in abstracts.split("\n"): + tiab=row.split("\t") + pmid = tiab.pop(0) + tiab= " ".join(tiab) + sentences = sent_tokenize(tiab) + ## keep the sentence only if it contains the gene + for sent in sentences: + for gene in gene_syno: + if findWholeWord(gene)(sent): + sent=re.sub(r'\b(%s)\b' % gene, r'\1', sent, flags=re.I) + for drug0 in drug_d: + if findWholeWord(drug_d[drug0])(sent) : + sent=sent.replace("","").replace("","") + sent=re.sub(r'\b(%s)\b' % drug_d[drug0], r'\1', sent, flags=re.I) + out+=symb+"\t"+"drug\t" + drug0+"\t"+pmid+"\t"+sent+"\n" + for add0 in addiction_d: + if findWholeWord(addiction_d[add0])(sent) : + sent=sent.replace("","").replace("","") + sent=re.sub(r'\b(%s)\b' % addiction_d[add0], r'\1', sent, flags=re.I) + out+=symb+"\t"+"addiction\t"+add0+"\t"+pmid+"\t"+sent+"\n" + return(out) + +addiction=undic(addiction_d) +drug=undic(drug_d) + + +out=open("gene_addiction_sentences.tab", "w+") +cnt=0 +with open ("./ncbi_gene_symb_syno_name_txid9606_absCnt_sorted_absCnt_sorted_absCnt_sorted.txt", "r") as f: + for line in f: + (genes, abstractCount)=line.strip().split("\t") + if int(abstractCount)>20: + symb=genes.split("|")[0] + print(symb+"-->"+genes) + q="\'(\"" + addiction.replace("|", "\"[tiab] OR \"") + "\") AND (\"" + drug.replace("|", "\"[tiab] OR \"", ) + "\") AND (\"" + genes.replace("|", "\"[tiab] OR \"", ) + "\")\'" + sentences=getSentences(q,genes) + out.write(sentences) +out.close() + +os.system("cut -f 1,4 gene_addiction_sentences.tab |uniq |cut -f 1 |uniq -c |sort -rn > topGeneAbstractCount.tab") + diff --git a/topGene_step3_generate_html.py b/topGene_step3_generate_html.py new file mode 100755 index 0000000..dfcd6fe --- /dev/null +++ b/topGene_step3_generate_html.py @@ -0,0 +1,50 @@ +import re + +## generate the html page for the top genes + +## put gene names and alias in a dictionary +geneNames={} +with open ("./ncbi_gene_symb_syno_name_txid9606_absCnt_sorted_absCnt_sorted.txt","r") as f: + for line in f: + (genes, count)=line.strip().split("\t") + gene=genes.split("|") + geneNames[gene[0]]=genes.strip() + +out=str() +html=str() +with open("./topGeneAbstractCount.tab" ,"r") as gc: + cnt=0 + for line in gc: + cnt+=1 + line=re.sub(r'^\s+','',line) + print (line) + pmid_cnt, symb=line.strip().split() + out+= symb+"\t"+geneNames[symb]+"\n" + html+="