diff options
-rwxr-xr-x | server.py | 2 | ||||
-rwxr-xr-x | topGene_step1_cnt_abstracts.py | 7 | ||||
-rwxr-xr-x | topGene_step2_cnt_sentences.py | 59 | ||||
-rwxr-xr-x | topGene_step3_generate_html.py | 50 |
4 files changed, 117 insertions, 1 deletions
@@ -127,7 +127,7 @@ def sentences(): @app.route("/showTopGene") def showTopGene(): topGene=request.args.get('topGene') - topGeneSentFile="gene_addiction_sentences_cnt.tab" + topGeneSentFile="gene_addiction_sentences.tab" with open(topGeneSentFile, "r") as sents: catCnt={} for sent in sents: diff --git a/topGene_step1_cnt_abstracts.py b/topGene_step1_cnt_abstracts.py index 780c314..a9dd23f 100755 --- a/topGene_step1_cnt_abstracts.py +++ b/topGene_step1_cnt_abstracts.py @@ -39,6 +39,7 @@ if len(sys.argv)==2: input_f=sys.argv[1] else: input_f="./ncbi_gene_symb_syno_name_txid9606.txt" + input_f="./ncbi_gene_symb_syno_name_txid9606_p2.txt" addiction=undic(addiction_d) drug=undic(drug_d) @@ -72,6 +73,8 @@ with open (input_f, "r") as f: # tab is added if there are abstracts counts if "\t" in line: (gene, count)=line.split("\t") + if int(count)<100: + rerun=1 else: gene=line.strip() # remove synonyms with only two letters @@ -93,3 +96,7 @@ with open (input_f, "r") as f: if (int(count)>0): out.write(gene+"\t"+count) +sorted_f=out_f.replace(".txt","_sorted.txt") +os.system("sort -k2 -t$'\t' -rn " + out_f + " > " + sorted_f ) + + diff --git a/topGene_step2_cnt_sentences.py b/topGene_step2_cnt_sentences.py new file mode 100755 index 0000000..fe97cdd --- /dev/null +++ b/topGene_step2_cnt_sentences.py @@ -0,0 +1,59 @@ +#!/bin/env python3 +import os +import re +import time +from nltk.tokenize import sent_tokenize +from ratspub_keywords import * + +def undic(dic): + return "|".join(dic.values()) + +def findWholeWord(w): + return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search + +def getSentences(query, genes): + abstracts = os.popen("esearch -db pubmed -query " + query + " | efetch -format uid |fetch-pubmed -path /run/media/hao/PubMed/Archive/ | xtract -pattern PubmedArticle -element MedlineCitation/PMID,ArticleTitle,AbstractText|sed \"s/-/ /g\"").read() + gene_syno=genes.split("|") + symb=gene_syno[0] + out=str() + for row in abstracts.split("\n"): + tiab=row.split("\t") + pmid = tiab.pop(0) + tiab= " ".join(tiab) + sentences = sent_tokenize(tiab) + ## keep the sentence only if it contains the gene + for sent in sentences: + for gene in gene_syno: + if findWholeWord(gene)(sent): + sent=re.sub(r'\b(%s)\b' % gene, r'<strong>\1</strong>', sent, flags=re.I) + for drug0 in drug_d: + if findWholeWord(drug_d[drug0])(sent) : + sent=sent.replace("<b>","").replace("</b>","") + sent=re.sub(r'\b(%s)\b' % drug_d[drug0], r'<b>\1</b>', sent, flags=re.I) + out+=symb+"\t"+"drug\t" + drug0+"\t"+pmid+"\t"+sent+"\n" + for add0 in addiction_d: + if findWholeWord(addiction_d[add0])(sent) : + sent=sent.replace("<b>","").replace("</b>","") + sent=re.sub(r'\b(%s)\b' % addiction_d[add0], r'<b>\1</b>', sent, flags=re.I) + out+=symb+"\t"+"addiction\t"+add0+"\t"+pmid+"\t"+sent+"\n" + return(out) + +addiction=undic(addiction_d) +drug=undic(drug_d) + + +out=open("gene_addiction_sentences.tab", "w+") +cnt=0 +with open ("./ncbi_gene_symb_syno_name_txid9606_absCnt_sorted_absCnt_sorted_absCnt_sorted.txt", "r") as f: + for line in f: + (genes, abstractCount)=line.strip().split("\t") + if int(abstractCount)>20: + symb=genes.split("|")[0] + print(symb+"-->"+genes) + q="\'(\"" + addiction.replace("|", "\"[tiab] OR \"") + "\") AND (\"" + drug.replace("|", "\"[tiab] OR \"", ) + "\") AND (\"" + genes.replace("|", "\"[tiab] OR \"", ) + "\")\'" + sentences=getSentences(q,genes) + out.write(sentences) +out.close() + +os.system("cut -f 1,4 gene_addiction_sentences.tab |uniq |cut -f 1 |uniq -c |sort -rn > topGeneAbstractCount.tab") + diff --git a/topGene_step3_generate_html.py b/topGene_step3_generate_html.py new file mode 100755 index 0000000..dfcd6fe --- /dev/null +++ b/topGene_step3_generate_html.py @@ -0,0 +1,50 @@ +import re + +## generate the html page for the top genes + +## put gene names and alias in a dictionary +geneNames={} +with open ("./ncbi_gene_symb_syno_name_txid9606_absCnt_sorted_absCnt_sorted.txt","r") as f: + for line in f: + (genes, count)=line.strip().split("\t") + gene=genes.split("|") + geneNames[gene[0]]=genes.strip() + +out=str() +html=str() +with open("./topGeneAbstractCount.tab" ,"r") as gc: + cnt=0 + for line in gc: + cnt+=1 + line=re.sub(r'^\s+','',line) + print (line) + pmid_cnt, symb=line.strip().split() + out+= symb+"\t"+geneNames[symb]+"\n" + html+="<li><a href=\"/showTopGene?topGene="+symb+"\">"+symb+"</a><br>\n" + if cnt==500: + break + +with open("topGene_symb_alias.txt", "w+") as tg: + tg.write(out) + tg.close() + + +htmlout=''' +{% extends "layout.html" %} +{% block content %} + +<h3> Top addiction related genes </h3> + +<br> +These genes are ranked by the number of PubMed abstracts that contain the name of the gene and one or more addiction related keyword. +<hr> + +<ol>''' + html + ''' +</ol> +{% endblock %} +''' + +with open("./templates/topAddictionGene.html", "w+") as html_f: + html_f.write(htmlout) + html_f.close() + |