#!/bin/env python3 import os import re import time from nltk.tokenize import sent_tokenize from ratspub_keywords import * def undic(dic): return "|".join(dic.values()) def findWholeWord(w): return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search def getSentences(query, genes): abstracts = os.popen("esearch -db pubmed -query " + query + " | efetch -format uid |fetch-pubmed -path /run/media/hao/PubMed/Archive/ | xtract -pattern PubmedArticle -element MedlineCitation/PMID,ArticleTitle,AbstractText|sed \"s/-/ /g\"").read() gene_syno=genes.split("|") symb=gene_syno[0] out=str() for row in abstracts.split("\n"): tiab=row.split("\t") pmid = tiab.pop(0) tiab= " ".join(tiab) sentences = sent_tokenize(tiab) ## keep the sentence only if it contains the gene for sent in sentences: for gene in gene_syno: if findWholeWord(gene)(sent): sent=re.sub(r'\b(%s)\b' % gene, r'\1', sent, flags=re.I) for drug0 in drug_d: if findWholeWord(drug_d[drug0])(sent) : sent=sent.replace("","").replace("","") sent=re.sub(r'\b(%s)\b' % drug_d[drug0], r'\1', sent, flags=re.I) out+=symb+"\t"+"drug\t" + drug0+"\t"+pmid+"\t"+sent+"\n" for add0 in addiction_d: if findWholeWord(addiction_d[add0])(sent) : sent=sent.replace("","").replace("","") sent=re.sub(r'\b(%s)\b' % addiction_d[add0], r'\1', sent, flags=re.I) out+=symb+"\t"+"addiction\t"+add0+"\t"+pmid+"\t"+sent+"\n" return(out) addiction=undic(addiction_d) drug=undic(drug_d) out=open("gene_addiction_sentences.tab", "w+") cnt=0 with open ("./ncbi_gene_symb_syno_name_txid9606_absCnt_sorted_absCnt_sorted_absCnt_sorted.txt", "r") as f: for line in f: (genes, abstractCount)=line.strip().split("\t") if int(abstractCount)>20: symb=genes.split("|")[0] print(symb+"-->"+genes) q="\'(\"" + addiction.replace("|", "\"[tiab] OR \"") + "\") AND (\"" + drug.replace("|", "\"[tiab] OR \"", ) + "\") AND (\"" + genes.replace("|", "\"[tiab] OR \"", ) + "\")\'" sentences=getSentences(q,genes) out.write(sentences) out.close() os.system("cut -f 1,4 gene_addiction_sentences.tab |uniq |cut -f 1 |uniq -c |sort -rn > topGeneAbstractCount.tab")