#!/bin/env python3
from nltk.tokenize import sent_tokenize
import os
import re
from addiction_keywords import *
from gene_synonyms import *
import ast
global pubmed_path
def undic(dic):
all_s=''
for s in dic:
all_s += "|".join(str(e) for e in s)
all_s +="|"
all_s=all_s[:-1]
return all_s
def findWholeWord(w):
return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search
def getabstracts(gene,query):
"""
1. esearch -db pubmed -query ... -- searches PubMed for the gene + keyword query, returns matching record IDs
2. efetch -format uid -- fetches just the PMIDs (unique identifiers) from the search results
3. xfetch -db pubmed -- looks up those PMIDs in the local PubMed mirror (avoids hitting NCBI servers
for the full abstracts)
4. xtract -pattern PubmedArticle -element MedlineCitation/PMID,ArticleTitle,AbstractText -- extracts PMID, title, and
abstract text from the XML into tab-separated fields
5. sed "s/-/ /g" -- replaces hyphens with spaces (so hyphenated gene names match keyword searches later)
So: search PubMed remotely for matching articles, get their PMIDs, retrieve the full XML from the local mirror, then extract the PMID + title + abstract as tab-separated text. efetch -format uid returns only PMIDs. The esearch itself just creates a search handle on NCBI's servers, and efetch -format uid pulls back only the numeric PMIDs from that handle. No abstracts or XML are fetched from NCBI.
"""
query="\"(" + query + ") AND (" + gene + " [tiab])\""
# Step 1: fetch PMIDs from PubMed
pmid_cmd = "esearch -db pubmed -query " + query + " | efetch -format uid"
print(f" popen: {pmid_cmd}")
pmids = os.popen(pmid_cmd).read().strip()
if not pmids:
print(f" no PMIDs found for {gene}")
return ""
pmid_list = pmids.split("\n")
print(f" PMIDs ({len(pmid_list)}): {' '.join(pmid_list)}")
# Step 2: fetch abstracts from local mirror
abs_cmd = "echo '" + pmids.replace("'", "") + "' | xfetch -db pubmed" \
+ " | xtract -pattern PubmedArticle -element MedlineCitation/PMID,ArticleTitle,AbstractText | sed \"s/-/ /g\""
print(f" popen: {abs_cmd}")
abstracts = os.popen(abs_cmd).read()
return(abstracts)
def getSentences(gene, sentences_ls):
out=str()
# Keep the sentence only if it contains the gene
#print(sentences_ls)
for sent in sentences_ls:
#if gene.lower() in sent.lower():
if re.search(r'\b'+gene.lower()+r'\b',sent.lower()):
pmid = sent.split(' ')[0]
sent = sent.split(' ',1)[1]
sent=re.sub(r'\b(%s)\b' % gene, r'\1', sent, flags=re.I)
out+=pmid+"\t"+sent+"\n"
return(out)
def gene_category(gene, cat_d, cat, abstracts,addiction_flag,dictn):
# e.g. BDNF, addiction_d, undic(addiction_d) "addiction"
sents=getSentences(gene, abstracts)
#print(sents)
#print(abstracts)
out=str()
if (addiction_flag==1):
for sent in sents.split("\n"):
for key in cat_d:
if key =='s':
key_ad = key+"*"
else:
key_ad = key+"s*"
key_ad = key_ad.replace("s|", "s*|")
key_ad = key_ad.replace("|", "s*|")
key_ad = key_ad.replace("s*s*", "s*")
key_ad_ls = key_ad.split('|')
for key_ad in key_ad_ls:
re_find = re.compile(r'\b{}\b'.format(key_ad), re.IGNORECASE)
if re_find.findall(sent):
sent=sent.replace("","").replace("","") # remove other highlights
sent=re.sub(r'\b(%s)\b' % key_ad, r'\1', sent, flags=re.I) # highlight keyword
out+=gene+"\t"+ cat + "\t"+key+"\t"+sent+"\n"
else:
for key_1 in dictn[cat_d].keys():
for key_2 in dictn[cat_d][key_1]:
if key_2[-1] =='s':
key_2 = key_2+"*"
else:
key_2 = key_2+"s*"
key_2 = key_2.replace("s|", "s*|")
key_2 = key_2.replace("|", "s*|")
key_2 = key_2.replace("s*s*", "s*")
key_2_ls = key_2.split('|')
for sent in sents.split("\n"):
for key_2 in key_2_ls:
re_find = re.compile(r'\b{}\b'.format(key_2), re.IGNORECASE)
if re_find.findall(sent):
sent=sent.replace("","").replace("","") # remove other highlights
sent=re.sub(r'\b(%s)\b' % key_2, r'\1', sent, flags=re.I) # highlight keyword
out+=gene+"\t"+ cat + "\t"+key_1+"\t"+sent+"\n"
return(out)
def generate_nodes(nodes_d, nodetype,nodecolor):
# Include all search terms even if there are no edges, just to show negative result
json0 =str()
for node in nodes_d:
json0 += "{ data: { id: '" + node + "', nodecolor: '" + nodecolor + "', nodetype: '"+nodetype + "', url:'/shownode?nodetype=" + nodetype + "&node="+node+"' } },\n"
return(json0)
def generate_nodes_json(nodes_d, nodetype,nodecolor):
# Include all search terms even if there are no edges, just to show negative result
nodes_json0 =str()
for node in nodes_d:
nodes_json0 += "{ \"id\": \"" + node + "\", \"nodecolor\": \"" + nodecolor + "\", \"nodetype\": \"" + nodetype + "\", \"url\":\"/shownode?nodetype=" + nodetype + "&node="+node+"\" },\n"
return(nodes_json0)
def generate_edges(data, filename):
pmid_list=[]
json0=str()
edgeCnts={}
for line in data.split("\n"):
if len(line.strip())!=0:
(source, cat, target, pmid, sent) = line.split("\t")
edgeID=filename+"|"+source+"|"+target
if (edgeID in edgeCnts) and (pmid+target not in pmid_list):
edgeCnts[edgeID]+=1
pmid_list.append(pmid+target)
elif (edgeID not in edgeCnts) and (pmid+target not in pmid_list):
edgeCnts[edgeID]=1
pmid_list.append(pmid+target)
for edgeID in edgeCnts:
(filename, source,target)=edgeID.split("|")
json0+="{ data: { id: '" + edgeID + "', source: '" + source + "', target: '" + target + "', sentCnt: " + str(edgeCnts[edgeID]) + ", url:'/sentences?edgeID=" + edgeID + "' } },\n"
return(json0)
def generate_edges_json(data, filename):
pmid_list=[]
edges_json0=str()
edgeCnts={}
for line in data.split("\n"):
if len(line.strip())!=0:
(source, cat, target, pmid, sent) = line.split("\t")
edgeID=filename+"|"+source+"|"+target
if (edgeID in edgeCnts) and (pmid+target not in pmid_list):
edgeCnts[edgeID]+=1
pmid_list.append(pmid+target)
elif (edgeID not in edgeCnts) and (pmid+target not in pmid_list):
edgeCnts[edgeID]=1
pmid_list.append(pmid+target)
for edgeID in edgeCnts:
(filename, source,target)=edgeID.split("|")
edges_json0+="{ \"id\": \"" + edgeID + "\", \"source\": \"" + source + "\", \"target\": \"" + target + "\", \"sentCnt\": \"" + str(edgeCnts[edgeID]) + "\", \"url\":\"/sentences?edgeID=" + edgeID + "\" },\n"
return(edges_json0)
def searchArchived(sets, query, filetype,sents, path_user):
# NOTE: dataFile, filetype, and initial nodes assignment are unused
if sets=='topGene':
dataFile="topGene_addiction_sentences.tab"
nodes= "{ data: { id: '" + query + "', nodecolor: '" + "#2471A3" + "', fontweight:700, url:'/progress?query="+query+"' } },\n"
elif sets=='GWAS':
dataFile="gwas_addiction.tab"
nodes=str()
pmid_list=[]
catCnt={}
sn_file = ''
for sn in sents:
(symb, cat0, cat1, pmid, sent)=sn.split("\t")
if (symb.upper() == query.upper()) :
if (cat1 in catCnt.keys()) and (pmid+cat1 not in pmid_list):
pmid_list.append(pmid+cat1)
catCnt[cat1]+=1
elif (cat1 not in catCnt.keys()):
catCnt[cat1]=1
pmid_list.append(pmid+cat1)
sn_file += sn + '\n'
nodes= "{ data: { id: '" + query + "', nodecolor: '" + "#2471A3" + "', fontweight:700, url:'/progress?query="+query+"' } },\n"
edges=str()
gwas_json=str()
nodecolor={}
nodecolor["GWAS"]="hsl(0, 0%, 70%)"
for key in catCnt.keys():
if sets=='GWAS':
nc=nodecolor["GWAS"]
nodes += "{ data: { id: '" + key + "', nodecolor: '" + nc + "', url:'https://www.ebi.ac.uk/gwas/search?query="+key.replace("_GWAS","")+"' } },\n"
edgeID=path_user+'gwas_results.tab'+"|"+query+"|"+key
edges+="{ data: { id: '" + edgeID+ "', source: '" + query + "', target: '" + key + "', sentCnt: " + str(catCnt[key]) + ", url:'/sentences?edgeID=" + edgeID + "' } },\n"
gwas_json+="{ \"id\": \"" + edgeID + "\", \"source\": \"" + query + "\", \"target\": \"" + key + "\", \"sentCnt\": \"" + str(catCnt[key]) + "\", \"url\":\"/sentences?edgeID=" + edgeID + "\" },\n"
return(nodes+edges,gwas_json,sn_file)
pubmed_path=os.environ.get("EDIRECT_LOCAL_ARCHIVE", "./minipubmed")
print(f" pubmed_path={pubmed_path}")
if not os.path.isdir(pubmed_path):
print(f"ERROR: EDIRECT_LOCAL_ARCHIVE directory not found: {pubmed_path} - note this is a recent env variable that replaces the others")
raise SystemExit(1)
testdir = os.path.join(pubmed_path, "pubmed", "Archive", "00")
if not os.path.isdir(testdir):
print(f"ERROR: PubMed/Archive not found in {testdir} (EDIRECT_LOCAL_ARCHIVE={pubmed_path})")
raise SystemExit(1)
raise SystemExit(1)