1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
|
#!/bin/env python3
from nltk.tokenize import sent_tokenize
import os
import re
from ratspub_keywords import *
global function_d, brain_d, drug_d, addiction_d, brain_query_term, pubmed_path
## turn dictionary (synonyms) to regular expression
def undic(dic):
return "|".join(dic.values())
def findWholeWord(w):
return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search
def getSentences(query, gene):
abstracts = os.popen("esearch -db pubmed -query " + query + " | efetch -format uid |fetch-pubmed -path "+ pubmed_path + " | xtract -pattern PubmedArticle -element MedlineCitation/PMID,ArticleTitle,AbstractText|sed \"s/-/ /g\"").read()
out=str()
for row in abstracts.split("\n"):
tiab=row.split("\t")
pmid = tiab.pop(0)
tiab= " ".join(tiab)
sentences = sent_tokenize(tiab)
## keep the sentence only if it contains the gene
for sent in sentences:
if findWholeWord(gene)(sent):
sent=re.sub(r'\b(%s)\b' % gene, r'<strong>\1</strong>', sent, flags=re.I)
out+=pmid+"\t"+sent+"\n"
return(out)
def gene_category(gene, cat_d, query, cat):
#e.g. BDNF, addiction_d, undic(addiction_d) "addiction"
q="\"(" + query.replace("|", " OR ") + ") AND " + gene + "\""
sents=getSentences(q, gene)
out=str()
for sent in sents.split("\n"):
for key in cat_d:
if findWholeWord(cat_d[key])(sent) :
sent=sent.replace("<b>","").replace("</b>","") # remove other highlights
sent=re.sub(r'\b(%s)\b' % cat_d[key], r'<b>\1</b>', sent, flags=re.I) # highlight keyword
out+=gene+"\t"+ cat + "\t"+key+"\t"+sent+"\n"
return(out)
def generate_nodes(nodes_d, nodetype):
# include all search terms even if there are no edges, just to show negative result
json0 =str()
for node in nodes_d:
json0 += "{ data: { id: '" + node + "', nodecolor: '" + nodecolor[nodetype] + "', nodetype: '"+nodetype + "', url:'/shownode?nodetype=" + nodetype + "&node="+node+"' } },\n"
return(json0)
def generate_nodes_json(nodes_d, nodetype):
# include all search terms even if there are no edges, just to show negative result
nodes_json0 =str()
for node in nodes_d:
nodes_json0 += "{ \"id\": \"" + node + "\", \"nodecolor\": \"" + nodecolor[nodetype] + "\", \"nodetype\": \"" + nodetype + "\", \"url\":\"/shownode?nodetype=" + nodetype + "&node="+node+"\" },\n"
return(nodes_json0)
def generate_edges(data, filename):
pmid_temp=0
json0=str()
edgeCnts={}
for line in data.split("\n"):
if len(line.strip())!=0:
(source, cat, target, pmid, sent) = line.split("\t")
edgeID=filename+"|"+source+"|"+target
if (edgeID in edgeCnts) and (pmid != pmid_temp):
edgeCnts[edgeID]+=1
pmid_temp = pmid
elif (edgeID not in edgeCnts):
edgeCnts[edgeID]=1
pmid_temp = pmid
for edgeID in edgeCnts:
(filename, source,target)=edgeID.split("|")
json0+="{ data: { id: '" + edgeID + "', source: '" + source + "', target: '" + target + "', sentCnt: " + str(edgeCnts[edgeID]) + ", url:'/sentences?edgeID=" + edgeID + "' } },\n"
return(json0)
def generate_edges_json(data, filename):
pmid_temp=0
edges_json0=str()
edgeCnts={}
for line in data.split("\n"):
if len(line.strip())!=0:
(source, cat, target, pmid, sent) = line.split("\t")
edgeID=filename+"|"+source+"|"+target
if (edgeID in edgeCnts) and (pmid != pmid_temp):
edgeCnts[edgeID]+=1
pmid_temp = pmid
elif (edgeID not in edgeCnts):
edgeCnts[edgeID]=1
pmid_temp = pmid
for edgeID in edgeCnts:
(filename, source,target)=edgeID.split("|")
edges_json0+="{ \"id\": \"" + edgeID + "\", \"source\": \"" + source + "\", \"target\": \"" + target + "\", \"sentCnt\": \"" + str(edgeCnts[edgeID]) + "\", \"url\":\"/sentences?edgeID=" + edgeID + "\" },\n"
return(edges_json0)
def searchArchived(sets, query, filetype):
if sets=='topGene':
dataFile="topGene_addiction_sentences.tab"
nodes= "{ data: { id: '" + query + "', nodecolor: '" + "#2471A3" + "', fontweight:700, url:'/progress?query="+query+"' } },\n"
elif sets=='GWAS':
dataFile="gwas_addiction.tab"
nodes=str()
with open(dataFile, "r") as sents:
pmid_list=[]
cat1_list=[]
catCnt={}
for sent in sents:
(symb, cat0, cat1, pmid, sent)=sent.split("\t")
if (symb.upper() == query.upper()) :
if (cat1 in catCnt.keys()) and (pmid+cat1 not in pmid_list):
pmid_list.append(pmid+cat1)
catCnt[cat1]+=1
elif (cat1 not in catCnt.keys()):
catCnt[cat1]=1
pmid_list.append(pmid+cat1)
nodes= "{ data: { id: '" + query + "', nodecolor: '" + "#2471A3" + "', fontweight:700, url:'/progress?query="+query+"' } },\n"
edges=str()
gwas_json=str()
for key in catCnt.keys():
if sets=='GWAS':
nc=nodecolor["GWAS"]
nodes += "{ data: { id: '" + key + "', nodecolor: '" + nc + "', url:'https://www.ebi.ac.uk/gwas/search?query="+key.replace("_GWAS","")+"' } },\n"
elif key in drug_d.keys():
nc=nodecolor["drug"]
nodes += "{ data: { id: '" + key + "', nodecolor: '" + nc + "', url:'/shownode?node="+key+"' } },\n"
else:
nc=nodecolor["addiction"]
nodes += "{ data: { id: '" + key + "', nodecolor: '" + nc + "', url:'/shownode?node="+key+"' } },\n"
edgeID=dataFile+"|"+query+"|"+key
edges+="{ data: { id: '" + edgeID+ "', source: '" + query + "', target: '" + key + "', sentCnt: " + str(catCnt[key]) + ", url:'/sentences?edgeID=" + edgeID + "' } },\n"
gwas_json+="{ \"id\": \"" + edgeID + "\", \"source\": \"" + query + "\", \"target\": \"" + key + "\", \"sentCnt\": \"" + str(catCnt[key]) + "\", \"url\":\"/sentences?edgeID=" + edgeID + "\" },\n"
if(filetype == 'cys'):
return(nodes+edges)
else:
return(gwas_json)
# brain region has too many short acronyms to just use the undic function, so search PubMed using the following
brain_query_term="cortex|accumbens|striatum|amygadala|hippocampus|tegmental|mesolimbic|infralimbic|prelimbic|habenula"
function=undic(function_d)
addiction=undic(addiction_d)
drug=undic(drug_d)
nodecolor={'function':"#A9CCE3", 'addiction': "#D7BDE2", 'drug': "#F9E79F", 'brain':"#A3E4D7", 'GWAS':"#AEB6BF", 'stress':"#EDBB99", 'psychiatric':"#F5B7B1"}
#https://htmlcolorcodes.com/ third column down
n0=generate_nodes(function_d, 'function')
n1=generate_nodes(addiction_d, 'addiction')
n2=generate_nodes(drug_d, 'drug')
n3=generate_nodes(brain_d, 'brain')
n4=generate_nodes(stress_d, 'stress')
n5=generate_nodes(psychiatric_d, 'psychiatric')
n6=generate_nodes(psychiatric_d, 'GWAS')
nj0=generate_nodes_json(function_d, 'function')
nj1=generate_nodes_json(addiction_d, 'addiction')
nj2=generate_nodes_json(drug_d, 'drug')
nj3=generate_nodes_json(brain_d, 'brain')
nj4=generate_nodes_json(stress_d, 'stress')
nj5=generate_nodes_json(psychiatric_d, 'psychiatric')
nj6=generate_nodes_json(psychiatric_d, 'GWAS')
host= os.popen('hostname').read().strip()
if host=="x1":
pubmed_path="/run/media/hao/PubMed/Archive/"
elif host=="hchen3":
pubmed_path="/media/hao/2d554499-6c5b-462d-85f3-5c49b25f4ac8/PubMed/Archive"
elif host=="penguin2":
pubmed_path="/export2/PubMed/Archive"
|