8 files changed, 41 insertions, 109 deletions
diff --git a/ratspub.py b/ratspub.py
deleted file mode 100755
index 0cc5d8a..0000000
--- a/ratspub.py
+++ /dev/null
@@ -1,88 +0,0 @@
-#!/bin/env python3 
-from nltk.tokenize import sent_tokenize
-import os
-import re
-from ratspub_keywords import *
-
-global function_d, brain_d, drug_d, addiction_d, brain_query_term, pubmed_path
-
-
-## turn dictionary (synonyms) to regular expression
-def undic(dic):
-    return "|".join(dic.values())
-
-def findWholeWord(w):
-    return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search
-
-def getSentences(query, gene):
-    abstracts = os.popen("esearch -db pubmed -query " +  query + " | efetch -format uid |fetch-pubmed -path "+ pubmed_path + " | xtract -pattern PubmedArticle -element MedlineCitation/PMID,ArticleTitle,AbstractText|sed \"s/-/ /g\"").read()
-    out=str()
-    for row in abstracts.split("\n"):
-        tiab=row.split("\t")
-        pmid = tiab.pop(0)
-        tiab= " ".join(tiab)
-        sentences = sent_tokenize(tiab)
-        ## keep the sentence only if it contains the gene 
-        for sent in sentences:
-            if findWholeWord(gene)(sent):
-                sent=re.sub(r'\b(%s)\b' % gene, r'<strong>\1</strong>', sent, flags=re.I)
-                out+=pmid+"\t"+sent+"\n"
-    return(out)
-
-def gene_category(gene, cat_d, query, cat):
-    #e.g. BDNF, addiction_d, undic(addiction_d) "addiction"
-    q="\"(" + query.replace("|", " OR ")  + ") AND " + gene + "\""
-    sents=getSentences(q, gene)
-    out=str()
-    for sent in sents.split("\n"):
-        for key in cat_d:
-            if findWholeWord(cat_d[key])(sent) :
-                sent=sent.replace("<b>","").replace("</b>","") # remove other highlights
-                sent=re.sub(r'\b(%s)\b' % cat_d[key], r'<b>\1</b>', sent, flags=re.I) # highlight keyword
-                out+=gene+"\t"+ cat + "\t"+key+"\t"+sent+"\n"
-    return(out)
-
-def generate_nodes(nodes_d, nodetype):
-    # include all search terms even if there are no edges, just to show negative result 
-    json0 =str()
-    for node in nodes_d:
-        json0 += "{ data: { id: '" + node +  "', nodecolor: '" + nodecolor[nodetype] + "', nodetype: '"+nodetype + "', url:'/shownode?nodetype=" + nodetype + "&node="+node+"' } },\n"
-    return(json0)
-
-def generate_edges(data, filename):
-    json0=str()
-    edgeCnts={}
-    for line in  data.split("\n"):
-        if len(line.strip())!=0:
-            (source, cat, target, pmid, sent) = line.split("\t")
-            edgeID=filename+"|"+source+"|"+target
-            if edgeID in edgeCnts:
-                edgeCnts[edgeID]+=1
-            else:
-                edgeCnts[edgeID]=1
-    for edgeID in edgeCnts:
-        (filename, source,target)=edgeID.split("|")
-        json0+="{ data: { id: '" + edgeID + "', source: '" + source + "', target: '" + target + "', sentCnt: " + str(edgeCnts[edgeID]) + ",  url:'/sentences?edgeID=" + edgeID + "' } },\n"
-    return(json0)
-
-# brain region has too many short acronyms to just use the undic function, so search PubMed using the following 
-brain_query_term="cortex|accumbens|striatum|amygadala|hippocampus|tegmental|mesolimbic|infralimbic|prelimbic|habenula"
-function=undic(function_d)
-addiction=undic(addiction_d)
-drug=undic(drug_d)
-
-nodecolor={'function':"#A9CCE3", 'addiction': "#D7BDE2", 'drug': "#F9E79F", 'brain':"#A3E4D7"}
-#https://htmlcolorcodes.com/
-n0=generate_nodes(function_d, 'function')
-n1=generate_nodes(addiction_d, 'addiction')
-n2=generate_nodes(drug_d, 'drug')
-n3=generate_nodes(brain_d, 'brain')
-default_nodes=n0+n1+n2+n3
-
-
-host= os.popen('hostname').read().strip()
-if host=="x1":
-    pubmed_path="/run/media/hao/PubMed/Archive/"
-elif host=="hchen3":
-    pubmed_path="/media/hao/2d554499-6c5b-462d-85f3-5c49b25f4ac8/PubMed/Archive"
-
diff --git a/server.py b/server.py
index d947e1f..3dc7f9c 100755
--- a/server.py
+++ b/server.py
@@ -25,8 +25,8 @@ def progress():
     genes=genes.replace(",", " ")
     genes=genes.replace(";", " ")
     genes=genes.split()
-    if len(genes)>=160:
-        message="<span class='text-danger'>Up to five terms can be searched at a time</span>"
+    if len(genes)>=100:
+        message="<span class='text-danger'>Up to 100 terms can be searched at a time</span>"
         return render_template('index.html', message=message)
     elif len(genes)==0:
         message="<span class='text-danger'>Please enter a search term </span>"
@@ -100,14 +100,14 @@ def search():
 
 @app.route('/cytoscape')
 def cytoscape():
-    message2="This graph is interactive: <li>Click on a line to see the sentences <i>in a new window</i><li> Click on a gene to search its relations with top 200 addiction genes<li>Click on a keyword to see the terms included in the search <i> in a new window</i><p>"
+    message2="<h4> Gene vs Keywords</h4>This graph is interactive: <li>Click on a line to see the sentences <i>in a new window</i><li> Click on a gene to search its relations with top 200 addiction genes<li>Click on a keyword to see the terms included in the search <i> in a new window</i><p>"
     with open(session['path']+"_cy","r") as f:
         elements=f.read()
     with open(session['path']+"_0link","r") as z:
         zeroLink=z.read()
         if (len(zeroLink)>0):
             message2+="<span style=\"color:darkred;\">No result was found for these genes: " + zeroLink + "</span>"
-    return render_template('cytoscape.html', elements=elements, message="Gene vs Keywords", message2=message2)
+    return render_template('cytoscape.html', elements=elements, message2=message2)
 
 @app.route("/sentences")
 def sentences():
@@ -194,11 +194,11 @@ def gene_gene():
     with open(gg_file, "w+") as gg:
         gg.write(out)
         gg.close()
-    results="<h4>Gene vs top addiction genes</h4> Click on the number of sentences will show those sentences. Click on the top addiction gene will show an archived search for that gene.<hr>"
+    results="<h4>Gene vs top addiction genes</h4> Click on the number of sentences will show those sentences. Click on the <span style=\"background-color:#FcF3cf\">top addiction genes</span> will show an archived search for that gene.<hr>"
     topGeneHits={}
     for key in hitGenes.keys():
         url=gg_file+"|"+query+"|"+key
-        topGeneHits["<li>"+query+" and <a href=/showTopGene?topGene="+key+" target=_New>"+key+"</a> :  <a href=/sentences?edgeID=" + url+ " target=_new>" +  str(hitGenes[key]) + " sentences.</a> \n"]=hitGenes[key]
+        topGeneHits["<li>"+query+" and <a href=/showTopGene?topGene="+key+" target=_gene><span style=\"background-color:#FcF3cf\">"+key+"</span></a> :  <a href=/sentences?edgeID=" + url+ " target=_new>" +  str(hitGenes[key]) + " sentences.</a> \n"]=hitGenes[key]
     #yyps = [(k, d[k]) for k in sorted(d, key=d.get, reverse=True)]
     topSorted = [(k, topGeneHits[k]) for k in sorted(topGeneHits, key=topGeneHits.get, reverse=True)]
     for k,v in topSorted:
diff --git a/static/ratspub.png b/static/ratspub.png
new file mode 100644
index 0000000..a031bc0
--- /dev/null
+++ b/static/ratspub.png
diff --git a/static/style.css b/static/style.css
index f57b27c..aa1bbd2 100644
--- a/static/style.css
+++ b/static/style.css
@@ -52,4 +52,6 @@ a:active {
   width: 20%;
 }
 
-
+#searchform{
+	background:#F8F9F9;
+}
diff --git a/templates/about.html b/templates/about.html
index 4ea4112..ea5e4c7 100644
--- a/templates/about.html
+++ b/templates/about.html
@@ -2,15 +2,13 @@
 {% extends "layout.html" %}
 {% block content %}
 
-<h3> What does RatsPub do?</h3>
+<h3> About RatsPub ...</h3>
 
-<p>RatsPub searches PubMed to find abstracts containing genes of interest and a list of curated addiction-related keywords. The abstracts corresponding to these returned PMIDs are then retrieve from a <a href="https://dataguide.nlm.nih.gov/edirect/archive.html">local archive of the PubMed</a>. No limit on the date of publication is set. Each abstract is then broken down into sentences, which are then filtered by genes and keywords. 
+<p>RatsPub searches PubMed to find abstracts containing genes of interest and a list of curated addiction-related keywords. The abstracts corresponding to these returned PMIDs are then retrieve from a <a href="https://dataguide.nlm.nih.gov/edirect/archive.html">local archive of the PubMed</a>. No limit on the date of publication is set. Each abstract is then broken down into sentences, which are then filtered by gene names and keywords. 
 
-<p>These gene-keyword relationships are presented as an interactive <a href="https://js.cytoscape.org" >cytoscape</a> graph. 
+<p>Clicking  the links will bring up the corresponding sentences in a new browser window. Clicking the keywords will bring up the correponding search terms. Clicking the genes will start a new search to find sentences containing the target gene and the top 200 addiction genes. 
 
-<p>Clicking  the links will bring up the corresponding sentences in a new browser window. Clicking the keyword nodes will bring up all the terms included. Clicking on the gene nodes will start a new search to find sentences containing that gene and the top 200 addiction genes. 
-
-<p> The top 200 addiction genes were obtained by searching All human genes against addiction related keywords. The archived results for these top addiction genes can be accessed via the <a href="/allTopGenes">Addiction Genes</a> link. 
+<p> The top 200 addiction genes were obtained by searching 29,761 human genes against addiction related keywords. To ensure comprehensive coverage, gene alias obtained from NCBI gene database were included in the search. The results were extensively curated to remove those alias that matched words that were not gene name or wrong genes. Some incorrect results remained because the same name also produced correct results. The archived results for these top addiction genes can be accessed via the <a href="/allTopGenes">Addiction Genes</a> link. 
 	
 <hr>
 <a href="https://github.com/chen42/ratspub"> Source code </a>
diff --git a/templates/index.html b/templates/index.html
index b94853e..f5b6aea 100644
--- a/templates/index.html
+++ b/templates/index.html
@@ -4,16 +4,36 @@
 
 <h3> <u>R</u>elationship  with <u>A</u>ddiction <u>T</u>hrough <u>S</u>earches of <u>Pub</u>Med </span></a>
 </h3>
- 
 
-	This app searches PubMed to find <i>sentences</i> that contain the query terms (e.g., gene symbols) and  <a href="https://github.com/chen42/RatsPub/blob/master/ratspub.py#L95-L129">keywords</a> related to drug addiction.
-<br> 
+<table>
+	<tr><td >
+<p>
+	RatsPub searches PubMed to find <i>sentences</i> that contain the query terms (i.e., gene symbols) and  <a href="https://github.com/chen42/RatsPub/blob/master/ratspub_keywords.py">drug addiction-related keywords</a>.  These gene-keyword relationships are presented as an interactive graph that can efficiently  answer the question <b>"What do we know about these genes and addiction?" </b>
+
+<p> In addition, clicking gene names in the graph will launch a new search for sentences containing the target gene and 200 addiction-related genes. These results help to answer the question <b>"Are there  genes that can link my gene of interest to addiction?"  </b>
+		</td>
+		<td >
+			<div class='img'><img src="/static/ratspub.png" class="img-fluid">
+		</td></tr>
+		<tr><td colspan=2>
+
+	Up to 100 gene symbols can be searched at a time. Gene symbols can be separated by either a space or a semicolon.  Gene alias will <i>not</i> be automatically included because the large number of false matches associated with gene synonyms retrieved from databases. 
+			</td></tr></table>
+
 	<p>
+	<b>Example</b>: Rgma Nrxn3; Chrna3
+
+<br> 
     <form action="/progress">
-      <input name="query" class="form-control form-control-lg" type="search" placeholder="Rgma Nrxn3" aria-label="search" value="">
-      <button class="btn btn-outline-success my-2 my-sm-0" type="submit">Search</button>
-    </form>
+	<textarea name="query" class="form-control form-control-lg" type="search" row="3" id="searchform">
+	</textarea>
+  <div class="form-group row">
+    <div class="col-sm-10">
+      <button type="submit" class="btn btn-primary">Submit</button>
+    </div>
+  </div>
 
+    </form>
 {% endblock %}
 
 
diff --git a/templates/layout.html b/templates/layout.html
index 08977c3..6a85473 100644
--- a/templates/layout.html
+++ b/templates/layout.html
@@ -22,7 +22,7 @@
   <div class="collapse navbar-collapse" id="navbarSupportedContent">
     <ul class="navbar-nav mr-auto">
       <li class="nav-item active">
-        <a class="nav-link" href="/">Home <span class="sr-only">(current)</span></a>
+        <a class="nav-link" href="/">Search <span class="sr-only">(current)</span></a>
       </li>
       <li class="nav-item">
         <a class="nav-link" href="/allTopGenes">Addiction Genes</a>
diff --git a/templates/topAddictionGene.html b/templates/topAddictionGene.html
index e427560..e62103a 100644
--- a/templates/topAddictionGene.html
+++ b/templates/topAddictionGene.html
@@ -5,7 +5,7 @@
 <h3> Top addiction related genes </h3>
 
 <br>
-These genes are ranked by the number of PubMed abstracts that contain the name of the gene and one or more addiction related keyword.
+These genes are ranked by the number of PubMed abstracts that contain the name of the gene and one or more addiction related keyword. Alias were obtained from NCBI gene database and have been curated to remove most, but not all, false matches.
 <hr>
 
 <ol><li><a href="/showTopGene?topGene=CNR1">CNR1</a> <span style="font-size:small; color:grey">(CNRS1; PROTOCADHERIN ALPHA CLUSTER, COMPLEX LOCUS)</span><br>