standardized the search function

author: Hao Chen 2019-05-13 13:41:44 -0500
committer: Hao Chen 2019-05-13 13:41:44 -0500
commit: 1a9b0800d3e95d9cd2a1ec7597dd793cf6b6c62a (patch)
tree: ebfdbfe3c8778b393accbcf29d3e276347edff58
parent: 4931c70884c7aa7bb3d4fba927597a2f89b4c9b0 (diff)
download: genecup-1a9b0800d3e95d9cd2a1ec7597dd793cf6b6c62a.tar.gz
2 files changed, 28 insertions, 47 deletions
diff --git a/ratspub.py b/ratspub.py
index 79abc87..c82c9b8 100755
--- a/ratspub.py
+++ b/ratspub.py
@@ -3,7 +3,7 @@ from nltk.tokenize import sent_tokenize
 import os
 import re
 
-global function_d, brain_d, drug_d, addiction_d
+global function_d, brain_d, drug_d, addiction_d, brain_query_term
 
 ## turn dictionary (synonyms) to regular expression
 def undic(dic):
@@ -27,46 +27,17 @@ def getSentences(query, gene):
                 out+=pmid+"\t"+sent+"\n"
     return(out)
 
-def gene_addiction(gene):
-    # search gene name & drug name  in the context of addiction terms (i.e., exclude etoh affects cancer, or methods to extract cocaine) 
-    q="\"(" + addiction.replace("|", " OR ")  + ") AND (" + drug.replace("|", " OR ", ) + ") AND " + gene + "\""
+def gene_category(gene, cat_d, query, cat):
+    #e.g. BDNF, addiction_d, undic(addiction_d) "addiction"
+    q="\"(" + query.replace("|", " OR ")  + ") AND " + gene + "\""
     sents=getSentences(q, gene)
     out=str()
     for sent in sents.split("\n"):
-        for drug0 in drug_d:
-            if findWholeWord(drug_d[drug0])(sent) :
-                sent=sent.replace("<b>","").replace("</b>","")
-                sent=re.sub(r'\b(%s)\b' % drug_d[drug0], r'<b>\1</b>', sent, flags=re.I)
-                out+=gene+"\t"+"drug\t" + drug0+"\t"+sent+"\n"
-        for add0 in addiction_d:
-            if findWholeWord(addiction_d[add0])(sent) :
-                sent=sent.replace("<b>","").replace("</b>","")
-                sent=re.sub(r'\b(%s)\b' % addiction_d[add0], r'<b>\1</b>', sent, flags=re.I)
-                out+=gene+"\t"+"addiction\t"+add0+"\t"+sent+"\n"
-    return(out)
-
-def gene_anatomical(gene):
-    q="\"(" + brain.replace("|", " OR ")  + ") AND " + gene + "\""
-    sents=getSentences(q,gene)
-    out=str()
-    for sent in sents.split("\n"):
-        for brain0 in brain_d:
-            if findWholeWord(brain_d[brain0])(sent) :
-                sent=sent.replace("<b>","").replace("</b>","")
-                sent=re.sub(r'\b(%s)\b' % brain_d[brain0], r'<b>\1</b>', sent, flags=re.I)
-                out+=gene+"\t"+"brain\t"+brain0+"\t"+sent+"\n"
-    return(out)
-
-def gene_functional(gene):
-    q="\"(" + function.replace("|", " OR ")  + ") AND " + gene + "\""
-    sents=getSentences(q,gene)
-    out=str()
-    for sent in sents.split("\n"):
-        for bio0 in function_d:
-            if findWholeWord(function_d[bio0])(sent) :
-                sent=sent.replace("<b>","").replace("</b>","")
-                sent=re.sub(r'\b(%s)\b' % function_d[bio0], r'<b>\1</b>', sent, flags=re.I)
-                out+=gene+"\t"+"function\t"+bio0+"\t"+sent+"\n"
+        for key in cat_d:
+            if findWholeWord(cat_d[key])(sent) :
+                sent=sent.replace("<b>","").replace("</b>","") # remove other highlights
+                sent=re.sub(r'\b(%s)\b' % cat_d[key], r'<b>\1</b>', sent, flags=re.I) # highlight keyword
+                out+=gene+"\t"+ cat + "\t"+key+"\t"+sent+"\n"
     return(out)
 
 def generate_nodes(nodes_d, nodetype):
@@ -117,7 +88,7 @@ brain_d ={"cortex":"cortex|prefrontal|pfc|mPFC|vmpfc|corticostriatal|cortico lim
           "vta":"ventral tegmental|vta|pvta|mesolimbic|limbic|midbrain|mesoaccumbens"
           }
 # brain region has too many short acronyms to just use the undic function, so search PubMed using the following 
-brain="cortex|accumbens|striatum|amygadala|hippocampus|tegmental|mesolimbic|infralimbic|prelimbic"
+brain_query_term="cortex|accumbens|striatum|amygadala|hippocampus|tegmental|mesolimbic|infralimbic|prelimbic"
 function_d={"signalling":"signalling|signaling|phosphorylation|glycosylation",
             "transcription":"transcription|methylation|hypomethylation|hypermethylation|histone|ribosome",
             "neuroplasticity":"neuroplasticity|plasticity|long term potentiation|LTP|long term depression|LTD|synaptic|epsp|epsc|neurite|neurogenesis|boutons|mIPSC|IPSC|IPSP",
diff --git a/server.py b/server.py
index 2cff990..b71e6bb 100755
--- a/server.py
+++ b/server.py
@@ -18,8 +18,6 @@ def root():
 def about():
     return render_template('about.html')
 
-
-
 @app.route('/progress')
 def progress():
     # only 1-6 terms are allowed
@@ -43,7 +41,7 @@ def progress():
 @app.route("/search")
 def search():
     genes=session['query']
-    percent=round(100/(len(genes)*3),1)
+    percent=round(100/(len(genes)*4),1)
     snt_file=session['path']+"_snt"
     cysdata=open(session['path']+"_cy","w+")
     sntdata=open(snt_file,"w+")
@@ -53,20 +51,32 @@ def search():
         nodes=default_nodes
         progress=0
         for  gene in genes:
+            gene=gene.replace("-"," ")
             nodes+="{ data: { id: '" + gene +  "', nodecolor:'#E74C3C', fontweight:700, url:'/gene_gene?gene="+gene+"'} },\n"
+            # report progress immediately
             progress+=percent
             yield "data:"+str(progress)+"\n\n"
-            sent0=gene_addiction(gene)
+            addiction=undic(addiction_d)
+            sent0=gene_category(gene, addiction_d, addiction, "addiction")
             e0=generate_edges(sent0, tf_name)
-            sent1=gene_functional(gene)
+            #  
+            function=undic(function_d)
+            sent1=gene_category(gene, function_d, function, "function")
             progress+=percent
             yield "data:"+str(progress)+"\n\n"
             e1=generate_edges(sent1, tf_name)
-            sent2=gene_anatomical(gene)
+            #
+            drug=undic(drug_d)
+            sent2=gene_category(gene, drug_d, drug, "drug")
             progress+=percent
+            yield "data:"+str(progress)+"\n\n"
             e2=generate_edges(sent2, tf_name)
-            edges+=e0+e1+e2
-            sentences+=sent0+sent1+sent2
+            # brain has its own query terms that does not include the many short acronyms
+            sent3=gene_category(gene, brain_d, brain_query_term, "brain")
+            progress+=percent
+            e3=generate_edges(sent3, tf_name)
+            edges+=e0+e1+e2+e3
+            sentences+=sent0+sent1+sent2+sent3
             #save data before the last yield
             if (progress>99):
                 progress=100
author	Hao Chen	2019-05-13 13:41:44 -0500
committer	Hao Chen	2019-05-13 13:41:44 -0500
commit	1a9b0800d3e95d9cd2a1ec7597dd793cf6b6c62a (patch)
tree	ebfdbfe3c8778b393accbcf29d3e276347edff58
parent	4931c70884c7aa7bb3d4fba927597a2f89b4c9b0 (diff)
download	genecup-1a9b0800d3e95d9cd2a1ec7597dd793cf6b6c62a.tar.gz