diff options
author | Hao Chen | 2019-05-13 13:41:44 -0500 |
---|---|---|
committer | Hao Chen | 2019-05-13 13:41:44 -0500 |
commit | 1a9b0800d3e95d9cd2a1ec7597dd793cf6b6c62a (patch) | |
tree | ebfdbfe3c8778b393accbcf29d3e276347edff58 | |
parent | 4931c70884c7aa7bb3d4fba927597a2f89b4c9b0 (diff) | |
download | genecup-1a9b0800d3e95d9cd2a1ec7597dd793cf6b6c62a.tar.gz |
standardized the search function
-rwxr-xr-x | ratspub.py | 49 | ||||
-rwxr-xr-x | server.py | 26 |
2 files changed, 28 insertions, 47 deletions
@@ -3,7 +3,7 @@ from nltk.tokenize import sent_tokenize import os import re -global function_d, brain_d, drug_d, addiction_d +global function_d, brain_d, drug_d, addiction_d, brain_query_term ## turn dictionary (synonyms) to regular expression def undic(dic): @@ -27,46 +27,17 @@ def getSentences(query, gene): out+=pmid+"\t"+sent+"\n" return(out) -def gene_addiction(gene): - # search gene name & drug name in the context of addiction terms (i.e., exclude etoh affects cancer, or methods to extract cocaine) - q="\"(" + addiction.replace("|", " OR ") + ") AND (" + drug.replace("|", " OR ", ) + ") AND " + gene + "\"" +def gene_category(gene, cat_d, query, cat): + #e.g. BDNF, addiction_d, undic(addiction_d) "addiction" + q="\"(" + query.replace("|", " OR ") + ") AND " + gene + "\"" sents=getSentences(q, gene) out=str() for sent in sents.split("\n"): - for drug0 in drug_d: - if findWholeWord(drug_d[drug0])(sent) : - sent=sent.replace("<b>","").replace("</b>","") - sent=re.sub(r'\b(%s)\b' % drug_d[drug0], r'<b>\1</b>', sent, flags=re.I) - out+=gene+"\t"+"drug\t" + drug0+"\t"+sent+"\n" - for add0 in addiction_d: - if findWholeWord(addiction_d[add0])(sent) : - sent=sent.replace("<b>","").replace("</b>","") - sent=re.sub(r'\b(%s)\b' % addiction_d[add0], r'<b>\1</b>', sent, flags=re.I) - out+=gene+"\t"+"addiction\t"+add0+"\t"+sent+"\n" - return(out) - -def gene_anatomical(gene): - q="\"(" + brain.replace("|", " OR ") + ") AND " + gene + "\"" - sents=getSentences(q,gene) - out=str() - for sent in sents.split("\n"): - for brain0 in brain_d: - if findWholeWord(brain_d[brain0])(sent) : - sent=sent.replace("<b>","").replace("</b>","") - sent=re.sub(r'\b(%s)\b' % brain_d[brain0], r'<b>\1</b>', sent, flags=re.I) - out+=gene+"\t"+"brain\t"+brain0+"\t"+sent+"\n" - return(out) - -def gene_functional(gene): - q="\"(" + function.replace("|", " OR ") + ") AND " + gene + "\"" - sents=getSentences(q,gene) - out=str() - for sent in sents.split("\n"): - for bio0 in function_d: - if findWholeWord(function_d[bio0])(sent) : - sent=sent.replace("<b>","").replace("</b>","") - sent=re.sub(r'\b(%s)\b' % function_d[bio0], r'<b>\1</b>', sent, flags=re.I) - out+=gene+"\t"+"function\t"+bio0+"\t"+sent+"\n" + for key in cat_d: + if findWholeWord(cat_d[key])(sent) : + sent=sent.replace("<b>","").replace("</b>","") # remove other highlights + sent=re.sub(r'\b(%s)\b' % cat_d[key], r'<b>\1</b>', sent, flags=re.I) # highlight keyword + out+=gene+"\t"+ cat + "\t"+key+"\t"+sent+"\n" return(out) def generate_nodes(nodes_d, nodetype): @@ -117,7 +88,7 @@ brain_d ={"cortex":"cortex|prefrontal|pfc|mPFC|vmpfc|corticostriatal|cortico lim "vta":"ventral tegmental|vta|pvta|mesolimbic|limbic|midbrain|mesoaccumbens" } # brain region has too many short acronyms to just use the undic function, so search PubMed using the following -brain="cortex|accumbens|striatum|amygadala|hippocampus|tegmental|mesolimbic|infralimbic|prelimbic" +brain_query_term="cortex|accumbens|striatum|amygadala|hippocampus|tegmental|mesolimbic|infralimbic|prelimbic" function_d={"signalling":"signalling|signaling|phosphorylation|glycosylation", "transcription":"transcription|methylation|hypomethylation|hypermethylation|histone|ribosome", "neuroplasticity":"neuroplasticity|plasticity|long term potentiation|LTP|long term depression|LTD|synaptic|epsp|epsc|neurite|neurogenesis|boutons|mIPSC|IPSC|IPSP", @@ -18,8 +18,6 @@ def root(): def about(): return render_template('about.html') - - @app.route('/progress') def progress(): # only 1-6 terms are allowed @@ -43,7 +41,7 @@ def progress(): @app.route("/search") def search(): genes=session['query'] - percent=round(100/(len(genes)*3),1) + percent=round(100/(len(genes)*4),1) snt_file=session['path']+"_snt" cysdata=open(session['path']+"_cy","w+") sntdata=open(snt_file,"w+") @@ -53,20 +51,32 @@ def search(): nodes=default_nodes progress=0 for gene in genes: + gene=gene.replace("-"," ") nodes+="{ data: { id: '" + gene + "', nodecolor:'#E74C3C', fontweight:700, url:'/gene_gene?gene="+gene+"'} },\n" + # report progress immediately progress+=percent yield "data:"+str(progress)+"\n\n" - sent0=gene_addiction(gene) + addiction=undic(addiction_d) + sent0=gene_category(gene, addiction_d, addiction, "addiction") e0=generate_edges(sent0, tf_name) - sent1=gene_functional(gene) + # + function=undic(function_d) + sent1=gene_category(gene, function_d, function, "function") progress+=percent yield "data:"+str(progress)+"\n\n" e1=generate_edges(sent1, tf_name) - sent2=gene_anatomical(gene) + # + drug=undic(drug_d) + sent2=gene_category(gene, drug_d, drug, "drug") progress+=percent + yield "data:"+str(progress)+"\n\n" e2=generate_edges(sent2, tf_name) - edges+=e0+e1+e2 - sentences+=sent0+sent1+sent2 + # brain has its own query terms that does not include the many short acronyms + sent3=gene_category(gene, brain_d, brain_query_term, "brain") + progress+=percent + e3=generate_edges(sent3, tf_name) + edges+=e0+e1+e2+e3 + sentences+=sent0+sent1+sent2+sent3 #save data before the last yield if (progress>99): progress=100 |