From 15dde5133ac6d72846aa0db631e6660d50cb904e Mon Sep 17 00:00:00 2001
From: Hao Chen
Date: Sun, 19 Apr 2020 12:02:28 -0500
Subject: mv some files to utility

---
 utility/process_gwas.py                            |  48 +++++++++
 ...Gene_step0_extract_gene_alias_from_gene_info.sh |   4 +
 utility/topGene_step1_cnt_abstracts.py             | 107 +++++++++++++++++++++
 utility/topGene_step2_cnt_sentences.py             |  65 +++++++++++++
 utility/topGene_step3_generate_html.py             |  57 +++++++++++
 .../topGene_step4_get_pmids_for_all_top_genes.py   |  33 +++++++
 6 files changed, 314 insertions(+)
 create mode 100644 utility/process_gwas.py
 create mode 100755 utility/topGene_step0_extract_gene_alias_from_gene_info.sh
 create mode 100755 utility/topGene_step1_cnt_abstracts.py
 create mode 100755 utility/topGene_step2_cnt_sentences.py
 create mode 100755 utility/topGene_step3_generate_html.py
 create mode 100755 utility/topGene_step4_get_pmids_for_all_top_genes.py

(limited to 'utility')
diff --git a/utility/process_gwas.py b/utility/process_gwas.py
new file mode 100644
index 0000000..eba59c0
--- /dev/null
+++ b/utility/process_gwas.py
@@ -0,0 +1,48 @@
+import re
+
+with open("./addiction_gwas.tsv", "r") as f:
+    for line in f:
+        try:
+            (pmid, trait0, gene0, gene1, snp, pval, trait1)=line.strip().split("\t")
+        except:
+            next
+        key1="unassigned"
+        key2="unassigned"
+        trait=trait0+"; "+trait1
+        genes=gene0+";"+gene1
+        if re.search('cocaine', trait, flags=re.I):
+            key1="addiction"
+            key2="cocaine"
+        elif re.search('smoking|congestive|nicotine', trait, flags=re.I):
+            key1="addiction"
+            key2="nicotine"
+        elif re.search('opioid|morphin|heroin|methadone', trait, flags=re.I):
+            key1="addiction"
+            key2="opioid"
+        elif re.search('amphetam', trait, flags=re.I):
+            key1="addiction"
+            key2="amphetamine"
+        elif re.search('canabis', trait, flags=re.I):
+            key1="addiction"
+            key2="canabis"
+        elif re.search('food', trait, flags=re.I):
+            key1="addiction"
+            key2="food"
+        elif re.search('alcohol', trait, flags=re.I):
+            key1="addiction"
+            key2="alcohol"
+        elif re.search('addiction|abuse', trait, flags=re.I):
+            key1="addiction"
+            key2="addiction"
+        else:
+            key1="behavior"
+            key2="psychiatric"
+        genes=genes.replace(" - ", ";")
+        genes=genes.replace(",", ";")
+        printed=dict()
+        for gene in genes.split(";"):
+            gene=gene.replace(" ","")
+            if gene !="NR" and gene not in  printed:
+                text="SNP:<b>"+snp+"</b>, P value: <b>"+pval+"</b>, Disease/trait:<b> "+trait0+"</b>, Mapped trait:<b> "+trait1+"</b>"
+                print (gene+"\t"+"GWAS"+"\t"+key2+"_GWAS\t"+pmid+"\t"+text)
+            printed[gene]=1
diff --git a/utility/topGene_step0_extract_gene_alias_from_gene_info.sh b/utility/topGene_step0_extract_gene_alias_from_gene_info.sh
new file mode 100755
index 0000000..4d3118b
--- /dev/null
+++ b/utility/topGene_step0_extract_gene_alias_from_gene_info.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+#-e "s/\(|\)/ /g"  -e "s/\[|\]/ /g" 
+grep ^9606 ~/Downloads/gene_info |cut -f 3,5,12|grep -v ^LOC|grep -v -i pseudogene |sed -e "s/\t-//"  -e "s/\t/|/2"  -e "s/\t-//"  -e "s/\t/\|/" -e "s/(\|)\|\[\|\]\|{\|}/ /g" | sort >ncbi_gene_symb_syno_name_txid9606.txt 
+
diff --git a/utility/topGene_step1_cnt_abstracts.py b/utility/topGene_step1_cnt_abstracts.py
new file mode 100755
index 0000000..420c9cf
--- /dev/null
+++ b/utility/topGene_step1_cnt_abstracts.py
@@ -0,0 +1,107 @@
+#!/bin/env python3 
+import os
+import sys
+import re
+import time
+from ratspub_keywords import *
+
+def undic(dic):
+    return "|".join(dic.values())
+
+def gene_addiction_cnt(gene):
+    time.sleep(0.2)
+    q="\'(\"" + addiction.replace("|", "\"[tiab] OR \"")  + "\") AND (\"" + drug.replace("|", "\"[tiab] OR \"", ) + "\") AND (\"" + gene + "\")\'"
+    count=os.popen('esearch -db pubmed  -query ' + q + ' | xtract -pattern ENTREZ_DIRECT -element Count ').read()
+    if (len(count)==0):
+        print("pause")
+        time.sleep(15)
+        return gene_addiction_cnt(gene)
+    else:
+        return (count)
+
+def removeStopWords(terms):
+    out=str()
+    for one in terms.upper().split("|"):
+       if one not in stopWords:
+            out+="|"+one
+    return(out[1:])
+
+def saveStopWord(w):
+    with open (stopword_f,"a") as swf:
+        swf.write(w+"\n")
+    return
+
+# either start with ncbi_gene_symb_syno_name_txid9606 for fresh new counts
+# or recount the results after adding additional stopwords
+
+if len(sys.argv)==2:
+    input_f=sys.argv[1]
+else:
+    input_f="./ncbi_gene_symb_syno_name_txid9606.txt"
+
+addiction=undic(addiction_d)
+drug=undic(drug_d)
+output_f=input_f.replace(".txt","_absCnt.txt")
+out=open(output_f, "w+")
+
+stopword_f="./stop_words_addiction_gene_search.txt"
+with open (stopword_f, "r") as swf:
+    stopWords=swf.read().upper().split("\n")
+    swf.close()
+
+with open (input_f, "r") as f:
+    for line in f:
+        do_search=0
+        inputline=line
+        line=line.replace("-","\ ")
+        # remove the annotated stopword
+        if "'" in line:
+            do_search=1
+            words=line.split("|")
+            line=str()
+            for word in words:
+                # ' is used to mark/annotate a word is a stop word in the results
+                # remove the ' mark 
+                if "'" in word:
+                    word=word.replace("'","")
+                    stopWords.append(word)
+                    saveStopWord(word)
+                line+="|"+word
+            line=line[1:]
+        line=removeStopWords(line)
+        # tab is added if there are abstracts counts
+        if "\t" in line:
+            (gene, count)=line.split("\t")
+            # rerun if count is low, these are less annotated
+        #    if int(count)<50:
+        #        do_search=1
+        else:
+            #no count, 
+            gene=line.strip()
+            do_search=1
+        if do_search==1:
+            # remove synonyms with only two letters
+            if "|" in gene:
+                synos=gene.split("|")
+                # keep the gene name regardless number of characters
+                gene=synos[0]
+                #print ("gene: "+gene + " synos -->" + str(synos[1:]))
+                for syno in synos[1:]:
+                    #synonyms must be at least 3 characters
+                    if len(syno)>3:
+                        gene+="|"+syno
+            gene_q=gene.replace("|", "\"[tiab] OR \"")
+            gene_q+="[tiab]"
+            count=gene_addiction_cnt(gene_q)
+            print("original line->\t"+inputline.strip())
+            print("stopword rmed->\t"+line.strip())
+            print("final  result->\t"+gene+"\t"+count)
+            out.write(gene+"\t"+count)
+        else:
+            print("original resl->\t"+inputline.strip())
+            out.write(inputline)
+
+sorted_f=output_f.replace(".txt","_sorted.txt")
+os.system("sort -k2 -t$'\t' -rn " + output_f + " > " + sorted_f )
+
+
diff --git a/utility/topGene_step2_cnt_sentences.py b/utility/topGene_step2_cnt_sentences.py
new file mode 100755
index 0000000..b05aa7a
--- /dev/null
+++ b/utility/topGene_step2_cnt_sentences.py
@@ -0,0 +1,65 @@
+#!/bin/env python3 
+import os, sys
+import re
+import time
+from nltk.tokenize import sent_tokenize
+from ratspub_keywords import *
+
+def undic(dic):
+    return "|".join(dic.values())
+
+def findWholeWord(w):
+    return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search
+
+def getSentences(query, genes):
+    abstracts = os.popen("esearch -db pubmed -query " +  query + " | efetch -format uid |fetch-pubmed -path /run/media/hao/PubMed/Archive/ | xtract -pattern PubmedArticle -element MedlineCitation/PMID,ArticleTitle,AbstractText|sed \"s/-/ /g\"").read()
+    gene_syno=genes.split("|")
+    symb=gene_syno[0]
+    out=str()
+    for row in abstracts.split("\n"):
+        tiab=row.split("\t")
+        pmid = tiab.pop(0)
+        tiab= " ".join(tiab)
+        sentences = sent_tokenize(tiab)
+          ## keep the sentence only if it contains the gene 
+        for sent in sentences:
+            for gene in gene_syno:
+                if findWholeWord(gene)(sent):
+                    sent=re.sub(r'\b(%s)\b' % gene, r'<strong>\1</strong>', sent, flags=re.I)
+                    for drug0 in drug_d:
+                        if findWholeWord(drug_d[drug0])(sent) :
+                            sent=sent.replace("<b>","").replace("</b>","")
+                            sent=re.sub(r'\b(%s)\b' % drug_d[drug0], r'<b>\1</b>', sent, flags=re.I)
+                            out+=symb+"\t"+"drug\t" + drug0+"\t"+pmid+"\t"+sent+"\n"
+                    for add0 in addiction_d:
+                        if findWholeWord(addiction_d[add0])(sent) :
+                            sent=sent.replace("<b>","").replace("</b>","")
+                            sent=re.sub(r'\b(%s)\b' % addiction_d[add0], r'<b>\1</b>', sent, flags=re.I)
+                            out+=symb+"\t"+"addiction\t"+add0+"\t"+pmid+"\t"+sent+"\n"
+    return(out)
+
+addiction=undic(addiction_d)
+drug=undic(drug_d)
+
+
+out=open("topGene_addiction_sentences.tab", "w+")
+cnt=0
+
+if len(sys.argv) != 2:
+    print ("Please provide a sorted gene count file at the command line")
+    sys.exit()
+
+sorted_file=sys.argv[1] #  ncbi_gene_symb_syno_name_txid9606_absCnt_sorted_absCnt_sorted_absCnt_sorted_absCnt_sorted.txt
+with open (sorted_file, "r") as f:
+    for line in f:
+        (genes, abstractCount)=line.strip().split("\t")
+        genes=genes.replace("-","\ ")
+        if int(abstractCount)>20:
+            symb=genes.split("|")[0]
+            print(symb+"-->"+genes)
+            q="\'(\"" + addiction.replace("|", "\"[tiab] OR \"")  + "\") AND (\"" + drug.replace("|", "\"[tiab] OR \"", ) + "\") AND (\"" + genes.replace("|", "\"[tiab] OR \"", ) + "\")\'"
+            sentences=getSentences(q,genes)
+            out.write(sentences)
+out.close()
+
+os.system("cut -f 1,4 topGene_addiction_sentences.tab  |uniq |cut -f 1 |sort |uniq -c |sort -rn > topGeneAbstractCount.tab")
diff --git a/utility/topGene_step3_generate_html.py b/utility/topGene_step3_generate_html.py
new file mode 100755
index 0000000..036325b
--- /dev/null
+++ b/utility/topGene_step3_generate_html.py
@@ -0,0 +1,57 @@
+import re
+import sys
+
+## generate the html page for the top genes
+
+## put gene names and alias in a dictionary
+#ncbi_gene_symb_syno_name_txid9606_absCnt_sorted_absCnt_sorted_absCnt_absCnt_sorted.txt
+if (len(sys.argv) != 2):
+    print ("please provide the name of a sorted gene abstract count file")
+    sys.exit()
+
+geneNames={}
+with open (sys.argv[1],"r") as f:
+    for line in f:
+        (genes, count)=line.strip().split("\t")
+        gene=genes.split("|")
+        names=re.sub(r'^.*?\|', "", genes)
+        geneNames[gene[0]]=names.strip().replace("|", "; ")
+
+out=str()
+html=str()
+with open("./topGeneAbstractCount.tab" ,"r") as gc:
+    cnt=0
+    for  line in gc:
+        cnt+=1
+        line=re.sub(r'^\s+','',line)
+        print (line)
+        pmid_cnt, symb=line.strip().split()
+        out+= symb+"\t"+geneNames[symb]+"\n"
+        html+="<li><a href=\"/showTopGene?topGene="+symb+"\">"+symb+"</a> <span style=\"font-size:small; color:grey\">("+geneNames[symb]+")</span><br>\n"
+        if cnt==200:
+            break
+
+with open("topGene_symb_alias.txt", "w+")  as tg:
+    tg.write(out)
+    tg.close()
+
+
+htmlout='''
+{% extends "layout.html" %}
+{% block content %}
+
+<h4> Top addiction related genes </h4>
+
+<br>
+These genes are ranked by the number of PubMed abstracts that contain the name of the gene and one or more addiction related keyword. Alias were obtained from NCBI gene database and have been curated to remove most, but not all, false matches.
+<hr>
+
+<ol>''' + html  + '''
+</ol>
+{% endblock %}
+'''
+
+with open("./templates/topAddictionGene.html", "w+")  as html_f:
+    html_f.write(htmlout)
+    html_f.close()
+
diff --git a/utility/topGene_step4_get_pmids_for_all_top_genes.py b/utility/topGene_step4_get_pmids_for_all_top_genes.py
new file mode 100755
index 0000000..adf527c
--- /dev/null
+++ b/utility/topGene_step4_get_pmids_for_all_top_genes.py
@@ -0,0 +1,33 @@
+import os
+
+## save all pmids for the top genes so that I don't have to search for these. 
+
+def getPMID(query):
+    print (query)
+    pmids=os.popen("esearch -db pubmed -query \"" +  query + "\" | efetch -format uid").read()
+    return(pmids)
+
+def collectTerms():
+    pmids_f=open("topGene_all.pmid","w+")
+    with open("./topGene_symb_alias.txt", "r") as top:
+        q=str()
+        cnt=0
+        for one in top:
+            cnt+=1
+            (symb, alias)=one.split("\t")
+            q+="|"+symb+"|"+alias.strip()
+            if (cnt==5):
+                print ("\n")
+                q=q[1:]
+                q=q.replace(";", "[tiab] OR ")+"[tiab]"
+                pmids=getPMID(q)
+                pmids_f.write(pmids)
+                cnt=0
+                q=str()
+        print("there should be nothing following the word empty"+q)
+
+collectTerms()
+os.system("sort topGene_all.pmid |uniq > topGene_uniq.pmid" )
+os.system("rm topGene_all.pmid")
+print ("results are in topGen_uniq.pmid")
+
-- 
cgit v1.2.3