From 15dde5133ac6d72846aa0db631e6660d50cb904e Mon Sep 17 00:00:00 2001
From: Hao Chen
Date: Sun, 19 Apr 2020 12:02:28 -0500
Subject: mv some files to utility

---
 process_gwas.py                                    |  48 ---------
 topGene_step0_extract_gene_alias_from_gene_info.sh |   4 -
 topGene_step1_cnt_abstracts.py                     | 107 ---------------------
 topGene_step2_cnt_sentences.py                     |  65 -------------
 topGene_step3_generate_html.py                     |  57 -----------
 topGene_step4_get_pmids_for_all_top_genes.py       |  33 -------
 utility/process_gwas.py                            |  48 +++++++++
 ...Gene_step0_extract_gene_alias_from_gene_info.sh |   4 +
 utility/topGene_step1_cnt_abstracts.py             | 107 +++++++++++++++++++++
 utility/topGene_step2_cnt_sentences.py             |  65 +++++++++++++
 utility/topGene_step3_generate_html.py             |  57 +++++++++++
 .../topGene_step4_get_pmids_for_all_top_genes.py   |  33 +++++++
 12 files changed, 314 insertions(+), 314 deletions(-)
 delete mode 100644 process_gwas.py
 delete mode 100755 topGene_step0_extract_gene_alias_from_gene_info.sh
 delete mode 100755 topGene_step1_cnt_abstracts.py
 delete mode 100755 topGene_step2_cnt_sentences.py
 delete mode 100755 topGene_step3_generate_html.py
 delete mode 100755 topGene_step4_get_pmids_for_all_top_genes.py
 create mode 100644 utility/process_gwas.py
 create mode 100755 utility/topGene_step0_extract_gene_alias_from_gene_info.sh
 create mode 100755 utility/topGene_step1_cnt_abstracts.py
 create mode 100755 utility/topGene_step2_cnt_sentences.py
 create mode 100755 utility/topGene_step3_generate_html.py
 create mode 100755 utility/topGene_step4_get_pmids_for_all_top_genes.py
diff --git a/process_gwas.py b/process_gwas.py
deleted file mode 100644
index eba59c0..0000000
--- a/process_gwas.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import re
-
-with open("./addiction_gwas.tsv", "r") as f:
-    for line in f:
-        try:
-            (pmid, trait0, gene0, gene1, snp, pval, trait1)=line.strip().split("\t")
-        except:
-            next
-        key1="unassigned"
-        key2="unassigned"
-        trait=trait0+"; "+trait1
-        genes=gene0+";"+gene1
-        if re.search('cocaine', trait, flags=re.I):
-            key1="addiction"
-            key2="cocaine"
-        elif re.search('smoking|congestive|nicotine', trait, flags=re.I):
-            key1="addiction"
-            key2="nicotine"
-        elif re.search('opioid|morphin|heroin|methadone', trait, flags=re.I):
-            key1="addiction"
-            key2="opioid"
-        elif re.search('amphetam', trait, flags=re.I):
-            key1="addiction"
-            key2="amphetamine"
-        elif re.search('canabis', trait, flags=re.I):
-            key1="addiction"
-            key2="canabis"
-        elif re.search('food', trait, flags=re.I):
-            key1="addiction"
-            key2="food"
-        elif re.search('alcohol', trait, flags=re.I):
-            key1="addiction"
-            key2="alcohol"
-        elif re.search('addiction|abuse', trait, flags=re.I):
-            key1="addiction"
-            key2="addiction"
-        else:
-            key1="behavior"
-            key2="psychiatric"
-        genes=genes.replace(" - ", ";")
-        genes=genes.replace(",", ";")
-        printed=dict()
-        for gene in genes.split(";"):
-            gene=gene.replace(" ","")
-            if gene !="NR" and gene not in  printed:
-                text="SNP:<b>"+snp+"</b>, P value: <b>"+pval+"</b>, Disease/trait:<b> "+trait0+"</b>, Mapped trait:<b> "+trait1+"</b>"
-                print (gene+"\t"+"GWAS"+"\t"+key2+"_GWAS\t"+pmid+"\t"+text)
-            printed[gene]=1
diff --git a/topGene_step0_extract_gene_alias_from_gene_info.sh b/topGene_step0_extract_gene_alias_from_gene_info.sh
deleted file mode 100755
index 4d3118b..0000000
--- a/topGene_step0_extract_gene_alias_from_gene_info.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/bash
-#-e "s/\(|\)/ /g"  -e "s/\[|\]/ /g" 
-grep ^9606 ~/Downloads/gene_info |cut -f 3,5,12|grep -v ^LOC|grep -v -i pseudogene |sed -e "s/\t-//"  -e "s/\t/|/2"  -e "s/\t-//"  -e "s/\t/\|/" -e "s/(\|)\|\[\|\]\|{\|}/ /g" | sort >ncbi_gene_symb_syno_name_txid9606.txt 
-
diff --git a/topGene_step1_cnt_abstracts.py b/topGene_step1_cnt_abstracts.py
deleted file mode 100755
index 420c9cf..0000000
--- a/topGene_step1_cnt_abstracts.py
+++ /dev/null
@@ -1,107 +0,0 @@
-#!/bin/env python3 
-import os
-import sys
-import re
-import time
-from ratspub_keywords import *
-
-def undic(dic):
-    return "|".join(dic.values())
-
-def gene_addiction_cnt(gene):
-    time.sleep(0.2)
-    q="\'(\"" + addiction.replace("|", "\"[tiab] OR \"")  + "\") AND (\"" + drug.replace("|", "\"[tiab] OR \"", ) + "\") AND (\"" + gene + "\")\'"
-    count=os.popen('esearch -db pubmed  -query ' + q + ' | xtract -pattern ENTREZ_DIRECT -element Count ').read()
-    if (len(count)==0):
-        print("pause")
-        time.sleep(15)
-        return gene_addiction_cnt(gene)
-    else:
-        return (count)
-
-def removeStopWords(terms):
-    out=str()
-    for one in terms.upper().split("|"):
-       if one not in stopWords:
-            out+="|"+one
-    return(out[1:])
-
-def saveStopWord(w):
-    with open (stopword_f,"a") as swf:
-        swf.write(w+"\n")
-    return
-
-# either start with ncbi_gene_symb_syno_name_txid9606 for fresh new counts
-# or recount the results after adding additional stopwords
-
-if len(sys.argv)==2:
-    input_f=sys.argv[1]
-else:
-    input_f="./ncbi_gene_symb_syno_name_txid9606.txt"
-
-addiction=undic(addiction_d)
-drug=undic(drug_d)
-output_f=input_f.replace(".txt","_absCnt.txt")
-out=open(output_f, "w+")
-
-stopword_f="./stop_words_addiction_gene_search.txt"
-with open (stopword_f, "r") as swf:
-    stopWords=swf.read().upper().split("\n")
-    swf.close()
-
-with open (input_f, "r") as f:
-    for line in f:
-        do_search=0
-        inputline=line
-        line=line.replace("-","\ ")
-        # remove the annotated stopword
-        if "'" in line:
-            do_search=1
-            words=line.split("|")
-            line=str()
-            for word in words:
-                # ' is used to mark/annotate a word is a stop word in the results
-                # remove the ' mark 
-                if "'" in word:
-                    word=word.replace("'","")
-                    stopWords.append(word)
-                    saveStopWord(word)
-                line+="|"+word
-            line=line[1:]
-        line=removeStopWords(line)
-        # tab is added if there are abstracts counts
-        if "\t" in line:
-            (gene, count)=line.split("\t")
-            # rerun if count is low, these are less annotated
-        #    if int(count)<50:
-        #        do_search=1
-        else:
-            #no count, 
-            gene=line.strip()
-            do_search=1
-        if do_search==1:
-            # remove synonyms with only two letters
-            if "|" in gene:
-                synos=gene.split("|")
-                # keep the gene name regardless number of characters
-                gene=synos[0]
-                #print ("gene: "+gene + " synos -->" + str(synos[1:]))
-                for syno in synos[1:]:
-                    #synonyms must be at least 3 characters
-                    if len(syno)>3:
-                        gene+="|"+syno
-            gene_q=gene.replace("|", "\"[tiab] OR \"")
-            gene_q+="[tiab]"
-            count=gene_addiction_cnt(gene_q)
-            print("original line->\t"+inputline.strip())
-            print("stopword rmed->\t"+line.strip())
-            print("final  result->\t"+gene+"\t"+count)
-            out.write(gene+"\t"+count)
-        else:
-            print("original resl->\t"+inputline.strip())
-            out.write(inputline)
-
-sorted_f=output_f.replace(".txt","_sorted.txt")
-os.system("sort -k2 -t$'\t' -rn " + output_f + " > " + sorted_f )
-
-
diff --git a/topGene_step2_cnt_sentences.py b/topGene_step2_cnt_sentences.py
deleted file mode 100755
index b05aa7a..0000000
--- a/topGene_step2_cnt_sentences.py
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/bin/env python3 
-import os, sys
-import re
-import time
-from nltk.tokenize import sent_tokenize
-from ratspub_keywords import *
-
-def undic(dic):
-    return "|".join(dic.values())
-
-def findWholeWord(w):
-    return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search
-
-def getSentences(query, genes):
-    abstracts = os.popen("esearch -db pubmed -query " +  query + " | efetch -format uid |fetch-pubmed -path /run/media/hao/PubMed/Archive/ | xtract -pattern PubmedArticle -element MedlineCitation/PMID,ArticleTitle,AbstractText|sed \"s/-/ /g\"").read()
-    gene_syno=genes.split("|")
-    symb=gene_syno[0]
-    out=str()
-    for row in abstracts.split("\n"):
-        tiab=row.split("\t")
-        pmid = tiab.pop(0)
-        tiab= " ".join(tiab)
-        sentences = sent_tokenize(tiab)
-          ## keep the sentence only if it contains the gene 
-        for sent in sentences:
-            for gene in gene_syno:
-                if findWholeWord(gene)(sent):
-                    sent=re.sub(r'\b(%s)\b' % gene, r'<strong>\1</strong>', sent, flags=re.I)
-                    for drug0 in drug_d:
-                        if findWholeWord(drug_d[drug0])(sent) :
-                            sent=sent.replace("<b>","").replace("</b>","")
-                            sent=re.sub(r'\b(%s)\b' % drug_d[drug0], r'<b>\1</b>', sent, flags=re.I)
-                            out+=symb+"\t"+"drug\t" + drug0+"\t"+pmid+"\t"+sent+"\n"
-                    for add0 in addiction_d:
-                        if findWholeWord(addiction_d[add0])(sent) :
-                            sent=sent.replace("<b>","").replace("</b>","")
-                            sent=re.sub(r'\b(%s)\b' % addiction_d[add0], r'<b>\1</b>', sent, flags=re.I)
-                            out+=symb+"\t"+"addiction\t"+add0+"\t"+pmid+"\t"+sent+"\n"
-    return(out)
-
-addiction=undic(addiction_d)
-drug=undic(drug_d)
-
-
-out=open("topGene_addiction_sentences.tab", "w+")
-cnt=0
-
-if len(sys.argv) != 2:
-    print ("Please provide a sorted gene count file at the command line")
-    sys.exit()
-
-sorted_file=sys.argv[1] #  ncbi_gene_symb_syno_name_txid9606_absCnt_sorted_absCnt_sorted_absCnt_sorted_absCnt_sorted.txt
-with open (sorted_file, "r") as f:
-    for line in f:
-        (genes, abstractCount)=line.strip().split("\t")
-        genes=genes.replace("-","\ ")
-        if int(abstractCount)>20:
-            symb=genes.split("|")[0]
-            print(symb+"-->"+genes)
-            q="\'(\"" + addiction.replace("|", "\"[tiab] OR \"")  + "\") AND (\"" + drug.replace("|", "\"[tiab] OR \"", ) + "\") AND (\"" + genes.replace("|", "\"[tiab] OR \"", ) + "\")\'"
-            sentences=getSentences(q,genes)
-            out.write(sentences)
-out.close()
-
-os.system("cut -f 1,4 topGene_addiction_sentences.tab  |uniq |cut -f 1 |sort |uniq -c |sort -rn > topGeneAbstractCount.tab")
diff --git a/topGene_step3_generate_html.py b/topGene_step3_generate_html.py
deleted file mode 100755
index 036325b..0000000
--- a/topGene_step3_generate_html.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import re
-import sys
-
-## generate the html page for the top genes
-
-## put gene names and alias in a dictionary
-#ncbi_gene_symb_syno_name_txid9606_absCnt_sorted_absCnt_sorted_absCnt_absCnt_sorted.txt
-if (len(sys.argv) != 2):
-    print ("please provide the name of a sorted gene abstract count file")
-    sys.exit()
-
-geneNames={}
-with open (sys.argv[1],"r") as f:
-    for line in f:
-        (genes, count)=line.strip().split("\t")
-        gene=genes.split("|")
-        names=re.sub(r'^.*?\|', "", genes)
-        geneNames[gene[0]]=names.strip().replace("|", "; ")
-
-out=str()
-html=str()
-with open("./topGeneAbstractCount.tab" ,"r") as gc:
-    cnt=0
-    for  line in gc:
-        cnt+=1
-        line=re.sub(r'^\s+','',line)
-        print (line)
-        pmid_cnt, symb=line.strip().split()
-        out+= symb+"\t"+geneNames[symb]+"\n"
-        html+="<li><a href=\"/showTopGene?topGene="+symb+"\">"+symb+"</a> <span style=\"font-size:small; color:grey\">("+geneNames[symb]+")</span><br>\n"
-        if cnt==200:
-            break
-
-with open("topGene_symb_alias.txt", "w+")  as tg:
-    tg.write(out)
-    tg.close()
-
-
-htmlout='''
-{% extends "layout.html" %}
-{% block content %}
-
-<h4> Top addiction related genes </h4>
-
-<br>
-These genes are ranked by the number of PubMed abstracts that contain the name of the gene and one or more addiction related keyword. Alias were obtained from NCBI gene database and have been curated to remove most, but not all, false matches.
-<hr>
-
-<ol>''' + html  + '''
-</ol>
-{% endblock %}
-'''
-
-with open("./templates/topAddictionGene.html", "w+")  as html_f:
-    html_f.write(htmlout)
-    html_f.close()
-
diff --git a/topGene_step4_get_pmids_for_all_top_genes.py b/topGene_step4_get_pmids_for_all_top_genes.py
deleted file mode 100755
index adf527c..0000000
--- a/topGene_step4_get_pmids_for_all_top_genes.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import os
-
-## save all pmids for the top genes so that I don't have to search for these. 
-
-def getPMID(query):
-    print (query)
-    pmids=os.popen("esearch -db pubmed -query \"" +  query + "\" | efetch -format uid").read()
-    return(pmids)
-
-def collectTerms():
-    pmids_f=open("topGene_all.pmid","w+")
-    with open("./topGene_symb_alias.txt", "r") as top:
-        q=str()
-        cnt=0
-        for one in top:
-            cnt+=1
-            (symb, alias)=one.split("\t")
-            q+="|"+symb+"|"+alias.strip()
-            if (cnt==5):
-                print ("\n")
-                q=q[1:]
-                q=q.replace(";", "[tiab] OR ")+"[tiab]"
-                pmids=getPMID(q)
-                pmids_f.write(pmids)
-                cnt=0
-                q=str()
-        print("there should be nothing following the word empty"+q)
-
-collectTerms()
-os.system("sort topGene_all.pmid |uniq > topGene_uniq.pmid" )
-os.system("rm topGene_all.pmid")
-print ("results are in topGen_uniq.pmid")
-
diff --git a/utility/process_gwas.py b/utility/process_gwas.py
new file mode 100644
index 0000000..eba59c0
--- /dev/null
+++ b/utility/process_gwas.py
@@ -0,0 +1,48 @@
+import re
+
+with open("./addiction_gwas.tsv", "r") as f:
+    for line in f:
+        try:
+            (pmid, trait0, gene0, gene1, snp, pval, trait1)=line.strip().split("\t")
+        except:
+            next
+        key1="unassigned"
+        key2="unassigned"
+        trait=trait0+"; "+trait1
+        genes=gene0+";"+gene1
+        if re.search('cocaine', trait, flags=re.I):
+            key1="addiction"
+            key2="cocaine"
+        elif re.search('smoking|congestive|nicotine', trait, flags=re.I):
+            key1="addiction"
+            key2="nicotine"
+        elif re.search('opioid|morphin|heroin|methadone', trait, flags=re.I):
+            key1="addiction"
+            key2="opioid"
+        elif re.search('amphetam', trait, flags=re.I):
+            key1="addiction"
+            key2="amphetamine"
+        elif re.search('canabis', trait, flags=re.I):
+            key1="addiction"
+            key2="canabis"
+        elif re.search('food', trait, flags=re.I):
+            key1="addiction"
+            key2="food"
+        elif re.search('alcohol', trait, flags=re.I):
+            key1="addiction"
+            key2="alcohol"
+        elif re.search('addiction|abuse', trait, flags=re.I):
+            key1="addiction"
+            key2="addiction"
+        else:
+            key1="behavior"
+            key2="psychiatric"
+        genes=genes.replace(" - ", ";")
+        genes=genes.replace(",", ";")
+        printed=dict()
+        for gene in genes.split(";"):
+            gene=gene.replace(" ","")
+            if gene !="NR" and gene not in  printed:
+                text="SNP:<b>"+snp+"</b>, P value: <b>"+pval+"</b>, Disease/trait:<b> "+trait0+"</b>, Mapped trait:<b> "+trait1+"</b>"
+                print (gene+"\t"+"GWAS"+"\t"+key2+"_GWAS\t"+pmid+"\t"+text)
+            printed[gene]=1
diff --git a/utility/topGene_step0_extract_gene_alias_from_gene_info.sh b/utility/topGene_step0_extract_gene_alias_from_gene_info.sh
new file mode 100755
index 0000000..4d3118b
--- /dev/null
+++ b/utility/topGene_step0_extract_gene_alias_from_gene_info.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+#-e "s/\(|\)/ /g"  -e "s/\[|\]/ /g" 
+grep ^9606 ~/Downloads/gene_info |cut -f 3,5,12|grep -v ^LOC|grep -v -i pseudogene |sed -e "s/\t-//"  -e "s/\t/|/2"  -e "s/\t-//"  -e "s/\t/\|/" -e "s/(\|)\|\[\|\]\|{\|}/ /g" | sort >ncbi_gene_symb_syno_name_txid9606.txt 
+
diff --git a/utility/topGene_step1_cnt_abstracts.py b/utility/topGene_step1_cnt_abstracts.py
new file mode 100755
index 0000000..420c9cf
--- /dev/null
+++ b/utility/topGene_step1_cnt_abstracts.py
@@ -0,0 +1,107 @@
+#!/bin/env python3 
+import os
+import sys
+import re
+import time
+from ratspub_keywords import *
+
+def undic(dic):
+    return "|".join(dic.values())
+
+def gene_addiction_cnt(gene):
+    time.sleep(0.2)
+    q="\'(\"" + addiction.replace("|", "\"[tiab] OR \"")  + "\") AND (\"" + drug.replace("|", "\"[tiab] OR \"", ) + "\") AND (\"" + gene + "\")\'"
+    count=os.popen('esearch -db pubmed  -query ' + q + ' | xtract -pattern ENTREZ_DIRECT -element Count ').read()
+    if (len(count)==0):
+        print("pause")
+        time.sleep(15)
+        return gene_addiction_cnt(gene)
+    else:
+        return (count)
+
+def removeStopWords(terms):
+    out=str()
+    for one in terms.upper().split("|"):
+       if one not in stopWords:
+            out+="|"+one
+    return(out[1:])
+
+def saveStopWord(w):
+    with open (stopword_f,"a") as swf:
+        swf.write(w+"\n")
+    return
+
+# either start with ncbi_gene_symb_syno_name_txid9606 for fresh new counts
+# or recount the results after adding additional stopwords
+
+if len(sys.argv)==2:
+    input_f=sys.argv[1]
+else:
+    input_f="./ncbi_gene_symb_syno_name_txid9606.txt"
+
+addiction=undic(addiction_d)
+drug=undic(drug_d)
+output_f=input_f.replace(".txt","_absCnt.txt")
+out=open(output_f, "w+")
+
+stopword_f="./stop_words_addiction_gene_search.txt"
+with open (stopword_f, "r") as swf:
+    stopWords=swf.read().upper().split("\n")
+    swf.close()
+
+with open (input_f, "r") as f:
+    for line in f:
+        do_search=0
+        inputline=line
+        line=line.replace("-","\ ")
+        # remove the annotated stopword
+        if "'" in line:
+            do_search=1
+            words=line.split("|")
+            line=str()
+            for word in words:
+                # ' is used to mark/annotate a word is a stop word in the results
+                # remove the ' mark 
+                if "'" in word:
+                    word=word.replace("'","")
+                    stopWords.append(word)
+                    saveStopWord(word)
+                line+="|"+word
+            line=line[1:]
+        line=removeStopWords(line)
+        # tab is added if there are abstracts counts
+        if "\t" in line:
+            (gene, count)=line.split("\t")
+            # rerun if count is low, these are less annotated
+        #    if int(count)<50:
+        #        do_search=1
+        else:
+            #no count, 
+            gene=line.strip()
+            do_search=1
+        if do_search==1:
+            # remove synonyms with only two letters
+            if "|" in gene:
+                synos=gene.split("|")
+                # keep the gene name regardless number of characters
+                gene=synos[0]
+                #print ("gene: "+gene + " synos -->" + str(synos[1:]))
+                for syno in synos[1:]:
+                    #synonyms must be at least 3 characters
+                    if len(syno)>3:
+                        gene+="|"+syno
+            gene_q=gene.replace("|", "\"[tiab] OR \"")
+            gene_q+="[tiab]"
+            count=gene_addiction_cnt(gene_q)
+            print("original line->\t"+inputline.strip())
+            print("stopword rmed->\t"+line.strip())
+            print("final  result->\t"+gene+"\t"+count)
+            out.write(gene+"\t"+count)
+        else:
+            print("original resl->\t"+inputline.strip())
+            out.write(inputline)
+
+sorted_f=output_f.replace(".txt","_sorted.txt")
+os.system("sort -k2 -t$'\t' -rn " + output_f + " > " + sorted_f )
+
+
diff --git a/utility/topGene_step2_cnt_sentences.py b/utility/topGene_step2_cnt_sentences.py
new file mode 100755
index 0000000..b05aa7a
--- /dev/null
+++ b/utility/topGene_step2_cnt_sentences.py
@@ -0,0 +1,65 @@
+#!/bin/env python3 
+import os, sys
+import re
+import time
+from nltk.tokenize import sent_tokenize
+from ratspub_keywords import *
+
+def undic(dic):
+    return "|".join(dic.values())
+
+def findWholeWord(w):
+    return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search
+
+def getSentences(query, genes):
+    abstracts = os.popen("esearch -db pubmed -query " +  query + " | efetch -format uid |fetch-pubmed -path /run/media/hao/PubMed/Archive/ | xtract -pattern PubmedArticle -element MedlineCitation/PMID,ArticleTitle,AbstractText|sed \"s/-/ /g\"").read()
+    gene_syno=genes.split("|")
+    symb=gene_syno[0]
+    out=str()
+    for row in abstracts.split("\n"):
+        tiab=row.split("\t")
+        pmid = tiab.pop(0)
+        tiab= " ".join(tiab)
+        sentences = sent_tokenize(tiab)
+          ## keep the sentence only if it contains the gene 
+        for sent in sentences:
+            for gene in gene_syno:
+                if findWholeWord(gene)(sent):
+                    sent=re.sub(r'\b(%s)\b' % gene, r'<strong>\1</strong>', sent, flags=re.I)
+                    for drug0 in drug_d:
+                        if findWholeWord(drug_d[drug0])(sent) :
+                            sent=sent.replace("<b>","").replace("</b>","")
+                            sent=re.sub(r'\b(%s)\b' % drug_d[drug0], r'<b>\1</b>', sent, flags=re.I)
+                            out+=symb+"\t"+"drug\t" + drug0+"\t"+pmid+"\t"+sent+"\n"
+                    for add0 in addiction_d:
+                        if findWholeWord(addiction_d[add0])(sent) :
+                            sent=sent.replace("<b>","").replace("</b>","")
+                            sent=re.sub(r'\b(%s)\b' % addiction_d[add0], r'<b>\1</b>', sent, flags=re.I)
+                            out+=symb+"\t"+"addiction\t"+add0+"\t"+pmid+"\t"+sent+"\n"
+    return(out)
+
+addiction=undic(addiction_d)
+drug=undic(drug_d)
+
+
+out=open("topGene_addiction_sentences.tab", "w+")
+cnt=0
+
+if len(sys.argv) != 2:
+    print ("Please provide a sorted gene count file at the command line")
+    sys.exit()
+
+sorted_file=sys.argv[1] #  ncbi_gene_symb_syno_name_txid9606_absCnt_sorted_absCnt_sorted_absCnt_sorted_absCnt_sorted.txt
+with open (sorted_file, "r") as f:
+    for line in f:
+        (genes, abstractCount)=line.strip().split("\t")
+        genes=genes.replace("-","\ ")
+        if int(abstractCount)>20:
+            symb=genes.split("|")[0]
+            print(symb+"-->"+genes)
+            q="\'(\"" + addiction.replace("|", "\"[tiab] OR \"")  + "\") AND (\"" + drug.replace("|", "\"[tiab] OR \"", ) + "\") AND (\"" + genes.replace("|", "\"[tiab] OR \"", ) + "\")\'"
+            sentences=getSentences(q,genes)
+            out.write(sentences)
+out.close()
+
+os.system("cut -f 1,4 topGene_addiction_sentences.tab  |uniq |cut -f 1 |sort |uniq -c |sort -rn > topGeneAbstractCount.tab")
diff --git a/utility/topGene_step3_generate_html.py b/utility/topGene_step3_generate_html.py
new file mode 100755
index 0000000..036325b
--- /dev/null
+++ b/utility/topGene_step3_generate_html.py
@@ -0,0 +1,57 @@
+import re
+import sys
+
+## generate the html page for the top genes
+
+## put gene names and alias in a dictionary
+#ncbi_gene_symb_syno_name_txid9606_absCnt_sorted_absCnt_sorted_absCnt_absCnt_sorted.txt
+if (len(sys.argv) != 2):
+    print ("please provide the name of a sorted gene abstract count file")
+    sys.exit()
+
+geneNames={}
+with open (sys.argv[1],"r") as f:
+    for line in f:
+        (genes, count)=line.strip().split("\t")
+        gene=genes.split("|")
+        names=re.sub(r'^.*?\|', "", genes)
+        geneNames[gene[0]]=names.strip().replace("|", "; ")
+
+out=str()
+html=str()
+with open("./topGeneAbstractCount.tab" ,"r") as gc:
+    cnt=0
+    for  line in gc:
+        cnt+=1
+        line=re.sub(r'^\s+','',line)
+        print (line)
+        pmid_cnt, symb=line.strip().split()
+        out+= symb+"\t"+geneNames[symb]+"\n"
+        html+="<li><a href=\"/showTopGene?topGene="+symb+"\">"+symb+"</a> <span style=\"font-size:small; color:grey\">("+geneNames[symb]+")</span><br>\n"
+        if cnt==200:
+            break
+
+with open("topGene_symb_alias.txt", "w+")  as tg:
+    tg.write(out)
+    tg.close()
+
+
+htmlout='''
+{% extends "layout.html" %}
+{% block content %}
+
+<h4> Top addiction related genes </h4>
+
+<br>
+These genes are ranked by the number of PubMed abstracts that contain the name of the gene and one or more addiction related keyword. Alias were obtained from NCBI gene database and have been curated to remove most, but not all, false matches.
+<hr>
+
+<ol>''' + html  + '''
+</ol>
+{% endblock %}
+'''
+
+with open("./templates/topAddictionGene.html", "w+")  as html_f:
+    html_f.write(htmlout)
+    html_f.close()
+
diff --git a/utility/topGene_step4_get_pmids_for_all_top_genes.py b/utility/topGene_step4_get_pmids_for_all_top_genes.py
new file mode 100755
index 0000000..adf527c
--- /dev/null
+++ b/utility/topGene_step4_get_pmids_for_all_top_genes.py
@@ -0,0 +1,33 @@
+import os
+
+## save all pmids for the top genes so that I don't have to search for these. 
+
+def getPMID(query):
+    print (query)
+    pmids=os.popen("esearch -db pubmed -query \"" +  query + "\" | efetch -format uid").read()
+    return(pmids)
+
+def collectTerms():
+    pmids_f=open("topGene_all.pmid","w+")
+    with open("./topGene_symb_alias.txt", "r") as top:
+        q=str()
+        cnt=0
+        for one in top:
+            cnt+=1
+            (symb, alias)=one.split("\t")
+            q+="|"+symb+"|"+alias.strip()
+            if (cnt==5):
+                print ("\n")
+                q=q[1:]
+                q=q.replace(";", "[tiab] OR ")+"[tiab]"
+                pmids=getPMID(q)
+                pmids_f.write(pmids)
+                cnt=0
+                q=str()
+        print("there should be nothing following the word empty"+q)
+
+collectTerms()
+os.system("sort topGene_all.pmid |uniq > topGene_uniq.pmid" )
+os.system("rm topGene_all.pmid")
+print ("results are in topGen_uniq.pmid")
+
-- 
cgit 1.4.1