about summary refs log tree commit diff
diff options
context:
space:
mode:
authorPjotr Prins2026-04-06 04:58:31 -0500
committerPjotr Prins2026-04-06 04:58:31 -0500
commitddac775a9c58f84026f51cfcaa688bb51e70f00c (patch)
treef8862e2d10b4ab58b138905cffb6d8211adda997
parentd99396ddac97bdea89c4326e5d3a6ba70c894313 (diff)
parent0ede8df7f295604b09a574dcfa587a31483f1583 (diff)
downloadgenecup-ddac775a9c58f84026f51cfcaa688bb51e70f00c.tar.gz
Merge branch 'master' of /home/git/public/genecup
-rw-r--r--RELEASE_NOTES.md41
-rw-r--r--VERSION2
-rw-r--r--guix.scm8
-rwxr-xr-xmore_functions.py66
-rwxr-xr-xserver.py104
-rw-r--r--templates/create-ontology.html48
-rw-r--r--tests/test_network_gemini_ontology.py56
7 files changed, 302 insertions, 23 deletions
diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
new file mode 100644
index 0000000..0d73f75
--- /dev/null
+++ b/RELEASE_NOTES.md
@@ -0,0 +1,41 @@
+# GeneCup Release Notes
+
+## Version 1.9.1 (2026-04-05)
+
+### UI/UX
+- Added header/footer in GN color scheme with version info across all pages
+- Added --port and --debug command line switches
+- Show environment variables (EDIRECT_PUBMED_MASTER, GEMINI_API_KEY, NLTK_DATA, GENECUP_DATADIR) on startup
+- Added intermediate "Calling Gemini API..." loading page that auto-refreshes when classification completes
+
+### Gemini API integration
+- Replaced TensorFlow stress classifier with Google Gemini API (gemini-2.5-pro for few-shot, gemini-3-flash-preview for batch)
+- API key read from ~/.config/gemini/credentials (with 0400 permission check)
+- Batch classification: all stress sentences classified in one API call with JSON response
+- In-memory cache for Gemini results (keyed by SHA-256 of sentence batch)
+- Retry logic (3 attempts with 2s/4s backoff)
+- Gemini prompts and responses logged to console
+
+### PubMed / edirect
+- Packaged edirect 25.x for Guix (Go programs compiled from source, XML bounds-check patch)
+- Replaced missing fetch-pubmed with xfetch -db pubmed (local archive lookup)
+- Hybrid abstract fetching: tries local xfetch first, falls back to NCBI efetch for PMIDs missing from the local archive
+- In-memory cache for esearch PMID results (keyed by SHA-256 of query string)
+- EDIRECT_LOCAL_ARCHIVE env var configures local PubMed archive path
+
+### Packaging (guix.scm)
+- Added edirect-25, nltk-punkt, minipubmed, python-google-genai packages
+- genecup-gemini package with genecup wrapper script, JavaScript assets, NLTK data
+- GENECUP_DATADIR for sqlite DB location
+
+### Testing
+- Added Python unittest framework (tests/)
+- test_hello.py: offline smoke test (runs in guix build)
+- test_network_esearch.py: NCBI esearch for Penk+stress PMIDs
+- test_local_xfetch.py: local xsearch+xfetch against PubMed archive
+- test_network_hybrid.py: validates hybrid fetch matches NCBI; tests esearch cache
+
+### Cleanup
+- Moved dead code to old/server.py
+- Removed unused TensorFlow/Keras dependencies
+- Removed stress_prompt.txt dependency (batch classifier builds its own prompt)
diff --git a/VERSION b/VERSION
index c064b1b..9ab8337 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-1.9.1-pre
+1.9.1
diff --git a/guix.scm b/guix.scm
index f2e54d4..2c42e85 100644
--- a/guix.scm
+++ b/guix.scm
@@ -7,12 +7,14 @@
 ;;
 ;; Development shell:
 ;;
-;;   guix shell -L . -C -N -F edirect-25 genecup-gemini coreutils -- genecup --port 4201
+;;   guix shell -L . -C -N -F --expose=$HOME/.config/gemini --share=/export3/PubMed edirect-25 genecup-gemini coreutils -- genecup --port 4201
 ;;
 ;; In a shell you can run
 ;;
-;;   python3 -m unittest tests.test_network_esearch
+;;   guix shell -C -N -F -L . --expose=$HOME/.config/gemini --share=/export3/PubMed edirect-25 genecup-gemini
+;;   env EDIRECT_LOCAL_ARCHIVE=/export3/PubMed/Source python3 -m unittest tests.test_network_esearch
 ;;   env EDIRECT_LOCAL_ARCHIVE=/export3/PubMed/Source python3 -m unittest tests.test_local_xfetch -v
+;;   env EDIRECT_LOCAL_ARCHIVE=/export3/PubMed/Source python3 -m unittest tests.test_network_gemini_ontology
 ;;
 ;; Note: API key is read from ~/.config/gemini/credentials
 ;;
@@ -284,7 +286,7 @@ GeneCup with four gene symbols (gria1, crhr1, drd2, and penk).")
            go-github-com-surgebase-porter2
            go-golang-org-x-sys
            go-golang-org-x-text))
-    (propagated-inputs (list curl wget grep sed gawk coreutils findutils gzip unzip))
+    (propagated-inputs (list curl wget grep sed gawk coreutils findutils gzip unzip xargs))
     (inputs (list bash-minimal coreutils perl perl-xml-simple python))
     (home-page "https://www.ncbi.nlm.nih.gov/books/NBK179288/")
     (synopsis "Tools for accessing the NCBI's set of databases")
diff --git a/more_functions.py b/more_functions.py
index a115899..5d48adc 100755
--- a/more_functions.py
+++ b/more_functions.py
@@ -3,6 +3,7 @@ from nltk.tokenize import sent_tokenize
 import hashlib
 import os
 import re
+import time
 
 from addiction_keywords import *
 from gene_synonyms import *
@@ -10,8 +11,51 @@ import ast
 
 global pubmed_path
 
-# In-memory cache for esearch results: hash(query) -> list of PMIDs
-_esearch_cache = {}
+# In-memory caches
+_esearch_cache = {}  # hash(query) -> list of PMIDs
+_gemini_query_cache = {}  # hash(prompt) -> response text
+
+def gemini_query(prompt, model='gemini-2.5-flash'):
+    """Send a prompt to the Gemini API with caching and retry.
+
+    Returns the response text, or raises on failure.
+    """
+    from google import genai
+
+    cache_key = hashlib.sha256(prompt.encode()).hexdigest()
+    if cache_key in _gemini_query_cache:
+        print(f"  Gemini query cache hit")
+        return _gemini_query_cache[cache_key]
+
+    api_key = os.environ.get("GEMINI_API_KEY", "")
+    if not api_key:
+        cred_file = os.path.expanduser("~/.config/gemini/credentials")
+        if os.path.isfile(cred_file):
+            with open(cred_file) as f:
+                api_key = f.read().strip()
+    if not api_key:
+        raise RuntimeError("No Gemini API key found")
+
+    client = genai.Client(api_key=api_key)
+    last_error = None
+    for attempt in range(3):
+        try:
+            if attempt > 0:
+                time.sleep(2 * attempt)
+                print(f"  Gemini retry {attempt + 1}/3")
+            print(f"  Gemini API call ({model}): {prompt[:80]}...")
+            response = client.models.generate_content(
+                model=model,
+                contents=prompt
+            )
+            result = response.text.strip()
+            print(f"  Gemini response: {result[:200]}")
+            _gemini_query_cache[cache_key] = result
+            return result
+        except Exception as e:
+            last_error = e
+            print(f"  Gemini attempt {attempt + 1}/3 failed: {e}")
+    raise RuntimeError(f"Gemini API failed after 3 attempts: {last_error}")
 
 def esearch_pmids(query):
     """Search PubMed for PMIDs matching query. Results are cached in memory.
@@ -69,6 +113,22 @@ def hybrid_fetch_abstracts(pmid_list):
         abstracts += extra
     return abstracts
 
+def getabstracts_batch(genes, query):
+    """Fetch abstracts for multiple genes in a single PubMed query.
+
+    Builds: (keywords) AND (gene1 [tiab] OR gene2 [tiab] OR ...)
+    Returns tab-separated lines: PMID, ArticleTitle, AbstractText
+    """
+    genes_clause = " OR ".join(g + " [tiab]" for g in genes)
+    full_query = "\"(" + query + ") AND (" + genes_clause + ")\""
+    pmid_list = esearch_pmids(full_query)
+    if not pmid_list:
+        print(f"  no PMIDs found for {genes}")
+        return ""
+    print(f"  PMIDs ({len(pmid_list)}): {' '.join(pmid_list[:20])}{'...' if len(pmid_list) > 20 else ''}")
+    abstracts = hybrid_fetch_abstracts(pmid_list)
+    return abstracts
+
 def getabstracts(gene,query):
     """
       1. esearch -db pubmed -query ... -- searches PubMed for the gene + keyword query, returns matching record IDs
@@ -246,7 +306,7 @@ pubmed_path=os.environ.get("EDIRECT_LOCAL_ARCHIVE", "./minipubmed")
 print(f"  pubmed_path={pubmed_path}")
 
 if not os.path.isdir(pubmed_path):
-    print(f"ERROR: EDIRECT_LOCAL_ARCHIVE directory not found: {pubmed_path} - note this is a recent env variable that replaces the others")
+    print(f"ERROR: EDIRECT_LOCAL_ARCHIVE directory not found: {pubmed_path} - note this is a recent env variable that replaces the others (ignore the minipub reference)")
     raise SystemExit(1)
 testdir = os.path.join(pubmed_path, "pubmed", "Archive", "00")
 if not os.path.isdir(testdir):
diff --git a/server.py b/server.py
index c81cbc9..25ccad5 100755
--- a/server.py
+++ b/server.py
@@ -64,7 +64,7 @@ import re
 import ast
 from more_functions import *
 from nltk.tokenize import sent_tokenize
-from more_functions import getabstracts, undic, gene_category
+from more_functions import getabstracts, getabstracts_batch, undic, gene_category
 
 GENECUP_PROMPT_TEMPLATE = ""
 try:
@@ -346,6 +346,67 @@ def logout():
 def about():
     return render_template('about.html',version=version())
 
+@app.route("/create-ontology", methods=["GET", "POST"])
+def create_ontology():
+    if request.method == "GET":
+        session.pop('namecat', None)
+    from more_functions import gemini_query
+    default_prompt = (
+        "Give me a list of terms on substance abuse disorder (SUD) that act "
+        "as traits and classifiers in scientific literature with a focus on "
+        "behaviour and brain attributes related to the hippocampus. Avoid "
+        "aliases and synonyms as well as gene names. Each term should be "
+        "1-3 words (max). Give me a list of at least 20, but no more than "
+        "80, most used terms. Return only the terms, one per line, no "
+        "numbering. Add abbreviations and aliases - each at least 3 letters that have no other meaning - as a list with each term, "
+        "separated by commas")
+    if request.method == "POST":
+        action = request.form.get("action", "generate")
+
+        if action == "search":
+            # Build a temporary .onto file from the result terms and redirect to /progress
+            result_text = request.form.get("result", "")
+            query = request.form.get("query", "")
+            search_types = request.form.getlist("type")
+            # Build onto dict: each term is its own category with aliases as pipe-separated keywords
+            dict_onto = {}
+            for line in result_text.strip().split("\n"):
+                line = line.strip()
+                if not line:
+                    continue
+                parts = [p.strip() for p in line.split(",")]
+                category = parts[0]
+                keywords = "|".join(parts)
+                dict_onto[category] = {category: {keywords}}
+            # Save to a temp .onto file
+            onto_path = os.path.join(tempfile.gettempdir(), "gemini_ontology")
+            with open(onto_path + ".onto", "w") as f:
+                f.write(repr(dict_onto))
+            session['namecat'] = onto_path
+            print(f"  Created ontology: {onto_path}.onto with {len(dict_onto)} categories")
+            print(f"  Gene query: '{query}', search_types: {search_types}")
+            # Build the redirect URL with type and query params
+            from urllib.parse import urlencode
+            params = [("query", query)]
+            for t in search_types:
+                params.append(("type", t))
+            return redirect("/progress?" + urlencode(params))
+
+        # action == "generate"
+        prompt = request.form.get("prompt", default_prompt)
+        try:
+            result = gemini_query(prompt)
+            terms = [t.strip() for t in result.strip().split("\n") if t.strip()]
+            return render_template('create-ontology.html',
+                                   prompt=prompt, result=result,
+                                   count=len(terms), version=version())
+        except Exception as e:
+            return render_template('create-ontology.html',
+                                   prompt=prompt, result=f"Error: {e}",
+                                   count=0, version=version())
+    return render_template('create-ontology.html',
+                           prompt=default_prompt, result=None,
+                           count=0, version=version())
 
 # Ontology selection
 @app.route("/index_ontology", methods=["POST", "GET"])
@@ -803,6 +864,11 @@ def progress():
         if (search_type == []):
             search_type = ['GWAS', 'function', 'addiction', 'drug', 'brain', 'stress', 'psychiatric', 'cell']
         session['search_type'] = search_type
+        # Use default addiction ontology unless redirected from /create-ontology
+        if request.referrer and '/create-ontology' in request.referrer:
+            pass  # keep session['namecat'] set by /create-ontology
+        elif 'namecat' in session:
+            del session['namecat']
     genes_session = ''
 
     for gen in genes:
@@ -846,8 +912,10 @@ def search():
     if 'namecat' in session:
         namecat_flag=1
         ses_namecat = session['namecat']
+        print(f"  /search: namecat={ses_namecat}, search_type={search_type}")
         onto_cont = open(session['namecat']+".onto","r").read()
         dict_onto=ast.literal_eval(onto_cont)
+        print(f"  /search: onto categories={list(dict_onto.keys())[:10]}")
 
         for ky in dict_onto.keys():
             nodecolor[ky] = "hsl("+str((n_num+1)*int(360/len(dict_onto.keys())))+", 70%, 80%)"
@@ -908,28 +976,32 @@ def search():
                     all_d = all_d+'|'+all_d_ls
             if all_d: # Check if all_d is not empty
                 all_d=all_d[1:]
+            print(f"  /search generate: all_d={all_d[:200] if all_d else '(empty)'}, search_type={search_type}")
 
             if ("GWAS" in search_type):
                 datf = pd.read_csv('./utility/gwas_used.csv',sep='\t')
             progress+=percent
             yield "data:"+str(progress)+"\n\n"
 
-            for gene in genes:
-                print(f"Fetching info for gene {gene}\n")
-                abstracts_raw = getabstracts(gene,all_d) # all_d might be empty if no search_type matches
-                print(abstracts_raw)
-                sentences_ls=[]
+            # Batch fetch all abstracts in a single PubMed query
+            print(f"Batch fetching abstracts for {len(genes)} genes")
+            all_abstracts_raw = getabstracts_batch(genes, all_d) if all_d else ""
+            # Parse all sentences once
+            all_sentences = []
+            for row in all_abstracts_raw.split("\n"):
+                if not row.strip(): continue
+                tiab = row.split("\t")
+                pmid = tiab.pop(0)
+                tiab_text = " ".join(tiab)
+                for sent_tok in sent_tokenize(tiab_text):
+                    all_sentences.append(pmid + ' ' + sent_tok)
 
-                for row in abstracts_raw.split("\n"):
-                    if not row.strip(): continue # Skip empty lines
-                    tiab=row.split("\t")
-                    pmid = tiab.pop(0)
-                    tiab_text = " ".join(tiab) # Renamed to avoid conflict
-                    sentences_tok = sent_tokenize(tiab_text)
-                    for sent_tok in sentences_tok:
-                        sent_tok = pmid + ' ' + sent_tok
-                        sentences_ls.append(sent_tok)
-                gene=gene.replace("-"," ")
+            for gene in genes:
+                gene = gene.replace("-", " ")
+                # Filter sentences that mention this gene
+                gene_re = re.compile(r'\b' + re.escape(gene) + r'\b', re.IGNORECASE)
+                sentences_ls = [s for s in all_sentences if gene_re.search(s)]
+                print(f"  Gene {gene}: {len(sentences_ls)} sentences")
 
                 geneEdges = ""
 
diff --git a/templates/create-ontology.html b/templates/create-ontology.html
new file mode 100644
index 0000000..537c246
--- /dev/null
+++ b/templates/create-ontology.html
@@ -0,0 +1,48 @@
+{% extends "layout.html" %}
+{% block content %}
+
+<div class="container mt-4">
+  <h3>Create Ontology with Gemini AI</h3>
+
+  <form method="POST" action="/create-ontology">
+    <input type="hidden" name="action" value="generate">
+    <div class="form-group">
+      <label for="prompt">Prompt:</label>
+      <textarea class="form-control" id="prompt" name="prompt" rows="6">{{ prompt }}</textarea>
+    </div>
+    <button type="submit" class="btn btn-primary" onclick="this.style.backgroundColor='#87CEEB'; this.style.borderColor='#87CEEB';">Create Ontology</button>
+  </form>
+
+  {% if result %}
+  <div class="form-group mt-4">
+    <label for="result">Result ({{ count }} terms):</label>
+    <textarea class="form-control" id="result" name="result" rows="20" form="search-form">{{ result }}</textarea>
+  </div>
+
+  <form id="search-form" method="POST" action="/create-ontology" class="mt-3">
+    <input type="hidden" name="action" value="search">
+    <div class="form-group">
+      <label for="query">Gene symbols (space or comma separated):</label>
+      <textarea class="form-control" id="query" name="query" rows="2"></textarea>
+    </div>
+    <div id="check_selection_onto"></div>
+    <button type="submit" class="btn btn-primary" onclick="if(!document.getElementById('query').value.trim()){alert('Please enter at least one gene symbol');return false;}">Search</button>
+  </form>
+
+  <script>
+    var lines = document.getElementById('result').value.split('\n');
+    var checkbox = '';
+    for (var i = 0; i < lines.length; i++) {
+      var term = lines[i].trim();
+      if (term) {
+        var label = term.split(',')[0].trim();
+        checkbox += '<strong><input type="checkbox" name="type" value="' + label + '" checked form="search-form"> ' + label + '&nbsp;&nbsp;</strong>';
+      }
+    }
+    checkbox += '<br><strong><input type="checkbox" onClick="var c=document.querySelectorAll(\'input[name=type]\');for(var i=0;i<c.length;i++)c[i].checked=this.checked;"/> (Un)select all</strong>';
+    document.getElementById('check_selection_onto').innerHTML = checkbox;
+  </script>
+  {% endif %}
+</div>
+
+{% endblock %}
diff --git a/tests/test_network_gemini_ontology.py b/tests/test_network_gemini_ontology.py
new file mode 100644
index 0000000..2a84909
--- /dev/null
+++ b/tests/test_network_gemini_ontology.py
@@ -0,0 +1,56 @@
+"""Test Gemini API for generating SUD ontology terms.
+
+Requires a Gemini API key in ~/.config/gemini/credentials and internet access.
+
+Run with: python3 -m unittest tests.test_network_gemini_ontology -v
+"""
+
+import os
+import sys
+import time
+import unittest
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+from more_functions import gemini_query
+
+PROMPT = (
+    """
+    Give me a list of terms on substance abuse disorder (SUD) that act
+    as traits and classifiers in scientific literature with a focus on
+    behaviour and brain attributes related to the hippocampus. Avoid
+    aliases and synonyms as well as gene names. Each term should be
+    1-3 words (max).  Give me a list of at least 20, but no more than
+    80, most used terms.  Return only the terms, one per line, no
+    numbering. Add abbreviations and aliases as a list with each term, separated by commas"""
+)
+
+class TestGeminiOntology(unittest.TestCase):
+    def test_1_sud_ontology_terms(self):
+        """Gemini should return 20-50 SUD ontology terms."""
+        t0 = time.time()
+        response = gemini_query(PROMPT)
+        elapsed = time.time() - t0
+        terms = [t.strip() for t in response.strip().split("\n") if t.strip()]
+        print(f"  Got {len(terms)} terms ({elapsed:.2f}s)")
+        for t in terms:
+            print(f"    - {t}")
+        self.assertGreaterEqual(len(terms), 20,
+                                f"Expected at least 20 terms, got {len(terms)}")
+        self.assertLessEqual(len(terms), 80,
+                             f"Expected at most 80 terms, got {len(terms)}")
+        # Each term should be short (1-3 words, allow some slack)
+        long_terms = [t for t in terms if len(t.split()) > 5]
+
+    def test_2_cached_ontology(self):
+        """Second call should use cache and be fast."""
+        # Ensure cache is populated from test_1
+        gemini_query(PROMPT)
+        t0 = time.time()
+        response = gemini_query(PROMPT)
+        elapsed = time.time() - t0
+        terms = [t.strip() for t in response.strip().split("\n") if t.strip()]
+        print(f"  Cached: {len(terms)} terms ({elapsed:.4f}s)")
+        self.assertLess(elapsed, 0.01, f"Cache lookup too slow: {elapsed:.4f}s")
+
+if __name__ == "__main__":
+    unittest.main()