From 5f4cef3640f84092e5692e16865002a832b7838c Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Mon, 6 Apr 2026 09:51:26 +0200
Subject: Added a test that creates an ontology

---
 more_functions.py | 50 +++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 47 insertions(+), 3 deletions(-)

(limited to 'more_functions.py')

diff --git a/more_functions.py b/more_functions.py
index a115899..35e3646 100755
--- a/more_functions.py
+++ b/more_functions.py
@@ -3,6 +3,7 @@ from nltk.tokenize import sent_tokenize
 import hashlib
 import os
 import re
+import time
 
 from addiction_keywords import *
 from gene_synonyms import *
@@ -10,8 +11,51 @@ import ast
 
 global pubmed_path
 
-# In-memory cache for esearch results: hash(query) -> list of PMIDs
-_esearch_cache = {}
+# In-memory caches
+_esearch_cache = {}  # hash(query) -> list of PMIDs
+_gemini_query_cache = {}  # hash(prompt) -> response text
+
+def gemini_query(prompt, model='gemini-2.5-flash'):
+    """Send a prompt to the Gemini API with caching and retry.
+
+    Returns the response text, or raises on failure.
+    """
+    from google import genai
+
+    cache_key = hashlib.sha256(prompt.encode()).hexdigest()
+    if cache_key in _gemini_query_cache:
+        print(f"  Gemini query cache hit")
+        return _gemini_query_cache[cache_key]
+
+    api_key = os.environ.get("GEMINI_API_KEY", "")
+    if not api_key:
+        cred_file = os.path.expanduser("~/.config/gemini/credentials")
+        if os.path.isfile(cred_file):
+            with open(cred_file) as f:
+                api_key = f.read().strip()
+    if not api_key:
+        raise RuntimeError("No Gemini API key found")
+
+    client = genai.Client(api_key=api_key)
+    last_error = None
+    for attempt in range(3):
+        try:
+            if attempt > 0:
+                time.sleep(2 * attempt)
+                print(f"  Gemini retry {attempt + 1}/3")
+            print(f"  Gemini API call ({model}): {prompt[:80]}...")
+            response = client.models.generate_content(
+                model=model,
+                contents=prompt
+            )
+            result = response.text.strip()
+            print(f"  Gemini response: {result[:200]}")
+            _gemini_query_cache[cache_key] = result
+            return result
+        except Exception as e:
+            last_error = e
+            print(f"  Gemini attempt {attempt + 1}/3 failed: {e}")
+    raise RuntimeError(f"Gemini API failed after 3 attempts: {last_error}")
 
 def esearch_pmids(query):
     """Search PubMed for PMIDs matching query. Results are cached in memory.
@@ -246,7 +290,7 @@ pubmed_path=os.environ.get("EDIRECT_LOCAL_ARCHIVE", "./minipubmed")
 print(f"  pubmed_path={pubmed_path}")
 
 if not os.path.isdir(pubmed_path):
-    print(f"ERROR: EDIRECT_LOCAL_ARCHIVE directory not found: {pubmed_path} - note this is a recent env variable that replaces the others")
+    print(f"ERROR: EDIRECT_LOCAL_ARCHIVE directory not found: {pubmed_path} - note this is a recent env variable that replaces the others (ignore the minipub reference)")
     raise SystemExit(1)
 testdir = os.path.join(pubmed_path, "pubmed", "Archive", "00")
 if not os.path.isdir(testdir):
-- 
cgit 1.4.1


From acca175362eb1d1ce2b0cd263c39537b2b8a6f2b Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Mon, 6 Apr 2026 11:44:20 +0200
Subject: Combine pubmed online search to a single query

---
 more_functions.py | 16 ++++++++++++++++
 server.py         | 35 +++++++++++++++++++----------------
 2 files changed, 35 insertions(+), 16 deletions(-)

(limited to 'more_functions.py')

diff --git a/more_functions.py b/more_functions.py
index 35e3646..5d48adc 100755
--- a/more_functions.py
+++ b/more_functions.py
@@ -113,6 +113,22 @@ def hybrid_fetch_abstracts(pmid_list):
         abstracts += extra
     return abstracts
 
+def getabstracts_batch(genes, query):
+    """Fetch abstracts for multiple genes in a single PubMed query.
+
+    Builds: (keywords) AND (gene1 [tiab] OR gene2 [tiab] OR ...)
+    Returns tab-separated lines: PMID, ArticleTitle, AbstractText
+    """
+    genes_clause = " OR ".join(g + " [tiab]" for g in genes)
+    full_query = "\"(" + query + ") AND (" + genes_clause + ")\""
+    pmid_list = esearch_pmids(full_query)
+    if not pmid_list:
+        print(f"  no PMIDs found for {genes}")
+        return ""
+    print(f"  PMIDs ({len(pmid_list)}): {' '.join(pmid_list[:20])}{'...' if len(pmid_list) > 20 else ''}")
+    abstracts = hybrid_fetch_abstracts(pmid_list)
+    return abstracts
+
 def getabstracts(gene,query):
     """
       1. esearch -db pubmed -query ... -- searches PubMed for the gene + keyword query, returns matching record IDs
diff --git a/server.py b/server.py
index 79dda7d..25ccad5 100755
--- a/server.py
+++ b/server.py
@@ -64,7 +64,7 @@ import re
 import ast
 from more_functions import *
 from nltk.tokenize import sent_tokenize
-from more_functions import getabstracts, undic, gene_category
+from more_functions import getabstracts, getabstracts_batch, undic, gene_category
 
 GENECUP_PROMPT_TEMPLATE = ""
 try:
@@ -983,22 +983,25 @@ def search():
             progress+=percent
             yield "data:"+str(progress)+"\n\n"
 
-            for gene in genes:
-                print(f"Fetching info for gene {gene}\n")
-                abstracts_raw = getabstracts(gene,all_d) # all_d might be empty if no search_type matches
-                print(abstracts_raw)
-                sentences_ls=[]
+            # Batch fetch all abstracts in a single PubMed query
+            print(f"Batch fetching abstracts for {len(genes)} genes")
+            all_abstracts_raw = getabstracts_batch(genes, all_d) if all_d else ""
+            # Parse all sentences once
+            all_sentences = []
+            for row in all_abstracts_raw.split("\n"):
+                if not row.strip(): continue
+                tiab = row.split("\t")
+                pmid = tiab.pop(0)
+                tiab_text = " ".join(tiab)
+                for sent_tok in sent_tokenize(tiab_text):
+                    all_sentences.append(pmid + ' ' + sent_tok)
 
-                for row in abstracts_raw.split("\n"):
-                    if not row.strip(): continue # Skip empty lines
-                    tiab=row.split("\t")
-                    pmid = tiab.pop(0)
-                    tiab_text = " ".join(tiab) # Renamed to avoid conflict
-                    sentences_tok = sent_tokenize(tiab_text)
-                    for sent_tok in sentences_tok:
-                        sent_tok = pmid + ' ' + sent_tok
-                        sentences_ls.append(sent_tok)
-                gene=gene.replace("-"," ")
+            for gene in genes:
+                gene = gene.replace("-", " ")
+                # Filter sentences that mention this gene
+                gene_re = re.compile(r'\b' + re.escape(gene) + r'\b', re.IGNORECASE)
+                sentences_ls = [s for s in all_sentences if gene_re.search(s)]
+                print(f"  Gene {gene}: {len(sentences_ls)} sentences")
 
                 geneEdges = ""
 
-- 
cgit 1.4.1