about summary refs log tree commit diff
path: root/more_functions.py
diff options
context:
space:
mode:
Diffstat (limited to 'more_functions.py')
-rwxr-xr-xmore_functions.py66
1 files changed, 63 insertions, 3 deletions
diff --git a/more_functions.py b/more_functions.py
index a115899..5d48adc 100755
--- a/more_functions.py
+++ b/more_functions.py
@@ -3,6 +3,7 @@ from nltk.tokenize import sent_tokenize
 import hashlib
 import os
 import re
+import time
 
 from addiction_keywords import *
 from gene_synonyms import *
@@ -10,8 +11,51 @@ import ast
 
 global pubmed_path
 
-# In-memory cache for esearch results: hash(query) -> list of PMIDs
-_esearch_cache = {}
+# In-memory caches
+_esearch_cache = {}  # hash(query) -> list of PMIDs
+_gemini_query_cache = {}  # hash(prompt) -> response text
+
+def gemini_query(prompt, model='gemini-2.5-flash'):
+    """Send a prompt to the Gemini API with caching and retry.
+
+    Returns the response text, or raises on failure.
+    """
+    from google import genai
+
+    cache_key = hashlib.sha256(prompt.encode()).hexdigest()
+    if cache_key in _gemini_query_cache:
+        print(f"  Gemini query cache hit")
+        return _gemini_query_cache[cache_key]
+
+    api_key = os.environ.get("GEMINI_API_KEY", "")
+    if not api_key:
+        cred_file = os.path.expanduser("~/.config/gemini/credentials")
+        if os.path.isfile(cred_file):
+            with open(cred_file) as f:
+                api_key = f.read().strip()
+    if not api_key:
+        raise RuntimeError("No Gemini API key found")
+
+    client = genai.Client(api_key=api_key)
+    last_error = None
+    for attempt in range(3):
+        try:
+            if attempt > 0:
+                time.sleep(2 * attempt)
+                print(f"  Gemini retry {attempt + 1}/3")
+            print(f"  Gemini API call ({model}): {prompt[:80]}...")
+            response = client.models.generate_content(
+                model=model,
+                contents=prompt
+            )
+            result = response.text.strip()
+            print(f"  Gemini response: {result[:200]}")
+            _gemini_query_cache[cache_key] = result
+            return result
+        except Exception as e:
+            last_error = e
+            print(f"  Gemini attempt {attempt + 1}/3 failed: {e}")
+    raise RuntimeError(f"Gemini API failed after 3 attempts: {last_error}")
 
 def esearch_pmids(query):
     """Search PubMed for PMIDs matching query. Results are cached in memory.
@@ -69,6 +113,22 @@ def hybrid_fetch_abstracts(pmid_list):
         abstracts += extra
     return abstracts
 
+def getabstracts_batch(genes, query):
+    """Fetch abstracts for multiple genes in a single PubMed query.
+
+    Builds: (keywords) AND (gene1 [tiab] OR gene2 [tiab] OR ...)
+    Returns tab-separated lines: PMID, ArticleTitle, AbstractText
+    """
+    genes_clause = " OR ".join(g + " [tiab]" for g in genes)
+    full_query = "\"(" + query + ") AND (" + genes_clause + ")\""
+    pmid_list = esearch_pmids(full_query)
+    if not pmid_list:
+        print(f"  no PMIDs found for {genes}")
+        return ""
+    print(f"  PMIDs ({len(pmid_list)}): {' '.join(pmid_list[:20])}{'...' if len(pmid_list) > 20 else ''}")
+    abstracts = hybrid_fetch_abstracts(pmid_list)
+    return abstracts
+
 def getabstracts(gene,query):
     """
       1. esearch -db pubmed -query ... -- searches PubMed for the gene + keyword query, returns matching record IDs
@@ -246,7 +306,7 @@ pubmed_path=os.environ.get("EDIRECT_LOCAL_ARCHIVE", "./minipubmed")
 print(f"  pubmed_path={pubmed_path}")
 
 if not os.path.isdir(pubmed_path):
-    print(f"ERROR: EDIRECT_LOCAL_ARCHIVE directory not found: {pubmed_path} - note this is a recent env variable that replaces the others")
+    print(f"ERROR: EDIRECT_LOCAL_ARCHIVE directory not found: {pubmed_path} - note this is a recent env variable that replaces the others (ignore the minipub reference)")
     raise SystemExit(1)
 testdir = os.path.join(pubmed_path, "pubmed", "Archive", "00")
 if not os.path.isdir(testdir):