From 5f4cef3640f84092e5692e16865002a832b7838c Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Mon, 6 Apr 2026 09:51:26 +0200 Subject: Added a test that creates an ontology --- more_functions.py | 50 +++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 47 insertions(+), 3 deletions(-) (limited to 'more_functions.py') diff --git a/more_functions.py b/more_functions.py index a115899..35e3646 100755 --- a/more_functions.py +++ b/more_functions.py @@ -3,6 +3,7 @@ from nltk.tokenize import sent_tokenize import hashlib import os import re +import time from addiction_keywords import * from gene_synonyms import * @@ -10,8 +11,51 @@ import ast global pubmed_path -# In-memory cache for esearch results: hash(query) -> list of PMIDs -_esearch_cache = {} +# In-memory caches +_esearch_cache = {} # hash(query) -> list of PMIDs +_gemini_query_cache = {} # hash(prompt) -> response text + +def gemini_query(prompt, model='gemini-2.5-flash'): + """Send a prompt to the Gemini API with caching and retry. + + Returns the response text, or raises on failure. + """ + from google import genai + + cache_key = hashlib.sha256(prompt.encode()).hexdigest() + if cache_key in _gemini_query_cache: + print(f" Gemini query cache hit") + return _gemini_query_cache[cache_key] + + api_key = os.environ.get("GEMINI_API_KEY", "") + if not api_key: + cred_file = os.path.expanduser("~/.config/gemini/credentials") + if os.path.isfile(cred_file): + with open(cred_file) as f: + api_key = f.read().strip() + if not api_key: + raise RuntimeError("No Gemini API key found") + + client = genai.Client(api_key=api_key) + last_error = None + for attempt in range(3): + try: + if attempt > 0: + time.sleep(2 * attempt) + print(f" Gemini retry {attempt + 1}/3") + print(f" Gemini API call ({model}): {prompt[:80]}...") + response = client.models.generate_content( + model=model, + contents=prompt + ) + result = response.text.strip() + print(f" Gemini response: {result[:200]}") + _gemini_query_cache[cache_key] = result + return result + except Exception as e: + last_error = e + print(f" Gemini attempt {attempt + 1}/3 failed: {e}") + raise RuntimeError(f"Gemini API failed after 3 attempts: {last_error}") def esearch_pmids(query): """Search PubMed for PMIDs matching query. Results are cached in memory. @@ -246,7 +290,7 @@ pubmed_path=os.environ.get("EDIRECT_LOCAL_ARCHIVE", "./minipubmed") print(f" pubmed_path={pubmed_path}") if not os.path.isdir(pubmed_path): - print(f"ERROR: EDIRECT_LOCAL_ARCHIVE directory not found: {pubmed_path} - note this is a recent env variable that replaces the others") + print(f"ERROR: EDIRECT_LOCAL_ARCHIVE directory not found: {pubmed_path} - note this is a recent env variable that replaces the others (ignore the minipub reference)") raise SystemExit(1) testdir = os.path.join(pubmed_path, "pubmed", "Archive", "00") if not os.path.isdir(testdir): -- cgit 1.4.1 From acca175362eb1d1ce2b0cd263c39537b2b8a6f2b Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Mon, 6 Apr 2026 11:44:20 +0200 Subject: Combine pubmed online search to a single query --- more_functions.py | 16 ++++++++++++++++ server.py | 35 +++++++++++++++++++---------------- 2 files changed, 35 insertions(+), 16 deletions(-) (limited to 'more_functions.py') diff --git a/more_functions.py b/more_functions.py index 35e3646..5d48adc 100755 --- a/more_functions.py +++ b/more_functions.py @@ -113,6 +113,22 @@ def hybrid_fetch_abstracts(pmid_list): abstracts += extra return abstracts +def getabstracts_batch(genes, query): + """Fetch abstracts for multiple genes in a single PubMed query. + + Builds: (keywords) AND (gene1 [tiab] OR gene2 [tiab] OR ...) + Returns tab-separated lines: PMID, ArticleTitle, AbstractText + """ + genes_clause = " OR ".join(g + " [tiab]" for g in genes) + full_query = "\"(" + query + ") AND (" + genes_clause + ")\"" + pmid_list = esearch_pmids(full_query) + if not pmid_list: + print(f" no PMIDs found for {genes}") + return "" + print(f" PMIDs ({len(pmid_list)}): {' '.join(pmid_list[:20])}{'...' if len(pmid_list) > 20 else ''}") + abstracts = hybrid_fetch_abstracts(pmid_list) + return abstracts + def getabstracts(gene,query): """ 1. esearch -db pubmed -query ... -- searches PubMed for the gene + keyword query, returns matching record IDs diff --git a/server.py b/server.py index 79dda7d..25ccad5 100755 --- a/server.py +++ b/server.py @@ -64,7 +64,7 @@ import re import ast from more_functions import * from nltk.tokenize import sent_tokenize -from more_functions import getabstracts, undic, gene_category +from more_functions import getabstracts, getabstracts_batch, undic, gene_category GENECUP_PROMPT_TEMPLATE = "" try: @@ -983,22 +983,25 @@ def search(): progress+=percent yield "data:"+str(progress)+"\n\n" - for gene in genes: - print(f"Fetching info for gene {gene}\n") - abstracts_raw = getabstracts(gene,all_d) # all_d might be empty if no search_type matches - print(abstracts_raw) - sentences_ls=[] + # Batch fetch all abstracts in a single PubMed query + print(f"Batch fetching abstracts for {len(genes)} genes") + all_abstracts_raw = getabstracts_batch(genes, all_d) if all_d else "" + # Parse all sentences once + all_sentences = [] + for row in all_abstracts_raw.split("\n"): + if not row.strip(): continue + tiab = row.split("\t") + pmid = tiab.pop(0) + tiab_text = " ".join(tiab) + for sent_tok in sent_tokenize(tiab_text): + all_sentences.append(pmid + ' ' + sent_tok) - for row in abstracts_raw.split("\n"): - if not row.strip(): continue # Skip empty lines - tiab=row.split("\t") - pmid = tiab.pop(0) - tiab_text = " ".join(tiab) # Renamed to avoid conflict - sentences_tok = sent_tokenize(tiab_text) - for sent_tok in sentences_tok: - sent_tok = pmid + ' ' + sent_tok - sentences_ls.append(sent_tok) - gene=gene.replace("-"," ") + for gene in genes: + gene = gene.replace("-", " ") + # Filter sentences that mention this gene + gene_re = re.compile(r'\b' + re.escape(gene) + r'\b', re.IGNORECASE) + sentences_ls = [s for s in all_sentences if gene_re.search(s)] + print(f" Gene {gene}: {len(sentences_ls)} sentences") geneEdges = "" -- cgit 1.4.1