about summary refs log tree commit diff
path: root/more_functions.py
diff options
context:
space:
mode:
authorPjotr Prins2026-04-05 17:24:54 +0200
committerPjotr Prins2026-04-05 17:24:54 +0200
commit8f84075f667fd3ad523d395cc1c66c07bbef0c23 (patch)
treede35852163b4f9f08a4941ae62e0938e947e4b58 /more_functions.py
parent10782144441bb72362f73c2b3db5bc66eb4c5fb1 (diff)
downloadgenecup-8f84075f667fd3ad523d395cc1c66c07bbef0c23.tar.gz
Cache PMID search hits
Diffstat (limited to 'more_functions.py')
-rwxr-xr-xmore_functions.py29
1 files changed, 23 insertions, 6 deletions
diff --git a/more_functions.py b/more_functions.py
index d2c2040..a6cb86f 100755
--- a/more_functions.py
+++ b/more_functions.py
@@ -1,5 +1,6 @@
 #!/bin/env python3
 from nltk.tokenize import sent_tokenize
+import hashlib
 import os
 import re
 
@@ -9,6 +10,25 @@ import ast
 
 global pubmed_path
 
+# In-memory cache for esearch results: hash(query) -> list of PMIDs
+_esearch_cache = {}
+
+def esearch_pmids(query):
+    """Search PubMed for PMIDs matching query. Results are cached in memory.
+
+    Returns a list of PMID strings, or [] if none found.
+    """
+    key = hashlib.sha256(query.encode()).hexdigest()
+    if key in _esearch_cache:
+        print(f"  esearch cache hit for: {query}")
+        return _esearch_cache[key]
+    pmid_cmd = "esearch -db pubmed -query " + query + " | efetch -format uid"
+    print(f"  popen: {pmid_cmd}")
+    pmids = os.popen(pmid_cmd).read().strip()
+    pmid_list = [p for p in pmids.split("\n") if p.strip()] if pmids else []
+    _esearch_cache[key] = pmid_list
+    return pmid_list
+
 def undic(dic):
     all_s=''
     for s in dic:
@@ -62,14 +82,11 @@ def getabstracts(gene,query):
     """
 
     query="\"(" + query + ") AND (" + gene + " [tiab])\""
-    # Step 1: fetch PMIDs from PubMed
-    pmid_cmd = "esearch -db pubmed -query " + query + " | efetch -format uid"
-    print(f"  popen: {pmid_cmd}")
-    pmids = os.popen(pmid_cmd).read().strip()
-    if not pmids:
+    # Step 1: fetch PMIDs from PubMed (cached)
+    pmid_list = esearch_pmids(query)
+    if not pmid_list:
         print(f"  no PMIDs found for {gene}")
         return ""
-    pmid_list = pmids.split("\n")
     print(f"  PMIDs ({len(pmid_list)}): {' '.join(pmid_list)}")
     # Step 2: fetch abstracts via hybrid local+NCBI
     abstracts = hybrid_fetch_abstracts(pmid_list)