Cache PMID search hits

author: Pjotr Prins 2026-04-05 17:24:54 +0200
committer: Pjotr Prins 2026-04-05 17:24:54 +0200
commit: 8f84075f667fd3ad523d395cc1c66c07bbef0c23 (patch)
tree: de35852163b4f9f08a4941ae62e0938e947e4b58
parent: 10782144441bb72362f73c2b3db5bc66eb4c5fb1 (diff)
download: genecup-8f84075f667fd3ad523d395cc1c66c07bbef0c23.tar.gz
2 files changed, 49 insertions, 25 deletions
diff --git a/more_functions.py b/more_functions.py
index d2c2040..a6cb86f 100755
--- a/more_functions.py
+++ b/more_functions.py
@@ -1,5 +1,6 @@
 #!/bin/env python3
 from nltk.tokenize import sent_tokenize
+import hashlib
 import os
 import re
 
@@ -9,6 +10,25 @@ import ast
 
 global pubmed_path
 
+# In-memory cache for esearch results: hash(query) -> list of PMIDs
+_esearch_cache = {}
+
+def esearch_pmids(query):
+    """Search PubMed for PMIDs matching query. Results are cached in memory.
+
+    Returns a list of PMID strings, or [] if none found.
+    """
+    key = hashlib.sha256(query.encode()).hexdigest()
+    if key in _esearch_cache:
+        print(f"  esearch cache hit for: {query}")
+        return _esearch_cache[key]
+    pmid_cmd = "esearch -db pubmed -query " + query + " | efetch -format uid"
+    print(f"  popen: {pmid_cmd}")
+    pmids = os.popen(pmid_cmd).read().strip()
+    pmid_list = [p for p in pmids.split("\n") if p.strip()] if pmids else []
+    _esearch_cache[key] = pmid_list
+    return pmid_list
+
 def undic(dic):
     all_s=''
     for s in dic:
@@ -62,14 +82,11 @@ def getabstracts(gene,query):
     """
 
     query="\"(" + query + ") AND (" + gene + " [tiab])\""
-    # Step 1: fetch PMIDs from PubMed
-    pmid_cmd = "esearch -db pubmed -query " + query + " | efetch -format uid"
-    print(f"  popen: {pmid_cmd}")
-    pmids = os.popen(pmid_cmd).read().strip()
-    if not pmids:
+    # Step 1: fetch PMIDs from PubMed (cached)
+    pmid_list = esearch_pmids(query)
+    if not pmid_list:
         print(f"  no PMIDs found for {gene}")
         return ""
-    pmid_list = pmids.split("\n")
     print(f"  PMIDs ({len(pmid_list)}): {' '.join(pmid_list)}")
     # Step 2: fetch abstracts via hybrid local+NCBI
     abstracts = hybrid_fetch_abstracts(pmid_list)
diff --git a/tests/test_network_hybrid.py b/tests/test_network_hybrid.py
index 2e28a16..05ca174 100644
--- a/tests/test_network_hybrid.py
+++ b/tests/test_network_hybrid.py
@@ -6,42 +6,36 @@ Run with: EDIRECT_LOCAL_ARCHIVE=/export3/PubMed/Source python3 -m unittest tests
 """
 
 import os
-import subprocess
 import sys
+import time
 import unittest
 
 # Add project root to path so we can import more_functions
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
-from more_functions import hybrid_fetch_abstracts
+from more_functions import esearch_pmids, hybrid_fetch_abstracts
 
 ARCHIVE = os.environ.get("EDIRECT_LOCAL_ARCHIVE", "/export3/PubMed/Source")
+QUERY = '"(stress) AND (Penk [tiab])"'
 
 @unittest.skipUnless(os.path.isdir(ARCHIVE),
                      f"EDIRECT_LOCAL_ARCHIVE not found: {ARCHIVE}")
 class TestNetworkHybrid(unittest.TestCase):
-    def test_hybrid_matches_esearch(self):
+    def test_1_hybrid_matches_esearch(self):
         """Hybrid xfetch+efetch should return same PMIDs as pure esearch."""
-        env = os.environ.copy()
-        env["EDIRECT_LOCAL_ARCHIVE"] = ARCHIVE
-        query = "(stress) AND (Penk [tiab])"
-
-        # Step 1: get PMIDs from NCBI
-        r1 = subprocess.run(
-            ["sh", "-c",
-             f'esearch -db pubmed -query "{query}" | efetch -format uid'],
-            capture_output=True, text=True, timeout=120, env=env)
-        self.assertEqual(r1.returncode, 0, r1.stderr)
-        ncbi_pmids = sorted(set(r1.stdout.strip().split("\n")))
-        ncbi_pmids = [p for p in ncbi_pmids if p.strip()]
-        print(f"  NCBI esearch: {len(ncbi_pmids)} PMIDs")
-
-        # Step 2: hybrid fetch using the shared function
+        t0 = time.time()
+        ncbi_pmids = esearch_pmids(QUERY)
+        t_search = time.time() - t0
+        self.assertGreater(len(ncbi_pmids), 0)
+        print(f"  NCBI esearch: {len(ncbi_pmids)} PMIDs ({t_search:.2f}s)")
+
+        t0 = time.time()
         abstracts = hybrid_fetch_abstracts(ncbi_pmids)
+        t_fetch = time.time() - t0
         hybrid_pmids = set()
         for line in abstracts.strip().split("\n"):
             if line.strip():
                 hybrid_pmids.add(line.split("\t")[0])
-        print(f"  Hybrid total: {len(hybrid_pmids)} abstracts")
+        print(f"  Hybrid total: {len(hybrid_pmids)} abstracts ({t_fetch:.2f}s)")
 
         # Some articles have no abstract (letters, editorials) so
         # hybrid may be slightly less than NCBI. Allow up to 5% gap.
@@ -50,5 +44,18 @@ class TestNetworkHybrid(unittest.TestCase):
         self.assertLessEqual(gap, max(1, len(ncbi_pmids) // 20),
                              f"Too many missing: hybrid {len(hybrid_pmids)} vs NCBI {len(ncbi_pmids)}")
 
+    def test_2_cached_esearch(self):
+        """Second esearch call should use cache and be fast."""
+        # First call to populate cache (may already be cached from test_1)
+        pmids1 = esearch_pmids(QUERY)
+
+        t0 = time.time()
+        pmids2 = esearch_pmids(QUERY)
+        t_cached = time.time() - t0
+
+        print(f"  Cached esearch: {len(pmids2)} PMIDs ({t_cached:.4f}s)")
+        self.assertEqual(pmids1, pmids2, "Cached results differ from first call")
+        self.assertLess(t_cached, 0.01, f"Cache lookup too slow: {t_cached:.4f}s")
+
 if __name__ == "__main__":
     unittest.main()
author	Pjotr Prins	2026-04-05 17:24:54 +0200
committer	Pjotr Prins	2026-04-05 17:24:54 +0200
commit	8f84075f667fd3ad523d395cc1c66c07bbef0c23 (patch)
tree	de35852163b4f9f08a4941ae62e0938e947e4b58
parent	10782144441bb72362f73c2b3db5bc66eb4c5fb1 (diff)
download	genecup-8f84075f667fd3ad523d395cc1c66c07bbef0c23.tar.gz