Cache PMID search hits

author: Pjotr Prins 2026-04-05 17:24:54 +0200
committer: Pjotr Prins 2026-04-05 17:24:54 +0200
commit: 8f84075f667fd3ad523d395cc1c66c07bbef0c23 (patch)
tree: de35852163b4f9f08a4941ae62e0938e947e4b58 /tests
parent: 10782144441bb72362f73c2b3db5bc66eb4c5fb1 (diff)
download: genecup-8f84075f667fd3ad523d395cc1c66c07bbef0c23.tar.gz
1 files changed, 26 insertions, 19 deletions
diff --git a/tests/test_network_hybrid.py b/tests/test_network_hybrid.py
index 2e28a16..05ca174 100644
--- a/tests/test_network_hybrid.py
+++ b/tests/test_network_hybrid.py
@@ -6,42 +6,36 @@ Run with: EDIRECT_LOCAL_ARCHIVE=/export3/PubMed/Source python3 -m unittest tests
 """
 
 import os
-import subprocess
 import sys
+import time
 import unittest
 
 # Add project root to path so we can import more_functions
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
-from more_functions import hybrid_fetch_abstracts
+from more_functions import esearch_pmids, hybrid_fetch_abstracts
 
 ARCHIVE = os.environ.get("EDIRECT_LOCAL_ARCHIVE", "/export3/PubMed/Source")
+QUERY = '"(stress) AND (Penk [tiab])"'
 
 @unittest.skipUnless(os.path.isdir(ARCHIVE),
                      f"EDIRECT_LOCAL_ARCHIVE not found: {ARCHIVE}")
 class TestNetworkHybrid(unittest.TestCase):
-    def test_hybrid_matches_esearch(self):
+    def test_1_hybrid_matches_esearch(self):
         """Hybrid xfetch+efetch should return same PMIDs as pure esearch."""
-        env = os.environ.copy()
-        env["EDIRECT_LOCAL_ARCHIVE"] = ARCHIVE
-        query = "(stress) AND (Penk [tiab])"
-
-        # Step 1: get PMIDs from NCBI
-        r1 = subprocess.run(
-            ["sh", "-c",
-             f'esearch -db pubmed -query "{query}" | efetch -format uid'],
-            capture_output=True, text=True, timeout=120, env=env)
-        self.assertEqual(r1.returncode, 0, r1.stderr)
-        ncbi_pmids = sorted(set(r1.stdout.strip().split("\n")))
-        ncbi_pmids = [p for p in ncbi_pmids if p.strip()]
-        print(f"  NCBI esearch: {len(ncbi_pmids)} PMIDs")
-
-        # Step 2: hybrid fetch using the shared function
+        t0 = time.time()
+        ncbi_pmids = esearch_pmids(QUERY)
+        t_search = time.time() - t0
+        self.assertGreater(len(ncbi_pmids), 0)
+        print(f"  NCBI esearch: {len(ncbi_pmids)} PMIDs ({t_search:.2f}s)")
+
+        t0 = time.time()
         abstracts = hybrid_fetch_abstracts(ncbi_pmids)
+        t_fetch = time.time() - t0
         hybrid_pmids = set()
         for line in abstracts.strip().split("\n"):
             if line.strip():
                 hybrid_pmids.add(line.split("\t")[0])
-        print(f"  Hybrid total: {len(hybrid_pmids)} abstracts")
+        print(f"  Hybrid total: {len(hybrid_pmids)} abstracts ({t_fetch:.2f}s)")
 
         # Some articles have no abstract (letters, editorials) so
         # hybrid may be slightly less than NCBI. Allow up to 5% gap.
@@ -50,5 +44,18 @@ class TestNetworkHybrid(unittest.TestCase):
         self.assertLessEqual(gap, max(1, len(ncbi_pmids) // 20),
                              f"Too many missing: hybrid {len(hybrid_pmids)} vs NCBI {len(ncbi_pmids)}")
 
+    def test_2_cached_esearch(self):
+        """Second esearch call should use cache and be fast."""
+        # First call to populate cache (may already be cached from test_1)
+        pmids1 = esearch_pmids(QUERY)
+
+        t0 = time.time()
+        pmids2 = esearch_pmids(QUERY)
+        t_cached = time.time() - t0
+
+        print(f"  Cached esearch: {len(pmids2)} PMIDs ({t_cached:.4f}s)")
+        self.assertEqual(pmids1, pmids2, "Cached results differ from first call")
+        self.assertLess(t_cached, 0.01, f"Cache lookup too slow: {t_cached:.4f}s")
+
 if __name__ == "__main__":
     unittest.main()
author	Pjotr Prins	2026-04-05 17:24:54 +0200
committer	Pjotr Prins	2026-04-05 17:24:54 +0200
commit	8f84075f667fd3ad523d395cc1c66c07bbef0c23 (patch)
tree	de35852163b4f9f08a4941ae62e0938e947e4b58 /tests
parent	10782144441bb72362f73c2b3db5bc66eb4c5fb1 (diff)
download	genecup-8f84075f667fd3ad523d395cc1c66c07bbef0c23.tar.gz