diff options
| author | Pjotr Prins | 2026-04-05 17:24:54 +0200 |
|---|---|---|
| committer | Pjotr Prins | 2026-04-05 17:24:54 +0200 |
| commit | 8f84075f667fd3ad523d395cc1c66c07bbef0c23 (patch) | |
| tree | de35852163b4f9f08a4941ae62e0938e947e4b58 /tests | |
| parent | 10782144441bb72362f73c2b3db5bc66eb4c5fb1 (diff) | |
| download | genecup-8f84075f667fd3ad523d395cc1c66c07bbef0c23.tar.gz | |
Cache PMID search hits
Diffstat (limited to 'tests')
| -rw-r--r-- | tests/test_network_hybrid.py | 45 |
1 files changed, 26 insertions, 19 deletions
diff --git a/tests/test_network_hybrid.py b/tests/test_network_hybrid.py index 2e28a16..05ca174 100644 --- a/tests/test_network_hybrid.py +++ b/tests/test_network_hybrid.py @@ -6,42 +6,36 @@ Run with: EDIRECT_LOCAL_ARCHIVE=/export3/PubMed/Source python3 -m unittest tests """ import os -import subprocess import sys +import time import unittest # Add project root to path so we can import more_functions sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) -from more_functions import hybrid_fetch_abstracts +from more_functions import esearch_pmids, hybrid_fetch_abstracts ARCHIVE = os.environ.get("EDIRECT_LOCAL_ARCHIVE", "/export3/PubMed/Source") +QUERY = '"(stress) AND (Penk [tiab])"' @unittest.skipUnless(os.path.isdir(ARCHIVE), f"EDIRECT_LOCAL_ARCHIVE not found: {ARCHIVE}") class TestNetworkHybrid(unittest.TestCase): - def test_hybrid_matches_esearch(self): + def test_1_hybrid_matches_esearch(self): """Hybrid xfetch+efetch should return same PMIDs as pure esearch.""" - env = os.environ.copy() - env["EDIRECT_LOCAL_ARCHIVE"] = ARCHIVE - query = "(stress) AND (Penk [tiab])" - - # Step 1: get PMIDs from NCBI - r1 = subprocess.run( - ["sh", "-c", - f'esearch -db pubmed -query "{query}" | efetch -format uid'], - capture_output=True, text=True, timeout=120, env=env) - self.assertEqual(r1.returncode, 0, r1.stderr) - ncbi_pmids = sorted(set(r1.stdout.strip().split("\n"))) - ncbi_pmids = [p for p in ncbi_pmids if p.strip()] - print(f" NCBI esearch: {len(ncbi_pmids)} PMIDs") - - # Step 2: hybrid fetch using the shared function + t0 = time.time() + ncbi_pmids = esearch_pmids(QUERY) + t_search = time.time() - t0 + self.assertGreater(len(ncbi_pmids), 0) + print(f" NCBI esearch: {len(ncbi_pmids)} PMIDs ({t_search:.2f}s)") + + t0 = time.time() abstracts = hybrid_fetch_abstracts(ncbi_pmids) + t_fetch = time.time() - t0 hybrid_pmids = set() for line in abstracts.strip().split("\n"): if line.strip(): hybrid_pmids.add(line.split("\t")[0]) - print(f" Hybrid total: {len(hybrid_pmids)} abstracts") + print(f" Hybrid total: {len(hybrid_pmids)} abstracts ({t_fetch:.2f}s)") # Some articles have no abstract (letters, editorials) so # hybrid may be slightly less than NCBI. Allow up to 5% gap. @@ -50,5 +44,18 @@ class TestNetworkHybrid(unittest.TestCase): self.assertLessEqual(gap, max(1, len(ncbi_pmids) // 20), f"Too many missing: hybrid {len(hybrid_pmids)} vs NCBI {len(ncbi_pmids)}") + def test_2_cached_esearch(self): + """Second esearch call should use cache and be fast.""" + # First call to populate cache (may already be cached from test_1) + pmids1 = esearch_pmids(QUERY) + + t0 = time.time() + pmids2 = esearch_pmids(QUERY) + t_cached = time.time() - t0 + + print(f" Cached esearch: {len(pmids2)} PMIDs ({t_cached:.4f}s)") + self.assertEqual(pmids1, pmids2, "Cached results differ from first call") + self.assertLess(t_cached, 0.01, f"Cache lookup too slow: {t_cached:.4f}s") + if __name__ == "__main__": unittest.main() |
