diff options
| -rwxr-xr-x | more_functions.py | 29 | ||||
| -rw-r--r-- | tests/test_network_hybrid.py | 45 |
2 files changed, 49 insertions, 25 deletions
diff --git a/more_functions.py b/more_functions.py index d2c2040..a6cb86f 100755 --- a/more_functions.py +++ b/more_functions.py @@ -1,5 +1,6 @@ #!/bin/env python3 from nltk.tokenize import sent_tokenize +import hashlib import os import re @@ -9,6 +10,25 @@ import ast global pubmed_path +# In-memory cache for esearch results: hash(query) -> list of PMIDs +_esearch_cache = {} + +def esearch_pmids(query): + """Search PubMed for PMIDs matching query. Results are cached in memory. + + Returns a list of PMID strings, or [] if none found. + """ + key = hashlib.sha256(query.encode()).hexdigest() + if key in _esearch_cache: + print(f" esearch cache hit for: {query}") + return _esearch_cache[key] + pmid_cmd = "esearch -db pubmed -query " + query + " | efetch -format uid" + print(f" popen: {pmid_cmd}") + pmids = os.popen(pmid_cmd).read().strip() + pmid_list = [p for p in pmids.split("\n") if p.strip()] if pmids else [] + _esearch_cache[key] = pmid_list + return pmid_list + def undic(dic): all_s='' for s in dic: @@ -62,14 +82,11 @@ def getabstracts(gene,query): """ query="\"(" + query + ") AND (" + gene + " [tiab])\"" - # Step 1: fetch PMIDs from PubMed - pmid_cmd = "esearch -db pubmed -query " + query + " | efetch -format uid" - print(f" popen: {pmid_cmd}") - pmids = os.popen(pmid_cmd).read().strip() - if not pmids: + # Step 1: fetch PMIDs from PubMed (cached) + pmid_list = esearch_pmids(query) + if not pmid_list: print(f" no PMIDs found for {gene}") return "" - pmid_list = pmids.split("\n") print(f" PMIDs ({len(pmid_list)}): {' '.join(pmid_list)}") # Step 2: fetch abstracts via hybrid local+NCBI abstracts = hybrid_fetch_abstracts(pmid_list) diff --git a/tests/test_network_hybrid.py b/tests/test_network_hybrid.py index 2e28a16..05ca174 100644 --- a/tests/test_network_hybrid.py +++ b/tests/test_network_hybrid.py @@ -6,42 +6,36 @@ Run with: EDIRECT_LOCAL_ARCHIVE=/export3/PubMed/Source python3 -m unittest tests """ import os -import subprocess import sys +import time import unittest # Add project root to path so we can import more_functions sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) -from more_functions import hybrid_fetch_abstracts +from more_functions import esearch_pmids, hybrid_fetch_abstracts ARCHIVE = os.environ.get("EDIRECT_LOCAL_ARCHIVE", "/export3/PubMed/Source") +QUERY = '"(stress) AND (Penk [tiab])"' @unittest.skipUnless(os.path.isdir(ARCHIVE), f"EDIRECT_LOCAL_ARCHIVE not found: {ARCHIVE}") class TestNetworkHybrid(unittest.TestCase): - def test_hybrid_matches_esearch(self): + def test_1_hybrid_matches_esearch(self): """Hybrid xfetch+efetch should return same PMIDs as pure esearch.""" - env = os.environ.copy() - env["EDIRECT_LOCAL_ARCHIVE"] = ARCHIVE - query = "(stress) AND (Penk [tiab])" - - # Step 1: get PMIDs from NCBI - r1 = subprocess.run( - ["sh", "-c", - f'esearch -db pubmed -query "{query}" | efetch -format uid'], - capture_output=True, text=True, timeout=120, env=env) - self.assertEqual(r1.returncode, 0, r1.stderr) - ncbi_pmids = sorted(set(r1.stdout.strip().split("\n"))) - ncbi_pmids = [p for p in ncbi_pmids if p.strip()] - print(f" NCBI esearch: {len(ncbi_pmids)} PMIDs") - - # Step 2: hybrid fetch using the shared function + t0 = time.time() + ncbi_pmids = esearch_pmids(QUERY) + t_search = time.time() - t0 + self.assertGreater(len(ncbi_pmids), 0) + print(f" NCBI esearch: {len(ncbi_pmids)} PMIDs ({t_search:.2f}s)") + + t0 = time.time() abstracts = hybrid_fetch_abstracts(ncbi_pmids) + t_fetch = time.time() - t0 hybrid_pmids = set() for line in abstracts.strip().split("\n"): if line.strip(): hybrid_pmids.add(line.split("\t")[0]) - print(f" Hybrid total: {len(hybrid_pmids)} abstracts") + print(f" Hybrid total: {len(hybrid_pmids)} abstracts ({t_fetch:.2f}s)") # Some articles have no abstract (letters, editorials) so # hybrid may be slightly less than NCBI. Allow up to 5% gap. @@ -50,5 +44,18 @@ class TestNetworkHybrid(unittest.TestCase): self.assertLessEqual(gap, max(1, len(ncbi_pmids) // 20), f"Too many missing: hybrid {len(hybrid_pmids)} vs NCBI {len(ncbi_pmids)}") + def test_2_cached_esearch(self): + """Second esearch call should use cache and be fast.""" + # First call to populate cache (may already be cached from test_1) + pmids1 = esearch_pmids(QUERY) + + t0 = time.time() + pmids2 = esearch_pmids(QUERY) + t_cached = time.time() - t0 + + print(f" Cached esearch: {len(pmids2)} PMIDs ({t_cached:.4f}s)") + self.assertEqual(pmids1, pmids2, "Cached results differ from first call") + self.assertLess(t_cached, 0.01, f"Cache lookup too slow: {t_cached:.4f}s") + if __name__ == "__main__": unittest.main() |
