1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
|
"""Test hybrid search: local xfetch with NCBI efetch fallback.
Requires EDIRECT_LOCAL_ARCHIVE and internet access.
Run with: EDIRECT_LOCAL_ARCHIVE=/export3/PubMed/Source python3 -m unittest tests.test_network_hybrid -v
"""
import os
import sys
import time
import unittest
# Add project root to path so we can import more_functions
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from more_functions import esearch_pmids, hybrid_fetch_abstracts
ARCHIVE = os.environ.get("EDIRECT_LOCAL_ARCHIVE", "/export3/PubMed/Source")
QUERY = '"(stress) AND (Penk [tiab])"'
@unittest.skipUnless(os.path.isdir(ARCHIVE),
f"EDIRECT_LOCAL_ARCHIVE not found: {ARCHIVE}")
class TestNetworkHybrid(unittest.TestCase):
def test_1_hybrid_matches_esearch(self):
"""Hybrid xfetch+efetch should return same PMIDs as pure esearch."""
t0 = time.time()
ncbi_pmids = esearch_pmids(QUERY)
t_search = time.time() - t0
self.assertGreater(len(ncbi_pmids), 0)
print(f" NCBI esearch: {len(ncbi_pmids)} PMIDs ({t_search:.2f}s)")
t0 = time.time()
abstracts = hybrid_fetch_abstracts(ncbi_pmids)
t_fetch = time.time() - t0
hybrid_pmids = set()
for line in abstracts.strip().split("\n"):
if line.strip():
hybrid_pmids.add(line.split("\t")[0])
print(f" Hybrid total: {len(hybrid_pmids)} abstracts ({t_fetch:.2f}s)")
# Some articles have no abstract (letters, editorials) so
# hybrid may be slightly less than NCBI. Allow up to 5% gap.
gap = len(ncbi_pmids) - len(hybrid_pmids)
print(f" Gap: {gap} PMIDs without abstracts")
self.assertLessEqual(gap, max(1, len(ncbi_pmids) // 20),
f"Too many missing: hybrid {len(hybrid_pmids)} vs NCBI {len(ncbi_pmids)}")
def test_2_cached_esearch(self):
"""Second esearch call should use cache and be fast."""
# First call to populate cache (may already be cached from test_1)
pmids1 = esearch_pmids(QUERY)
t0 = time.time()
pmids2 = esearch_pmids(QUERY)
t_cached = time.time() - t0
print(f" Cached esearch: {len(pmids2)} PMIDs ({t_cached:.4f}s)")
self.assertEqual(pmids1, pmids2, "Cached results differ from first call")
self.assertLess(t_cached, 0.01, f"Cache lookup too slow: {t_cached:.4f}s")
if __name__ == "__main__":
unittest.main()
|