about summary refs log tree commit diff
diff options
context:
space:
mode:
authorPjotr Prins2026-04-05 17:21:00 +0200
committerPjotr Prins2026-04-05 17:21:00 +0200
commit10782144441bb72362f73c2b3db5bc66eb4c5fb1 (patch)
treede885fc11ba6aa600313a8feae40bc727f814d6f
parent39e8a0eae7902eca703a61a44a0638198537a7ac (diff)
downloadgenecup-10782144441bb72362f73c2b3db5bc66eb4c5fb1.tar.gz
Make it more DRY
-rwxr-xr-xmore_functions.py48
-rw-r--r--tests/test_network_hybrid.py35
2 files changed, 40 insertions, 43 deletions
diff --git a/more_functions.py b/more_functions.py
index d4271f4..d2c2040 100755
--- a/more_functions.py
+++ b/more_functions.py
@@ -21,6 +21,33 @@ def findWholeWord(w):
     return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search
 
 
+def hybrid_fetch_abstracts(pmid_list):
+    """Fetch abstracts for a list of PMIDs: try local xfetch first,
+    fall back to NCBI efetch for any missing.
+
+    Returns tab-separated lines: PMID, ArticleTitle, AbstractText
+    with hyphens replaced by spaces.
+    """
+    extract = "xtract -pattern PubmedArticle -element MedlineCitation/PMID,ArticleTitle,AbstractText"
+    safe_pmids = "\n".join(p.replace("'", "") for p in pmid_list)
+    # Try local xfetch
+    abs_cmd = "echo '" + safe_pmids + "' | xfetch -db pubmed | " + extract + " | sed \"s/-/ /g\""
+    print(f"  popen(local): {abs_cmd}")
+    abstracts = os.popen(abs_cmd).read()
+    # Check which PMIDs came back with abstracts
+    found_pmids = set()
+    for line in abstracts.strip().split("\n"):
+        if line.strip():
+            found_pmids.add(line.split("\t")[0])
+    missing = [p for p in pmid_list if p not in found_pmids]
+    if missing:
+        print(f"  {len(missing)} PMIDs missing from local, falling back to NCBI efetch")
+        fallback_cmd = "echo '" + "\n".join(missing) + "' | efetch -db pubmed -format xml | " + extract + " | sed \"s/-/ /g\""
+        print(f"  popen(ncbi): {fallback_cmd}")
+        extra = os.popen(fallback_cmd).read()
+        abstracts += extra
+    return abstracts
+
 def getabstracts(gene,query):
     """
       1. esearch -db pubmed -query ... -- searches PubMed for the gene + keyword query, returns matching record IDs
@@ -44,25 +71,8 @@ def getabstracts(gene,query):
         return ""
     pmid_list = pmids.split("\n")
     print(f"  PMIDs ({len(pmid_list)}): {' '.join(pmid_list)}")
-    # Step 2: fetch abstracts -- try local xfetch first, fall back to NCBI efetch
-    safe_pmids = pmids.replace("'", "")
-    extract = "xtract -pattern PubmedArticle -element MedlineCitation/PMID,ArticleTitle,AbstractText"
-    # Try local xfetch
-    abs_cmd = "echo '" + safe_pmids + "' | xfetch -db pubmed | " + extract + " | sed \"s/-/ /g\""
-    print(f"  popen(local): {abs_cmd}")
-    abstracts = os.popen(abs_cmd).read()
-    # Check which PMIDs came back with abstracts
-    found_pmids = set()
-    for line in abstracts.strip().split("\n"):
-        if line.strip():
-            found_pmids.add(line.split("\t")[0])
-    missing = [p for p in pmid_list if p not in found_pmids]
-    if missing:
-        print(f"  {len(missing)} PMIDs missing from local, falling back to NCBI efetch")
-        fallback_cmd = "echo '" + "\n".join(missing) + "' | efetch -db pubmed -format xml | " + extract + " | sed \"s/-/ /g\""
-        print(f"  popen(ncbi): {fallback_cmd}")
-        extra = os.popen(fallback_cmd).read()
-        abstracts += extra
+    # Step 2: fetch abstracts via hybrid local+NCBI
+    abstracts = hybrid_fetch_abstracts(pmid_list)
     return(abstracts)
 
 def getSentences(gene, sentences_ls):
diff --git a/tests/test_network_hybrid.py b/tests/test_network_hybrid.py
index d2912d4..2e28a16 100644
--- a/tests/test_network_hybrid.py
+++ b/tests/test_network_hybrid.py
@@ -7,8 +7,13 @@ Run with: EDIRECT_LOCAL_ARCHIVE=/export3/PubMed/Source python3 -m unittest tests
 
 import os
 import subprocess
+import sys
 import unittest
 
+# Add project root to path so we can import more_functions
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+from more_functions import hybrid_fetch_abstracts
+
 ARCHIVE = os.environ.get("EDIRECT_LOCAL_ARCHIVE", "/export3/PubMed/Source")
 
 @unittest.skipUnless(os.path.isdir(ARCHIVE),
@@ -19,7 +24,6 @@ class TestNetworkHybrid(unittest.TestCase):
         env = os.environ.copy()
         env["EDIRECT_LOCAL_ARCHIVE"] = ARCHIVE
         query = "(stress) AND (Penk [tiab])"
-        extract = "xtract -pattern PubmedArticle -element MedlineCitation/PMID"
 
         # Step 1: get PMIDs from NCBI
         r1 = subprocess.run(
@@ -31,29 +35,12 @@ class TestNetworkHybrid(unittest.TestCase):
         ncbi_pmids = [p for p in ncbi_pmids if p.strip()]
         print(f"  NCBI esearch: {len(ncbi_pmids)} PMIDs")
 
-        # Step 2: local xfetch
-        pmid_str = "\\n".join(ncbi_pmids)
-        r2 = subprocess.run(
-            ["sh", "-c",
-             f'printf "{pmid_str}" | xfetch -db pubmed | {extract}'],
-            capture_output=True, text=True, timeout=120, env=env)
-        local_pmids = set(r2.stdout.strip().split("\n")) - {""}
-        print(f"  Local xfetch: {len(local_pmids)} abstracts")
-
-        # Step 3: fallback efetch for missing
-        missing = [p for p in ncbi_pmids if p not in local_pmids]
-        print(f"  Missing from local: {len(missing)}")
-        fallback_pmids = set()
-        if missing:
-            missing_str = "\\n".join(missing)
-            r3 = subprocess.run(
-                ["sh", "-c",
-                 f'printf "{missing_str}" | efetch -db pubmed -format xml | {extract}'],
-                capture_output=True, text=True, timeout=120, env=env)
-            fallback_pmids = set(r3.stdout.strip().split("\n")) - {""}
-            print(f"  NCBI fallback: {len(fallback_pmids)} abstracts")
-
-        hybrid_pmids = sorted(local_pmids | fallback_pmids)
+        # Step 2: hybrid fetch using the shared function
+        abstracts = hybrid_fetch_abstracts(ncbi_pmids)
+        hybrid_pmids = set()
+        for line in abstracts.strip().split("\n"):
+            if line.strip():
+                hybrid_pmids.add(line.split("\t")[0])
         print(f"  Hybrid total: {len(hybrid_pmids)} abstracts")
 
         # Some articles have no abstract (letters, editorials) so