about summary refs log tree commit diff
diff options
context:
space:
mode:
authorPjotr Prins2026-04-05 16:14:01 +0200
committerPjotr Prins2026-04-05 16:14:01 +0200
commitcd5e20b411e90e6ba1d82a7159fb0ac874da9c71 (patch)
tree509ff99e668aea96bc5467a8e05dc53fdaa9c7a9
parentd0876ab1f89c107db906be6c82a413308cb54c19 (diff)
downloadgenecup-cd5e20b411e90e6ba1d82a7159fb0ac874da9c71.tar.gz
Add hybrid search
-rwxr-xr-xmore_functions.py26
-rw-r--r--tests/test_local_xfetch.py5
2 files changed, 23 insertions, 8 deletions
diff --git a/more_functions.py b/more_functions.py
index 96e20e5..d4271f4 100755
--- a/more_functions.py
+++ b/more_functions.py
@@ -25,8 +25,8 @@ def getabstracts(gene,query):
     """
       1. esearch -db pubmed -query ... -- searches PubMed for the gene + keyword query, returns matching record IDs
       2. efetch -format uid -- fetches just the PMIDs (unique identifiers) from the search results
-      3. xfetch -db pubmed -- looks up those PMIDs in the local PubMed mirror (avoids hitting NCBI servers
-         for the full abstracts)
+      3. xfetch -db pubmed -- looks up those PMIDs in the local PubMed mirror first;
+         falls back to efetch (NCBI API) for any PMIDs missing abstracts locally
       4. xtract -pattern PubmedArticle -element MedlineCitation/PMID,ArticleTitle,AbstractText -- extracts PMID, title, and
          abstract text from the XML into tab-separated fields
       5. sed "s/-/ /g" -- replaces hyphens with spaces (so hyphenated gene names match keyword searches later)
@@ -44,11 +44,25 @@ def getabstracts(gene,query):
         return ""
     pmid_list = pmids.split("\n")
     print(f"  PMIDs ({len(pmid_list)}): {' '.join(pmid_list)}")
-    # Step 2: fetch abstracts from local mirror
-    abs_cmd = "echo '" + pmids.replace("'", "") + "' | xfetch -db pubmed" \
-        + " | xtract -pattern PubmedArticle -element MedlineCitation/PMID,ArticleTitle,AbstractText | sed \"s/-/ /g\""
-    print(f"  popen: {abs_cmd}")
+    # Step 2: fetch abstracts -- try local xfetch first, fall back to NCBI efetch
+    safe_pmids = pmids.replace("'", "")
+    extract = "xtract -pattern PubmedArticle -element MedlineCitation/PMID,ArticleTitle,AbstractText"
+    # Try local xfetch
+    abs_cmd = "echo '" + safe_pmids + "' | xfetch -db pubmed | " + extract + " | sed \"s/-/ /g\""
+    print(f"  popen(local): {abs_cmd}")
     abstracts = os.popen(abs_cmd).read()
+    # Check which PMIDs came back with abstracts
+    found_pmids = set()
+    for line in abstracts.strip().split("\n"):
+        if line.strip():
+            found_pmids.add(line.split("\t")[0])
+    missing = [p for p in pmid_list if p not in found_pmids]
+    if missing:
+        print(f"  {len(missing)} PMIDs missing from local, falling back to NCBI efetch")
+        fallback_cmd = "echo '" + "\n".join(missing) + "' | efetch -db pubmed -format xml | " + extract + " | sed \"s/-/ /g\""
+        print(f"  popen(ncbi): {fallback_cmd}")
+        extra = os.popen(fallback_cmd).read()
+        abstracts += extra
     return(abstracts)
 
 def getSentences(gene, sentences_ls):
diff --git a/tests/test_local_xfetch.py b/tests/test_local_xfetch.py
index 7b7bc8f..7ab7e93 100644
--- a/tests/test_local_xfetch.py
+++ b/tests/test_local_xfetch.py
@@ -29,10 +29,11 @@ class TestLocalXfetch(unittest.TestCase):
         self.assertGreater(len(output), 0, "Expected non-empty XML output")
         self.assertIn("PubmedArticle", output,
                       "Expected PubmedArticle XML elements")
-        # Count articles
+        # Count articles -- local index may be incomplete compared to NCBI
+        # (depends on how far the indexing pipeline ran)
         count = output.count("<PubmedArticle>")
         print(f"  Found {count} PubmedArticle records for Penk+stress (local)")
-        self.assertGreater(count, 10, "Expected at least 10 PubmedArticles")
+        self.assertGreater(count, 5, "Expected at least 5 PubmedArticles")
 
 if __name__ == "__main__":
     unittest.main()