about summary refs log tree commit diff
path: root/more_functions.py
diff options
context:
space:
mode:
Diffstat (limited to 'more_functions.py')
-rwxr-xr-xmore_functions.py26
1 files changed, 20 insertions, 6 deletions
diff --git a/more_functions.py b/more_functions.py
index 96e20e5..d4271f4 100755
--- a/more_functions.py
+++ b/more_functions.py
@@ -25,8 +25,8 @@ def getabstracts(gene,query):
     """
       1. esearch -db pubmed -query ... -- searches PubMed for the gene + keyword query, returns matching record IDs
       2. efetch -format uid -- fetches just the PMIDs (unique identifiers) from the search results
-      3. xfetch -db pubmed -- looks up those PMIDs in the local PubMed mirror (avoids hitting NCBI servers
-         for the full abstracts)
+      3. xfetch -db pubmed -- looks up those PMIDs in the local PubMed mirror first;
+         falls back to efetch (NCBI API) for any PMIDs missing abstracts locally
       4. xtract -pattern PubmedArticle -element MedlineCitation/PMID,ArticleTitle,AbstractText -- extracts PMID, title, and
          abstract text from the XML into tab-separated fields
       5. sed "s/-/ /g" -- replaces hyphens with spaces (so hyphenated gene names match keyword searches later)
@@ -44,11 +44,25 @@ def getabstracts(gene,query):
         return ""
     pmid_list = pmids.split("\n")
     print(f"  PMIDs ({len(pmid_list)}): {' '.join(pmid_list)}")
-    # Step 2: fetch abstracts from local mirror
-    abs_cmd = "echo '" + pmids.replace("'", "") + "' | xfetch -db pubmed" \
-        + " | xtract -pattern PubmedArticle -element MedlineCitation/PMID,ArticleTitle,AbstractText | sed \"s/-/ /g\""
-    print(f"  popen: {abs_cmd}")
+    # Step 2: fetch abstracts -- try local xfetch first, fall back to NCBI efetch
+    safe_pmids = pmids.replace("'", "")
+    extract = "xtract -pattern PubmedArticle -element MedlineCitation/PMID,ArticleTitle,AbstractText"
+    # Try local xfetch
+    abs_cmd = "echo '" + safe_pmids + "' | xfetch -db pubmed | " + extract + " | sed \"s/-/ /g\""
+    print(f"  popen(local): {abs_cmd}")
     abstracts = os.popen(abs_cmd).read()
+    # Check which PMIDs came back with abstracts
+    found_pmids = set()
+    for line in abstracts.strip().split("\n"):
+        if line.strip():
+            found_pmids.add(line.split("\t")[0])
+    missing = [p for p in pmid_list if p not in found_pmids]
+    if missing:
+        print(f"  {len(missing)} PMIDs missing from local, falling back to NCBI efetch")
+        fallback_cmd = "echo '" + "\n".join(missing) + "' | efetch -db pubmed -format xml | " + extract + " | sed \"s/-/ /g\""
+        print(f"  popen(ncbi): {fallback_cmd}")
+        extra = os.popen(fallback_cmd).read()
+        abstracts += extra
     return(abstracts)
 
 def getSentences(gene, sentences_ls):