diff options
| author | Pjotr Prins | 2026-04-05 17:21:00 +0200 |
|---|---|---|
| committer | Pjotr Prins | 2026-04-05 17:21:00 +0200 |
| commit | 10782144441bb72362f73c2b3db5bc66eb4c5fb1 (patch) | |
| tree | de885fc11ba6aa600313a8feae40bc727f814d6f | |
| parent | 39e8a0eae7902eca703a61a44a0638198537a7ac (diff) | |
| download | genecup-10782144441bb72362f73c2b3db5bc66eb4c5fb1.tar.gz | |
Make it more DRY
| -rwxr-xr-x | more_functions.py | 48 | ||||
| -rw-r--r-- | tests/test_network_hybrid.py | 35 |
2 files changed, 40 insertions, 43 deletions
diff --git a/more_functions.py b/more_functions.py index d4271f4..d2c2040 100755 --- a/more_functions.py +++ b/more_functions.py @@ -21,6 +21,33 @@ def findWholeWord(w): return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search +def hybrid_fetch_abstracts(pmid_list): + """Fetch abstracts for a list of PMIDs: try local xfetch first, + fall back to NCBI efetch for any missing. + + Returns tab-separated lines: PMID, ArticleTitle, AbstractText + with hyphens replaced by spaces. + """ + extract = "xtract -pattern PubmedArticle -element MedlineCitation/PMID,ArticleTitle,AbstractText" + safe_pmids = "\n".join(p.replace("'", "") for p in pmid_list) + # Try local xfetch + abs_cmd = "echo '" + safe_pmids + "' | xfetch -db pubmed | " + extract + " | sed \"s/-/ /g\"" + print(f" popen(local): {abs_cmd}") + abstracts = os.popen(abs_cmd).read() + # Check which PMIDs came back with abstracts + found_pmids = set() + for line in abstracts.strip().split("\n"): + if line.strip(): + found_pmids.add(line.split("\t")[0]) + missing = [p for p in pmid_list if p not in found_pmids] + if missing: + print(f" {len(missing)} PMIDs missing from local, falling back to NCBI efetch") + fallback_cmd = "echo '" + "\n".join(missing) + "' | efetch -db pubmed -format xml | " + extract + " | sed \"s/-/ /g\"" + print(f" popen(ncbi): {fallback_cmd}") + extra = os.popen(fallback_cmd).read() + abstracts += extra + return abstracts + def getabstracts(gene,query): """ 1. esearch -db pubmed -query ... -- searches PubMed for the gene + keyword query, returns matching record IDs @@ -44,25 +71,8 @@ def getabstracts(gene,query): return "" pmid_list = pmids.split("\n") print(f" PMIDs ({len(pmid_list)}): {' '.join(pmid_list)}") - # Step 2: fetch abstracts -- try local xfetch first, fall back to NCBI efetch - safe_pmids = pmids.replace("'", "") - extract = "xtract -pattern PubmedArticle -element MedlineCitation/PMID,ArticleTitle,AbstractText" - # Try local xfetch - abs_cmd = "echo '" + safe_pmids + "' | xfetch -db pubmed | " + extract + " | sed \"s/-/ /g\"" - print(f" popen(local): {abs_cmd}") - abstracts = os.popen(abs_cmd).read() - # Check which PMIDs came back with abstracts - found_pmids = set() - for line in abstracts.strip().split("\n"): - if line.strip(): - found_pmids.add(line.split("\t")[0]) - missing = [p for p in pmid_list if p not in found_pmids] - if missing: - print(f" {len(missing)} PMIDs missing from local, falling back to NCBI efetch") - fallback_cmd = "echo '" + "\n".join(missing) + "' | efetch -db pubmed -format xml | " + extract + " | sed \"s/-/ /g\"" - print(f" popen(ncbi): {fallback_cmd}") - extra = os.popen(fallback_cmd).read() - abstracts += extra + # Step 2: fetch abstracts via hybrid local+NCBI + abstracts = hybrid_fetch_abstracts(pmid_list) return(abstracts) def getSentences(gene, sentences_ls): diff --git a/tests/test_network_hybrid.py b/tests/test_network_hybrid.py index d2912d4..2e28a16 100644 --- a/tests/test_network_hybrid.py +++ b/tests/test_network_hybrid.py @@ -7,8 +7,13 @@ Run with: EDIRECT_LOCAL_ARCHIVE=/export3/PubMed/Source python3 -m unittest tests import os import subprocess +import sys import unittest +# Add project root to path so we can import more_functions +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) +from more_functions import hybrid_fetch_abstracts + ARCHIVE = os.environ.get("EDIRECT_LOCAL_ARCHIVE", "/export3/PubMed/Source") @unittest.skipUnless(os.path.isdir(ARCHIVE), @@ -19,7 +24,6 @@ class TestNetworkHybrid(unittest.TestCase): env = os.environ.copy() env["EDIRECT_LOCAL_ARCHIVE"] = ARCHIVE query = "(stress) AND (Penk [tiab])" - extract = "xtract -pattern PubmedArticle -element MedlineCitation/PMID" # Step 1: get PMIDs from NCBI r1 = subprocess.run( @@ -31,29 +35,12 @@ class TestNetworkHybrid(unittest.TestCase): ncbi_pmids = [p for p in ncbi_pmids if p.strip()] print(f" NCBI esearch: {len(ncbi_pmids)} PMIDs") - # Step 2: local xfetch - pmid_str = "\\n".join(ncbi_pmids) - r2 = subprocess.run( - ["sh", "-c", - f'printf "{pmid_str}" | xfetch -db pubmed | {extract}'], - capture_output=True, text=True, timeout=120, env=env) - local_pmids = set(r2.stdout.strip().split("\n")) - {""} - print(f" Local xfetch: {len(local_pmids)} abstracts") - - # Step 3: fallback efetch for missing - missing = [p for p in ncbi_pmids if p not in local_pmids] - print(f" Missing from local: {len(missing)}") - fallback_pmids = set() - if missing: - missing_str = "\\n".join(missing) - r3 = subprocess.run( - ["sh", "-c", - f'printf "{missing_str}" | efetch -db pubmed -format xml | {extract}'], - capture_output=True, text=True, timeout=120, env=env) - fallback_pmids = set(r3.stdout.strip().split("\n")) - {""} - print(f" NCBI fallback: {len(fallback_pmids)} abstracts") - - hybrid_pmids = sorted(local_pmids | fallback_pmids) + # Step 2: hybrid fetch using the shared function + abstracts = hybrid_fetch_abstracts(ncbi_pmids) + hybrid_pmids = set() + for line in abstracts.strip().split("\n"): + if line.strip(): + hybrid_pmids.add(line.split("\t")[0]) print(f" Hybrid total: {len(hybrid_pmids)} abstracts") # Some articles have no abstract (letters, editorials) so |
