Use the short fetch-pubmed -path /path/to/archive → xfetch -db pubmed replacement instead!

author: Pjotr Prins 2026-04-05 11:01:21 +0200
committer: Pjotr Prins 2026-04-05 11:01:21 +0200
commit: 45be8b7f1e4bc046fd2e5b1a6bdf4c6db4057788 (patch)
tree: 96c20267b28a546defc98509bdae5a1caee0e90c
parent: 867e2a6c1d99f503ef388c2417bfffaa83de7754 (diff)
download: genecup-45be8b7f1e4bc046fd2e5b1a6bdf4c6db4057788.tar.gz
2 files changed, 5 insertions, 14 deletions
diff --git a/guix.scm b/guix.scm
index a67e737..905f1ac 100644
--- a/guix.scm
+++ b/guix.scm
@@ -116,8 +116,8 @@ detection tokenizer (tab format), used by NLTK's sent_tokenize function.")
                                         "/share/minipubmed")))
                 ;; Generate test.xml from pmid.list
                 (with-directory-excursion "minipubmed"
-                  ;; Generate test.xml by looking up PMIDs in the local archive
-                  (system "for uid in $(cat pmid.list); do p=$(printf '%.2s/%.2s/%.2s' \"$uid\" \"${uid#??}\" \"${uid#????}\"); f=PubMed/Archive/${p}/${uid}.xml.gz; [ -f \"$f\" ] && zcat \"$f\"; done > test.xml"))
+                  ;; Generate test.xml from pmid.list using xfetch
+                  (system "cat pmid.list | xfetch -db pubmed > test.xml"))
                 (mkdir-p out)
                 (copy-recursively "minipubmed" out)))))))
     (inputs (list edirect-25))
diff --git a/more_functions.py b/more_functions.py
index e304ddc..ef43e30 100755
--- a/more_functions.py
+++ b/more_functions.py
@@ -25,8 +25,8 @@ def getabstracts(gene,query):
     """
       1. esearch -db pubmed -query ... -- searches PubMed for the gene + keyword query, returns matching record IDs
       2. efetch -format uid -- fetches just the PMIDs (unique identifiers) from the search results
-      3. look up PMIDs in the local PubMed mirror archive (avoids hitting NCBI servers
-         for the full abstracts) -- archive stores gzipped XML by PMID path
+      3. xfetch -db pubmed -- looks up those PMIDs in the local PubMed mirror (avoids hitting NCBI servers
+         for the full abstracts)
       4. xtract -pattern PubmedArticle -element MedlineCitation/PMID,ArticleTitle,AbstractText -- extracts PMID, title, and
          abstract text from the XML into tab-separated fields
       5. sed "s/-/ /g" -- replaces hyphens with spaces (so hyphenated gene names match keyword searches later)
@@ -45,16 +45,7 @@ def getabstracts(gene,query):
     pmid_list = pmids.split("\n")
     print(f"  PMIDs ({len(pmid_list)}): {' '.join(pmid_list)}")
     # Step 2: fetch abstracts from local mirror
-    # fetch-pubmed is not shipped with edirect 25.x, so we look up
-    # the archive files directly: PMID 30990365 -> 30/99/03/30990365.xml.gz
-    fetch_script = (
-        "for uid in " + pmids.replace("\n", " ").replace("'", "") + "; do "
-        "p=$(printf '%.2s/%.2s/%.2s' \"$uid\" \"${uid#??}\" \"${uid#????}\"); "
-        "f=" + pubmed_path + "/Archive/${p}/${uid}.xml.gz; "
-        "[ -f \"$f\" ] && zcat \"$f\"; "
-        "done"
-    )
-    abs_cmd = fetch_script \
+    abs_cmd = "echo '" + pmids.replace("'", "") + "' | xfetch -db pubmed" \
         + " | xtract -pattern PubmedArticle -element MedlineCitation/PMID,ArticleTitle,AbstractText | sed \"s/-/ /g\""
     print(f"  popen: {abs_cmd}")
     abstracts = os.popen(abs_cmd).read()
author	Pjotr Prins	2026-04-05 11:01:21 +0200
committer	Pjotr Prins	2026-04-05 11:01:21 +0200
commit	45be8b7f1e4bc046fd2e5b1a6bdf4c6db4057788 (patch)
tree	96c20267b28a546defc98509bdae5a1caee0e90c
parent	867e2a6c1d99f503ef388c2417bfffaa83de7754 (diff)
download	genecup-45be8b7f1e4bc046fd2e5b1a6bdf4c6db4057788.tar.gz