diff options
| author | Pjotr Prins | 2026-04-05 10:38:53 +0200 |
|---|---|---|
| committer | Pjotr Prins | 2026-04-05 10:38:53 +0200 |
| commit | 210a4895f41094b594b8e34a48d3173b07e12184 (patch) | |
| tree | b09d9ac509085a699d804b86078fa09010014b97 | |
| parent | a8f87dd3acfdcca2c1f29dc23345eed0d50ee1ba (diff) | |
| download | genecup-210a4895f41094b594b8e34a48d3173b07e12184.tar.gz | |
Unpacking XML files as an alternative for fetch-pubmed works, but not complete I think
| -rw-r--r-- | guix.scm | 3 | ||||
| -rwxr-xr-x | more_functions.py | 15 |
2 files changed, 14 insertions, 4 deletions
diff --git a/guix.scm b/guix.scm index 8772e29..7c58143 100644 --- a/guix.scm +++ b/guix.scm @@ -116,7 +116,8 @@ detection tokenizer (tab format), used by NLTK's sent_tokenize function.") "/share/minipubmed"))) ;; Generate test.xml from pmid.list (with-directory-excursion "minipubmed" - (system "cat pmid.list | fetch-pubmed -path PubMed/Archive/ > test.xml")) + ;; Generate test.xml by looking up PMIDs in the local archive + (system "for uid in $(cat pmid.list); do p=$(printf '%.2s/%.2s/%.2s' \"$uid\" \"${uid#??}\" \"${uid#????}\"); f=PubMed/Archive/${p}/${uid}.xml.gz; [ -f \"$f\" ] && zcat \"$f\"; done > test.xml")) (mkdir-p out) (copy-recursively "minipubmed" out))))))) (inputs (list edirect-25)) diff --git a/more_functions.py b/more_functions.py index 5dc52d4..e304ddc 100755 --- a/more_functions.py +++ b/more_functions.py @@ -25,8 +25,8 @@ def getabstracts(gene,query): """ 1. esearch -db pubmed -query ... -- searches PubMed for the gene + keyword query, returns matching record IDs 2. efetch -format uid -- fetches just the PMIDs (unique identifiers) from the search results - 3. fetch-pubmed -path <pubmed_path> -- looks up those PMIDs in the local PubMed mirror (avoids hitting NCBI servers - for the full abstracts) + 3. look up PMIDs in the local PubMed mirror archive (avoids hitting NCBI servers + for the full abstracts) -- archive stores gzipped XML by PMID path 4. xtract -pattern PubmedArticle -element MedlineCitation/PMID,ArticleTitle,AbstractText -- extracts PMID, title, and abstract text from the XML into tab-separated fields 5. sed "s/-/ /g" -- replaces hyphens with spaces (so hyphenated gene names match keyword searches later) @@ -45,7 +45,16 @@ def getabstracts(gene,query): pmid_list = pmids.split("\n") print(f" PMIDs ({len(pmid_list)}): {' '.join(pmid_list)}") # Step 2: fetch abstracts from local mirror - abs_cmd = "echo '" + pmids.replace("'", "") + "' | fetch-pubmed -path " + pubmed_path \ + # fetch-pubmed is not shipped with edirect 25.x, so we look up + # the archive files directly: PMID 30990365 -> 30/99/03/30990365.xml.gz + fetch_script = ( + "for uid in " + pmids.replace("\n", " ").replace("'", "") + "; do " + "p=$(printf '%.2s/%.2s/%.2s' \"$uid\" \"${uid#??}\" \"${uid#????}\"); " + "f=" + pubmed_path + "/Archive/${p}/${uid}.xml.gz; " + "[ -f \"$f\" ] && zcat \"$f\"; " + "done" + ) + abs_cmd = fetch_script \ + " | xtract -pattern PubmedArticle -element MedlineCitation/PMID,ArticleTitle,AbstractText | sed \"s/-/ /g\"" print(f" popen: {abs_cmd}") abstracts = os.popen(abs_cmd).read() |
