Split pipe commands, so we can see how long they take

author: Pjotr Prins 2026-03-29 09:59:27 +0200
committer: Pjotr Prins 2026-03-29 09:59:27 +0200
commit: db9974a1b64c643a760c67e30d3787d47cd28a34 (patch)
tree: 69439aa8668c397569ed4f4d771f4fab7e38a17c
parent: 07b9a4222bab8128a5dcceca985fb17db0c98e05 (diff)
download: genecup-db9974a1b64c643a760c67e30d3787d47cd28a34.tar.gz
1 files changed, 27 insertions, 5 deletions
diff --git a/more_functions.py b/more_functions.py
index f94ec34..2c1198b 100755
--- a/more_functions.py
+++ b/more_functions.py
@@ -20,13 +20,35 @@ def undic(dic):
 def findWholeWord(w):
     return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search
 
+
 def getabstracts(gene,query):
+    """
+      1. esearch -db pubmed -query ... -- searches PubMed for the gene + keyword query, returns matching record IDs
+      2. efetch -format uid -- fetches just the PMIDs (unique identifiers) from the search results
+      3. fetch-pubmed -path <pubmed_path> -- looks up those PMIDs in the local PubMed mirror (avoids hitting NCBI servers
+         for the full abstracts)
+      4. xtract -pattern PubmedArticle -element MedlineCitation/PMID,ArticleTitle,AbstractText -- extracts PMID, title, and
+         abstract text from the XML into tab-separated fields
+      5. sed "s/-/ /g" -- replaces hyphens with spaces (so hyphenated gene names match keyword searches later)
+
+  So: search PubMed remotely for matching articles, get their PMIDs, retrieve the full XML from the local mirror, then extract the PMID + title + abstract as tab-separated text. efetch -format uid returns only PMIDs. The esearch itself just creates a search handle on NCBI's servers, and efetch -format uid pulls back only the numeric PMIDs from that handle. No abstracts or XML are fetched from NCBI.
+    """
+
     query="\"(" + query + ") AND (" + gene + " [tiab])\""
-    cmd = "esearch -db pubmed -query " +  query \
-        + " | efetch -format uid |fetch-pubmed -path "+ pubmed_path \
-        + " | xtract -pattern PubmedArticle -element MedlineCitation/PMID,ArticleTitle,AbstractText|sed \"s/-/ /g\""
-    print(f"  popen: {cmd}")
-    abstracts = os.popen(cmd).read()
+    # Step 1: fetch PMIDs from PubMed
+    pmid_cmd = "esearch -db pubmed -query " + query + " | efetch -format uid"
+    print(f"  popen: {pmid_cmd}")
+    pmids = os.popen(pmid_cmd).read().strip()
+    if not pmids:
+        print(f"  no PMIDs found for {gene}")
+        return ""
+    pmid_list = pmids.split("\n")
+    print(f"  PMIDs ({len(pmid_list)}): {' '.join(pmid_list)}")
+    # Step 2: fetch abstracts from local mirror
+    abs_cmd = "echo '" + pmids.replace("'", "") + "' | fetch-pubmed -path " + pubmed_path \
+        + " | xtract -pattern PubmedArticle -element MedlineCitation/PMID,ArticleTitle,AbstractText | sed \"s/-/ /g\""
+    print(f"  popen: {abs_cmd}")
+    abstracts = os.popen(abs_cmd).read()
     return(abstracts)
 
 def getSentences(gene, sentences_ls):
author	Pjotr Prins	2026-03-29 09:59:27 +0200
committer	Pjotr Prins	2026-03-29 09:59:27 +0200
commit	db9974a1b64c643a760c67e30d3787d47cd28a34 (patch)
tree	69439aa8668c397569ed4f4d771f4fab7e38a17c
parent	07b9a4222bab8128a5dcceca985fb17db0c98e05 (diff)
download	genecup-db9974a1b64c643a760c67e30d3787d47cd28a34.tar.gz