diff options
Diffstat (limited to 'more_functions.py')
| -rwxr-xr-x | more_functions.py | 26 |
1 files changed, 20 insertions, 6 deletions
diff --git a/more_functions.py b/more_functions.py index 96e20e5..d4271f4 100755 --- a/more_functions.py +++ b/more_functions.py @@ -25,8 +25,8 @@ def getabstracts(gene,query): """ 1. esearch -db pubmed -query ... -- searches PubMed for the gene + keyword query, returns matching record IDs 2. efetch -format uid -- fetches just the PMIDs (unique identifiers) from the search results - 3. xfetch -db pubmed -- looks up those PMIDs in the local PubMed mirror (avoids hitting NCBI servers - for the full abstracts) + 3. xfetch -db pubmed -- looks up those PMIDs in the local PubMed mirror first; + falls back to efetch (NCBI API) for any PMIDs missing abstracts locally 4. xtract -pattern PubmedArticle -element MedlineCitation/PMID,ArticleTitle,AbstractText -- extracts PMID, title, and abstract text from the XML into tab-separated fields 5. sed "s/-/ /g" -- replaces hyphens with spaces (so hyphenated gene names match keyword searches later) @@ -44,11 +44,25 @@ def getabstracts(gene,query): return "" pmid_list = pmids.split("\n") print(f" PMIDs ({len(pmid_list)}): {' '.join(pmid_list)}") - # Step 2: fetch abstracts from local mirror - abs_cmd = "echo '" + pmids.replace("'", "") + "' | xfetch -db pubmed" \ - + " | xtract -pattern PubmedArticle -element MedlineCitation/PMID,ArticleTitle,AbstractText | sed \"s/-/ /g\"" - print(f" popen: {abs_cmd}") + # Step 2: fetch abstracts -- try local xfetch first, fall back to NCBI efetch + safe_pmids = pmids.replace("'", "") + extract = "xtract -pattern PubmedArticle -element MedlineCitation/PMID,ArticleTitle,AbstractText" + # Try local xfetch + abs_cmd = "echo '" + safe_pmids + "' | xfetch -db pubmed | " + extract + " | sed \"s/-/ /g\"" + print(f" popen(local): {abs_cmd}") abstracts = os.popen(abs_cmd).read() + # Check which PMIDs came back with abstracts + found_pmids = set() + for line in abstracts.strip().split("\n"): + if line.strip(): + found_pmids.add(line.split("\t")[0]) + missing = [p for p in pmid_list if p not in found_pmids] + if missing: + print(f" {len(missing)} PMIDs missing from local, falling back to NCBI efetch") + fallback_cmd = "echo '" + "\n".join(missing) + "' | efetch -db pubmed -format xml | " + extract + " | sed \"s/-/ /g\"" + print(f" popen(ncbi): {fallback_cmd}") + extra = os.popen(fallback_cmd).read() + abstracts += extra return(abstracts) def getSentences(gene, sentences_ls): |
