diff options
| author | Pjotr Prins | 2026-04-05 17:24:54 +0200 |
|---|---|---|
| committer | Pjotr Prins | 2026-04-05 17:24:54 +0200 |
| commit | 8f84075f667fd3ad523d395cc1c66c07bbef0c23 (patch) | |
| tree | de35852163b4f9f08a4941ae62e0938e947e4b58 /more_functions.py | |
| parent | 10782144441bb72362f73c2b3db5bc66eb4c5fb1 (diff) | |
| download | genecup-8f84075f667fd3ad523d395cc1c66c07bbef0c23.tar.gz | |
Cache PMID search hits
Diffstat (limited to 'more_functions.py')
| -rwxr-xr-x | more_functions.py | 29 |
1 files changed, 23 insertions, 6 deletions
diff --git a/more_functions.py b/more_functions.py index d2c2040..a6cb86f 100755 --- a/more_functions.py +++ b/more_functions.py @@ -1,5 +1,6 @@ #!/bin/env python3 from nltk.tokenize import sent_tokenize +import hashlib import os import re @@ -9,6 +10,25 @@ import ast global pubmed_path +# In-memory cache for esearch results: hash(query) -> list of PMIDs +_esearch_cache = {} + +def esearch_pmids(query): + """Search PubMed for PMIDs matching query. Results are cached in memory. + + Returns a list of PMID strings, or [] if none found. + """ + key = hashlib.sha256(query.encode()).hexdigest() + if key in _esearch_cache: + print(f" esearch cache hit for: {query}") + return _esearch_cache[key] + pmid_cmd = "esearch -db pubmed -query " + query + " | efetch -format uid" + print(f" popen: {pmid_cmd}") + pmids = os.popen(pmid_cmd).read().strip() + pmid_list = [p for p in pmids.split("\n") if p.strip()] if pmids else [] + _esearch_cache[key] = pmid_list + return pmid_list + def undic(dic): all_s='' for s in dic: @@ -62,14 +82,11 @@ def getabstracts(gene,query): """ query="\"(" + query + ") AND (" + gene + " [tiab])\"" - # Step 1: fetch PMIDs from PubMed - pmid_cmd = "esearch -db pubmed -query " + query + " | efetch -format uid" - print(f" popen: {pmid_cmd}") - pmids = os.popen(pmid_cmd).read().strip() - if not pmids: + # Step 1: fetch PMIDs from PubMed (cached) + pmid_list = esearch_pmids(query) + if not pmid_list: print(f" no PMIDs found for {gene}") return "" - pmid_list = pmids.split("\n") print(f" PMIDs ({len(pmid_list)}): {' '.join(pmid_list)}") # Step 2: fetch abstracts via hybrid local+NCBI abstracts = hybrid_fetch_abstracts(pmid_list) |
