diff options
| -rw-r--r-- | guix.scm | 12 | ||||
| -rwxr-xr-x | more_functions.py | 2 | ||||
| -rwxr-xr-x | server.py | 10 |
3 files changed, 13 insertions, 11 deletions
diff --git a/guix.scm b/guix.scm index dfc303d..748a7f6 100644 --- a/guix.scm +++ b/guix.scm @@ -43,9 +43,9 @@ (define nltk-punkt-source (origin (method url-fetch) - (uri "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip") + (uri "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt_tab.zip") (sha256 - (base32 "1v306rjpjfcqd8mh276lfz8s1d22zgj8n0lfzh5nbbxfjj4hghsi")))) + (base32 "01h11srafj57yvp74xkidikh6m7ch7qscz21lck7f9vlg4c68zz5")))) (define-public nltk-punkt (package @@ -66,14 +66,14 @@ (replace 'install (lambda* (#:key outputs #:allow-other-keys) (let ((out (string-append (assoc-ref outputs "out") - "/share/nltk_data/tokenizers/punkt"))) + "/share/nltk_data/tokenizers/punkt_tab"))) (mkdir-p out) - (copy-recursively "punkt" out))))))) + (copy-recursively "punkt_tab" out))))))) (native-inputs (list unzip)) (home-page "https://www.nltk.org/nltk_data/") - (synopsis "NLTK Punkt sentence tokenizer models") + (synopsis "NLTK Punkt_Tab sentence tokenizer models") (description "Pre-trained models for the Punkt sentence boundary -detection tokenizer, used by NLTK's sent_tokenize function.") +detection tokenizer (tab format), used by NLTK's sent_tokenize function.") (license license:asl2.0))) (define minipubmed-source diff --git a/more_functions.py b/more_functions.py index 0c7fbd4..4261692 100755 --- a/more_functions.py +++ b/more_functions.py @@ -184,6 +184,6 @@ print(f" pubmed_path={pubmed_path}") if not os.path.isdir(pubmed_path): print(f"ERROR: EDIRECT_PUBMED_MASTER directory not found: {pubmed_path}") raise SystemExit(1) -if not os.path.isdir(os.path.join(pubmed_path, "Archive")): +if not os.path.isdir(os.path.join(pubmed_path, "PubMed", "Archive")): print(f"ERROR: PubMed/Archive not found in {pubmed_path}") raise SystemExit(1) diff --git a/server.py b/server.py index 0cbbfc4..6db682a 100755 --- a/server.py +++ b/server.py @@ -25,7 +25,7 @@ load_dotenv() import os GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") -# nltk.download('punkt') - we should prefetch +# nltk.download('punkt') # we should prefetch # import pickle # Removed from collections import Counter from datetime import datetime @@ -101,13 +101,14 @@ def get_sentences_from_file(file_path, gene_name, category_name=None): return matching_sentences -nltk.data.path.append("./nlp/") +# nltk expects tokenizers at nltk_data/tokenizers/punkt +# nltk.data.path.append("./nlp/") # Validate punkt tokenizer is available try: - nltk.data.find('tokenizers/punkt') + nltk.data.find('tokenizers/punkt_tab') except LookupError: - print("ERROR: NLTK punkt tokenizer not found. Set NLTK_DATA or install punkt data.") + print("ERROR: NLTK punkt_tab tokenizer not found. Set NLTK_DATA or install punkt_tab data.") print(" NLTK data paths: " + str(nltk.data.path)) raise SystemExit(1) @@ -1016,6 +1017,7 @@ def search(): yield "data:"+str(progress)+"\n\n" for gene in genes: + print(f"Fetching info for gene {gene}\n") abstracts_raw = getabstracts(gene,all_d) # all_d might be empty if no search_type matches print(abstracts_raw) sentences_ls=[] |
