about summary refs log tree commit diff
diff options
context:
space:
mode:
authorPjotr Prins2026-03-28 11:18:21 +0100
committerPjotr Prins2026-03-28 11:18:21 +0100
commit2d821f4ff808027a67da6548cba6bedc4b69bb62 (patch)
tree796d41661ac0a7104b09b82883fef928009b0ef0
parent95e839c648c1946a6b0186421d89003a1126bf9e (diff)
downloadgenecup-2d821f4ff808027a67da6548cba6bedc4b69bb62.tar.gz
Use punkt_tab instead of punkt. See https://openillumi.com/en/en-nltk-punkt-tab-lookuperror-fix/
-rw-r--r--guix.scm12
-rwxr-xr-xmore_functions.py2
-rwxr-xr-xserver.py10
3 files changed, 13 insertions, 11 deletions
diff --git a/guix.scm b/guix.scm
index dfc303d..748a7f6 100644
--- a/guix.scm
+++ b/guix.scm
@@ -43,9 +43,9 @@
 (define nltk-punkt-source
   (origin
     (method url-fetch)
-    (uri "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip")
+    (uri "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt_tab.zip")
     (sha256
-     (base32 "1v306rjpjfcqd8mh276lfz8s1d22zgj8n0lfzh5nbbxfjj4hghsi"))))
+     (base32 "01h11srafj57yvp74xkidikh6m7ch7qscz21lck7f9vlg4c68zz5"))))
 
 (define-public nltk-punkt
   (package
@@ -66,14 +66,14 @@
           (replace 'install
             (lambda* (#:key outputs #:allow-other-keys)
               (let ((out (string-append (assoc-ref outputs "out")
-                                        "/share/nltk_data/tokenizers/punkt")))
+                                        "/share/nltk_data/tokenizers/punkt_tab")))
                 (mkdir-p out)
-                (copy-recursively "punkt" out)))))))
+                (copy-recursively "punkt_tab" out)))))))
     (native-inputs (list unzip))
     (home-page "https://www.nltk.org/nltk_data/")
-    (synopsis "NLTK Punkt sentence tokenizer models")
+    (synopsis "NLTK Punkt_Tab sentence tokenizer models")
     (description "Pre-trained models for the Punkt sentence boundary
-detection tokenizer, used by NLTK's sent_tokenize function.")
+detection tokenizer (tab format), used by NLTK's sent_tokenize function.")
     (license license:asl2.0)))
 
 (define minipubmed-source
diff --git a/more_functions.py b/more_functions.py
index 0c7fbd4..4261692 100755
--- a/more_functions.py
+++ b/more_functions.py
@@ -184,6 +184,6 @@ print(f"  pubmed_path={pubmed_path}")
 if not os.path.isdir(pubmed_path):
     print(f"ERROR: EDIRECT_PUBMED_MASTER directory not found: {pubmed_path}")
     raise SystemExit(1)
-if not os.path.isdir(os.path.join(pubmed_path, "Archive")):
+if not os.path.isdir(os.path.join(pubmed_path, "PubMed", "Archive")):
     print(f"ERROR: PubMed/Archive not found in {pubmed_path}")
     raise SystemExit(1)
diff --git a/server.py b/server.py
index 0cbbfc4..6db682a 100755
--- a/server.py
+++ b/server.py
@@ -25,7 +25,7 @@ load_dotenv()
 import os
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
 
-# nltk.download('punkt') - we should prefetch
+# nltk.download('punkt') # we should prefetch
 # import pickle # Removed
 from collections import Counter
 from datetime import datetime
@@ -101,13 +101,14 @@ def get_sentences_from_file(file_path, gene_name, category_name=None):
     return matching_sentences
 
 
-nltk.data.path.append("./nlp/")
+# nltk expects tokenizers at nltk_data/tokenizers/punkt
+# nltk.data.path.append("./nlp/")
 
 # Validate punkt tokenizer is available
 try:
-    nltk.data.find('tokenizers/punkt')
+    nltk.data.find('tokenizers/punkt_tab')
 except LookupError:
-    print("ERROR: NLTK punkt tokenizer not found. Set NLTK_DATA or install punkt data.")
+    print("ERROR: NLTK punkt_tab tokenizer not found. Set NLTK_DATA or install punkt_tab data.")
     print("  NLTK data paths: " + str(nltk.data.path))
     raise SystemExit(1)
 
@@ -1016,6 +1017,7 @@ def search():
             yield "data:"+str(progress)+"\n\n"
 
             for gene in genes:
+                print(f"Fetching info for gene {gene}\n")
                 abstracts_raw = getabstracts(gene,all_d) # all_d might be empty if no search_type matches
                 print(abstracts_raw)
                 sentences_ls=[]