about summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--README.md40
-rw-r--r--guix.scm14
-rwxr-xr-xmore_functions.py9
-rwxr-xr-xserver.py8
4 files changed, 53 insertions, 18 deletions
diff --git a/README.md b/README.md
index 08676b3..fa0f54f 100644
--- a/README.md
+++ b/README.md
@@ -25,10 +25,40 @@ Live searches are conducted through PubMed to get relevant PMIDs, which are then
 3. sort the genes based on the number of abstracts with useful sentences.
 4. generate the final list, include symbol, alias, and name
 
+# Run a test server
+
+You can use the [guix.scm](./guix.scm) container to run genecup:
+
+```sh
+env GEMINI_API_KEY="AIzaSy**" `guix build -L . genecup-gemini`/server.py --port 4201
+```
+
+Note that the build includes minipubmed and punkt for testing!
+
+# Run a production server
+
 ## Install local mirror of PubMed
 
 - Following the instruction provided by NCBI: https://www.nlm.nih.gov/dataguide/edirect/archive.html
 
+Point environment variables to this dir.
+
+```
+env EDIRECT_PUBMED_MASTER=/export3/PubMed GEMINI_API_KEY="AIzaSy**" `guix build -L . genecup-gemini`/server.py --port 4201
+```
+
+Environment variables used:
+
+```
+EDIRECT_PUBMED_MASTER
+GEMINI_API_KEY
+NLTK_DATA
+TMPDIR
+```
+
+# Development
+
+
 ## Mini PubMed for testing
 
 For testing or code development, it is useful to have a small collection of PubMed abstracts in the same format as the local PubMed mirror. We provide 2473 abstracts that can be used to test four gene symbols (gria1, crhr1, drd2, and penk).
@@ -56,15 +86,7 @@ wget https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenize
 unzip punkt.zip
 ```
 
-# Run the server
-
-You can use the [guix.scm](./guix.scm) container to run genecup:
-
-```sh
-GeneCup$ guix shell -L . -C -N -F genecup-gemini coreutils edirect -- env EDIRECT_PUBMED_MASTER=./minipubmed NLTK_DATA=./minipubmed GEMINI_API_KEY="AIza****" ./server.py --port 4201
-```
-
-## Development
+## Source code
 
 The source code and data are in a git repository: https://git.genenetwork.org/genecup/
 
diff --git a/guix.scm b/guix.scm
index fc6ebbc..f5e2d83 100644
--- a/guix.scm
+++ b/guix.scm
@@ -192,15 +192,10 @@ access to Gemini models.")
             (lambda* (#:key inputs #:allow-other-keys)
               (delete-file "minipubmed.tgz")
               (let ((pubmed (string-append (assoc-ref inputs "minipubmed")
-                                           "/share/minipubmed"))
-                    (punkt  (string-append (assoc-ref inputs "nltk-punkt")
-                                           "/share/nltk_data/tokenizers/punkt")))
+                                           "/share/minipubmed")))
                 ;; Patch default pubmed path to store location
                 (substitute* "more_functions.py"
-                  (("\\./minipubmed") pubmed))
-                ;; Copy punkt tokenizer data
-                (mkdir-p "nlp/tokenizers")
-                (copy-recursively punkt "nlp/tokenizers/punkt"))))
+                  (("\\./minipubmed") pubmed)))))
           (replace 'install
             (lambda* (#:key outputs #:allow-other-keys)
               (let ((out (assoc-ref outputs "out")))
@@ -248,7 +243,10 @@ access to Gemini models.")
                                         ,(dirname (which "dirname"))
                                         ,(dirname (which "grep"))
                                         ,(dirname (which "sed"))))
-                  `("GUIX_PYTHONPATH" ":" prefix (,path)))))))))
+                  `("GUIX_PYTHONPATH" ":" prefix (,path))
+                  `("NLTK_DATA" ":" prefix
+                    (,(string-append (assoc-ref inputs "nltk-punkt")
+                                     "/share/nltk_data"))))))))))
     (propagated-inputs
      (list
        python-bcrypt
diff --git a/more_functions.py b/more_functions.py
index 8162acc..7f12215 100755
--- a/more_functions.py
+++ b/more_functions.py
@@ -175,5 +175,12 @@ def searchArchived(sets, query, filetype,sents, path_user):
         gwas_json+="{ \"id\": \"" + edgeID + "\", \"source\": \"" + query + "\", \"target\": \"" + key + "\", \"sentCnt\": \"" + str(catCnt[key]) + "\",  \"url\":\"/sentences?edgeID=" + edgeID + "\" },\n"
     return(nodes+edges,gwas_json,sn_file)
 
-pubmed_path=os.environ["EDIRECT_PUBMED_MASTER"]
+pubmed_path=os.environ.get("EDIRECT_PUBMED_MASTER", "./minipubmed")
+
+if not os.path.isdir(pubmed_path):
+    print(f"ERROR: EDIRECT_PUBMED_MASTER directory not found: {pubmed_path}")
+    raise SystemExit(1)
+if not os.path.isdir(os.path.join(pubmed_path, "PubMed", "Archive")):
+    print(f"ERROR: PubMed/Archive not found in {pubmed_path}")
+    raise SystemExit(1)
 
diff --git a/server.py b/server.py
index ad8c29d..868572e 100755
--- a/server.py
+++ b/server.py
@@ -103,6 +103,14 @@ def get_sentences_from_file(file_path, gene_name, category_name=None):
 
 nltk.data.path.append("./nlp/")
 
+# Validate punkt tokenizer is available
+try:
+    nltk.data.find('tokenizers/punkt')
+except LookupError:
+    print("ERROR: NLTK punkt tokenizer not found. Set NLTK_DATA or install punkt data.")
+    print("  NLTK data paths: " + str(nltk.data.path))
+    raise SystemExit(1)
+
 # Initialize database within application context
 with app.app_context():
     db.create_all()