diff options
| author | Pjotr Prins | 2026-04-06 04:58:31 -0500 |
|---|---|---|
| committer | Pjotr Prins | 2026-04-06 04:58:31 -0500 |
| commit | ddac775a9c58f84026f51cfcaa688bb51e70f00c (patch) | |
| tree | f8862e2d10b4ab58b138905cffb6d8211adda997 | |
| parent | d99396ddac97bdea89c4326e5d3a6ba70c894313 (diff) | |
| parent | 0ede8df7f295604b09a574dcfa587a31483f1583 (diff) | |
| download | genecup-ddac775a9c58f84026f51cfcaa688bb51e70f00c.tar.gz | |
Merge branch 'master' of /home/git/public/genecup
| -rw-r--r-- | RELEASE_NOTES.md | 41 | ||||
| -rw-r--r-- | VERSION | 2 | ||||
| -rw-r--r-- | guix.scm | 8 | ||||
| -rwxr-xr-x | more_functions.py | 66 | ||||
| -rwxr-xr-x | server.py | 104 | ||||
| -rw-r--r-- | templates/create-ontology.html | 48 | ||||
| -rw-r--r-- | tests/test_network_gemini_ontology.py | 56 |
7 files changed, 302 insertions, 23 deletions
diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md new file mode 100644 index 0000000..0d73f75 --- /dev/null +++ b/RELEASE_NOTES.md @@ -0,0 +1,41 @@ +# GeneCup Release Notes + +## Version 1.9.1 (2026-04-05) + +### UI/UX +- Added header/footer in GN color scheme with version info across all pages +- Added --port and --debug command line switches +- Show environment variables (EDIRECT_PUBMED_MASTER, GEMINI_API_KEY, NLTK_DATA, GENECUP_DATADIR) on startup +- Added intermediate "Calling Gemini API..." loading page that auto-refreshes when classification completes + +### Gemini API integration +- Replaced TensorFlow stress classifier with Google Gemini API (gemini-2.5-pro for few-shot, gemini-3-flash-preview for batch) +- API key read from ~/.config/gemini/credentials (with 0400 permission check) +- Batch classification: all stress sentences classified in one API call with JSON response +- In-memory cache for Gemini results (keyed by SHA-256 of sentence batch) +- Retry logic (3 attempts with 2s/4s backoff) +- Gemini prompts and responses logged to console + +### PubMed / edirect +- Packaged edirect 25.x for Guix (Go programs compiled from source, XML bounds-check patch) +- Replaced missing fetch-pubmed with xfetch -db pubmed (local archive lookup) +- Hybrid abstract fetching: tries local xfetch first, falls back to NCBI efetch for PMIDs missing from the local archive +- In-memory cache for esearch PMID results (keyed by SHA-256 of query string) +- EDIRECT_LOCAL_ARCHIVE env var configures local PubMed archive path + +### Packaging (guix.scm) +- Added edirect-25, nltk-punkt, minipubmed, python-google-genai packages +- genecup-gemini package with genecup wrapper script, JavaScript assets, NLTK data +- GENECUP_DATADIR for sqlite DB location + +### Testing +- Added Python unittest framework (tests/) +- test_hello.py: offline smoke test (runs in guix build) +- test_network_esearch.py: NCBI esearch for Penk+stress PMIDs +- test_local_xfetch.py: local xsearch+xfetch against PubMed archive +- test_network_hybrid.py: validates hybrid fetch matches NCBI; tests esearch cache + +### Cleanup +- Moved dead code to old/server.py +- Removed unused TensorFlow/Keras dependencies +- Removed stress_prompt.txt dependency (batch classifier builds its own prompt) diff --git a/VERSION b/VERSION index c064b1b..9ab8337 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.9.1-pre +1.9.1 diff --git a/guix.scm b/guix.scm index f2e54d4..2c42e85 100644 --- a/guix.scm +++ b/guix.scm @@ -7,12 +7,14 @@ ;; ;; Development shell: ;; -;; guix shell -L . -C -N -F edirect-25 genecup-gemini coreutils -- genecup --port 4201 +;; guix shell -L . -C -N -F --expose=$HOME/.config/gemini --share=/export3/PubMed edirect-25 genecup-gemini coreutils -- genecup --port 4201 ;; ;; In a shell you can run ;; -;; python3 -m unittest tests.test_network_esearch +;; guix shell -C -N -F -L . --expose=$HOME/.config/gemini --share=/export3/PubMed edirect-25 genecup-gemini +;; env EDIRECT_LOCAL_ARCHIVE=/export3/PubMed/Source python3 -m unittest tests.test_network_esearch ;; env EDIRECT_LOCAL_ARCHIVE=/export3/PubMed/Source python3 -m unittest tests.test_local_xfetch -v +;; env EDIRECT_LOCAL_ARCHIVE=/export3/PubMed/Source python3 -m unittest tests.test_network_gemini_ontology ;; ;; Note: API key is read from ~/.config/gemini/credentials ;; @@ -284,7 +286,7 @@ GeneCup with four gene symbols (gria1, crhr1, drd2, and penk).") go-github-com-surgebase-porter2 go-golang-org-x-sys go-golang-org-x-text)) - (propagated-inputs (list curl wget grep sed gawk coreutils findutils gzip unzip)) + (propagated-inputs (list curl wget grep sed gawk coreutils findutils gzip unzip xargs)) (inputs (list bash-minimal coreutils perl perl-xml-simple python)) (home-page "https://www.ncbi.nlm.nih.gov/books/NBK179288/") (synopsis "Tools for accessing the NCBI's set of databases") diff --git a/more_functions.py b/more_functions.py index a115899..5d48adc 100755 --- a/more_functions.py +++ b/more_functions.py @@ -3,6 +3,7 @@ from nltk.tokenize import sent_tokenize import hashlib import os import re +import time from addiction_keywords import * from gene_synonyms import * @@ -10,8 +11,51 @@ import ast global pubmed_path -# In-memory cache for esearch results: hash(query) -> list of PMIDs -_esearch_cache = {} +# In-memory caches +_esearch_cache = {} # hash(query) -> list of PMIDs +_gemini_query_cache = {} # hash(prompt) -> response text + +def gemini_query(prompt, model='gemini-2.5-flash'): + """Send a prompt to the Gemini API with caching and retry. + + Returns the response text, or raises on failure. + """ + from google import genai + + cache_key = hashlib.sha256(prompt.encode()).hexdigest() + if cache_key in _gemini_query_cache: + print(f" Gemini query cache hit") + return _gemini_query_cache[cache_key] + + api_key = os.environ.get("GEMINI_API_KEY", "") + if not api_key: + cred_file = os.path.expanduser("~/.config/gemini/credentials") + if os.path.isfile(cred_file): + with open(cred_file) as f: + api_key = f.read().strip() + if not api_key: + raise RuntimeError("No Gemini API key found") + + client = genai.Client(api_key=api_key) + last_error = None + for attempt in range(3): + try: + if attempt > 0: + time.sleep(2 * attempt) + print(f" Gemini retry {attempt + 1}/3") + print(f" Gemini API call ({model}): {prompt[:80]}...") + response = client.models.generate_content( + model=model, + contents=prompt + ) + result = response.text.strip() + print(f" Gemini response: {result[:200]}") + _gemini_query_cache[cache_key] = result + return result + except Exception as e: + last_error = e + print(f" Gemini attempt {attempt + 1}/3 failed: {e}") + raise RuntimeError(f"Gemini API failed after 3 attempts: {last_error}") def esearch_pmids(query): """Search PubMed for PMIDs matching query. Results are cached in memory. @@ -69,6 +113,22 @@ def hybrid_fetch_abstracts(pmid_list): abstracts += extra return abstracts +def getabstracts_batch(genes, query): + """Fetch abstracts for multiple genes in a single PubMed query. + + Builds: (keywords) AND (gene1 [tiab] OR gene2 [tiab] OR ...) + Returns tab-separated lines: PMID, ArticleTitle, AbstractText + """ + genes_clause = " OR ".join(g + " [tiab]" for g in genes) + full_query = "\"(" + query + ") AND (" + genes_clause + ")\"" + pmid_list = esearch_pmids(full_query) + if not pmid_list: + print(f" no PMIDs found for {genes}") + return "" + print(f" PMIDs ({len(pmid_list)}): {' '.join(pmid_list[:20])}{'...' if len(pmid_list) > 20 else ''}") + abstracts = hybrid_fetch_abstracts(pmid_list) + return abstracts + def getabstracts(gene,query): """ 1. esearch -db pubmed -query ... -- searches PubMed for the gene + keyword query, returns matching record IDs @@ -246,7 +306,7 @@ pubmed_path=os.environ.get("EDIRECT_LOCAL_ARCHIVE", "./minipubmed") print(f" pubmed_path={pubmed_path}") if not os.path.isdir(pubmed_path): - print(f"ERROR: EDIRECT_LOCAL_ARCHIVE directory not found: {pubmed_path} - note this is a recent env variable that replaces the others") + print(f"ERROR: EDIRECT_LOCAL_ARCHIVE directory not found: {pubmed_path} - note this is a recent env variable that replaces the others (ignore the minipub reference)") raise SystemExit(1) testdir = os.path.join(pubmed_path, "pubmed", "Archive", "00") if not os.path.isdir(testdir): diff --git a/server.py b/server.py index c81cbc9..25ccad5 100755 --- a/server.py +++ b/server.py @@ -64,7 +64,7 @@ import re import ast from more_functions import * from nltk.tokenize import sent_tokenize -from more_functions import getabstracts, undic, gene_category +from more_functions import getabstracts, getabstracts_batch, undic, gene_category GENECUP_PROMPT_TEMPLATE = "" try: @@ -346,6 +346,67 @@ def logout(): def about(): return render_template('about.html',version=version()) +@app.route("/create-ontology", methods=["GET", "POST"]) +def create_ontology(): + if request.method == "GET": + session.pop('namecat', None) + from more_functions import gemini_query + default_prompt = ( + "Give me a list of terms on substance abuse disorder (SUD) that act " + "as traits and classifiers in scientific literature with a focus on " + "behaviour and brain attributes related to the hippocampus. Avoid " + "aliases and synonyms as well as gene names. Each term should be " + "1-3 words (max). Give me a list of at least 20, but no more than " + "80, most used terms. Return only the terms, one per line, no " + "numbering. Add abbreviations and aliases - each at least 3 letters that have no other meaning - as a list with each term, " + "separated by commas") + if request.method == "POST": + action = request.form.get("action", "generate") + + if action == "search": + # Build a temporary .onto file from the result terms and redirect to /progress + result_text = request.form.get("result", "") + query = request.form.get("query", "") + search_types = request.form.getlist("type") + # Build onto dict: each term is its own category with aliases as pipe-separated keywords + dict_onto = {} + for line in result_text.strip().split("\n"): + line = line.strip() + if not line: + continue + parts = [p.strip() for p in line.split(",")] + category = parts[0] + keywords = "|".join(parts) + dict_onto[category] = {category: {keywords}} + # Save to a temp .onto file + onto_path = os.path.join(tempfile.gettempdir(), "gemini_ontology") + with open(onto_path + ".onto", "w") as f: + f.write(repr(dict_onto)) + session['namecat'] = onto_path + print(f" Created ontology: {onto_path}.onto with {len(dict_onto)} categories") + print(f" Gene query: '{query}', search_types: {search_types}") + # Build the redirect URL with type and query params + from urllib.parse import urlencode + params = [("query", query)] + for t in search_types: + params.append(("type", t)) + return redirect("/progress?" + urlencode(params)) + + # action == "generate" + prompt = request.form.get("prompt", default_prompt) + try: + result = gemini_query(prompt) + terms = [t.strip() for t in result.strip().split("\n") if t.strip()] + return render_template('create-ontology.html', + prompt=prompt, result=result, + count=len(terms), version=version()) + except Exception as e: + return render_template('create-ontology.html', + prompt=prompt, result=f"Error: {e}", + count=0, version=version()) + return render_template('create-ontology.html', + prompt=default_prompt, result=None, + count=0, version=version()) # Ontology selection @app.route("/index_ontology", methods=["POST", "GET"]) @@ -803,6 +864,11 @@ def progress(): if (search_type == []): search_type = ['GWAS', 'function', 'addiction', 'drug', 'brain', 'stress', 'psychiatric', 'cell'] session['search_type'] = search_type + # Use default addiction ontology unless redirected from /create-ontology + if request.referrer and '/create-ontology' in request.referrer: + pass # keep session['namecat'] set by /create-ontology + elif 'namecat' in session: + del session['namecat'] genes_session = '' for gen in genes: @@ -846,8 +912,10 @@ def search(): if 'namecat' in session: namecat_flag=1 ses_namecat = session['namecat'] + print(f" /search: namecat={ses_namecat}, search_type={search_type}") onto_cont = open(session['namecat']+".onto","r").read() dict_onto=ast.literal_eval(onto_cont) + print(f" /search: onto categories={list(dict_onto.keys())[:10]}") for ky in dict_onto.keys(): nodecolor[ky] = "hsl("+str((n_num+1)*int(360/len(dict_onto.keys())))+", 70%, 80%)" @@ -908,28 +976,32 @@ def search(): all_d = all_d+'|'+all_d_ls if all_d: # Check if all_d is not empty all_d=all_d[1:] + print(f" /search generate: all_d={all_d[:200] if all_d else '(empty)'}, search_type={search_type}") if ("GWAS" in search_type): datf = pd.read_csv('./utility/gwas_used.csv',sep='\t') progress+=percent yield "data:"+str(progress)+"\n\n" - for gene in genes: - print(f"Fetching info for gene {gene}\n") - abstracts_raw = getabstracts(gene,all_d) # all_d might be empty if no search_type matches - print(abstracts_raw) - sentences_ls=[] + # Batch fetch all abstracts in a single PubMed query + print(f"Batch fetching abstracts for {len(genes)} genes") + all_abstracts_raw = getabstracts_batch(genes, all_d) if all_d else "" + # Parse all sentences once + all_sentences = [] + for row in all_abstracts_raw.split("\n"): + if not row.strip(): continue + tiab = row.split("\t") + pmid = tiab.pop(0) + tiab_text = " ".join(tiab) + for sent_tok in sent_tokenize(tiab_text): + all_sentences.append(pmid + ' ' + sent_tok) - for row in abstracts_raw.split("\n"): - if not row.strip(): continue # Skip empty lines - tiab=row.split("\t") - pmid = tiab.pop(0) - tiab_text = " ".join(tiab) # Renamed to avoid conflict - sentences_tok = sent_tokenize(tiab_text) - for sent_tok in sentences_tok: - sent_tok = pmid + ' ' + sent_tok - sentences_ls.append(sent_tok) - gene=gene.replace("-"," ") + for gene in genes: + gene = gene.replace("-", " ") + # Filter sentences that mention this gene + gene_re = re.compile(r'\b' + re.escape(gene) + r'\b', re.IGNORECASE) + sentences_ls = [s for s in all_sentences if gene_re.search(s)] + print(f" Gene {gene}: {len(sentences_ls)} sentences") geneEdges = "" diff --git a/templates/create-ontology.html b/templates/create-ontology.html new file mode 100644 index 0000000..537c246 --- /dev/null +++ b/templates/create-ontology.html @@ -0,0 +1,48 @@ +{% extends "layout.html" %} +{% block content %} + +<div class="container mt-4"> + <h3>Create Ontology with Gemini AI</h3> + + <form method="POST" action="/create-ontology"> + <input type="hidden" name="action" value="generate"> + <div class="form-group"> + <label for="prompt">Prompt:</label> + <textarea class="form-control" id="prompt" name="prompt" rows="6">{{ prompt }}</textarea> + </div> + <button type="submit" class="btn btn-primary" onclick="this.style.backgroundColor='#87CEEB'; this.style.borderColor='#87CEEB';">Create Ontology</button> + </form> + + {% if result %} + <div class="form-group mt-4"> + <label for="result">Result ({{ count }} terms):</label> + <textarea class="form-control" id="result" name="result" rows="20" form="search-form">{{ result }}</textarea> + </div> + + <form id="search-form" method="POST" action="/create-ontology" class="mt-3"> + <input type="hidden" name="action" value="search"> + <div class="form-group"> + <label for="query">Gene symbols (space or comma separated):</label> + <textarea class="form-control" id="query" name="query" rows="2"></textarea> + </div> + <div id="check_selection_onto"></div> + <button type="submit" class="btn btn-primary" onclick="if(!document.getElementById('query').value.trim()){alert('Please enter at least one gene symbol');return false;}">Search</button> + </form> + + <script> + var lines = document.getElementById('result').value.split('\n'); + var checkbox = ''; + for (var i = 0; i < lines.length; i++) { + var term = lines[i].trim(); + if (term) { + var label = term.split(',')[0].trim(); + checkbox += '<strong><input type="checkbox" name="type" value="' + label + '" checked form="search-form"> ' + label + ' </strong>'; + } + } + checkbox += '<br><strong><input type="checkbox" onClick="var c=document.querySelectorAll(\'input[name=type]\');for(var i=0;i<c.length;i++)c[i].checked=this.checked;"/> (Un)select all</strong>'; + document.getElementById('check_selection_onto').innerHTML = checkbox; + </script> + {% endif %} +</div> + +{% endblock %} diff --git a/tests/test_network_gemini_ontology.py b/tests/test_network_gemini_ontology.py new file mode 100644 index 0000000..2a84909 --- /dev/null +++ b/tests/test_network_gemini_ontology.py @@ -0,0 +1,56 @@ +"""Test Gemini API for generating SUD ontology terms. + +Requires a Gemini API key in ~/.config/gemini/credentials and internet access. + +Run with: python3 -m unittest tests.test_network_gemini_ontology -v +""" + +import os +import sys +import time +import unittest + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) +from more_functions import gemini_query + +PROMPT = ( + """ + Give me a list of terms on substance abuse disorder (SUD) that act + as traits and classifiers in scientific literature with a focus on + behaviour and brain attributes related to the hippocampus. Avoid + aliases and synonyms as well as gene names. Each term should be + 1-3 words (max). Give me a list of at least 20, but no more than + 80, most used terms. Return only the terms, one per line, no + numbering. Add abbreviations and aliases as a list with each term, separated by commas""" +) + +class TestGeminiOntology(unittest.TestCase): + def test_1_sud_ontology_terms(self): + """Gemini should return 20-50 SUD ontology terms.""" + t0 = time.time() + response = gemini_query(PROMPT) + elapsed = time.time() - t0 + terms = [t.strip() for t in response.strip().split("\n") if t.strip()] + print(f" Got {len(terms)} terms ({elapsed:.2f}s)") + for t in terms: + print(f" - {t}") + self.assertGreaterEqual(len(terms), 20, + f"Expected at least 20 terms, got {len(terms)}") + self.assertLessEqual(len(terms), 80, + f"Expected at most 80 terms, got {len(terms)}") + # Each term should be short (1-3 words, allow some slack) + long_terms = [t for t in terms if len(t.split()) > 5] + + def test_2_cached_ontology(self): + """Second call should use cache and be fast.""" + # Ensure cache is populated from test_1 + gemini_query(PROMPT) + t0 = time.time() + response = gemini_query(PROMPT) + elapsed = time.time() - t0 + terms = [t.strip() for t in response.strip().split("\n") if t.strip()] + print(f" Cached: {len(terms)} terms ({elapsed:.4f}s)") + self.assertLess(elapsed, 0.01, f"Cache lookup too slow: {elapsed:.4f}s") + +if __name__ == "__main__": + unittest.main() |
