From 26df13607eec77f35d8f2da28cb88c197fcf1b9c Mon Sep 17 00:00:00 2001 From: Alexander_Kabui Date: Thu, 12 Sep 2024 17:59:00 +0300 Subject: feat: implement text transformer for xapian searches. --- gn2/wqflask/views.py | 43 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 40 insertions(+), 3 deletions(-) (limited to 'gn2') diff --git a/gn2/wqflask/views.py b/gn2/wqflask/views.py index f723958a..bcab1187 100644 --- a/gn2/wqflask/views.py +++ b/gn2/wqflask/views.py @@ -268,11 +268,47 @@ def gsearchtable(): return flask.jsonify(current_page) + def clean_xapian_query(query: str) -> str: - """ Remove filler words in xapian query - TODO: FIXME """ - return query + Clean and optimize a Xapian query string by removing filler words, + and ensuring the query is tailored for optimal results from Fahamu. + + Args: + query (str): The original Xapian query string. + + Returns: + str: The cleaned and optimized query string. + """ + xapian_prefixes = { + "author", + "species", + "group", + "tissue", + "dataset", + "symbol", + "description", + "rif", + "wiki", + } + xapian_operators = {"AND", "NOT", "OR", "XOR", "NEAR", "ADJ"} + range_prefixes = {"mean", "peak", "position", "peakmb", "additive", "year"} + query_context = ["genes"] + cleaned_query_parts = [] + for token in query.split(): + if token in xapian_operators or any( + prefix in token for prefix in range_prefixes if ".." in token + ): + continue + prefix, _, suffix = token.partition(":") + if prefix in xapian_prefixes: + query_context.insert(0, prefix) + cleaned_query_parts.append(f"{prefix} {suffix}") + else: + cleaned_query_parts.append(prefix) + cleaned_query = " ".join(cleaned_query_parts) + context = ",".join(query_context) + return f"Provide answer on {cleaned_query} context {context}" @app.route("/gnqna", methods=["POST", "GET"]) @@ -298,6 +334,7 @@ def gnqna(): query_type = request.args.get("type") if query_type == "xapian": query = clean_xapian_query(query) + # todo; check if is empty safe_query = urllib.parse.urlencode({"query": query}) search_result = requests.put( urljoin(GN3_LOCAL_URL, f"/api/llm/search?{safe_query}"), -- cgit v1.2.3 From 88a7b6ca80f482fc464ce05bab86088dc9e3f1ca Mon Sep 17 00:00:00 2001 From: Alexander_Kabui Date: Fri, 13 Sep 2024 10:21:30 +0300 Subject: Use if statement for clarity. --- gn2/wqflask/views.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'gn2') diff --git a/gn2/wqflask/views.py b/gn2/wqflask/views.py index bcab1187..2c75df41 100644 --- a/gn2/wqflask/views.py +++ b/gn2/wqflask/views.py @@ -296,11 +296,11 @@ def clean_xapian_query(query: str) -> str: query_context = ["genes"] cleaned_query_parts = [] for token in query.split(): - if token in xapian_operators or any( - prefix in token for prefix in range_prefixes if ".." in token - ): + if token in xapian_operators: continue prefix, _, suffix = token.partition(":") + if ".." in suffix and prefix in range_prefixes: + continue if prefix in xapian_prefixes: query_context.insert(0, prefix) cleaned_query_parts.append(f"{prefix} {suffix}") -- cgit v1.2.3 From 6441bdde2a5705aa04b1e556366b40f6b4e5b8e4 Mon Sep 17 00:00:00 2001 From: Alexander_Kabui Date: Fri, 13 Sep 2024 10:45:54 +0300 Subject: Move the clean xapian query function to gn2 helper functions. --- gn2/utility/helper_functions.py | 45 ++++++++++++++++++++++++++++++++++++++++ gn2/wqflask/views.py | 46 ++--------------------------------------- 2 files changed, 47 insertions(+), 44 deletions(-) (limited to 'gn2') diff --git a/gn2/utility/helper_functions.py b/gn2/utility/helper_functions.py index fc101959..8c35df5f 100644 --- a/gn2/utility/helper_functions.py +++ b/gn2/utility/helper_functions.py @@ -8,6 +8,51 @@ from gn2.utility.tools import get_setting from gn2.wqflask.database import database_connection + +def clean_xapian_query(query: str) -> str: + """ + Clean and optimize a Xapian query string by removing filler words, + and ensuring the query is tailored for optimal results from Fahamu. + + Args: + query (str): The original Xapian query string. + + Returns: + str: The cleaned and optimized query string. + """ + xapian_prefixes = { + "author", + "species", + "group", + "tissue", + "dataset", + "symbol", + "description", + "rif", + "wiki", + } + xapian_operators = {"AND", "NOT", "OR", "XOR", "NEAR", "ADJ"} + range_prefixes = {"mean", "peak", "position", "peakmb", "additive", "year"} + query_context = ["genes"] + cleaned_query_parts = [] + for token in query.split(): + if token in xapian_operators: + continue + prefix, _, suffix = token.partition(":") + if ".." in suffix and prefix in range_prefixes: + continue + if prefix in xapian_prefixes: + query_context.insert(0, prefix) + cleaned_query_parts.append(f"{prefix} {suffix}") + else: + cleaned_query_parts.append(prefix) + cleaned_query = " ".join(cleaned_query_parts) + context = ",".join(query_context) + return f"Provide answer on {cleaned_query} context {context}" + + + + def get_species_dataset_trait(self, start_vars): if "temp_trait" in list(start_vars.keys()): if start_vars['temp_trait'] == "True": diff --git a/gn2/wqflask/views.py b/gn2/wqflask/views.py index 2c75df41..5846c86a 100644 --- a/gn2/wqflask/views.py +++ b/gn2/wqflask/views.py @@ -89,6 +89,7 @@ from gn2.utility.tools import GN3_LOCAL_URL from gn2.utility.tools import JS_TWITTER_POST_FETCHER_PATH from gn2.utility.tools import JS_GUIX_PATH from gn2.utility.helper_functions import get_species_groups +from gn2.utility.helper_functions import clean_xapian_query from gn2.utility.redis_tools import get_redis_conn import gn2.utility.hmac as hmac @@ -268,49 +269,6 @@ def gsearchtable(): return flask.jsonify(current_page) - -def clean_xapian_query(query: str) -> str: - """ - Clean and optimize a Xapian query string by removing filler words, - and ensuring the query is tailored for optimal results from Fahamu. - - Args: - query (str): The original Xapian query string. - - Returns: - str: The cleaned and optimized query string. - """ - xapian_prefixes = { - "author", - "species", - "group", - "tissue", - "dataset", - "symbol", - "description", - "rif", - "wiki", - } - xapian_operators = {"AND", "NOT", "OR", "XOR", "NEAR", "ADJ"} - range_prefixes = {"mean", "peak", "position", "peakmb", "additive", "year"} - query_context = ["genes"] - cleaned_query_parts = [] - for token in query.split(): - if token in xapian_operators: - continue - prefix, _, suffix = token.partition(":") - if ".." in suffix and prefix in range_prefixes: - continue - if prefix in xapian_prefixes: - query_context.insert(0, prefix) - cleaned_query_parts.append(f"{prefix} {suffix}") - else: - cleaned_query_parts.append(prefix) - cleaned_query = " ".join(cleaned_query_parts) - context = ",".join(query_context) - return f"Provide answer on {cleaned_query} context {context}" - - @app.route("/gnqna", methods=["POST", "GET"]) @require_oauth2 def gnqna(): @@ -336,7 +294,7 @@ def gnqna(): query = clean_xapian_query(query) # todo; check if is empty safe_query = urllib.parse.urlencode({"query": query}) - search_result = requests.put( + search_result = requests.get( urljoin(GN3_LOCAL_URL, f"/api/llm/search?{safe_query}"), headers={"Authorization": f"Bearer {token}"}, ) -- cgit v1.2.3