diff options
author | Alexander Kabui | 2024-09-13 11:01:38 +0300 |
---|---|---|
committer | GitHub | 2024-09-13 11:01:38 +0300 |
commit | e823d509b2e2485ab65b1e26a7d4c40042714c25 (patch) | |
tree | 53326f8c78c3699563be0cc3562e5624c2cad66f | |
parent | 63de4403b6e2186c5758443ec9163fc57b8a012f (diff) | |
parent | 3617205075688b2de4f2318578babfd709272b65 (diff) | |
download | genenetwork2-e823d509b2e2485ab65b1e26a7d4c40042714c25.tar.gz |
Merge pull request #868 from genenetwork/feature/xapian-to-llm-text-transformer
feat: implement text transformer for xapian searches.
-rw-r--r-- | gn2/utility/helper_functions.py | 45 | ||||
-rw-r--r-- | gn2/wqflask/views.py | 20 |
2 files changed, 47 insertions, 18 deletions
diff --git a/gn2/utility/helper_functions.py b/gn2/utility/helper_functions.py index fc101959..8c35df5f 100644 --- a/gn2/utility/helper_functions.py +++ b/gn2/utility/helper_functions.py @@ -8,6 +8,51 @@ from gn2.utility.tools import get_setting from gn2.wqflask.database import database_connection + +def clean_xapian_query(query: str) -> str: + """ + Clean and optimize a Xapian query string by removing filler words, + and ensuring the query is tailored for optimal results from Fahamu. + + Args: + query (str): The original Xapian query string. + + Returns: + str: The cleaned and optimized query string. + """ + xapian_prefixes = { + "author", + "species", + "group", + "tissue", + "dataset", + "symbol", + "description", + "rif", + "wiki", + } + xapian_operators = {"AND", "NOT", "OR", "XOR", "NEAR", "ADJ"} + range_prefixes = {"mean", "peak", "position", "peakmb", "additive", "year"} + query_context = ["genes"] + cleaned_query_parts = [] + for token in query.split(): + if token in xapian_operators: + continue + prefix, _, suffix = token.partition(":") + if ".." in suffix and prefix in range_prefixes: + continue + if prefix in xapian_prefixes: + query_context.insert(0, prefix) + cleaned_query_parts.append(f"{prefix} {suffix}") + else: + cleaned_query_parts.append(prefix) + cleaned_query = " ".join(cleaned_query_parts) + context = ",".join(query_context) + return f"Provide answer on {cleaned_query} context {context}" + + + + def get_species_dataset_trait(self, start_vars): if "temp_trait" in list(start_vars.keys()): if start_vars['temp_trait'] == "True": diff --git a/gn2/wqflask/views.py b/gn2/wqflask/views.py index aaf40d41..e306cc2c 100644 --- a/gn2/wqflask/views.py +++ b/gn2/wqflask/views.py @@ -89,6 +89,7 @@ from gn2.utility.tools import GN3_LOCAL_URL from gn2.utility.tools import JS_TWITTER_POST_FETCHER_PATH from gn2.utility.tools import JS_GUIX_PATH from gn2.utility.helper_functions import get_species_groups +from gn2.utility.helper_functions import clean_xapian_query from gn2.utility.redis_tools import get_redis_conn import gn2.utility.hmac as hmac @@ -268,24 +269,6 @@ def gsearchtable(): return flask.jsonify(current_page) -def clean_xapian_query(query: str) -> str: - """ Remove filler words in xapian query - This is a temporary solution that works for some query. A better solution is being worked on. - TODO: FIXME - """ - xapian_prefixes = set(["author", "species", "group", "tissue", "dataset", "symbol", "description", "rif", "wiki"]) - range_prefixes = set(["mean", "peak", "position", "peakmb", "additive", "year"]) - final_query = [] - for word in query.split(): - split_word = word.split(":") - if len(split_word) > 0 and split_word[0].lower() in xapian_prefixes: - final_query.append(split_word[1]) - continue - if split_word[0].lower() in range_prefixes: - # no need to search for ranges - continue - return " ".join(final_query) - @app.route("/gnqna", methods=["POST", "GET"]) @require_oauth2 @@ -310,6 +293,7 @@ def gnqna(): query_type = request.args.get("type") if query_type == "xapian": query = clean_xapian_query(query) + # todo; check if is empty safe_query = urllib.parse.urlencode({"query": query}) search_result = requests.get( urljoin(GN3_LOCAL_URL, f"/api/llm/search?{safe_query}"), |