From 6441bdde2a5705aa04b1e556366b40f6b4e5b8e4 Mon Sep 17 00:00:00 2001 From: Alexander_Kabui Date: Fri, 13 Sep 2024 10:45:54 +0300 Subject: Move the clean xapian query function to gn2 helper functions. --- gn2/utility/helper_functions.py | 45 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'gn2/utility') diff --git a/gn2/utility/helper_functions.py b/gn2/utility/helper_functions.py index fc101959..8c35df5f 100644 --- a/gn2/utility/helper_functions.py +++ b/gn2/utility/helper_functions.py @@ -8,6 +8,51 @@ from gn2.utility.tools import get_setting from gn2.wqflask.database import database_connection + +def clean_xapian_query(query: str) -> str: + """ + Clean and optimize a Xapian query string by removing filler words, + and ensuring the query is tailored for optimal results from Fahamu. + + Args: + query (str): The original Xapian query string. + + Returns: + str: The cleaned and optimized query string. + """ + xapian_prefixes = { + "author", + "species", + "group", + "tissue", + "dataset", + "symbol", + "description", + "rif", + "wiki", + } + xapian_operators = {"AND", "NOT", "OR", "XOR", "NEAR", "ADJ"} + range_prefixes = {"mean", "peak", "position", "peakmb", "additive", "year"} + query_context = ["genes"] + cleaned_query_parts = [] + for token in query.split(): + if token in xapian_operators: + continue + prefix, _, suffix = token.partition(":") + if ".." in suffix and prefix in range_prefixes: + continue + if prefix in xapian_prefixes: + query_context.insert(0, prefix) + cleaned_query_parts.append(f"{prefix} {suffix}") + else: + cleaned_query_parts.append(prefix) + cleaned_query = " ".join(cleaned_query_parts) + context = ",".join(query_context) + return f"Provide answer on {cleaned_query} context {context}" + + + + def get_species_dataset_trait(self, start_vars): if "temp_trait" in list(start_vars.keys()): if start_vars['temp_trait'] == "True": -- cgit v1.2.3