aboutsummaryrefslogtreecommitdiff
path: root/gn2
diff options
context:
space:
mode:
Diffstat (limited to 'gn2')
-rw-r--r--gn2/utility/helper_functions.py45
-rw-r--r--gn2/wqflask/views.py20
2 files changed, 47 insertions, 18 deletions
diff --git a/gn2/utility/helper_functions.py b/gn2/utility/helper_functions.py
index fc101959..8c35df5f 100644
--- a/gn2/utility/helper_functions.py
+++ b/gn2/utility/helper_functions.py
@@ -8,6 +8,51 @@ from gn2.utility.tools import get_setting
from gn2.wqflask.database import database_connection
+
+def clean_xapian_query(query: str) -> str:
+ """
+ Clean and optimize a Xapian query string by removing filler words,
+ and ensuring the query is tailored for optimal results from Fahamu.
+
+ Args:
+ query (str): The original Xapian query string.
+
+ Returns:
+ str: The cleaned and optimized query string.
+ """
+ xapian_prefixes = {
+ "author",
+ "species",
+ "group",
+ "tissue",
+ "dataset",
+ "symbol",
+ "description",
+ "rif",
+ "wiki",
+ }
+ xapian_operators = {"AND", "NOT", "OR", "XOR", "NEAR", "ADJ"}
+ range_prefixes = {"mean", "peak", "position", "peakmb", "additive", "year"}
+ query_context = ["genes"]
+ cleaned_query_parts = []
+ for token in query.split():
+ if token in xapian_operators:
+ continue
+ prefix, _, suffix = token.partition(":")
+ if ".." in suffix and prefix in range_prefixes:
+ continue
+ if prefix in xapian_prefixes:
+ query_context.insert(0, prefix)
+ cleaned_query_parts.append(f"{prefix} {suffix}")
+ else:
+ cleaned_query_parts.append(prefix)
+ cleaned_query = " ".join(cleaned_query_parts)
+ context = ",".join(query_context)
+ return f"Provide answer on {cleaned_query} context {context}"
+
+
+
+
def get_species_dataset_trait(self, start_vars):
if "temp_trait" in list(start_vars.keys()):
if start_vars['temp_trait'] == "True":
diff --git a/gn2/wqflask/views.py b/gn2/wqflask/views.py
index aaf40d41..e306cc2c 100644
--- a/gn2/wqflask/views.py
+++ b/gn2/wqflask/views.py
@@ -89,6 +89,7 @@ from gn2.utility.tools import GN3_LOCAL_URL
from gn2.utility.tools import JS_TWITTER_POST_FETCHER_PATH
from gn2.utility.tools import JS_GUIX_PATH
from gn2.utility.helper_functions import get_species_groups
+from gn2.utility.helper_functions import clean_xapian_query
from gn2.utility.redis_tools import get_redis_conn
import gn2.utility.hmac as hmac
@@ -268,24 +269,6 @@ def gsearchtable():
return flask.jsonify(current_page)
-def clean_xapian_query(query: str) -> str:
- """ Remove filler words in xapian query
- This is a temporary solution that works for some query. A better solution is being worked on.
- TODO: FIXME
- """
- xapian_prefixes = set(["author", "species", "group", "tissue", "dataset", "symbol", "description", "rif", "wiki"])
- range_prefixes = set(["mean", "peak", "position", "peakmb", "additive", "year"])
- final_query = []
- for word in query.split():
- split_word = word.split(":")
- if len(split_word) > 0 and split_word[0].lower() in xapian_prefixes:
- final_query.append(split_word[1])
- continue
- if split_word[0].lower() in range_prefixes:
- # no need to search for ranges
- continue
- return " ".join(final_query)
-
@app.route("/gnqna", methods=["POST", "GET"])
@require_oauth2
@@ -310,6 +293,7 @@ def gnqna():
query_type = request.args.get("type")
if query_type == "xapian":
query = clean_xapian_query(query)
+ # todo; check if is empty
safe_query = urllib.parse.urlencode({"query": query})
search_result = requests.get(
urljoin(GN3_LOCAL_URL, f"/api/llm/search?{safe_query}"),