diff options
author | Alexander_Kabui | 2024-09-12 17:59:00 +0300 |
---|---|---|
committer | Alexander_Kabui | 2024-09-12 17:59:00 +0300 |
commit | 26df13607eec77f35d8f2da28cb88c197fcf1b9c (patch) | |
tree | 0a9cbd929d04479c423219145ba1d7ec5a055e04 | |
parent | 5acb80117c002b72dc591e0ce0c3083a2695bf0a (diff) | |
download | genenetwork2-26df13607eec77f35d8f2da28cb88c197fcf1b9c.tar.gz |
feat: implement text transformer for xapian searches.
-rw-r--r-- | gn2/wqflask/views.py | 43 |
1 files changed, 40 insertions, 3 deletions
diff --git a/gn2/wqflask/views.py b/gn2/wqflask/views.py index f723958a..bcab1187 100644 --- a/gn2/wqflask/views.py +++ b/gn2/wqflask/views.py @@ -268,11 +268,47 @@ def gsearchtable(): return flask.jsonify(current_page) + def clean_xapian_query(query: str) -> str: - """ Remove filler words in xapian query - TODO: FIXME """ - return query + Clean and optimize a Xapian query string by removing filler words, + and ensuring the query is tailored for optimal results from Fahamu. + + Args: + query (str): The original Xapian query string. + + Returns: + str: The cleaned and optimized query string. + """ + xapian_prefixes = { + "author", + "species", + "group", + "tissue", + "dataset", + "symbol", + "description", + "rif", + "wiki", + } + xapian_operators = {"AND", "NOT", "OR", "XOR", "NEAR", "ADJ"} + range_prefixes = {"mean", "peak", "position", "peakmb", "additive", "year"} + query_context = ["genes"] + cleaned_query_parts = [] + for token in query.split(): + if token in xapian_operators or any( + prefix in token for prefix in range_prefixes if ".." in token + ): + continue + prefix, _, suffix = token.partition(":") + if prefix in xapian_prefixes: + query_context.insert(0, prefix) + cleaned_query_parts.append(f"{prefix} {suffix}") + else: + cleaned_query_parts.append(prefix) + cleaned_query = " ".join(cleaned_query_parts) + context = ",".join(query_context) + return f"Provide answer on {cleaned_query} context {context}" @app.route("/gnqna", methods=["POST", "GET"]) @@ -298,6 +334,7 @@ def gnqna(): query_type = request.args.get("type") if query_type == "xapian": query = clean_xapian_query(query) + # todo; check if is empty safe_query = urllib.parse.urlencode({"query": query}) search_result = requests.put( urljoin(GN3_LOCAL_URL, f"/api/llm/search?{safe_query}"), |