Merge pull request #868 from genenetwork/feature/xapian-to-llm-text-transformer

feat: implement text transformer for xapian searches.
author: Alexander Kabui 2024-09-13 11:01:38 +0300
committer: GitHub 2024-09-13 11:01:38 +0300
commit: e823d509b2e2485ab65b1e26a7d4c40042714c25 (patch)
tree: 53326f8c78c3699563be0cc3562e5624c2cad66f /gn2/utility
parent: 63de4403b6e2186c5758443ec9163fc57b8a012f (diff)
parent: 3617205075688b2de4f2318578babfd709272b65 (diff)
download: genenetwork2-e823d509b2e2485ab65b1e26a7d4c40042714c25.tar.gz
1 files changed, 45 insertions, 0 deletions
diff --git a/gn2/utility/helper_functions.py b/gn2/utility/helper_functions.py
index fc101959..8c35df5f 100644
--- a/gn2/utility/helper_functions.py
+++ b/gn2/utility/helper_functions.py
@@ -8,6 +8,51 @@ from gn2.utility.tools import get_setting
 from gn2.wqflask.database import database_connection
 
 
+
+def clean_xapian_query(query: str) -> str:
+    """
+    Clean and optimize a Xapian query string by removing filler words,
+    and ensuring the query is tailored for optimal results from Fahamu.
+
+    Args:
+        query (str): The original Xapian query string.
+
+    Returns:
+        str: The cleaned and optimized query string.
+    """
+    xapian_prefixes = {
+        "author",
+        "species",
+        "group",
+        "tissue",
+        "dataset",
+        "symbol",
+        "description",
+        "rif",
+        "wiki",
+    }
+    xapian_operators = {"AND", "NOT", "OR", "XOR", "NEAR", "ADJ"}
+    range_prefixes = {"mean", "peak", "position", "peakmb", "additive", "year"}
+    query_context = ["genes"]
+    cleaned_query_parts = []
+    for token in query.split():
+        if token in xapian_operators:
+            continue
+        prefix, _, suffix = token.partition(":")
+        if ".." in suffix and prefix in range_prefixes:
+            continue
+        if prefix in xapian_prefixes:
+            query_context.insert(0, prefix)
+            cleaned_query_parts.append(f"{prefix} {suffix}")
+        else:
+            cleaned_query_parts.append(prefix)
+    cleaned_query = " ".join(cleaned_query_parts)
+    context = ",".join(query_context)
+    return f"Provide answer on {cleaned_query} context {context}"
+
+
+
+
 def get_species_dataset_trait(self, start_vars):
     if "temp_trait" in list(start_vars.keys()):
         if start_vars['temp_trait'] == "True":
author	Alexander Kabui	2024-09-13 11:01:38 +0300
committer	GitHub	2024-09-13 11:01:38 +0300
commit	e823d509b2e2485ab65b1e26a7d4c40042714c25 (patch)
tree	53326f8c78c3699563be0cc3562e5624c2cad66f /gn2/utility
parent	63de4403b6e2186c5758443ec9163fc57b8a012f (diff)
parent	3617205075688b2de4f2318578babfd709272b65 (diff)
download	genenetwork2-e823d509b2e2485ab65b1e26a7d4c40042714c25.tar.gz