about summary refs log tree commit diff
diff options
context:
space:
mode:
authorArun Isaac2023-01-24 13:14:04 +0000
committerArun Isaac2023-01-24 13:22:20 +0000
commitbe2034b4e6666861c42702d663e1638c06e4793a (patch)
tree7a8a0d895fc380393840877c53030d370a006b99
parent99c0139cd37f72d73196eb97c7a56ee143a70600 (diff)
downloadgenenetwork3-be2034b4e6666861c42702d663e1638c06e4793a.tar.gz
search: Assume triplets mean synteny is requested.
* gn3/api/search.py (query_subqueries, query_terms, parse_synteny_field,
is_synteny_on, remove_synteny_field): Delete functions.
(parse_query): Assume triplets mean synteny is requested.
-rw-r--r--gn3/api/search.py171
1 files changed, 47 insertions, 124 deletions
diff --git a/gn3/api/search.py b/gn3/api/search.py
index 8ce51e3..d25f362 100644
--- a/gn3/api/search.py
+++ b/gn3/api/search.py
@@ -40,28 +40,6 @@ def combine_queries(operator: int, *queries: xapian.Query) -> xapian.Query:
     return reduce(partial(xapian.Query, operator), queries)
 
 
-def query_subqueries(query: xapian.Query) -> list[xapian.Query]:
-    """Return list of child queries in query."""
-    return [query.get_subquery(i) for i in range(query.get_num_subqueries())]
-
-
-def query_terms(query: xapian.Query) -> list[str]:
-    """Return list of terms in query."""
-    # Unfortunately, the TermIterator from python xapian bindings seems
-    # buggy. So, we resort to traversing the query tree.
-    # python xapian bindings do not expose xapian.Query.LEAF_TERM. This is
-    # most likely a bug.
-    leaf_type = 100
-    if query.get_type() == leaf_type:
-        # We have no choice but to access the protected _get_terms_begin method.
-        # pylint: disable=protected-access
-        return [query._get_terms_begin().get_term().decode("utf-8")]
-    else:
-        return reduce(lambda result, subquery: result + query_terms(subquery),
-                      query_subqueries(query),
-                      [])
-
-
 class FieldProcessor(xapian.FieldProcessor):
     """
     Field processor for use in a xapian query parser.
@@ -200,110 +178,55 @@ def parse_location_field(species_query: xapian.Query,
             .maybe(xapian.Query.MatchNothing, make_query))
 
 
-def parse_synteny_field(synteny_prefix: str, query: bytes) -> xapian.Query:
-    """Parse synteny field and return a xapian query."""
-    if query.decode("utf-8") in ["on", "off"]:
-        return xapian.Query(synteny_prefix + query.decode("utf-8"))
-    else:
-        return xapian.Query(xapian.Query.OP_INVALID)
-
-
-def is_synteny_on(synteny_prefix: str, query: xapian.Query) -> bool:
-    """Check if synteny search is requested in query."""
-    return synteny_prefix + "on" in query_terms(query)
-
-
-def remove_synteny_field(synteny_prefix: str, query: xapian.Query,
-                         parent_operator: int = xapian.Query.OP_AND) -> xapian.Query:
-    """Return a new query with the synteny field removed."""
-    # Note that this function only supports queries that exclusively use the
-    # AND, OR, FILTER, WEIGHT, RANGE and INVALID operators.
-    # python xapian bindings do not expose xapian.Query.LEAF_TERM. This is
-    # most likely a bug.
-    leaf_type = 100
-    # Handle leaf node.
-    if query.get_type() == leaf_type:
-        if not any(term.startswith(synteny_prefix) for term in query_terms(query)):
-            return query
-        elif parent_operator in [xapian.Query.OP_AND, xapian.Query.OP_FILTER]:
-            return xapian.Query.MatchAll
-        elif parent_operator == xapian.Query.OP_OR:
-            return xapian.Query.MatchNothing
-        else:
-            raise ValueError("Unexpected operator in query", query.get_type())
-    # Recurse on non-leaf nodes with the AND, OR or FILTER operators as root.
-    elif query.get_type() in (xapian.Query.OP_AND, xapian.Query.OP_OR,
-                              xapian.Query.OP_FILTER, xapian.Query.OP_SCALE_WEIGHT):
-        return combine_queries(query.get_type(),
-                               *[remove_synteny_field(synteny_prefix, subquery, query.get_type())
-                                 for subquery in query_subqueries(query)])
-    # Return other supported non-leaf nodes verbatim.
-    elif query.get_type() in [xapian.Query.OP_VALUE_RANGE, xapian.Query.OP_INVALID]:
-        return query
-    # Raise an exception on unsupported non-leaf nodes.
-    else:
-        raise ValueError("Unexpected operator in query", query.get_type())
-
-
 def parse_query(synteny_files_directory: Path, query: str):
     """Parse search query using GeneNetwork specific field processors."""
-    synteny_prefix = "XSYN"
-
-    def make_query_parser(synteny: bool) -> xapian.QueryParser:
-        queryparser = xapian.QueryParser()
-        queryparser.set_stemmer(xapian.Stem("en"))
-        queryparser.set_stemming_strategy(queryparser.STEM_SOME)
-        species_prefix = "XS"
-        chromosome_prefix = "XC"
-        queryparser.add_boolean_prefix("author", "A")
-        queryparser.add_boolean_prefix("species", species_prefix)
-        queryparser.add_boolean_prefix("group", "XG")
-        queryparser.add_boolean_prefix("tissue", "XI")
-        queryparser.add_boolean_prefix("dataset", "XDS")
-        queryparser.add_boolean_prefix("symbol", "XY")
-        queryparser.add_boolean_prefix("chr", chromosome_prefix)
-        queryparser.add_boolean_prefix("peakchr", "XPC")
-        queryparser.add_prefix("description", "XD")
-        queryparser.add_prefix("synteny", FieldProcessor(partial(parse_synteny_field,
-                                                                 synteny_prefix)))
-        range_prefixes = ["mean", "peak", "mb", "peakmb", "additive", "year"]
-        for i, prefix in enumerate(range_prefixes):
-            queryparser.add_rangeprocessor(xapian.NumberRangeProcessor(i, prefix + ":"))
-
-        # Add field processors for location shorthands.
-        species_shorthands = {"Hs": "human",
-                              "Mm": "mouse"}
-        for shorthand, species in species_shorthands.items():
-            field_processors = [partial(parse_location_field,
-                                        xapian.Query(species_prefix + species),
-                                        chromosome_prefix,
-                                        range_prefixes.index("mb"),
-                                        Just)]
-            # If human and synteny is requested, add liftover.
-            # With synteny search, we search for the same gene sequences
-            # across different species. But, the same gene sequences may be
-            # present in very different chromosomal positions in different
-            # species. So, we first liftover.
-            if shorthand == "Hs" and synteny:
-                chain_files = {"mouse": "hg19ToMm10-chains.over.chain.gz"}
-                for lifted_species, chain_file in chain_files.items():
-                    field_processors.append(
-                        partial(parse_location_field,
-                                xapian.Query(species_prefix + lifted_species),
-                                chromosome_prefix,
-                                range_prefixes.index("mb"),
-                                partial(liftover_interval,
-                                        synteny_files_directory / chain_file)))
-            queryparser.add_boolean_prefix(
-                shorthand,
-                FieldProcessor(field_processor_or(*field_processors)))
-        return queryparser
-
-    return remove_synteny_field(
-        synteny_prefix,
-        make_query_parser(is_synteny_on(synteny_prefix,
-                                        make_query_parser(False).parse_query(query)))
-        .parse_query(query))
+    queryparser = xapian.QueryParser()
+    queryparser.set_stemmer(xapian.Stem("en"))
+    queryparser.set_stemming_strategy(queryparser.STEM_SOME)
+    species_prefix = "XS"
+    chromosome_prefix = "XC"
+    queryparser.add_boolean_prefix("author", "A")
+    queryparser.add_boolean_prefix("species", species_prefix)
+    queryparser.add_boolean_prefix("group", "XG")
+    queryparser.add_boolean_prefix("tissue", "XI")
+    queryparser.add_boolean_prefix("dataset", "XDS")
+    queryparser.add_boolean_prefix("symbol", "XY")
+    queryparser.add_boolean_prefix("chr", chromosome_prefix)
+    queryparser.add_boolean_prefix("peakchr", "XPC")
+    queryparser.add_prefix("description", "XD")
+    range_prefixes = ["mean", "peak", "mb", "peakmb", "additive", "year"]
+    for i, prefix in enumerate(range_prefixes):
+        queryparser.add_rangeprocessor(xapian.NumberRangeProcessor(i, prefix + ":"))
+
+    # Add field processors for synteny triplets.
+    species_shorthands = {"Hs": "human",
+                          "Mm": "mouse"}
+    for shorthand, species in species_shorthands.items():
+        field_processors = [partial(parse_location_field,
+                                    xapian.Query(species_prefix + species),
+                                    chromosome_prefix,
+                                    range_prefixes.index("mb"),
+                                    Just)]
+        # With synteny search, we search for the same gene sequences
+        # across different species. But, the same gene sequences may be
+        # present in very different chromosomal positions in different
+        # species. So, we first liftover.
+        # TODO: Implement liftover and synteny search for species other than
+        # human.
+        if shorthand == "Hs":
+            chain_files = {"mouse": "hg19ToMm10-chains.over.chain.gz"}
+            for lifted_species, chain_file in chain_files.items():
+                field_processors.append(
+                    partial(parse_location_field,
+                            xapian.Query(species_prefix + lifted_species),
+                            chromosome_prefix,
+                            range_prefixes.index("mb"),
+                            partial(liftover_interval,
+                                    synteny_files_directory / chain_file)))
+        queryparser.add_boolean_prefix(
+            shorthand,
+            FieldProcessor(field_processor_or(*field_processors)))
+    return queryparser.parse_query(query)
 
 
 @search.route("/")