about summary refs log tree commit diff
diff options
context:
space:
mode:
authorArun Isaac2023-01-15 17:36:30 +0000
committerArun Isaac2023-01-18 01:15:47 +0000
commitd4af1981e80614b4e1b8c1570b5d80d9bfbe6b67 (patch)
tree72c2965434709b37beb076c13f1a94d51c7a03c3
parente4f572c7e2b4a553ec5231a9d8440c0b39bee5e3 (diff)
downloadgenenetwork3-d4af1981e80614b4e1b8c1570b5d80d9bfbe6b67.tar.gz
Support location shorthands in search queries.
* gn3/api/search.py: Import partial and reduce from functools. Import Callable
from typing.
(ChromosomalPosition, ChromosomalInterval, FieldProcessor): New classes.
(apply_si_suffix, combine_queries, parse_location_field, interval_start,
interval_end): New functions.
(parse_query): Add field processors for location shorthands.
-rw-r--r--gn3/api/search.py101
1 files changed, 99 insertions, 2 deletions
diff --git a/gn3/api/search.py b/gn3/api/search.py
index 3d8cbd0..83951a1 100644
--- a/gn3/api/search.py
+++ b/gn3/api/search.py
@@ -1,9 +1,13 @@
 """Search using Xapian index."""
 
+from collections import namedtuple
 import json
+from functools import partial, reduce
+from typing import Callable
 import urllib.parse
 
 from flask import abort, Blueprint, current_app, jsonify, request
+from pymonad.maybe import Just, Maybe, Nothing
 import xapian
 
 from gn3.monads import MonadicDict
@@ -11,24 +15,117 @@ from gn3.db_utils import xapian_database
 
 search = Blueprint("search", __name__)
 
+ChromosomalPosition = namedtuple("ChromosomalPosition", "chromosome position")
+ChromosomalInterval = namedtuple("ChromosomalInterval", "chromosome start end")
+FieldProcessorFunction = Callable[[str], xapian.Query]
+
+
+def interval_start(interval: ChromosomalInterval) -> Maybe[ChromosomalPosition]:
+    """Return start of a ChromosomalInterval as a ChromosomalPosition."""
+    return interval.start.map(lambda start: ChromosomalPosition(interval.chromosome, start))
+
+
+def interval_end(interval: ChromosomalInterval) -> Maybe[ChromosomalPosition]:
+    """Return end of a ChromosomalInterval as a ChromosomalPosition."""
+    return interval.end.map(lambda end: ChromosomalPosition(interval.chromosome, end))
+
+
+def combine_queries(operator: int, *queries: xapian.Query) -> xapian.Query:
+    """Combine xapian queries using operator."""
+    return reduce(partial(xapian.Query, operator), queries)
+
+
+class FieldProcessor(xapian.FieldProcessor):
+    """
+    Field processor for use in a xapian query parser.
+
+    This class allows us to create any field processor without creating a
+    separate class for each. To create a field processor, you only have to
+    pass FieldProcessor a function. This function may be a closure. All
+    additional state required by the field processor is contained in the
+    lexical environment of the closure.
+    """
+    def __init__(self, proc: FieldProcessorFunction) -> None:
+        super().__init__()
+        self.proc = proc
+    def __call__(self, query: str) -> xapian.Query:
+        return self.proc(query)
+
+
+def parse_range(range_string: str) -> tuple[Maybe[str], Maybe[str]]:
+    """Parse xapian range strings such as start..end."""
+    start, end = range_string.split("..")
+    return (Nothing if start == "" else Just(start),
+            Nothing if end == "" else Just(end))
+
+
+def apply_si_suffix(location: str) -> int:
+    """Apply SI suffixes kilo, mega, giga and convert to bases."""
+    suffixes = {"k": 3, "m": 6, "g": 9}
+    return int(float(location[:-1])*10**suffixes.get(location[-1].lower(), 0))
+
+
+def parse_location_field(species: str, species_prefix: str,
+                         chromosome_prefix: str, location_slot: int,
+                         query: bytes) -> xapian.Query:
+    """Parse location shorthands and return a xapian query.
+
+    Location shorthands compress species, chromosome and position into a
+    single field. e.g., Hs:chr2:1M..1.2M
+    """
+    def split_query(query: str) -> ChromosomalInterval:
+        """Split query into chromosome and location tuple."""
+        chromosome, location = query.lower().split(":")
+        if not chromosome.startswith("chr"):
+            raise ValueError
+        return ChromosomalInterval(chromosome.removeprefix("chr"),
+                                   *[location.map(apply_si_suffix)
+                                     for location in parse_range(location)])
+
+
+    try:
+        interval = split_query(query.decode("utf-8"))
+    except ValueError:
+        return xapian.Query(xapian.Query.OP_INVALID)
+    return combine_queries(xapian.Query.OP_AND,
+                           xapian.Query(species_prefix + species),
+                           xapian.Query(chromosome_prefix + interval.chromosome),
+                           xapian.NumberRangeProcessor(location_slot)
+                           (interval.start.maybe("", str),
+                            interval.end.maybe("", str)))
+
 
 def parse_query(query: str):
     """Parse search query using GeneNetwork specific field processors."""
     queryparser = xapian.QueryParser()
     queryparser.set_stemmer(xapian.Stem("en"))
     queryparser.set_stemming_strategy(queryparser.STEM_SOME)
+    species_prefix = "XS"
+    chromosome_prefix = "XC"
     queryparser.add_boolean_prefix("author", "A")
-    queryparser.add_boolean_prefix("species", "XS")
+    queryparser.add_boolean_prefix("species", species_prefix)
     queryparser.add_boolean_prefix("group", "XG")
     queryparser.add_boolean_prefix("tissue", "XI")
     queryparser.add_boolean_prefix("dataset", "XDS")
     queryparser.add_boolean_prefix("symbol", "XY")
-    queryparser.add_boolean_prefix("chr", "XC")
+    queryparser.add_boolean_prefix("chr", chromosome_prefix)
     queryparser.add_boolean_prefix("peakchr", "XPC")
     queryparser.add_prefix("description", "XD")
     range_prefixes = ["mean", "peak", "mb", "peakmb", "additive", "year"]
     for i, prefix in enumerate(range_prefixes):
         queryparser.add_rangeprocessor(xapian.NumberRangeProcessor(i, prefix + ":"))
+
+    # Add field processors for location shorthands.
+    species_shorthands = {"Hs": "human",
+                          "Mm": "mouse",
+                          "Rn": "rat"}
+    for shorthand, species in species_shorthands.items():
+        queryparser.add_boolean_prefix(
+            shorthand, FieldProcessor(partial(parse_location_field,
+                                              species,
+                                              species_prefix,
+                                              chromosome_prefix,
+                                              range_prefixes.index("mb"))))
     return queryparser.parse_query(query)