diff options
Diffstat (limited to 'gn3/api/search.py')
-rw-r--r-- | gn3/api/search.py | 101 |
1 files changed, 99 insertions, 2 deletions
diff --git a/gn3/api/search.py b/gn3/api/search.py index 3d8cbd0..83951a1 100644 --- a/gn3/api/search.py +++ b/gn3/api/search.py @@ -1,9 +1,13 @@ """Search using Xapian index.""" +from collections import namedtuple import json +from functools import partial, reduce +from typing import Callable import urllib.parse from flask import abort, Blueprint, current_app, jsonify, request +from pymonad.maybe import Just, Maybe, Nothing import xapian from gn3.monads import MonadicDict @@ -11,24 +15,117 @@ from gn3.db_utils import xapian_database search = Blueprint("search", __name__) +ChromosomalPosition = namedtuple("ChromosomalPosition", "chromosome position") +ChromosomalInterval = namedtuple("ChromosomalInterval", "chromosome start end") +FieldProcessorFunction = Callable[[str], xapian.Query] + + +def interval_start(interval: ChromosomalInterval) -> Maybe[ChromosomalPosition]: + """Return start of a ChromosomalInterval as a ChromosomalPosition.""" + return interval.start.map(lambda start: ChromosomalPosition(interval.chromosome, start)) + + +def interval_end(interval: ChromosomalInterval) -> Maybe[ChromosomalPosition]: + """Return end of a ChromosomalInterval as a ChromosomalPosition.""" + return interval.end.map(lambda end: ChromosomalPosition(interval.chromosome, end)) + + +def combine_queries(operator: int, *queries: xapian.Query) -> xapian.Query: + """Combine xapian queries using operator.""" + return reduce(partial(xapian.Query, operator), queries) + + +class FieldProcessor(xapian.FieldProcessor): + """ + Field processor for use in a xapian query parser. + + This class allows us to create any field processor without creating a + separate class for each. To create a field processor, you only have to + pass FieldProcessor a function. This function may be a closure. All + additional state required by the field processor is contained in the + lexical environment of the closure. + """ + def __init__(self, proc: FieldProcessorFunction) -> None: + super().__init__() + self.proc = proc + def __call__(self, query: str) -> xapian.Query: + return self.proc(query) + + +def parse_range(range_string: str) -> tuple[Maybe[str], Maybe[str]]: + """Parse xapian range strings such as start..end.""" + start, end = range_string.split("..") + return (Nothing if start == "" else Just(start), + Nothing if end == "" else Just(end)) + + +def apply_si_suffix(location: str) -> int: + """Apply SI suffixes kilo, mega, giga and convert to bases.""" + suffixes = {"k": 3, "m": 6, "g": 9} + return int(float(location[:-1])*10**suffixes.get(location[-1].lower(), 0)) + + +def parse_location_field(species: str, species_prefix: str, + chromosome_prefix: str, location_slot: int, + query: bytes) -> xapian.Query: + """Parse location shorthands and return a xapian query. + + Location shorthands compress species, chromosome and position into a + single field. e.g., Hs:chr2:1M..1.2M + """ + def split_query(query: str) -> ChromosomalInterval: + """Split query into chromosome and location tuple.""" + chromosome, location = query.lower().split(":") + if not chromosome.startswith("chr"): + raise ValueError + return ChromosomalInterval(chromosome.removeprefix("chr"), + *[location.map(apply_si_suffix) + for location in parse_range(location)]) + + + try: + interval = split_query(query.decode("utf-8")) + except ValueError: + return xapian.Query(xapian.Query.OP_INVALID) + return combine_queries(xapian.Query.OP_AND, + xapian.Query(species_prefix + species), + xapian.Query(chromosome_prefix + interval.chromosome), + xapian.NumberRangeProcessor(location_slot) + (interval.start.maybe("", str), + interval.end.maybe("", str))) + def parse_query(query: str): """Parse search query using GeneNetwork specific field processors.""" queryparser = xapian.QueryParser() queryparser.set_stemmer(xapian.Stem("en")) queryparser.set_stemming_strategy(queryparser.STEM_SOME) + species_prefix = "XS" + chromosome_prefix = "XC" queryparser.add_boolean_prefix("author", "A") - queryparser.add_boolean_prefix("species", "XS") + queryparser.add_boolean_prefix("species", species_prefix) queryparser.add_boolean_prefix("group", "XG") queryparser.add_boolean_prefix("tissue", "XI") queryparser.add_boolean_prefix("dataset", "XDS") queryparser.add_boolean_prefix("symbol", "XY") - queryparser.add_boolean_prefix("chr", "XC") + queryparser.add_boolean_prefix("chr", chromosome_prefix) queryparser.add_boolean_prefix("peakchr", "XPC") queryparser.add_prefix("description", "XD") range_prefixes = ["mean", "peak", "mb", "peakmb", "additive", "year"] for i, prefix in enumerate(range_prefixes): queryparser.add_rangeprocessor(xapian.NumberRangeProcessor(i, prefix + ":")) + + # Add field processors for location shorthands. + species_shorthands = {"Hs": "human", + "Mm": "mouse", + "Rn": "rat"} + for shorthand, species in species_shorthands.items(): + queryparser.add_boolean_prefix( + shorthand, FieldProcessor(partial(parse_location_field, + species, + species_prefix, + chromosome_prefix, + range_prefixes.index("mb")))) return queryparser.parse_query(query) |