about summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--gn3/api/search.py77
1 files changed, 64 insertions, 13 deletions
diff --git a/gn3/api/search.py b/gn3/api/search.py
index aa844ee..755f975 100644
--- a/gn3/api/search.py
+++ b/gn3/api/search.py
@@ -21,7 +21,8 @@ search = Blueprint("search", __name__)
 
 ChromosomalPosition = namedtuple("ChromosomalPosition", "chromosome position")
 ChromosomalInterval = namedtuple("ChromosomalInterval", "chromosome start end")
-IntervalLiftoverFunction = Callable[[ChromosomalInterval], Maybe[ChromosomalInterval]]
+IntervalLiftoverFunction = Callable[[
+    ChromosomalInterval], Maybe[ChromosomalInterval]]
 FieldProcessorFunction = Callable[[str], xapian.Query]
 
 
@@ -50,9 +51,11 @@ class FieldProcessor(xapian.FieldProcessor):
     additional state required by the field processor is contained in the
     lexical environment of the closure.
     """
+
     def __init__(self, proc: FieldProcessorFunction) -> None:
         super().__init__()
         self.proc = proc
+
     def __call__(self, query: str) -> xapian.Query:
         return self.proc(query)
 
@@ -83,7 +86,7 @@ def liftover(chain_file: Path, position: ChromosomalPosition) -> Maybe[Chromosom
             if line.startswith('chain'):
                 (target, query, query_size, query_strand) = split_chain_line(line)
                 if (target.chromosome == position.chromosome
-                    and target.start <= position.position < target.end):
+                        and target.start <= position.position < target.end):
                     transformed_position = query.start + position.position - target.start
                     if query_strand == '-':
                         transformed_position = query_size - 1 - transformed_position
@@ -113,8 +116,9 @@ def liftover_interval(chain_file: str, interval: ChromosomalInterval) -> Chromos
             # practice. But, in this case, there doesn't seem to be a way
             # around.
             .map(lambda interval: ChromosomalInterval(interval.chromosome,
-                                              Just(min(interval.start.value, interval.end.value)),
-                                              Just(max(interval.start.value, interval.end.value)))
+                                                      Just(
+                                                          min(interval.start.value, interval.end.value)),
+                                                      Just(max(interval.start.value, interval.end.value)))
                  if interval.start.is_just() and interval.end.is_just()
                  else interval))
 
@@ -139,7 +143,7 @@ def parse_position(spec: str) -> tuple[Maybe[int], Maybe[int]]:
     """Parse position specifiation converting point locations to ranges."""
     # Range
     if ".." in spec:
-        return tuple(limit.map(apply_si_suffix) # type: ignore
+        return tuple(limit.map(apply_si_suffix)  # type: ignore
                      for limit in parse_range(spec))
     # If point location, assume +/- 50 kbases on either side.
     else:
@@ -152,9 +156,9 @@ def parse_position_field(location_slot: int, query: bytes) -> xapian.Query:
     """Parse position and return a xapian query."""
     start, end = parse_position(query.decode("utf-8"))
     # TODO: Convert the xapian index to use bases instead of megabases.
-    to_megabases = lambda x: str(Decimal(x)/10**6)
+    def to_megabases(x): return str(Decimal(x)/10**6)
     return (xapian.NumberRangeProcessor(location_slot)
-            (start.maybe("", to_megabases), end.maybe("", to_megabases))) # type: ignore
+            (start.maybe("", to_megabases), end.maybe("", to_megabases)))  # type: ignore
 
 
 def parse_location_field(species_query: xapian.Query,
@@ -176,10 +180,11 @@ def parse_location_field(species_query: xapian.Query,
 
     def make_query(interval: ChromosomalInterval) -> xapian.Query:
         # TODO: Convert the xapian index to use bases instead of megabases.
-        to_megabases = lambda x: str(Decimal(x)/10**6)
+        def to_megabases(x): return str(Decimal(x)/10**6)
         return combine_queries(xapian.Query.OP_AND,
                                species_query,
-                               xapian.Query(chromosome_prefix + interval.chromosome),
+                               xapian.Query(chromosome_prefix +
+                                            interval.chromosome),
                                xapian.NumberRangeProcessor(location_slot)
                                (interval.start.maybe("", to_megabases),
                                 interval.end.maybe("", to_megabases)))
@@ -213,12 +218,14 @@ def parse_query(synteny_files_directory: Path, query: str):
     for i, prefix in enumerate(range_prefixes):
         # Treat position specially since it needs its own field processor.
         if prefix == "position":
-            position_field_processor = FieldProcessor(partial(parse_position_field, i))
+            position_field_processor = FieldProcessor(
+                partial(parse_position_field, i))
             queryparser.add_boolean_prefix(prefix, position_field_processor)
             # Alias the position prefix with pos.
             queryparser.add_boolean_prefix("pos", position_field_processor)
         else:
-            queryparser.add_rangeprocessor(xapian.NumberRangeProcessor(i, prefix + ":"))
+            queryparser.add_rangeprocessor(
+                xapian.NumberRangeProcessor(i, prefix + ":"))
 
     # Add field processors for synteny triplets.
     species_shorthands = {"Hs": "human",
@@ -265,10 +272,13 @@ def search_results():
     if results_per_page > maximum_results_per_page:
         abort(400, description="Requested too many search results")
 
-    query = parse_query(Path(current_app.config["DATA_DIR"]) / "synteny", querystring)
+    query = parse_query(
+        Path(current_app.config["DATA_DIR"]) / "synteny", querystring)
     traits = []
     # pylint: disable=invalid-name
-    with xapian_database(current_app.config["XAPIAN_DB_PATH"]) as db:
+
+    xapian_db_path = "/tmp/xapian"
+    with xapian_database(xapian_db_path) as db:
         enquire = xapian.Enquire(db)
         # Filter documents by type.
         enquire.set_query(xapian.Query(xapian.Query.OP_FILTER,
@@ -286,3 +296,44 @@ def search_results():
                                               "dopt": "Abstract"}))
             traits.append(trait.data)
     return jsonify(traits)
+
+
+NGRAM_MIN_LENGTH = 2
+NGRAM_MAX_LENGTH = 15
+
+
+def edge_ngram_terms(values, min_len=NGRAM_MIN_LENGTH, max_len=NGRAM_MAX_LENGTH):
+    prefixes = []
+
+    values = values.split()
+    for word in values:
+        for i in range(max(len(word), min_len), min(len(word), max_len) + 1):
+            prefix = word[:i]
+            prefixes.append(prefix)
+    return prefixes
+
+
+@search.route("/autocomplete")
+def autocomplete():
+    #!expirementall
+    querystring = request.args.get("query", default="")
+    # modify xapian path
+    with xapian_database("/tmp/xapian") as db:
+        queryparser = xapian.QueryParser()
+        queryparser.set_stemmer(xapian.Stem('en'))
+        queryparser.set_stemming_strategy(queryparser.STEM_NONE)
+        # trial and prefixes for other parse the query
+        queryparser.add_prefix("species", "XS")
+        queries = queryparser.parse_query(querystring)
+        init_term = queries._get_terms_begin()
+        filtered = set()
+        while init_term != queries._get_terms_end():
+            filtered.add(init_term.get_term().decode())
+            next(init_term)
+        matched = set()
+        for val in edge_ngram_terms(" ".join(filtered)):
+            matched.update({val.decode()
+                            for val in [term.term for term in db.allterms(val)]})
+        return jsonify({
+            "autocomplete": sorted(matched, key=len)
+        })