aboutsummaryrefslogtreecommitdiff
path: root/gn3/api/search.py
blob: 3d8cbd080124bc28991fbe248ad0db5ca1a863bc (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
"""Search using Xapian index."""

import json
import urllib.parse

from flask import abort, Blueprint, current_app, jsonify, request
import xapian

from gn3.monads import MonadicDict
from gn3.db_utils import xapian_database

search = Blueprint("search", __name__)


def parse_query(query: str):
    """Parse search query using GeneNetwork specific field processors."""
    queryparser = xapian.QueryParser()
    queryparser.set_stemmer(xapian.Stem("en"))
    queryparser.set_stemming_strategy(queryparser.STEM_SOME)
    queryparser.add_boolean_prefix("author", "A")
    queryparser.add_boolean_prefix("species", "XS")
    queryparser.add_boolean_prefix("group", "XG")
    queryparser.add_boolean_prefix("tissue", "XI")
    queryparser.add_boolean_prefix("dataset", "XDS")
    queryparser.add_boolean_prefix("symbol", "XY")
    queryparser.add_boolean_prefix("chr", "XC")
    queryparser.add_boolean_prefix("peakchr", "XPC")
    queryparser.add_prefix("description", "XD")
    range_prefixes = ["mean", "peak", "mb", "peakmb", "additive", "year"]
    for i, prefix in enumerate(range_prefixes):
        queryparser.add_rangeprocessor(xapian.NumberRangeProcessor(i, prefix + ":"))
    return queryparser.parse_query(query)


@search.route("/")
def search_results():
    """Search Xapian index and return a list of results."""
    args = request.args
    search_type = args.get("type", default="gene")
    querystring = args.get("query", default="")
    page = args.get("page", default=1, type=int)
    if page < 1:
        abort(404, description="Requested page does not exist")
    results_per_page = args.get("per_page", default=100, type=int)
    maximum_results_per_page = 10000
    if results_per_page > maximum_results_per_page:
        abort(400, description="Requested too many search results")

    query = parse_query(querystring)
    traits = []
    # pylint: disable=invalid-name
    with xapian_database(current_app.config["XAPIAN_DB_PATH"]) as db:
        enquire = xapian.Enquire(db)
        # Filter documents by type.
        enquire.set_query(xapian.Query(xapian.Query.OP_FILTER,
                                       query,
                                       xapian.Query(f"XT{search_type}")))
        for xapian_match in enquire.get_mset((page-1)*results_per_page, results_per_page):
            trait = MonadicDict(json.loads(xapian_match.document.get_data()))
            # Add PubMed link to phenotype search results.
            if search_type == "phenotype":
                trait["pubmed_link"] = trait["pubmed_id"].map(
                    lambda pubmed_id: "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?"
                    + urllib.parse.urlencode({"cmd": "Retrieve",
                                              "db": "PubMed",
                                              "list_uids": pubmed_id,
                                              "dopt": "Abstract"}))
            traits.append(trait.data)
    return jsonify(traits)