aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorArun Isaac2022-09-21 15:11:00 +0530
committerArun Isaac2022-09-26 13:39:45 +0530
commit1b90cf71577f414c71dc0190940f4234732cdd28 (patch)
tree103effeaf4455cb1b532b81abf598196313eba73
parent73327a58707d7321e2d4c616da442e598ed13c57 (diff)
downloadgenenetwork2-1b90cf71577f414c71dc0190940f4234732cdd28.tar.gz
Implement global search using xapian.
* wqflask/wqflask/gsearch.py: Import Just and Maybe from pymonad.maybe; curry from pymonad.tools; xapian; MonadicDict from utility.monads; xapian_database from wqflask.database. Do not import datetime; database_connection from wqflask.database; base.data_set; base.trait; db; utility.type_checking; utility. (is_permitted_for_listing): New function. (GSearch.__init__): Reimplement using xapian. * wqflask/wqflask/templates/gsearch_gene.html (block js): Accept dictionaries with missing keys, replacing them with "N/A".
-rw-r--r--wqflask/wqflask/gsearch.py391
-rw-r--r--wqflask/wqflask/templates/gsearch_gene.html29
-rw-r--r--wqflask/wqflask/templates/gsearch_pheno.html22
3 files changed, 116 insertions, 326 deletions
diff --git a/wqflask/wqflask/gsearch.py b/wqflask/wqflask/gsearch.py
index b2224831..861d8c9d 100644
--- a/wqflask/wqflask/gsearch.py
+++ b/wqflask/wqflask/gsearch.py
@@ -1,319 +1,90 @@
import json
-import datetime as dt
from types import SimpleNamespace
-from wqflask.database import database_connection
-from base.data_set import create_dataset
-from base.trait import create_trait
-from db import webqtlDatabaseFunction
+from pymonad.maybe import Just, Maybe, Nothing
+from pymonad.tools import curry
+import xapian
from base import webqtlConfig
-
-from utility import hmac
-
from utility.authentication_tools import check_resource_availability
-from utility.type_checking import is_float, is_int, is_str, get_float, get_int, get_string
+from utility.monads import MonadicDict
+from wqflask.database import xapian_database
+
+
+def is_permitted_for_listing(trait, search_type):
+ """Check if it is permissible to list trait in search results."""
+ dataset_type = {"gene": "ProbeSet", "phenotype": "Publish"}
+ dataset_ob = (Maybe.apply(curry(2, lambda id, species:
+ SimpleNamespace(id=id,
+ type=dataset_type[search_type],
+ name=trait["dataset"],
+ species=species)))
+ .to_arguments(trait["dataset_id"], trait["species"]))
+ return (Maybe.apply(curry(2, check_resource_availability))
+ .to_arguments(dataset_ob, trait["name"])
+ .map(lambda permissions:
+ ((isinstance(permissions["data"], list)) and ("view" in permissions["data"]))
+ or (permissions["data"] != 'no-access'))
+ .maybe(False, lambda x: x))
class GSearch:
-
- def __init__(self, kw):
- assert('type' in kw)
- assert('terms' in kw)
-
- self.type = kw['type']
- self.terms = kw['terms']
- assert(is_str(self.type))
-
- if self.type == "gene":
- _result = ()
- with database_connection() as conn, conn.cursor() as cursor:
- cursor.execute(
- "SELECT Species.`Name` AS species_name, "
- "InbredSet.`Name` AS inbredset_name, "
- "Tissue.`Name` AS tissue_name, "
- "ProbeSetFreeze.Name AS probesetfreeze_name, "
- "ProbeSetFreeze.FullName AS "
- "probesetfreeze_fullname, ProbeSet.Name AS "
- "probeset_name, ProbeSet.Symbol AS "
- "probeset_symbol, CAST(ProbeSet.`description` AS BINARY) "
- "AS probeset_description, ProbeSet.Chr AS chr, "
- "ProbeSet.Mb AS mb, ProbeSetXRef.Mean AS mean, "
- "ProbeSetXRef.LRS AS lrs, ProbeSetXRef.`Locus` "
- "AS locus, ProbeSetXRef.`pValue` AS pvalue, "
- "ProbeSetXRef.`additive` AS additive, "
- "ProbeSetFreeze.Id AS probesetfreeze_id, "
- "Geno.Chr as geno_chr, Geno.Mb as geno_mb "
- "FROM Species INNER JOIN InbredSet ON "
- "InbredSet.`SpeciesId`=Species.`Id` "
- "INNER JOIN ProbeFreeze ON "
- "ProbeFreeze.InbredSetId=InbredSet.`Id` "
- "INNER JOIN Tissue ON ProbeFreeze.`TissueId`=Tissue.`Id` "
- "INNER JOIN ProbeSetFreeze ON "
- "ProbeSetFreeze.ProbeFreezeId=ProbeFreeze.Id "
- "INNER JOIN ProbeSetXRef ON "
- "ProbeSetXRef.ProbeSetFreezeId=ProbeSetFreeze.Id "
- "INNER JOIN ProbeSet ON "
- "ProbeSet.Id = ProbeSetXRef.ProbeSetId "
- "LEFT JOIN Geno ON ProbeSetXRef.Locus = Geno.Name "
- "AND Geno.SpeciesId = Species.Id WHERE "
- "( MATCH "
- "(ProbeSet.Name, ProbeSet.description, ProbeSet.symbol, "
- "ProbeSet.alias, ProbeSet.GenbankId, ProbeSet.UniGeneId, "
- "ProbeSet.Probe_Target_Description) "
- "AGAINST (%s IN BOOLEAN MODE) ) "
- "AND ProbeSetFreeze.confidentiality < 1 AND "
- "ProbeSetFreeze.public > 0 ORDER BY species_name, "
- "inbredset_name, tissue_name, probesetfreeze_name, "
- "probeset_name LIMIT 6000", (self.terms,)
- )
- _result = cursor.fetchall()
-
- trait_list = []
- dataset_to_permissions = {}
- for i, line in enumerate(_result):
- this_trait = {}
- this_trait['index'] = i + 1
- this_trait['name'] = line[5]
- this_trait['dataset'] = line[3]
- this_trait['dataset_fullname'] = line[4]
- this_trait['hmac'] = hmac.data_hmac(
- '{}:{}'.format(line[5], line[3]))
- this_trait['species'] = line[0]
- this_trait['group'] = line[1]
- this_trait['tissue'] = line[2]
- this_trait['symbol'] = "N/A"
- if line[6]:
- this_trait['symbol'] = line[6]
- this_trait['description'] = "N/A"
- if line[7]:
- this_trait['description'] = line[7].decode(
- 'utf-8', 'replace')
- this_trait['location_repr'] = "N/A"
- if (line[8] != "NULL" and line[8] != "") and (line[9] != 0):
- this_trait['location_repr'] = 'Chr%s: %.6f' % (
- line[8], float(line[9]))
-
- this_trait['LRS_score_repr'] = "N/A"
- this_trait['additive'] = "N/A"
- this_trait['mean'] = "N/A"
-
- if line[11] != "" and line[11] != None:
- this_trait['LRS_score_repr'] = f"{float(line[11]) / 4.61:.1f}"
- if line[14] != "" and line[14] != None:
- this_trait['additive'] = f"{line[14]:.3f}"
- if line[10] != "" and line[10] != None:
- this_trait['mean'] = f"{line[10]:.3f}"
-
- locus_chr = line[16]
- locus_mb = line[17]
-
- max_lrs_text = "N/A"
- if locus_chr and locus_mb:
- max_lrs_text = f"Chr{locus_chr}: {locus_mb}"
- this_trait['max_lrs_text'] = max_lrs_text
-
- this_trait['additive'] = "N/A"
- if line[14] != "" and line[14] != None:
- this_trait['additive'] = '%.3f' % line[14]
- this_trait['dataset_id'] = line[15]
-
- dataset_ob = SimpleNamespace(
- id=this_trait["dataset_id"], type="ProbeSet", name=this_trait["dataset"], species=this_trait["species"])
- if dataset_ob.id not in dataset_to_permissions:
- permissions = check_resource_availability(dataset_ob)
- dataset_to_permissions[dataset_ob.id] = permissions
- else:
- pemissions = dataset_to_permissions[dataset_ob.id]
- if type(permissions['data']) is list:
- if "view" not in permissions['data']:
- continue
- else:
- if permissions['data'] == 'no-access':
- continue
-
- trait_list.append(this_trait)
-
- self.trait_count = len(trait_list)
- self.trait_list = trait_list
-
- self.header_fields = ['Index',
- 'Record',
- 'Species',
- 'Group',
- 'Tissue',
- 'Dataset',
- 'Symbol',
- 'Description',
- 'Location',
- 'Mean',
- '-logP',
- '-logP Location',
- 'Additive Effect']
-
- self.header_data_names = [
- 'index',
- 'name',
- 'species',
- 'group',
- 'tissue',
- 'dataset_fullname',
- 'symbol',
- 'description',
- 'location_repr',
- 'mean',
- 'LRS_score_repr',
- 'max_lrs_text',
- 'additive',
- ]
-
- elif self.type == "phenotype":
- search_term = self.terms
- group_clause = ""
- if "_" in self.terms:
- if len(self.terms.split("_")[0]) == 3:
- search_term = self.terms.split("_")[1]
- group_clause = "AND InbredSet.`InbredSetCode` = '{}'".format(
- self.terms.split("_")[0])
- _result = ()
- with database_connection() as conn, conn.cursor() as cursor:
- cursor.execute(
- "SELECT Species.`Name`, InbredSet.`Name`, "
- "PublishFreeze.`Name`, PublishFreeze.`FullName`, "
- "PublishXRef.`Id`, CAST(Phenotype.`Pre_publication_description` "
- "AS BINARY), CAST(Phenotype.`Post_publication_description` "
- "AS BINARY), Publication.`Authors`, Publication.`Year`, "
- "Publication.`PubMed_ID`, PublishXRef.`LRS`, "
- "PublishXRef.`additive`, InbredSet.`InbredSetCode`, "
- "PublishXRef.`mean`, PublishFreeze.Id, Geno.Chr as geno_chr, "
- "Geno.Mb as geno_mb FROM Species "
- "INNER JOIN InbredSet ON InbredSet.`SpeciesId`=Species.`Id` "
- "INNER JOIN PublishFreeze ON "
- "PublishFreeze.`InbredSetId`=InbredSet.`Id` "
- "INNER JOIN PublishXRef ON "
- "PublishXRef.`InbredSetId`=InbredSet.`Id` "
- "INNER JOIN Phenotype ON "
- "PublishXRef.`PhenotypeId`=Phenotype.`Id` "
- "INNER JOIN Publication ON "
- "PublishXRef.`PublicationId`=Publication.`Id` "
- "LEFT JOIN Geno ON PublishXRef.Locus = Geno.Name "
- "AND Geno.SpeciesId = Species.Id WHERE "
- "((MATCH (Phenotype.Post_publication_description, "
- "Phenotype.Pre_publication_description, "
- "Phenotype.Pre_publication_abbreviation, "
- "Phenotype.Post_publication_abbreviation, "
- "Phenotype.Lab_code) AGAINST (%s IN BOOLEAN MODE) ) "
- "OR (MATCH (Publication.Abstract, Publication.Title, "
- "Publication.Authors) AGAINST (%s IN BOOLEAN MODE) ) "
- f") {group_clause} ORDER BY Species.`Name`, "
- "InbredSet.`Name`, PublishXRef.`Id` LIMIT 6000",
- ((search_term,)*2)
- )
- _result = cursor.fetchall()
- trait_list = []
- for i, line in enumerate(_result):
- trait_dict = {}
- trait_dict['index'] = i + 1
- trait_dict['name'] = str(line[4])
- if len(str(line[12])) == 3:
- trait_dict['display_name'] = str(
- line[12]) + "_" + trait_dict['name']
- else:
- trait_dict['display_name'] = trait_dict['name']
- trait_dict['dataset'] = line[2]
- trait_dict['dataset_fullname'] = line[3]
- trait_dict['hmac'] = hmac.data_hmac(
- '{}:{}'.format(line[4], line[2]))
- trait_dict['species'] = line[0]
- trait_dict['group'] = line[1]
- if line[9] != None and line[6] != None:
- trait_dict['description'] = line[6].decode(
- 'utf-8', 'replace')
- elif line[5] != None:
- trait_dict['description'] = line[5].decode(
- 'utf-8', 'replace')
- else:
- trait_dict['description'] = "N/A"
- trait_dict['dataset_id'] = line[14]
-
- trait_dict['LRS_score_repr'] = "N/A"
- trait_dict['additive'] = "N/A"
- trait_dict['mean'] = "N/A"
-
- if line[10] != "" and line[10] != None:
- trait_dict['LRS_score_repr'] = f"{float(line[10]) / 4.61:.1f}"
- # Some Max LRS values in the DB are wrongly listed as 0.000, but shouldn't be displayed
- if trait_dict['LRS_score_repr'] == "0.000":
- trait_dict['LRS_score_repr'] = "N/A"
- if line[11] != "" and line[11] != None:
- trait_dict['additive'] = f"{line[11]:.3f}"
- if line[13] != "" and line[13] != None:
- trait_dict['mean'] = f"{line[13]:.3f}"
-
- locus_chr = line[15]
- locus_mb = line[16]
-
- max_lrs_text = "N/A"
- if locus_chr and locus_mb:
- max_lrs_text = f"Chr{locus_chr}: {locus_mb}"
- trait_dict['max_lrs_text'] = max_lrs_text
-
- trait_dict['authors'] = line[7]
-
- trait_dict['authors'] = line[7]
- trait_dict['authors_display'] = trait_dict['authors']
- author_list = trait_dict['authors'].split(",")
- if len(author_list) >= 2:
- trait_dict['authors_display'] = (",").join(author_list[:2]) + ", et al."
-
- trait_dict['year'] = line[8]
- trait_dict['pubmed_text'] = "N/A"
- trait_dict['pubmed_link'] = "N/A"
- if trait_dict['year'].isdigit():
- trait_dict['pubmed_text'] = trait_dict['year']
- if line[9] != "" and line[9] != None:
- trait_dict['pubmed_link'] = webqtlConfig.PUBMEDLINK_URL % line[8]
- if line[12]:
- trait_dict['display_name'] = line[12] + \
- "_" + str(trait_dict['name'])
-
- dataset_ob = SimpleNamespace(id=trait_dict["dataset_id"], type="Publish", species=trait_dict["species"])
- permissions = check_resource_availability(dataset_ob, trait_dict['name'])
- if type(permissions['data']) is list:
- if "view" not in permissions['data']:
- continue
- else:
- if permissions['data'] == 'no-access':
- continue
-
- trait_list.append(trait_dict)
-
- self.trait_count = len(trait_list)
- self.trait_list = trait_list
-
- self.header_fields = ['Index',
- 'Species',
- 'Group',
- 'Record',
- 'Description',
- 'Authors',
- 'Year',
- 'Max LRS',
- 'Max LRS Location',
- 'Additive Effect']
-
- self.header_data_names = [
- 'index',
- 'name',
- 'species',
- 'group',
- 'tissue',
- 'dataset_fullname',
- 'symbol',
- 'description',
- 'location_repr',
- 'mean',
- 'LRS_score_repr',
- 'max_lrs_text',
- 'additive',
- ]
+ def __init__(self, kwargs):
+ if ("type" not in kwargs) or ("terms" not in kwargs):
+ raise ValueError
+ self.type = kwargs["type"]
+ self.terms = kwargs["terms"]
+
+ queryparser = xapian.QueryParser()
+ queryparser.set_stemmer(xapian.Stem("en"))
+ queryparser.set_stemming_strategy(queryparser.STEM_SOME)
+ querystring = self.terms
+ query = queryparser.parse_query(querystring)
+ # FIXME: Handle presentation (that is, formatting strings for
+ # display) in the template rendering, not when retrieving
+ # search results.
+ chr_mb = curry(2, lambda chr, mb: f"Chr{chr}: {mb:.6f}")
+ format3f = lambda x: f"{x:.3f}"
+ hmac = curry(2, lambda dataset, dataset_fullname: f"{dataset_fullname}:{dataset}")
+ self.trait_list = []
+ # pylint: disable=invalid-name
+ with xapian_database() as db:
+ enquire = xapian.Enquire(db)
+ # Filter documents by type.
+ enquire.set_query(xapian.Query(xapian.Query.OP_FILTER,
+ query,
+ xapian.Query(f"XT{self.type}")))
+ for i, trait in enumerate(
+ [trait for xapian_match in enquire.get_mset(0, db.get_doccount())
+ if is_permitted_for_listing(
+ trait := MonadicDict(json.loads(xapian_match.document.get_data())),
+ search_type=self.type)]):
+ trait["index"] = Just(i)
+ trait["location_repr"] = (Maybe.apply(chr_mb)
+ .to_arguments(trait.pop("chr"), trait.pop("mb")))
+ trait["LRS_score_repr"] = trait.pop("lrs").map(format3f)
+ trait["additive"] = trait["additive"].map(format3f)
+ trait["mean"] = trait["mean"].map(format3f)
+ trait["max_lrs_text"] = (Maybe.apply(chr_mb)
+ .to_arguments(trait.pop("geno_chr"), trait.pop("geno_mb")))
+ if self.type == "gene":
+ trait["hmac"] = (Maybe.apply(hmac)
+ .to_arguments(trait["dataset"], trait["dataset_fullname"]))
+ elif self.type == "phenotype":
+ trait["display_name"] = trait["name"]
+ inbredsetcode = trait.pop("inbredsetcode")
+ if inbredsetcode.map(len) == Just(3):
+ trait["display_name"] = (Maybe.apply(lambda inbredsetcode, name:
+ f"{inbredsetcode}_{name}")
+ .to_arguments(inbredsetcode, trait["name"]))
+ trait["hmac"] = (Maybe.apply(hmac)
+ .to_arguments(trait.pop("dataset_fullname"), trait["name"]))
+ trait["authors_display"] = (trait.pop("authors").map(
+ lambda authors:
+ ", ".join(authors[:2] + ["et al."] if len(authors) >=2 else authors)))
+ trait["pubmed_text"] = (trait["year"].bind(
+ lambda year: Just(year) if year.isdigit() else Nothing))
+ trait["pubmed_link"] = (trait["pubmed_id"].map(
+ lambda pubmedid: webqtlConfig.PUBMEDLINK_URL % pubmedid))
+ self.trait_list.append(trait.data)
+ self.trait_count = len(self.trait_list)
diff --git a/wqflask/wqflask/templates/gsearch_gene.html b/wqflask/wqflask/templates/gsearch_gene.html
index 0e96f673..03e5019c 100644
--- a/wqflask/wqflask/templates/gsearch_gene.html
+++ b/wqflask/wqflask/templates/gsearch_gene.html
@@ -93,7 +93,8 @@
'type': "natural",
'width': "30px",
'targets': 1,
- 'data': "index"
+ 'data': "index",
+ 'defaultContent': "N/A"
},
{
'title': "Record",
@@ -111,35 +112,40 @@
'type': "natural",
'width': "60px",
'targets': 3,
- 'data': "species"
+ 'data': "species",
+ 'defaultContent': "N/A"
},
{
'title': "Group",
'type': "natural",
'width': "150px",
'targets': 4,
- 'data': "group"
+ 'data': "group",
+ 'defaultContent': "N/A"
},
{
'title': "Tissue",
'type': "natural",
'width': "150px",
'targets': 5,
- 'data': "tissue"
+ 'data': "tissue",
+ 'defaultContent': "N/A"
},
{
'title': "Dataset",
'type': "natural",
'targets': 6,
'width': "320px",
- 'data': "dataset_fullname"
+ 'data': "dataset_fullname",
+ 'defaultContent': "N/A"
},
{
'title': "Symbol",
'type': "natural",
'width': "60px",
'targets': 7,
- 'data': "symbol"
+ 'data': "symbol",
+ 'defaultContent': "N/A"
},
{
'title': "Description",
@@ -160,7 +166,8 @@
'type': "natural-minus-na",
'width': "125px",
'targets': 9,
- 'data': "location_repr"
+ 'data': "location_repr",
+ 'defaultContent': "N/A"
},
{
'title': "Mean",
@@ -168,7 +175,8 @@
'orderSequence': [ "desc", "asc"],
'width': "30px",
'targets': 10,
- 'data': "mean"
+ 'data': "mean",
+ 'defaultContent': "N/A"
},
{
'title': "<div style='text-align: right; padding-right: 10px;'>Peak</div> <div style='text-align: right;'>-logP <a href=\"{{ url_for('glossary_blueprint.glossary') }}#LRS\" target=\"_blank\" style=\"color: white;\"><sup>?</sup></a></div>",
@@ -176,6 +184,7 @@
'width': "60px",
'targets': 11,
'data': "LRS_score_repr",
+ 'defaultContent': "N/A",
'orderSequence': [ "desc", "asc"]
},
{
@@ -183,7 +192,8 @@
'type': "natural-minus-na",
'width': "125px",
'targets': 12,
- 'data': "max_lrs_text"
+ 'data': "max_lrs_text",
+ 'defaultContent': "N/A"
},
{
'title': "Additive<br>Effect<a href=\"{{ url_for('glossary_blueprint.glossary') }}#A\" target=\"_blank\" style=\"color: white;\"><sup>?</sup></a>",
@@ -191,6 +201,7 @@
'width': "50px",
'targets': 13,
'data': "additive",
+ 'defaultContent': "N/A",
'orderSequence': [ "desc", "asc"]
}
]
diff --git a/wqflask/wqflask/templates/gsearch_pheno.html b/wqflask/wqflask/templates/gsearch_pheno.html
index 6eb7e18a..a1fef2c8 100644
--- a/wqflask/wqflask/templates/gsearch_pheno.html
+++ b/wqflask/wqflask/templates/gsearch_pheno.html
@@ -93,21 +93,24 @@
'type': "natural",
'width': "30px",
'targets': 1,
- 'data': "index"
+ 'data': "index",
+ 'defaultContent': "N/A"
},
{
'title': "Species",
'type': "natural",
'width': "60px",
'targets': 2,
- 'data': "species"
+ 'data': "species",
+ 'defaultContent': "N/A"
},
{
'title': "Group",
'type': "natural",
'width': "100px",
'targets': 3,
- 'data': "group"
+ 'data': "group",
+ 'defaultContent': "N/A"
},
{
'title': "Record",
@@ -139,14 +142,16 @@
'type': "natural-minus-na",
'width': "30px",
'targets': 6,
- 'data': "mean"
+ 'data': "mean",
+ 'defaultContent': "N/A"
},
{
'title': "Authors",
'type': "natural",
'width': "300px",
'targets': 7,
- 'data': "authors_display"
+ 'data': "authors_display",
+ 'defaultContent': "N/A"
},
{
'title': "Year",
@@ -156,7 +161,7 @@
'width': "25px",
'targets': 8,
'render': function(data) {
- if (data.pubmed_id != "N/A"){
+ if ("pubmed_id" in data){
return '<a href="' + data.pubmed_link + '">' + data.pubmed_text + '</a>'
} else {
return data.pubmed_text
@@ -168,6 +173,7 @@
'title': "<div style='text-align: right; padding-right: 10px;'>Peak</div> <div style='text-align: right;'>-logP <a href=\"{{ url_for('glossary_blueprint.glossary') }}#LRS\" target=\"_blank\" style=\"color: white;\"><sup>?</sup></a></div>",
'type': "natural-minus-na",
'data': "LRS_score_repr",
+ 'defaultContent': "N/A",
'width': "60px",
'targets': 9,
'orderSequence': [ "desc", "asc"]
@@ -177,12 +183,14 @@
'type': "natural-minus-na",
'width': "125px",
'targets': 10,
- 'data': "max_lrs_text"
+ 'data': "max_lrs_text",
+ 'defaultContent': "N/A"
},
{
'title': "Additive Effect<a href=\"{{ url_for('glossary_blueprint.glossary') }}#A\" target=\"_blank\" style=\"color: white;\"><sup>?</sup></a>",
'type': "natural-minus-na",
'data': "additive",
+ 'defaultContent': "N/A",
'width': "60px",
'targets': 11,
'orderSequence': [ "desc", "asc"]