From 2b7ecb52fa5f568d7d48fc324bae8231343543bd Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Fri, 14 Apr 2023 13:19:31 +0300 Subject: auth: Add external script to search for phenotypes We need a search through the available phenotype traits in the database when linking the traits to user groups. Unfortunately, the Xapian Search indexes do not (and should not) include the internal identifiers we use to disambiguate the traits. On the other hand, we do not want to present the user with traits that have already been linked to any user group within the search results. The script in this commit, together with the modified queries for fetching the phenotype data form a "hack" of sorts to wrap around the way the search works while ensuring we do not present the user with "non-actionable" (linked) traits in the search results. --- gn3/auth/authorisation/data/phenotypes.py | 45 +++++++++-- scripts/search_phenotypes.py | 119 ++++++++++++++++++++++++++++++ 2 files changed, 157 insertions(+), 7 deletions(-) create mode 100644 scripts/search_phenotypes.py diff --git a/gn3/auth/authorisation/data/phenotypes.py b/gn3/auth/authorisation/data/phenotypes.py index 0d018cf..3ba478a 100644 --- a/gn3/auth/authorisation/data/phenotypes.py +++ b/gn3/auth/authorisation/data/phenotypes.py @@ -1,14 +1,44 @@ """Handle linking of Phenotype data to the Auth(entic|oris)ation system.""" +from typing import Any, Iterable + +from MySQLdb.cursors import DictCursor + import gn3.auth.db as authdb import gn3.db_utils as gn3db from gn3.auth.authorisation.checks import authorised_p -def linked_phenotype_data(conn: authdb.DbConnection) -> tuple[dict, ...]: +def linked_phenotype_data( + authconn: authdb.DbConnection, gn3conn: gn3db.Connection, + species: str = "") -> Iterable[dict[str, Any]]: """Retrieve phenotype data linked to user groups.""" - with authdb.cursor(conn) as cursor: - cursor.execute("SELECT * FROM linked_phenotype_data") - return tuple(dict(row) for row in cursor.fetchall()) - return tuple() + authkeys = ("SpeciesId", "InbredSetId", "PublishFreezeId", "PublishXRefId") + with (authdb.cursor(authconn) as authcursor, + gn3conn.cursor(DictCursor) as gn3cursor): + authcursor.execute("SELECT * FROM linked_phenotype_data") + linked = tuple(tuple(row[key] for key in authkeys) + for row in authcursor.fetchall()) + paramstr = "".join(["(%s, %s, %s, %s)"] * len(linked)) + query = ( + "SELECT spc.SpeciesId, spc.SpeciesName, iset.InbredSetId, " + "iset.InbredSetName, pf.Id AS PublishFreezeId, " + "pf.Name AS dataset_name, pf.FullName AS dataset_fullname, " + "pf.ShortName AS dataset_shortname, pxr.Id AS PublishXRefId " + "FROM " + "Species AS spc " + "INNER JOIN InbredSet AS iset " + "ON spc.SpeciesId=iset.SpeciesId " + "INNER JOIN PublishFreeze AS pf " + "ON iset.InbredSetId=pf.InbredSetId " + "INNER JOIN PublishXRef AS pxr " + "ON pf.InbredSetId=pxr.InbredSetId") + ( + " WHERE" if (len(linked) > 0 or bool(species)) else "") + ( + (" (spc.SpeciesId, iset.InbredSetId, pf.Id, pxr.Id) " + f"NOT IN ({paramstr})") if len(linked) > 0 else "") + ( + " AND"if len(linked) > 0 else "") + ( + " spc.SpeciesName=%s" if bool(species) else "") + params = linked + ((species,) if bool(species) else tuple()) + gn3cursor.execute(query, params) + return (item for item in gn3cursor.fetchall()) @authorised_p(("system:data:link-to-group",), error_description=( @@ -22,10 +52,11 @@ def ungrouped_phenotype_data( params = tuple( (row["SpeciesId"], row["InbredSetId"], row["PublishFreezeId"], row["PublishXRefId"]) - for row in linked_phenotype_data(authconn)) + for row in linked_phenotype_data(authconn, gn3conn)) paramstr = ", ".join(["(?, ?, ?, ?)"] * len(params)) query = ( - "SELECT spc.SpeciesId, iset.InbredSetId, pf.Id AS PublishFreezeId, " + "SELECT spc.SpeciesId, spc.SpeciesName, iset.InbredSetId, " + "iset.InbredSetName, pf.Id AS PublishFreezeId, " "pf.Name AS dataset_name, pf.FullName AS dataset_fullname, " "pf.ShortName AS dataset_shortname, pxr.Id AS PublishXRefId " "FROM " diff --git a/scripts/search_phenotypes.py b/scripts/search_phenotypes.py new file mode 100644 index 0000000..cdd1d96 --- /dev/null +++ b/scripts/search_phenotypes.py @@ -0,0 +1,119 @@ +""" +A script to do search for phenotype traits using the Xapian Search endpoint. +""" +import uuid +import json +import traceback +from urllib.parse import urljoin +from typing import Any, Iterable +from datetime import datetime, timedelta + +import click +import redis +import requests + +from gn3.auth import db as authdb +from gn3 import db_utils as gn3db +from gn3.settings import SQL_URI, AUTH_DB +from gn3.auth.authorisation.data.phenotypes import linked_phenotype_data + +class NoSearchResults(Exception): + """Raise when there are no results for a search.""" + +def do_search( + host: str, query: str, per_page: int, page: int = 1) -> Iterable[dict[str, Any]]: + """Do the search and return the results""" + search_uri = urljoin(host, (f"search/?page={page}&per_page={per_page}" + f"&type=phenotype&query={query}")) + response = requests.get(search_uri) + results = response.json() + if len(results) > 0: + return (item for item in results) + raise NoSearchResults(f"No results for search '{query}'") + +def __filter_object__(search_item): + return (search_item["species"], search_item["group"], + search_item["dataset"], search_item["name"]) + +def remove_selected(search_results, selected: tuple): + """Remove any item that the user has selected.""" + return (item for item in search_results if __filter_object__(item) not in selected) + +def remove_linked(search_results, linked: tuple): + """Remove any item that has been already linked to a user group.""" + return (item for item in search_results if __filter_object__(item) not in linked) + +def save_to_redis(redisconn: redis.Redis, redisname: str, results: tuple[dict[str, Any], ...]): + """Save the results to redis db.""" + key = "search_results" + prev_results = tuple(json.loads(redisconn.hget(redisname, key) or "[]")) + redisconn.hset(redisname, key, json.dumps(prev_results + results)) + +def expire_redis_results(redisconn: redis.Redis, redisname: str): + """Expire the results after a while to ensure they are cleaned up.""" + redisconn.expireat(redisname, datetime.now() + timedelta(minutes=30)) + +@click.command() +@click.argument("species") +@click.argument("query") +@click.argument("job-id", type=click.UUID) +@click.option( + "--host", default="http://localhost:8080/api/", help="The URI to GN3.") +@click.option("--per-page", default=10000, help="Number of results per page.") +@click.option("--selected", default="[]", help="Selected traits.") +@click.option( + "--auth-db-uri", default=AUTH_DB, help="The SQL URI to the auth database.") +@click.option( + "--gn3-db-uri", default=SQL_URI, + help="The SQL URI to the main GN3 database.") +@click.option( + "--redis-uri", default="redis://:@localhost:6379/0", + help="The URI to the redis server.") +def search(# pylint: disable=[too-many-arguments, too-many-locals] + species: str, query: str, job_id: uuid.UUID, host: str, per_page: int, + selected: str, auth_db_uri: str, gn3_db_uri: str, redis_uri: str): + """ + Search for phenotype traits, filtering out any linked and selected traits, + loading more and more pages until the `per_page` quota is fulfilled or the + search runs out of pages. + """ + redisname = f"GN3:JOBS:{job_id}" + with (authdb.connection(auth_db_uri) as authconn, + gn3db.database_connection(gn3_db_uri) as gn3conn, + redis.Redis.from_url(redis_uri, decode_responses=True) as redisconn): + redisconn.hset(redisname, "status", "started") + save_to_redis(redisconn, redisname, tuple()) + try: + search_query = f"species:{species}" + ( + f" AND ({query})" if bool(query) else "") + filter_keys = ("SpeciesName", "InbredSetName", "dataset_name", + "PublishXRefId") + selected_traits = tuple(tuple(item[key] for key in filter_keys) + for item in json.loads(selected)) + linked = tuple(tuple(row[key] for key in filter_keys) + for row in linked_phenotype_data( + authconn, gn3conn, species)) + page = 1 + count = 0 + while count < per_page: + results = tuple(remove_linked( + remove_selected( + do_search(host, search_query, per_page, page), + selected_traits), + linked))[0:per_page-count] + count = count + len(results) + page = page + 1 + save_to_redis(redisconn, redisname, results) + except NoSearchResults as _nsr: + pass + except Exception as _exc: # pylint: disable=[broad-except] + redisconn.hset(redisname, "status", "failed") + redisconn.hset(redisname, "exception", traceback.format_exc()) + expire_redis_results(redisconn, redisname) + return 1 + redisconn.hset(redisname, "status", "completed") + expire_redis_results(redisconn, redisname) + return 0 + +if __name__ == "__main__": + search() # pylint: disable=[no-value-for-parameter] -- cgit v1.2.3