diff options
Diffstat (limited to 'uploader/phenotypes')
-rw-r--r-- | uploader/phenotypes/__init__.py | 2 | ||||
-rw-r--r-- | uploader/phenotypes/models.py | 232 | ||||
-rw-r--r-- | uploader/phenotypes/views.py | 368 |
3 files changed, 602 insertions, 0 deletions
diff --git a/uploader/phenotypes/__init__.py b/uploader/phenotypes/__init__.py new file mode 100644 index 0000000..c17d32c --- /dev/null +++ b/uploader/phenotypes/__init__.py @@ -0,0 +1,2 @@ +"""Package for handling ('classical') phenotype data""" +from .views import phenotypesbp diff --git a/uploader/phenotypes/models.py b/uploader/phenotypes/models.py new file mode 100644 index 0000000..9324601 --- /dev/null +++ b/uploader/phenotypes/models.py @@ -0,0 +1,232 @@ +"""Database and utility functions for phenotypes.""" +from typing import Optional +from functools import reduce +from datetime import datetime + +import MySQLdb as mdb +from MySQLdb.cursors import Cursor, DictCursor + +from uploader.db_utils import debug_query + +def datasets_by_population( + conn: mdb.Connection, + species_id: int, + population_id: int +) -> tuple[dict, ...]: + """Retrieve all of a population's phenotype studies.""" + with conn.cursor(cursorclass=DictCursor) as cursor: + cursor.execute( + "SELECT s.SpeciesId, pf.* FROM Species AS s " + "INNER JOIN InbredSet AS iset ON s.Id=iset.SpeciesId " + "INNER JOIN PublishFreeze AS pf ON iset.Id=pf.InbredSetId " + "WHERE s.Id=%s AND iset.Id=%s;", + (species_id, population_id)) + return tuple(dict(row) for row in cursor.fetchall()) + + +def dataset_by_id(conn: mdb.Connection, + species_id: int, + population_id: int, + dataset_id: int) -> dict: + """Fetch dataset details by identifier""" + with conn.cursor(cursorclass=DictCursor) as cursor: + cursor.execute( + "SELECT s.SpeciesId, pf.* FROM Species AS s " + "INNER JOIN InbredSet AS iset ON s.Id=iset.SpeciesId " + "INNER JOIN PublishFreeze AS pf ON iset.Id=pf.InbredSetId " + "WHERE s.Id=%s AND iset.Id=%s AND pf.Id=%s", + (species_id, population_id, dataset_id)) + return dict(cursor.fetchone()) + + +def phenotypes_count(conn: mdb.Connection, + population_id: int, + dataset_id: int) -> int: + """Count the number of phenotypes in the dataset.""" + with conn.cursor(cursorclass=DictCursor) as cursor: + cursor.execute( + "SELECT COUNT(*) AS total_phenos FROM Phenotype AS pheno " + "INNER JOIN PublishXRef AS pxr ON pheno.Id=pxr.PhenotypeId " + "INNER JOIN PublishFreeze AS pf ON pxr.InbredSetId=pf.InbredSetId " + "WHERE pxr.InbredSetId=%s AND pf.Id=%s", + (population_id, dataset_id)) + return int(cursor.fetchone()["total_phenos"]) + + +def dataset_phenotypes(conn: mdb.Connection, + population_id: int, + dataset_id: int, + offset: int = 0, + limit: Optional[int] = None) -> tuple[dict, ...]: + """Fetch the actual phenotypes.""" + _query = ( + "SELECT pheno.*, pxr.Id, ist.InbredSetCode FROM Phenotype AS pheno " + "INNER JOIN PublishXRef AS pxr ON pheno.Id=pxr.PhenotypeId " + "INNER JOIN PublishFreeze AS pf ON pxr.InbredSetId=pf.InbredSetId " + "INNER JOIN InbredSet AS ist ON pf.InbredSetId=ist.Id " + "WHERE pxr.InbredSetId=%s AND pf.Id=%s") + ( + f" LIMIT {limit} OFFSET {offset}" if bool(limit) else "") + with conn.cursor(cursorclass=DictCursor) as cursor: + cursor.execute(_query, (population_id, dataset_id)) + debug_query(cursor) + return tuple(dict(row) for row in cursor.fetchall()) + + +def __phenotype_se__(cursor: Cursor, + species_id: int, + population_id: int, + dataset_id: int, + xref_id: str) -> dict: + """Fetch standard-error values (if they exist) for a phenotype.""" + _sequery = ( + "SELECT pxr.Id AS xref_id, pxr.DataId, str.Id AS StrainId, pse.error, nst.count " + "FROM Phenotype AS pheno " + "INNER JOIN PublishXRef AS pxr ON pheno.Id=pxr.PhenotypeId " + "INNER JOIN PublishSE AS pse ON pxr.DataId=pse.DataId " + "INNER JOIN NStrain AS nst ON pse.DataId=nst.DataId " + "INNER JOIN Strain AS str ON nst.StrainId=str.Id " + "INNER JOIN StrainXRef AS sxr ON str.Id=sxr.StrainId " + "INNER JOIN PublishFreeze AS pf ON sxr.InbredSetId=pf.InbredSetId " + "INNER JOIN InbredSet AS iset ON pf.InbredSetId=iset.InbredSetId " + "WHERE (str.SpeciesId, pxr.InbredSetId, pf.Id, pxr.Id)=(%s, %s, %s, %s)") + cursor.execute(_sequery, + (species_id, population_id, dataset_id, xref_id)) + return {(row["DataId"], row["StrainId"]): { + "xref_id": row["xref_id"], + "DataId": row["DataId"], + "error": row["error"], + "count": row["count"] + } for row in cursor.fetchall()} + +def __organise_by_phenotype__(pheno, row): + """Organise disparate data rows into phenotype 'objects'.""" + _pheno = pheno.get(row["Id"]) + return { + **pheno, + row["Id"]: { + "Id": row["Id"], + "Pre_publication_description": row["Pre_publication_description"], + "Post_publication_description": row["Post_publication_description"], + "Original_description": row["Original_description"], + "Units": row["Units"], + "Pre_publication_abbreviation": row["Pre_publication_abbreviation"], + "Post_publication_abbreviation": row["Post_publication_abbreviation"], + "xref_id": row["pxr.Id"], + "data": { + **(_pheno["data"] if bool(_pheno) else {}), + (row["DataId"], row["StrainId"]): { + "DataId": row["DataId"], + "mean": row["mean"], + "Locus": row["Locus"], + "LRS": row["LRS"], + "additive": row["additive"], + "Sequence": row["Sequence"], + "comments": row["comments"], + "value": row["value"], + "StrainName": row["Name"], + "StrainName2": row["Name2"], + "StrainSymbol": row["Symbol"], + "StrainAlias": row["Alias"] + } + } + } + } + + +def __merge_pheno_data_and_se__(data, sedata) -> dict: + """Merge phenotype data with the standard errors.""" + return { + key: {**value, **sedata.get(key, {})} + for key, value in data.items() + } + + +def phenotype_by_id( + conn: mdb.Connection, + species_id: int, + population_id: int, + dataset_id: int, + xref_id +) -> Optional[dict]: + """Fetch a specific phenotype.""" + _dataquery = ("SELECT pheno.*, pxr.*, pd.*, str.*, iset.InbredSetCode " + "FROM Phenotype AS pheno " + "INNER JOIN PublishXRef AS pxr ON pheno.Id=pxr.PhenotypeId " + "INNER JOIN PublishData AS pd ON pxr.DataId=pd.Id " + "INNER JOIN Strain AS str ON pd.StrainId=str.Id " + "INNER JOIN StrainXRef AS sxr ON str.Id=sxr.StrainId " + "INNER JOIN PublishFreeze AS pf ON sxr.InbredSetId=pf.InbredSetId " + "INNER JOIN InbredSet AS iset ON pf.InbredSetId=iset.InbredSetId " + "WHERE " + "(str.SpeciesId, pxr.InbredSetId, pf.Id, pxr.Id)=(%s, %s, %s, %s)") + with conn.cursor(cursorclass=DictCursor) as cursor: + cursor.execute(_dataquery, + (species_id, population_id, dataset_id, xref_id)) + _pheno: dict = reduce(__organise_by_phenotype__, cursor.fetchall(), {}) + if bool(_pheno) and len(_pheno.keys()) == 1: + _pheno = tuple(_pheno.values())[0] + return { + **_pheno, + "data": tuple(__merge_pheno_data_and_se__( + _pheno["data"], + __phenotype_se__(cursor, + species_id, + population_id, + dataset_id, + xref_id)).values()) + } + if bool(_pheno) and len(_pheno.keys()) > 1: + raise Exception( + "We found more than one phenotype with the same identifier!") + + return None + + +def phenotypes_data(conn: mdb.Connection, + population_id: int, + dataset_id: int, + offset: int = 0, + limit: Optional[int] = None) -> tuple[dict, ...]: + """Fetch the data for the phenotypes.""" + # — Phenotype -> PublishXRef -> PublishData -> Strain -> StrainXRef -> PublishFreeze + _query = ("SELECT pheno.*, pxr.*, pd.*, str.*, iset.InbredSetCode " + "FROM Phenotype AS pheno " + "INNER JOIN PublishXRef AS pxr ON pheno.Id=pxr.PhenotypeId " + "INNER JOIN PublishData AS pd ON pxr.DataId=pd.Id " + "INNER JOIN Strain AS str ON pd.StrainId=str.Id " + "INNER JOIN StrainXRef AS sxr ON str.Id=sxr.StrainId " + "INNER JOIN PublishFreeze AS pf ON sxr.InbredSetId=pf.InbredSetId " + "INNER JOIN InbredSet AS iset ON pf.InbredSetId=iset.InbredSetId " + "WHERE pxr.InbredSetId=%s AND pf.Id=%s") + ( + f" LIMIT {limit} OFFSET {offset}" if bool(limit) else "") + with conn.cursor(cursorclass=DictCursor) as cursor: + cursor.execute(_query, (population_id, dataset_id)) + debug_query(cursor) + return tuple(dict(row) for row in cursor.fetchall()) + + +def save_new_dataset(cursor: Cursor, + population_id: int, + dataset_name: str, + dataset_fullname: str, + dataset_shortname: str) -> dict: + """Create a new phenotype dataset.""" + params = { + "population_id": population_id, + "dataset_name": dataset_name, + "dataset_fullname": dataset_fullname, + "dataset_shortname": dataset_shortname, + "created": datetime.now().date().isoformat(), + "public": 2, + "confidentiality": 0, + "users": None + } + cursor.execute( + "INSERT INTO PublishFreeze(Name, FullName, ShortName, CreateTime, " + "public, InbredSetId, confidentiality, AuthorisedUsers) " + "VALUES(%(dataset_name)s, %(dataset_fullname)s, %(dataset_shortname)s, " + "%(created)s, %(public)s, %(population_id)s, %(confidentiality)s, " + "%(users)s)", + params) + debug_query(cursor) + return {**params, "Id": cursor.lastrowid} diff --git a/uploader/phenotypes/views.py b/uploader/phenotypes/views.py new file mode 100644 index 0000000..02e8078 --- /dev/null +++ b/uploader/phenotypes/views.py @@ -0,0 +1,368 @@ +"""Views handling ('classical') phenotypes.""" +import sys +import uuid +import json +from pathlib import Path +from functools import wraps + +from redis import Redis +from requests.models import Response +from MySQLdb.cursors import DictCursor +from flask import (flash, + request, + url_for, + redirect, + Blueprint, + render_template, + current_app as app) + +# from r_qtl import r_qtl2 as rqtl2 +from r_qtl import r_qtl2_qc as rqc +from r_qtl import exceptions as rqe + +from uploader import jobs +from uploader.files import save_file#, fullpath +from uploader.oauth2.client import oauth2_post +from uploader.authorisation import require_login +from uploader.db_utils import database_connection +from uploader.species.models import all_species, species_by_id +from uploader.monadic_requests import make_either_error_handler +from uploader.request_checks import with_species, with_population +from uploader.datautils import safe_int, order_by_family, enumerate_sequence +from uploader.population.models import (populations_by_species, + population_by_species_and_id) +from uploader.input_validation import (encode_errors, + decode_errors, + is_valid_representative_name) + +from .models import (dataset_by_id, + phenotype_by_id, + phenotypes_count, + save_new_dataset, + dataset_phenotypes, + datasets_by_population) + +phenotypesbp = Blueprint("phenotypes", __name__) + +@phenotypesbp.route("/phenotypes", methods=["GET"]) +@require_login +def index(): + """Direct entry-point for phenotypes data handling.""" + with database_connection(app.config["SQL_URI"]) as conn: + if not bool(request.args.get("species_id")): + return render_template("phenotypes/index.html", + species=order_by_family(all_species(conn)), + activelink="phenotypes") + + species = species_by_id(conn, request.args.get("species_id")) + if not bool(species): + flash("No such species!", "alert-danger") + return redirect(url_for("species.populations.phenotypes.index")) + return redirect(url_for("species.populations.phenotypes.select_population", + species_id=species["SpeciesId"])) + + +@phenotypesbp.route("<int:species_id>/phenotypes/select-population", + methods=["GET"]) +@require_login +@with_species(redirect_uri="species.populations.phenotypes.index") +def select_population(species: dict, **kwargs):# pylint: disable=[unused-argument] + """Select the population for your phenotypes.""" + with database_connection(app.config["SQL_URI"]) as conn: + if not bool(request.args.get("population_id")): + return render_template("phenotypes/select-population.html", + species=species, + populations=order_by_family( + populations_by_species( + conn, species["SpeciesId"]), + order_key="FamilyOrder"), + activelink="phenotypes") + + population = population_by_species_and_id( + conn, species["SpeciesId"], int(request.args["population_id"])) + if not bool(population): + flash("No such population found!", "alert-danger") + return redirect(url_for( + "species.populations.phenotypes.select_population", + species_id=species["SpeciesId"])) + + return redirect(url_for("species.populations.phenotypes.list_datasets", + species_id=species["SpeciesId"], + population_id=population["Id"])) + + + +@phenotypesbp.route( + "<int:species_id>/populations/<int:population_id>/phenotypes/datasets", + methods=["GET"]) +@require_login +@with_population(species_redirect_uri="species.populations.phenotypes.index", + redirect_uri="species.populations.phenotypes.select_population") +def list_datasets(species: dict, population: dict, **kwargs):# pylint: disable=[unused-argument] + """List available phenotype datasets.""" + with database_connection(app.config["SQL_URI"]) as conn: + return render_template("phenotypes/list-datasets.html", + species=species, + population=population, + datasets=datasets_by_population( + conn, + species["SpeciesId"], + population["Id"]), + activelink="list-datasets") + + +def with_dataset( + species_redirect_uri: str, + population_redirect_uri: str, + redirect_uri: str +): + """Ensure the dataset actually exists.""" + def __decorator__(func): + @wraps(func) + @with_population(species_redirect_uri, population_redirect_uri) + def __with_dataset__(**kwargs): + try: + _spcid = int(kwargs["species_id"]) + _popid = int(kwargs["population_id"]) + _dsetid = int(kwargs.get("dataset_id")) + select_dataset_uri = redirect(url_for( + redirect_uri, species_id=_spcid, population_id=_popid)) + if not bool(_dsetid): + flash("You need to select a valid 'dataset_id' value.", + "alert-danger") + return select_dataset_uri + with database_connection(app.config["SQL_URI"]) as conn: + dataset = dataset_by_id(conn, _spcid, _popid, _dsetid) + if not bool(dataset): + flash("You must select a valid dataset.", + "alert-danger") + return select_dataset_uri + except ValueError as _verr: + app.logger.debug( + "Exception converting 'dataset_id' to integer: %s", + kwargs.get("dataset_id"), + exc_info=True) + flash("Expected 'dataset_id' value to be an integer." + "alert-danger") + return select_dataset_uri + return func(dataset=dataset, **kwargs) + return __with_dataset__ + return __decorator__ + + +@phenotypesbp.route( + "<int:species_id>/populations/<int:population_id>/phenotypes/datasets" + "/<int:dataset_id>/view", + methods=["GET"]) +@require_login +@with_dataset( + species_redirect_uri="species.populations.phenotypes.index", + population_redirect_uri="species.populations.phenotypes.select_population", + redirect_uri="species.populations.phenotypes.list_datasets") +def view_dataset(# pylint: disable=[unused-argument] + species: dict, population: dict, dataset: dict, **kwargs): + """View a specific dataset""" + with database_connection(app.config["SQL_URI"]) as conn: + dataset = dataset_by_id( + conn, species["SpeciesId"], population["Id"], dataset["Id"]) + if not bool(dataset): + flash("Could not find such a phenotype dataset!", "alert-danger") + return redirect(url_for( + "species.populations.phenotypes.list_datasets", + species_id=species["SpeciesId"], + population_id=population["Id"])) + + start_at = max(safe_int(request.args.get("start_at") or 0), 0) + count = int(request.args.get("count") or 20) + return render_template("phenotypes/view-dataset.html", + species=species, + population=population, + dataset=dataset, + phenotype_count=phenotypes_count( + conn, population["Id"], dataset["Id"]), + phenotypes=enumerate_sequence( + dataset_phenotypes(conn, + population["Id"], + dataset["Id"], + offset=start_at, + limit=count), + start=start_at+1), + start_from=start_at, + count=count, + activelink="view-dataset") + + +@phenotypesbp.route( + "<int:species_id>/populations/<int:population_id>/phenotypes/datasets" + "/<int:dataset_id>/phenotype/<xref_id>", + methods=["GET"]) +@require_login +@with_dataset( + species_redirect_uri="species.populations.phenotypes.index", + population_redirect_uri="species.populations.phenotypes.select_population", + redirect_uri="species.populations.phenotypes.list_datasets") +def view_phenotype(# pylint: disable=[unused-argument] + species: dict, + population: dict, + dataset: dict, + xref_id: int, + **kwargs +): + """View an individual phenotype from the dataset.""" + def __render__(privileges): + return render_template( + "phenotypes/view-phenotype.html", + species=species, + population=population, + dataset=dataset, + phenotype=phenotype_by_id(conn, + species["SpeciesId"], + population["Id"], + dataset["Id"], + xref_id), + privileges=(privileges + ### For demo! Do not commit this part + + ("group:resource:edit-resource", + "group:resource:delete-resource",) + ### END: For demo! Do not commit this part + ), + activelink="view-phenotype") + + def __fail__(error): + if isinstance(error, Response) and error.json() == "No linked resource!": + return __render__(tuple()) + return make_either_error_handler( + "There was an error fetching the roles and privileges.")(error) + + with database_connection(app.config["SQL_URI"]) as conn: + return oauth2_post( + "/auth/resource/phenotypes/individual/linked-resource", + json={ + "species_id": species["SpeciesId"], + "population_id": population["Id"], + "dataset_id": dataset["Id"], + "xref_id": xref_id + } + ).then( + lambda resource: tuple( + privilege["privilege_id"] for role in resource["roles"] + for privilege in role["privileges"]) + ).then(__render__).either(__fail__, lambda resp: resp) + + +@phenotypesbp.route( + "<int:species_id>/populations/<int:population_id>/phenotypes/datasets/create", + methods=["GET", "POST"]) +@require_login +@with_population( + species_redirect_uri="species.populations.phenotypes.index", + redirect_uri="species.populations.phenotypes.select_population") +def create_dataset(species: dict, population: dict, **kwargs):# pylint: disable=[unused-argument] + """Create a new phenotype dataset.""" + with (database_connection(app.config["SQL_URI"]) as conn, + conn.cursor(cursorclass=DictCursor) as cursor): + if request.method == "GET": + return render_template("phenotypes/create-dataset.html", + activelink="create-dataset", + species=species, + population=population, + **decode_errors( + request.args.get("error_values", ""))) + + form = request.form + _errors: tuple[tuple[str, str], ...] = tuple() + if not is_valid_representative_name( + (form.get("dataset-name") or "").strip()): + _errors = _errors + (("dataset-name", "Invalid dataset name."),) + + if not bool((form.get("dataset-fullname") or "").strip()): + _errors = _errors + (("dataset-fullname", + "You must provide a value for 'Full Name'."),) + + if bool(_errors) > 0: + return redirect(url_for( + "species.populations.phenotypes.create_dataset", + species_id=species["SpeciesId"], + population_id=population["Id"], + error_values=encode_errors(_errors, form))) + + dataset_shortname = ( + form["dataset-shortname"] or form["dataset-name"]).strip() + _pheno_dataset = save_new_dataset( + cursor, + population["Id"], + form["dataset-name"].strip(), + form["dataset-fullname"].strip(), + dataset_shortname) + return redirect(url_for("species.populations.phenotypes.list_datasets", + species_id=species["SpeciesId"], + population_id=population["Id"])) + + +@phenotypesbp.route( + "<int:species_id>/populations/<int:population_id>/phenotypes/datasets" + "/<int:dataset_id>/add-phenotypes", + methods=["GET", "POST"]) +@require_login +@with_dataset( + species_redirect_uri="species.populations.phenotypes.index", + population_redirect_uri="species.populations.phenotypes.select_population", + redirect_uri="species.populations.phenotypes.list_datasets") +def add_phenotypes(species: dict, population: dict, dataset: dict, **kwargs):# pylint: disable=[unused-argument, too-many-locals] + """Add one or more phenotypes to the dataset.""" + add_phenos_uri = redirect(url_for( + "species.populations.phenotypes.add_phenotypes", + species_id=species["SpeciesId"], + population_id=population["Id"], + dataset_id=dataset["Id"])) + _redisuri = app.config["REDIS_URL"] + _sqluri = app.config["SQL_URI"] + with (Redis.from_url(_redisuri, decode_responses=True) as rconn, + # database_connection(_sqluri) as conn, + # conn.cursor(cursorclass=DictCursor) as cursor + ): + if request.method == "GET": + return render_template("phenotypes/add-phenotypes.html", + species=species, + population=population, + dataset=dataset, + activelink="add-phenotypes") + + try: + ## Handle huge files here... + phenobundle = save_file(request.files["phenotypes-bundle"], + Path(app.config["UPLOAD_FOLDER"])) + rqc.validate_bundle(phenobundle) + except AssertionError as _aerr: + app.logger.debug("File upload error!", exc_info=True) + flash("Expected a zipped bundle of files with phenotypes' " + "information.", + "alert-danger") + return add_phenos_uri + except rqe.RQTLError as rqtlerr: + app.logger.debug("Bundle validation error!", exc_info=True) + flash("R/qtl2 Error: " + " ".join(rqtlerr.args), "alert-danger") + return add_phenos_uri + + _jobid = uuid.uuid4() + _namespace = jobs.jobsnamespace() + _ttl_seconds = app.config["JOBS_TTL_SECONDS"] + _job = jobs.initialise_job( + rconn, + _namespace, + str(_jobid), + [sys.executable, "-m", "scripts.rqtl2.phenotypes_qc", _sqluri, + _redisuri, _namespace, str(_jobid), str(species["SpeciesId"]), + str(population["Id"]), str(dataset["Id"]), "--redisexpiry", + str(_ttl_seconds)], "phenotype_qc", _ttl_seconds, + {"job-metadata": json.dumps({ + "speciesid": species["SpeciesId"], + "populationid": population["Id"], + "datasetid": dataset["Id"], + "bundle": str(phenobundle.absolute())})}) + # jobs.launch_job( + # _job, + # redisuri, + # f"{app.config['UPLOAD_FOLDER']}/job_errors") + + raise NotImplementedError("Please implement this...") |