From 06c6a7f7f42e8ff2d33a934ff695efde24d26d65 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Wed, 28 Aug 2024 17:12:26 -0500 Subject: Move code handling expression data upload into new module. --- uploader/expression_data/__init__.py | 13 + uploader/expression_data/index.py | 126 ++++ uploader/expression_data/parse.py | 178 +++++ uploader/expression_data/rqtl2.py | 1176 ++++++++++++++++++++++++++++++++++ uploader/expression_data/samples.py | 359 +++++++++++ 5 files changed, 1852 insertions(+) create mode 100644 uploader/expression_data/__init__.py create mode 100644 uploader/expression_data/index.py create mode 100644 uploader/expression_data/parse.py create mode 100644 uploader/expression_data/rqtl2.py create mode 100644 uploader/expression_data/samples.py (limited to 'uploader/expression_data') diff --git a/uploader/expression_data/__init__.py b/uploader/expression_data/__init__.py new file mode 100644 index 0000000..b773bce --- /dev/null +++ b/uploader/expression_data/__init__.py @@ -0,0 +1,13 @@ +"""Package handling upload of files.""" +from flask import Blueprint + +from .rqtl2 import rqtl2 +from .index import indexbp +from .parse import parsebp +from .samples import samples + +exprdatabp = Blueprint("expression-data", __name__) +exprdatabp.register_blueprint(indexbp, url_prefix="/") +exprdatabp.register_blueprint(rqtl2, url_prefix="/rqtl2") +exprdatabp.register_blueprint(parsebp, url_prefix="/parse") +exprdatabp.register_blueprint(samples, url_prefix="/sample") diff --git a/uploader/expression_data/index.py b/uploader/expression_data/index.py new file mode 100644 index 0000000..a334c51 --- /dev/null +++ b/uploader/expression_data/index.py @@ -0,0 +1,126 @@ +"""Entry-point module""" +import os +import mimetypes +from typing import Tuple +from zipfile import ZipFile, is_zipfile + +from werkzeug.utils import secure_filename +from flask import ( + flash, + request, + url_for, + redirect, + Blueprint, + render_template, + current_app as app, + send_from_directory) + +from uploader.db import species +from uploader.authorisation import require_login +from uploader.db_utils import with_db_connection + +indexbp = Blueprint("index", __name__) + + +def errors(rqst) -> Tuple[str, ...]: + """Return a tuple of the errors found in the request `rqst`. If no error is + found, then an empty tuple is returned.""" + def __filetype_error__(): + return ( + ("Invalid file type provided.",) + if rqst.form.get("filetype") not in ("average", "standard-error") + else tuple()) + + def __file_missing_error__(): + return ( + ("No file was uploaded.",) + if ("qc_text_file" not in rqst.files or + rqst.files["qc_text_file"].filename == "") + else tuple()) + + def __file_mimetype_error__(): + text_file = rqst.files["qc_text_file"] + return ( + ( + ("Invalid file! Expected a tab-separated-values file, or a zip " + "file of the a tab-separated-values file."),) + if text_file.mimetype not in ( + "text/plain", "text/tab-separated-values", + "application/zip") + else tuple()) + + return ( + __filetype_error__() + + (__file_missing_error__() or __file_mimetype_error__())) + +def zip_file_errors(filepath, upload_dir) -> Tuple[str, ...]: + """Check the uploaded zip file for errors.""" + zfile_errors: Tuple[str, ...] = tuple() + if is_zipfile(filepath): + with ZipFile(filepath, "r") as zfile: + infolist = zfile.infolist() + if len(infolist) != 1: + zfile_errors = zfile_errors + ( + ("Expected exactly one (1) member file within the uploaded zip " + f"file. Got {len(infolist)} member files."),) + if len(infolist) == 1 and infolist[0].is_dir(): + zfile_errors = zfile_errors + ( + ("Expected a member text file in the uploaded zip file. Got a " + "directory/folder."),) + + if len(infolist) == 1 and not infolist[0].is_dir(): + zfile.extract(infolist[0], path=upload_dir) + mime = mimetypes.guess_type(f"{upload_dir}/{infolist[0].filename}") + if mime[0] != "text/tab-separated-values": + zfile_errors = zfile_errors + ( + ("Expected the member text file in the uploaded zip file to" + " be a tab-separated file."),) + + return zfile_errors + + +@indexbp.route("/", methods=["GET"]) +@require_login +def index(): + """Display the expression data index page.""" + return render_template("expression-data/index.html") + + +@indexbp.route("/upload", methods=["GET", "POST"]) +@require_login +def upload_file(): + """Enables uploading the files""" + if request.method == "GET": + return render_template( + "select_species.html", species=with_db_connection(species)) + + upload_dir = app.config["UPLOAD_FOLDER"] + request_errors = errors(request) + if request_errors: + for error in request_errors: + flash(error, "alert-danger error-expr-data") + return redirect(url_for("expression-data.index.upload_file")) + + filename = secure_filename(request.files["qc_text_file"].filename) + if not os.path.exists(upload_dir): + os.mkdir(upload_dir) + + filepath = os.path.join(upload_dir, filename) + request.files["qc_text_file"].save(os.path.join(upload_dir, filename)) + + zip_errors = zip_file_errors(filepath, upload_dir) + if zip_errors: + for error in zip_errors: + flash(error, "alert-danger error-expr-data") + return redirect(url_for("expression-data.index.upload_file")) + + return redirect(url_for("expression-data.parse.parse", + speciesid=request.form["speciesid"], + filename=filename, + filetype=request.form["filetype"])) + +@indexbp.route("/data-review", methods=["GET"]) +@require_login +def data_review(): + """Provide some help on data expectations to the user.""" + return render_template("data_review.html") diff --git a/uploader/expression_data/parse.py b/uploader/expression_data/parse.py new file mode 100644 index 0000000..fc1c3f0 --- /dev/null +++ b/uploader/expression_data/parse.py @@ -0,0 +1,178 @@ +"""File parsing module""" +import os + +import jsonpickle +from redis import Redis +from flask import flash, request, url_for, redirect, Blueprint, render_template +from flask import current_app as app + +from quality_control.errors import InvalidValue, DuplicateHeading + +from uploader import jobs +from uploader.dbinsert import species_by_id +from uploader.db_utils import with_db_connection +from uploader.authorisation import require_login + +parsebp = Blueprint("parse", __name__) + +def isinvalidvalue(item): + """Check whether item is of type InvalidValue""" + return isinstance(item, InvalidValue) + +def isduplicateheading(item): + """Check whether item is of type DuplicateHeading""" + return isinstance(item, DuplicateHeading) + +@parsebp.route("/parse", methods=["GET"]) +@require_login +def parse(): + """Trigger file parsing""" + errors = False + speciesid = request.args.get("speciesid") + filename = request.args.get("filename") + filetype = request.args.get("filetype") + if speciesid is None: + flash("No species selected", "alert-error error-expr-data") + errors = True + else: + try: + speciesid = int(speciesid) + species = with_db_connection( + lambda con: species_by_id(con, speciesid)) + if not bool(species): + flash("No such species.", "alert-error error-expr-data") + errors = True + except ValueError: + flash("Invalid speciesid provided. Expected an integer.", + "alert-error error-expr-data") + errors = True + + if filename is None: + flash("No file provided", "alert-error error-expr-data") + errors = True + + if filetype is None: + flash("No filetype provided", "alert-error error-expr-data") + errors = True + + if filetype not in ("average", "standard-error"): + flash("Invalid filetype provided", "alert-error error-expr-data") + errors = True + + if filename: + filepath = os.path.join(app.config["UPLOAD_FOLDER"], filename) + if not os.path.exists(filepath): + flash("Selected file does not exist (any longer)", + "alert-error error-expr-data") + errors = True + + if errors: + return redirect(url_for("expression-data.index.upload_file")) + + redisurl = app.config["REDIS_URL"] + with Redis.from_url(redisurl, decode_responses=True) as rconn: + job = jobs.launch_job( + jobs.build_file_verification_job( + rconn, app.config["SQL_URI"], redisurl, + speciesid, filepath, filetype, + app.config["JOBS_TTL_SECONDS"]), + redisurl, + f"{app.config['UPLOAD_FOLDER']}/job_errors") + + return redirect(url_for("expression-data.parse.parse_status", job_id=job["jobid"])) + +@parsebp.route("/status/", methods=["GET"]) +def parse_status(job_id: str): + "Retrieve the status of the job" + with Redis.from_url(app.config["REDIS_URL"], decode_responses=True) as rconn: + try: + job = jobs.job(rconn, jobs.jobsnamespace(), job_id) + except jobs.JobNotFound as _exc: + return render_template("no_such_job.html", job_id=job_id), 400 + + error_filename = jobs.error_filename( + job_id, f"{app.config['UPLOAD_FOLDER']}/job_errors") + if os.path.exists(error_filename): + stat = os.stat(error_filename) + if stat.st_size > 0: + return redirect(url_for("parse.fail", job_id=job_id)) + + job_id = job["jobid"] + progress = float(job["percent"]) + status = job["status"] + filename = job.get("filename", "uploaded file") + errors = jsonpickle.decode( + job.get("errors", jsonpickle.encode(tuple()))) + if status in ("success", "aborted"): + return redirect(url_for("expression-data.parse.results", job_id=job_id)) + + if status == "parse-error": + return redirect(url_for("parse.fail", job_id=job_id)) + + app.jinja_env.globals.update( + isinvalidvalue=isinvalidvalue, + isduplicateheading=isduplicateheading) + return render_template( + "job_progress.html", + job_id = job_id, + job_status = status, + progress = progress, + message = job.get("message", ""), + job_name = f"Parsing '{filename}'", + errors=errors) + +@parsebp.route("/results/", methods=["GET"]) +def results(job_id: str): + """Show results of parsing...""" + with Redis.from_url(app.config["REDIS_URL"], decode_responses=True) as rconn: + job = jobs.job(rconn, jobs.jobsnamespace(), job_id) + + if job: + filename = job["filename"] + errors = jsonpickle.decode(job.get("errors", jsonpickle.encode(tuple()))) + app.jinja_env.globals.update( + isinvalidvalue=isinvalidvalue, + isduplicateheading=isduplicateheading) + return render_template( + "parse_results.html", + errors=errors, + job_name = f"Parsing '{filename}'", + user_aborted = job.get("user_aborted"), + job_id=job["jobid"]) + + return render_template("no_such_job.html", job_id=job_id) + +@parsebp.route("/fail/", methods=["GET"]) +def fail(job_id: str): + """Handle parsing failure""" + with Redis.from_url(app.config["REDIS_URL"], decode_responses=True) as rconn: + job = jobs.job(rconn, jobs.jobsnamespace(), job_id) + + if job: + error_filename = jobs.error_filename( + job_id, f"{app.config['UPLOAD_FOLDER']}/job_errors") + if os.path.exists(error_filename): + stat = os.stat(error_filename) + if stat.st_size > 0: + return render_template( + "worker_failure.html", job_id=job_id) + + return render_template("parse_failure.html", job=job) + + return render_template("no_such_job.html", job_id=job_id) + +@parsebp.route("/abort", methods=["POST"]) +@require_login +def abort(): + """Handle user request to abort file processing""" + job_id = request.form["job_id"] + + with Redis.from_url(app.config["REDIS_URL"], decode_responses=True) as rconn: + job = jobs.job(rconn, jobs.jobsnamespace(), job_id) + + if job: + rconn.hset(name=jobs.job_key(jobs.jobsnamespace(), job_id), + key="user_aborted", + value=int(True)) + + return redirect(url_for("expression-data.parse.parse_status", job_id=job_id)) diff --git a/uploader/expression_data/rqtl2.py b/uploader/expression_data/rqtl2.py new file mode 100644 index 0000000..48df66c --- /dev/null +++ b/uploader/expression_data/rqtl2.py @@ -0,0 +1,1176 @@ +"""Module to handle uploading of R/qtl2 bundles."""#pylint: disable=[too-many-lines] +import sys +import json +import traceback +from pathlib import Path +from datetime import date +from uuid import UUID, uuid4 +from functools import partial +from zipfile import ZipFile, is_zipfile +from typing import Union, Callable, Optional + +import MySQLdb as mdb +from redis import Redis +from MySQLdb.cursors import DictCursor +from werkzeug.utils import secure_filename +from flask import ( + flash, + escape, + request, + jsonify, + url_for, + redirect, + Response, + Blueprint, + render_template, + current_app as app) + +from r_qtl import r_qtl2 + +from uploader import jobs +from uploader.files import save_file, fullpath +from uploader.dbinsert import species as all_species +from uploader.db_utils import with_db_connection, database_connection + +from uploader.authorisation import require_login +from uploader.db.platforms import platform_by_id, platforms_by_species +from uploader.db.averaging import averaging_methods, averaging_method_by_id +from uploader.db.tissues import all_tissues, tissue_by_id, create_new_tissue +from uploader.db import ( + species_by_id, + save_population, + populations_by_species, + population_by_species_and_id,) +from uploader.db.datasets import ( + geno_dataset_by_id, + geno_datasets_by_species_and_population, + + probeset_study_by_id, + probeset_create_study, + probeset_dataset_by_id, + probeset_create_dataset, + probeset_datasets_by_study, + probeset_studies_by_species_and_population) + +rqtl2 = Blueprint("rqtl2", __name__) + + +@rqtl2.route("/", methods=["GET", "POST"]) +@rqtl2.route("/select-species", methods=["GET", "POST"]) +@require_login +def select_species(): + """Select the species.""" + if request.method == "GET": + return render_template("rqtl2/index.html", species=with_db_connection(all_species)) + + species_id = request.form.get("species_id") + species = with_db_connection( + lambda conn: species_by_id(conn, species_id)) + if bool(species): + return redirect(url_for( + "expression-data.rqtl2.select_population", species_id=species_id)) + flash("Invalid species or no species selected!", "alert-error error-rqtl2") + return redirect(url_for("expression-data.rqtl2.select_species")) + + +@rqtl2.route("/upload/species//select-population", + methods=["GET", "POST"]) +@require_login +def select_population(species_id: int): + """Select/Create the population to organise data under.""" + with database_connection(app.config["SQL_URI"]) as conn: + species = species_by_id(conn, species_id) + if not bool(species): + flash("Invalid species selected!", "alert-error error-rqtl2") + return redirect(url_for("expression-data.rqtl2.select_species")) + + if request.method == "GET": + return render_template( + "rqtl2/select-population.html", + species=species, + populations=populations_by_species(conn, species_id)) + + population = population_by_species_and_id( + conn, species["SpeciesId"], request.form.get("inbredset_id")) + if not bool(population): + flash("Invalid Population!", "alert-error error-rqtl2") + return redirect( + url_for("expression-data.rqtl2.select_population", pgsrc="error"), + code=307) + + return redirect(url_for("expression-data.rqtl2.upload_rqtl2_bundle", + species_id=species["SpeciesId"], + population_id=population["InbredSetId"])) + + +@rqtl2.route("/upload/species//create-population", + methods=["POST"]) +@require_login +def create_population(species_id: int): + """Create a new population for the given species.""" + population_page = redirect(url_for("expression-data.rqtl2.select_population", + species_id=species_id)) + with database_connection(app.config["SQL_URI"]) as conn: + species = species_by_id(conn, species_id) + population_name = request.form.get("inbredset_name", "").strip() + population_fullname = request.form.get("inbredset_fullname", "").strip() + if not bool(species): + flash("Invalid species!", "alert-error error-rqtl2") + return redirect(url_for("expression-data.rqtl2.select_species")) + if not bool(population_name): + flash("Invalid Population Name!", "alert-error error-rqtl2") + return population_page + if not bool(population_fullname): + flash("Invalid Population Full Name!", "alert-error error-rqtl2") + return population_page + new_population = save_population(conn, { + "SpeciesId": species["SpeciesId"], + "Name": population_name, + "InbredSetName": population_fullname, + "FullName": population_fullname, + "Family": request.form.get("inbredset_family") or None, + "Description": request.form.get("description") or None + }) + + flash("Population created successfully.", "alert-success") + return redirect( + url_for("expression-data.rqtl2.upload_rqtl2_bundle", + species_id=species_id, + population_id=new_population["population_id"], + pgsrc="create-population"), + code=307) + + +class __RequestError__(Exception): #pylint: disable=[invalid-name] + """Internal class to avoid pylint's `too-many-return-statements` error.""" + + +@rqtl2.route(("/upload/species//population/" + "/rqtl2-bundle"), + methods=["GET", "POST"]) +@require_login +def upload_rqtl2_bundle(species_id: int, population_id: int): + """Allow upload of R/qtl2 bundle.""" + with database_connection(app.config["SQL_URI"]) as conn: + species = species_by_id(conn, species_id) + population = population_by_species_and_id( + conn, species["SpeciesId"], population_id) + if not bool(species): + flash("Invalid species!", "alert-error error-rqtl2") + return redirect(url_for("expression-data.rqtl2.select_species")) + if not bool(population): + flash("Invalid Population!", "alert-error error-rqtl2") + return redirect( + url_for("expression-data.rqtl2.select_population", pgsrc="error"), + code=307) + if request.method == "GET" or ( + request.method == "POST" + and bool(request.args.get("pgsrc"))): + return render_template("rqtl2/upload-rqtl2-bundle-step-01.html", + species=species, + population=population) + + try: + app.logger.debug("Files in the form: %s", request.files) + the_file = save_file(request.files["rqtl2_bundle_file"], + Path(app.config["UPLOAD_FOLDER"])) + except AssertionError: + app.logger.debug(traceback.format_exc()) + flash("Please provide a valid R/qtl2 zip bundle.", + "alert-error error-rqtl2") + return redirect(url_for("expression-data.rqtl2.upload_rqtl2_bundle", + species_id=species_id, + population_id=population_id)) + + if not is_zipfile(str(the_file)): + app.logger.debug("The file is not a zip file.") + raise __RequestError__("Invalid file! Expected a zip file.") + + jobid = trigger_rqtl2_bundle_qc( + species_id, + population_id, + the_file, + request.files["rqtl2_bundle_file"].filename)#type: ignore[arg-type] + return redirect(url_for( + "expression-data.rqtl2.rqtl2_bundle_qc_status", jobid=jobid)) + + +def trigger_rqtl2_bundle_qc( + species_id: int, + population_id: int, + rqtl2bundle: Path, + originalfilename: str +) -> UUID: + """Trigger QC on the R/qtl2 bundle.""" + redisuri = app.config["REDIS_URL"] + with Redis.from_url(redisuri, decode_responses=True) as rconn: + jobid = uuid4() + redis_ttl_seconds = app.config["JOBS_TTL_SECONDS"] + jobs.launch_job( + jobs.initialise_job( + rconn, + jobs.jobsnamespace(), + str(jobid), + [sys.executable, "-m", "scripts.qc_on_rqtl2_bundle", + app.config["SQL_URI"], app.config["REDIS_URL"], + jobs.jobsnamespace(), str(jobid), str(species_id), + str(population_id), "--redisexpiry", + str(redis_ttl_seconds)], + "rqtl2-bundle-qc-job", + redis_ttl_seconds, + {"job-metadata": json.dumps({ + "speciesid": species_id, + "populationid": population_id, + "rqtl2-bundle-file": str(rqtl2bundle.absolute()), + "original-filename": originalfilename})}), + redisuri, + f"{app.config['UPLOAD_FOLDER']}/job_errors") + return jobid + + +def chunk_name(uploadfilename: str, chunkno: int) -> str: + """Generate chunk name from original filename and chunk number""" + if uploadfilename == "": + raise ValueError("Name cannot be empty!") + if chunkno < 1: + raise ValueError("Chunk number must be greater than zero") + return f"{secure_filename(uploadfilename)}_part_{chunkno:05d}" + + +def chunks_directory(uniqueidentifier: str) -> Path: + """Compute the directory where chunks are temporarily stored.""" + if uniqueidentifier == "": + raise ValueError("Unique identifier cannot be empty!") + return Path(app.config["UPLOAD_FOLDER"], f"tempdir_{uniqueidentifier}") + + +@rqtl2.route(("/upload/species//population/" + "/rqtl2-bundle-chunked"), + methods=["GET"]) +@require_login +def upload_rqtl2_bundle_chunked_get(# pylint: disable=["unused-argument"] + species_id: int, + population_id: int +): + """ + Extension to the `upload_rqtl2_bundle` endpoint above that provides a way + for testing whether all the chunks have been uploaded and to assist with + resuming a failed expression-data. + """ + fileid = request.args.get("resumableIdentifier", type=str) or "" + filename = request.args.get("resumableFilename", type=str) or "" + chunk = request.args.get("resumableChunkNumber", type=int) or 0 + if not(fileid or filename or chunk): + return jsonify({ + "message": "At least one required query parameter is missing.", + "error": "BadRequest", + "statuscode": 400 + }), 400 + + if Path(chunks_directory(fileid), + chunk_name(filename, chunk)).exists(): + return "OK" + + return jsonify({ + "message": f"Chunk {chunk} was not found.", + "error": "NotFound", + "statuscode": 404 + }), 404 + + +def __merge_chunks__(targetfile: Path, chunkpaths: tuple[Path, ...]) -> Path: + """Merge the chunks into a single file.""" + with open(targetfile, "ab") as _target: + for chunkfile in chunkpaths: + with open(chunkfile, "rb") as _chunkdata: + _target.write(_chunkdata.read()) + + chunkfile.unlink() + return targetfile + + +@rqtl2.route(("/upload/species//population/" + "/rqtl2-bundle-chunked"), + methods=["POST"]) +@require_login +def upload_rqtl2_bundle_chunked_post(species_id: int, population_id: int): + """ + Extension to the `upload_rqtl2_bundle` endpoint above that allows large + files to be uploaded in chunks. + + This should hopefully speed up uploads, and if done right, even enable + resumable uploads + """ + _totalchunks = request.form.get("resumableTotalChunks", type=int) or 0 + _chunk = request.form.get("resumableChunkNumber", default=1, type=int) + _uploadfilename = request.form.get( + "resumableFilename", default="", type=str) or "" + _fileid = request.form.get( + "resumableIdentifier", default="", type=str) or "" + _targetfile = Path(app.config["UPLOAD_FOLDER"], _fileid) + + if _targetfile.exists(): + return jsonify({ + "message": ( + "A file with a similar unique identifier has previously been " + "uploaded and possibly is/has being/been processed."), + "error": "BadRequest", + "statuscode": 400 + }), 400 + + try: + # save chunk data + chunks_directory(_fileid).mkdir(exist_ok=True, parents=True) + request.files["file"].save(Path(chunks_directory(_fileid), + chunk_name(_uploadfilename, _chunk))) + + # Check whether upload is complete + chunkpaths = tuple( + Path(chunks_directory(_fileid), chunk_name(_uploadfilename, _achunk)) + for _achunk in range(1, _totalchunks+1)) + if all(_file.exists() for _file in chunkpaths): + # merge_files and clean up chunks + __merge_chunks__(_targetfile, chunkpaths) + chunks_directory(_fileid).rmdir() + jobid = trigger_rqtl2_bundle_qc( + species_id, population_id, _targetfile, _uploadfilename) + return url_for( + "expression-data.rqtl2.rqtl2_bundle_qc_status", jobid=jobid) + except Exception as exc:# pylint: disable=[broad-except] + msg = "Error processing uploaded file chunks." + app.logger.error(msg, exc_info=True, stack_info=True) + return jsonify({ + "message": msg, + "error": type(exc).__name__, + "error-description": " ".join(str(arg) for arg in exc.args), + "error-trace": traceback.format_exception(exc) + }), 500 + + return "OK" + + +@rqtl2.route("/upload/species/rqtl2-bundle/qc-status/", + methods=["GET", "POST"]) +@require_login +def rqtl2_bundle_qc_status(jobid: UUID): + """Check the status of the QC jobs.""" + with (Redis.from_url(app.config["REDIS_URL"], decode_responses=True) as rconn, + database_connection(app.config["SQL_URI"]) as dbconn): + try: + thejob = jobs.job(rconn, jobs.jobsnamespace(), jobid) + messagelistname = thejob.get("log-messagelist") + logmessages = (rconn.lrange(messagelistname, 0, -1) + if bool(messagelistname) else []) + jobstatus = thejob["status"] + if jobstatus == "error": + return render_template("rqtl2/rqtl2-qc-job-error.html", + job=thejob, + errorsgeneric=json.loads( + thejob.get("errors-generic", "[]")), + errorsgeno=json.loads( + thejob.get("errors-geno", "[]")), + errorspheno=json.loads( + thejob.get("errors-pheno", "[]")), + errorsphenose=json.loads( + thejob.get("errors-phenose", "[]")), + errorsphenocovar=json.loads( + thejob.get("errors-phenocovar", "[]")), + messages=logmessages) + if jobstatus == "success": + jobmeta = json.loads(thejob["job-metadata"]) + species = species_by_id(dbconn, jobmeta["speciesid"]) + return render_template( + "rqtl2/rqtl2-qc-job-results.html", + species=species, + population=population_by_species_and_id( + dbconn, species["SpeciesId"], jobmeta["populationid"]), + rqtl2bundle=Path(jobmeta["rqtl2-bundle-file"]).name, + rqtl2bundleorig=jobmeta["original-filename"]) + + def compute_percentage(thejob, filetype) -> Union[str, None]: + if f"{filetype}-linecount" in thejob: + return "100" + if f"{filetype}-filesize" in thejob: + percent = ((int(thejob.get(f"{filetype}-checked", 0)) + / + int(thejob.get(f"{filetype}-filesize", 1))) + * 100) + return f"{percent:.2f}" + return None + + return render_template( + "rqtl2/rqtl2-qc-job-status.html", + job=thejob, + geno_percent=compute_percentage(thejob, "geno"), + pheno_percent=compute_percentage(thejob, "pheno"), + phenose_percent=compute_percentage(thejob, "phenose"), + messages=logmessages) + except jobs.JobNotFound: + return render_template("rqtl2/no-such-job.html", jobid=jobid) + + +def redirect_on_error(flaskroute, **kwargs): + """Utility to redirect on error""" + return redirect(url_for(flaskroute, **kwargs, pgsrc="error"), + code=(307 if request.method == "POST" else 302)) + + +def check_species(conn: mdb.Connection, formargs: dict) -> Optional[ + tuple[str, Response]]: + """ + Check whether the 'species_id' value is provided, and whether a + corresponding species exists in the database. + + Maybe give the function a better name...""" + speciespage = redirect_on_error("expression-data.rqtl2.select_species") + if "species_id" not in formargs: + return "You MUST provide the Species identifier.", speciespage + + if not bool(species_by_id(conn, formargs["species_id"])): + return "No species with the provided identifier exists.", speciespage + + return None + + +def check_population(conn: mdb.Connection, + formargs: dict, + species_id) -> Optional[tuple[str, Response]]: + """ + Check whether the 'population_id' value is provided, and whether a + corresponding population exists in the database. + + Maybe give the function a better name...""" + poppage = redirect_on_error( + "expression-data.rqtl2.select_species", species_id=species_id) + if "population_id" not in formargs: + return "You MUST provide the Population identifier.", poppage + + if not bool(population_by_species_and_id( + conn, species_id, formargs["population_id"])): + return "No population with the provided identifier exists.", poppage + + return None + + +def check_r_qtl2_bundle(formargs: dict, + species_id, + population_id) -> Optional[tuple[str, Response]]: + """Check for the existence of the R/qtl2 bundle.""" + fileuploadpage = redirect_on_error("expression-data.rqtl2.upload_rqtl2_bundle", + species_id=species_id, + population_id=population_id) + if not "rqtl2_bundle_file" in formargs: + return ( + "You MUST provide a R/qtl2 zip bundle for expression-data.", fileuploadpage) + + if not Path(fullpath(formargs["rqtl2_bundle_file"])).exists(): + return "No R/qtl2 bundle with the given name exists.", fileuploadpage + + return None + + +def check_geno_dataset(conn: mdb.Connection, + formargs: dict, + species_id, + population_id) -> Optional[tuple[str, Response]]: + """Check for the Genotype dataset.""" + genodsetpg = redirect_on_error("expression-data.rqtl2.select_dataset_info", + species_id=species_id, + population_id=population_id) + if not bool(formargs.get("geno-dataset-id")): + return ( + "You MUST provide a valid Genotype dataset identifier", genodsetpg) + + with conn.cursor(cursorclass=DictCursor) as cursor: + cursor.execute("SELECT * FROM GenoFreeze WHERE Id=%s", + (formargs["geno-dataset-id"],)) + results = cursor.fetchall() + if not bool(results): + return ("No genotype dataset with the provided identifier exists.", + genodsetpg) + if len(results) > 1: + return ( + "Data corruption: More than one genotype dataset with the same " + "identifier.", + genodsetpg) + + return None + +def check_tissue( + conn: mdb.Connection,formargs: dict) -> Optional[tuple[str, Response]]: + """Check for tissue/organ/biological material.""" + selectdsetpg = redirect_on_error("expression-data.rqtl2.select_dataset_info", + species_id=formargs["species_id"], + population_id=formargs["population_id"]) + if not bool(formargs.get("tissueid", "").strip()): + return ("No tissue/organ/biological material provided.", selectdsetpg) + + with conn.cursor(cursorclass=DictCursor) as cursor: + cursor.execute("SELECT * FROM Tissue WHERE Id=%s", + (formargs["tissueid"],)) + results = cursor.fetchall() + if not bool(results): + return ("No tissue/organ with the provided identifier exists.", + selectdsetpg) + + if len(results) > 1: + return ( + "Data corruption: More than one tissue/organ with the same " + "identifier.", + selectdsetpg) + + return None + + +def check_probe_study(conn: mdb.Connection, + formargs: dict, + species_id, + population_id) -> Optional[tuple[str, Response]]: + """Check for the ProbeSet study.""" + dsetinfopg = redirect_on_error("expression-data.rqtl2.select_dataset_info", + species_id=species_id, + population_id=population_id) + if not bool(formargs.get("probe-study-id")): + return "No probeset study was selected!", dsetinfopg + + if not bool(probeset_study_by_id(conn, formargs["probe-study-id"])): + return ("No probeset study with the provided identifier exists", + dsetinfopg) + + return None + + +def check_probe_dataset(conn: mdb.Connection, + formargs: dict, + species_id, + population_id) -> Optional[tuple[str, Response]]: + """Check for the ProbeSet dataset.""" + dsetinfopg = redirect_on_error("expression-data.rqtl2.select_dataset_info", + species_id=species_id, + population_id=population_id) + if not bool(formargs.get("probe-dataset-id")): + return "No probeset dataset was selected!", dsetinfopg + + if not bool(probeset_dataset_by_id(conn, formargs["probe-dataset-id"])): + return ("No probeset dataset with the provided identifier exists", + dsetinfopg) + + return None + + +def with_errors(endpointthunk: Callable, *checkfns): + """Run 'endpointthunk' with error checking.""" + formargs = {**dict(request.args), **dict(request.form)} + errors = tuple(item for item in (_fn(formargs=formargs) for _fn in checkfns) + if item is not None) + if len(errors) > 0: + flash(errors[0][0], "alert-error error-rqtl2") + return errors[0][1] + + return endpointthunk() + + +@rqtl2.route(("/upload/species//population/" + "/rqtl2-bundle/select-geno-dataset"), + methods=["POST"]) +@require_login +def select_geno_dataset(species_id: int, population_id: int): + """Select from existing geno datasets.""" + with database_connection(app.config["SQL_URI"]) as conn: + def __thunk__(): + geno_dset = geno_datasets_by_species_and_population( + conn, species_id, population_id) + if not bool(geno_dset): + flash("No genotype dataset was provided!", + "alert-error error-rqtl2") + return redirect(url_for("expression-data.rqtl2.select_geno_dataset", + species_id=species_id, + population_id=population_id, + pgsrc="error"), + code=307) + + flash("Genotype accepted", "alert-success error-rqtl2") + return redirect(url_for("expression-data.rqtl2.select_dataset_info", + species_id=species_id, + population_id=population_id, + pgsrc="expression-data.rqtl2.select_geno_dataset"), + code=307) + + return with_errors(__thunk__, + partial(check_species, conn=conn), + partial(check_population, conn=conn, + species_id=species_id), + partial(check_r_qtl2_bundle, + species_id=species_id, + population_id=population_id), + partial(check_geno_dataset, + conn=conn, + species_id=species_id, + population_id=population_id)) + + +@rqtl2.route(("/upload/species//population/" + "/rqtl2-bundle/create-geno-dataset"), + methods=["POST"]) +@require_login +def create_geno_dataset(species_id: int, population_id: int): + """Create a new geno dataset.""" + with database_connection(app.config["SQL_URI"]) as conn: + def __thunk__(): + sgeno_page = redirect(url_for("expression-data.rqtl2.select_dataset_info", + species_id=species_id, + population_id=population_id, + pgsrc="error"), + code=307) + errorclasses = "alert-error error-rqtl2 error-rqtl2-create-geno-dataset" + if not bool(request.form.get("dataset-name")): + flash("You must provide the dataset name", errorclasses) + return sgeno_page + if not bool(request.form.get("dataset-fullname")): + flash("You must provide the dataset full name", errorclasses) + return sgeno_page + public = 2 if request.form.get("dataset-public") == "on" else 0 + + with conn.cursor(cursorclass=DictCursor) as cursor: + datasetname = request.form["dataset-name"] + new_dataset = { + "name": datasetname, + "fname": request.form.get("dataset-fullname"), + "sname": request.form.get("dataset-shortname") or datasetname, + "today": date.today().isoformat(), + "pub": public, + "isetid": population_id + } + cursor.execute("SELECT * FROM GenoFreeze WHERE Name=%s", + (datasetname,)) + results = cursor.fetchall() + if bool(results): + flash( + f"A genotype dataset with name '{escape(datasetname)}' " + "already exists.", + errorclasses) + return redirect(url_for("expression-data.rqtl2.select_dataset_info", + species_id=species_id, + population_id=population_id, + pgsrc="error"), + code=307) + cursor.execute( + "INSERT INTO GenoFreeze(" + "Name, FullName, ShortName, CreateTime, public, InbredSetId" + ") " + "VALUES(" + "%(name)s, %(fname)s, %(sname)s, %(today)s, %(pub)s, %(isetid)s" + ")", + new_dataset) + flash("Created dataset successfully.", "alert-success") + return render_template( + "rqtl2/create-geno-dataset-success.html", + species=species_by_id(conn, species_id), + population=population_by_species_and_id( + conn, species_id, population_id), + rqtl2_bundle_file=request.form["rqtl2_bundle_file"], + geno_dataset={**new_dataset, "id": cursor.lastrowid}) + + return with_errors(__thunk__, + partial(check_species, conn=conn), + partial(check_population, conn=conn, species_id=species_id), + partial(check_r_qtl2_bundle, + species_id=species_id, + population_id=population_id)) + + +@rqtl2.route(("/upload/species//population/" + "/rqtl2-bundle/select-tissue"), + methods=["POST"]) +@require_login +def select_tissue(species_id: int, population_id: int): + """Select from existing tissues.""" + with database_connection(app.config["SQL_URI"]) as conn: + def __thunk__(): + if not bool(request.form.get("tissueid", "").strip()): + flash("Invalid tissue selection!", + "alert-error error-select-tissue error-rqtl2") + + return redirect(url_for("expression-data.rqtl2.select_dataset_info", + species_id=species_id, + population_id=population_id, + pgsrc="expression-data.rqtl2.select_geno_dataset"), + code=307) + + return with_errors(__thunk__, + partial(check_species, conn=conn), + partial(check_population, + conn=conn, + species_id=species_id), + partial(check_r_qtl2_bundle, + species_id=species_id, + population_id=population_id), + partial(check_geno_dataset, + conn=conn, + species_id=species_id, + population_id=population_id)) + +@rqtl2.route(("/upload/species//population/" + "/rqtl2-bundle/create-tissue"), + methods=["POST"]) +@require_login +def create_tissue(species_id: int, population_id: int): + """Add new tissue, organ or biological material to the system.""" + form = request.form + datasetinfopage = redirect( + url_for("expression-data.rqtl2.select_dataset_info", + species_id=species_id, + population_id=population_id, + pgsrc="expression-data.rqtl2.select_geno_dataset"), + code=307) + with database_connection(app.config["SQL_URI"]) as conn: + tissuename = form.get("tissuename", "").strip() + tissueshortname = form.get("tissueshortname", "").strip() + if not bool(tissuename): + flash("Organ/Tissue name MUST be provided.", + "alert-error error-create-tissue error-rqtl2") + return datasetinfopage + + if not bool(tissueshortname): + flash("Organ/Tissue short name MUST be provided.", + "alert-error error-create-tissue error-rqtl2") + return datasetinfopage + + try: + tissue = create_new_tissue(conn, tissuename, tissueshortname) + flash("Tissue created successfully!", "alert-success") + return render_template( + "rqtl2/create-tissue-success.html", + species=species_by_id(conn, species_id), + population=population_by_species_and_id( + conn, species_id, population_id), + rqtl2_bundle_file=request.form["rqtl2_bundle_file"], + geno_dataset=geno_dataset_by_id( + conn, + int(request.form["geno-dataset-id"])), + tissue=tissue) + except mdb.IntegrityError as _ierr: + flash("Tissue/Organ with that short name already exists!", + "alert-error error-create-tissue error-rqtl2") + return datasetinfopage + + +@rqtl2.route(("/upload/species//population/" + "/rqtl2-bundle/select-probeset-study"), + methods=["POST"]) +@require_login +def select_probeset_study(species_id: int, population_id: int): + """Select or create a probeset study.""" + with database_connection(app.config["SQL_URI"]) as conn: + def __thunk__(): + summary_page = redirect(url_for("expression-data.rqtl2.select_dataset_info", + species_id=species_id, + population_id=population_id), + code=307) + if not bool(probeset_study_by_id(conn, int(request.form["probe-study-id"]))): + flash("Invalid study selected!", "alert-error error-rqtl2") + return summary_page + + return summary_page + return with_errors(__thunk__, + partial(check_species, conn=conn), + partial(check_population, + conn=conn, + species_id=species_id), + partial(check_r_qtl2_bundle, + species_id=species_id, + population_id=population_id), + partial(check_geno_dataset, + conn=conn, + species_id=species_id, + population_id=population_id), + partial(check_tissue, conn=conn), + partial(check_probe_study, + conn=conn, + species_id=species_id, + population_id=population_id)) + + +@rqtl2.route(("/upload/species//population/" + "/rqtl2-bundle/select-probeset-dataset"), + methods=["POST"]) +@require_login +def select_probeset_dataset(species_id: int, population_id: int): + """Select or create a probeset dataset.""" + with database_connection(app.config["SQL_URI"]) as conn: + def __thunk__(): + summary_page = redirect(url_for("expression-data.rqtl2.select_dataset_info", + species_id=species_id, + population_id=population_id), + code=307) + if not bool(probeset_study_by_id(conn, int(request.form["probe-study-id"]))): + flash("Invalid study selected!", "alert-error error-rqtl2") + return summary_page + + return summary_page + + return with_errors(__thunk__, + partial(check_species, conn=conn), + partial(check_population, + conn=conn, + species_id=species_id), + partial(check_r_qtl2_bundle, + species_id=species_id, + population_id=population_id), + partial(check_geno_dataset, + conn=conn, + species_id=species_id, + population_id=population_id), + partial(check_tissue, conn=conn), + partial(check_probe_study, + conn=conn, + species_id=species_id, + population_id=population_id), + partial(check_probe_dataset, + conn=conn, + species_id=species_id, + population_id=population_id)) + + +@rqtl2.route(("/upload/species//population/" + "/rqtl2-bundle/create-probeset-study"), + methods=["POST"]) +@require_login +def create_probeset_study(species_id: int, population_id: int): + """Create a new probeset study.""" + errorclasses = "alert-error error-rqtl2 error-rqtl2-create-probeset-study" + with database_connection(app.config["SQL_URI"]) as conn: + def __thunk__(): + form = request.form + dataset_info_page = redirect( + url_for("expression-data.rqtl2.select_dataset_info", + species_id=species_id, + population_id=population_id), + code=307) + + if not (bool(form.get("platformid")) and + bool(platform_by_id(conn, int(form["platformid"])))): + flash("Invalid platform selected.", errorclasses) + return dataset_info_page + + if not (bool(form.get("tissueid")) and + bool(tissue_by_id(conn, int(form["tissueid"])))): + flash("Invalid tissue selected.", errorclasses) + return dataset_info_page + + studyname = form["studyname"] + try: + study = probeset_create_study( + conn, population_id, int(form["platformid"]), int(form["tissueid"]), + studyname, form.get("studyfullname") or "", + form.get("studyshortname") or "") + except mdb.IntegrityError as _ierr: + flash(f"ProbeSet study with name '{escape(studyname)}' already " + "exists.", + errorclasses) + return dataset_info_page + return render_template( + "rqtl2/create-probe-study-success.html", + species=species_by_id(conn, species_id), + population=population_by_species_and_id( + conn, species_id, population_id), + rqtl2_bundle_file=request.form["rqtl2_bundle_file"], + geno_dataset=geno_dataset_by_id( + conn, + int(request.form["geno-dataset-id"])), + study=study) + + return with_errors(__thunk__, + partial(check_species, conn=conn), + partial(check_population, + conn=conn, + species_id=species_id), + partial(check_r_qtl2_bundle, + species_id=species_id, + population_id=population_id), + partial(check_geno_dataset, + conn=conn, + species_id=species_id, + population_id=population_id), + partial(check_tissue, conn=conn)) + + +@rqtl2.route(("/upload/species//population/" + "/rqtl2-bundle/create-probeset-dataset"), + methods=["POST"]) +@require_login +def create_probeset_dataset(species_id: int, population_id: int):#pylint: disable=[too-many-return-statements] + """Create a new probeset dataset.""" + errorclasses = "alert-error error-rqtl2 error-rqtl2-create-probeset-dataset" + with database_connection(app.config["SQL_URI"]) as conn: + def __thunk__():#pylint: disable=[too-many-return-statements] + form = request.form + summary_page = redirect(url_for("expression-data.rqtl2.select_dataset_info", + species_id=species_id, + population_id=population_id), + code=307) + if not bool(form.get("averageid")): + flash("Averaging method not selected!", errorclasses) + return summary_page + if not bool(form.get("datasetname")): + flash("Dataset name not provided!", errorclasses) + return summary_page + if not bool(form.get("datasetfullname")): + flash("Dataset full name not provided!", errorclasses) + return summary_page + + tissue = tissue_by_id(conn, form.get("tissueid", "").strip()) + + study = probeset_study_by_id(conn, int(form["probe-study-id"])) + if not bool(study): + flash("Invalid ProbeSet study provided!", errorclasses) + return summary_page + + avgmethod = averaging_method_by_id(conn, int(form["averageid"])) + if not bool(avgmethod): + flash("Invalid averaging method provided!", errorclasses) + return summary_page + + try: + dset = probeset_create_dataset(conn, + int(form["probe-study-id"]), + int(form["averageid"]), + form["datasetname"], + form["datasetfullname"], + form["datasetshortname"], + form["datasetpublic"] == "on", + form.get( + "datasetdatascale", "log2")) + except mdb.IntegrityError as _ierr: + app.logger.debug("Possible integrity error: %s", traceback.format_exc()) + flash(("IntegrityError: The data you provided has some errors: " + f"{_ierr.args}"), + errorclasses) + return summary_page + except Exception as _exc:# pylint: disable=[broad-except] + app.logger.debug("Error creating ProbeSet dataset: %s", + traceback.format_exc()) + flash(("There was a problem creating your dataset. Please try " + "again."), + errorclasses) + return summary_page + return render_template( + "rqtl2/create-probe-dataset-success.html", + species=species_by_id(conn, species_id), + population=population_by_species_and_id( + conn, species_id, population_id), + rqtl2_bundle_file=request.form["rqtl2_bundle_file"], + geno_dataset=geno_dataset_by_id( + conn, + int(request.form["geno-dataset-id"])), + tissue=tissue, + study=study, + avgmethod=avgmethod, + dataset=dset) + + return with_errors(__thunk__, + partial(check_species, conn=conn), + partial(check_population, + conn=conn, + species_id=species_id), + partial(check_r_qtl2_bundle, + species_id=species_id, + population_id=population_id), + partial(check_geno_dataset, + conn=conn, + species_id=species_id, + population_id=population_id), + partial(check_tissue, conn=conn), + partial(check_probe_study, + conn=conn, + species_id=species_id, + population_id=population_id)) + + +@rqtl2.route(("/upload/species//population/" + "/rqtl2-bundle/dataset-info"), + methods=["POST"]) +@require_login +def select_dataset_info(species_id: int, population_id: int): + """ + If `geno` files exist in the R/qtl2 bundle, prompt user to provide the + dataset the genotypes belong to. + """ + form = request.form + with database_connection(app.config["SQL_URI"]) as conn: + def __thunk__(): + species = species_by_id(conn, species_id) + population = population_by_species_and_id( + conn, species_id, population_id) + thefile = fullpath(form["rqtl2_bundle_file"]) + with ZipFile(str(thefile), "r") as zfile: + cdata = r_qtl2.control_data(zfile) + + geno_dataset = geno_dataset_by_id( + conn,form.get("geno-dataset-id", "").strip()) + if "geno" in cdata and not bool(form.get("geno-dataset-id")): + return render_template( + "rqtl2/select-geno-dataset.html", + species=species, + population=population, + rqtl2_bundle_file=thefile.name, + datasets=geno_datasets_by_species_and_population( + conn, species_id, population_id)) + + tissue = tissue_by_id(conn, form.get("tissueid", "").strip()) + if "pheno" in cdata and not bool(tissue): + return render_template( + "rqtl2/select-tissue.html", + species=species, + population=population, + rqtl2_bundle_file=thefile.name, + geno_dataset=geno_dataset, + studies=probeset_studies_by_species_and_population( + conn, species_id, population_id), + platforms=platforms_by_species(conn, species_id), + tissues=all_tissues(conn)) + + probeset_study = probeset_study_by_id( + conn, form.get("probe-study-id", "").strip()) + if "pheno" in cdata and not bool(probeset_study): + return render_template( + "rqtl2/select-probeset-study-id.html", + species=species, + population=population, + rqtl2_bundle_file=thefile.name, + geno_dataset=geno_dataset, + studies=probeset_studies_by_species_and_population( + conn, species_id, population_id), + platforms=platforms_by_species(conn, species_id), + tissue=tissue) + probeset_study = probeset_study_by_id( + conn, int(form["probe-study-id"])) + + probeset_dataset = probeset_dataset_by_id( + conn, form.get("probe-dataset-id", "").strip()) + if "pheno" in cdata and not bool(probeset_dataset): + return render_template( + "rqtl2/select-probeset-dataset.html", + species=species, + population=population, + rqtl2_bundle_file=thefile.name, + geno_dataset=geno_dataset, + probe_study=probeset_study, + tissue=tissue, + datasets=probeset_datasets_by_study( + conn, int(form["probe-study-id"])), + avgmethods=averaging_methods(conn)) + + return render_template("rqtl2/summary-info.html", + species=species, + population=population, + rqtl2_bundle_file=thefile.name, + geno_dataset=geno_dataset, + tissue=tissue, + probe_study=probeset_study, + probe_dataset=probeset_dataset) + + return with_errors(__thunk__, + partial(check_species, conn=conn), + partial(check_population, + conn=conn, + species_id=species_id), + partial(check_r_qtl2_bundle, + species_id=species_id, + population_id=population_id)) + + +@rqtl2.route(("/upload/species//population/" + "/rqtl2-bundle/confirm-bundle-details"), + methods=["POST"]) +@require_login +def confirm_bundle_details(species_id: int, population_id: int): + """Confirm the details and trigger R/qtl2 bundle processing...""" + redisuri = app.config["REDIS_URL"] + with (database_connection(app.config["SQL_URI"]) as conn, + Redis.from_url(redisuri, decode_responses=True) as rconn): + def __thunk__(): + redis_ttl_seconds = app.config["JOBS_TTL_SECONDS"] + jobid = str(uuid4()) + _job = jobs.launch_job( + jobs.initialise_job( + rconn, + jobs.jobsnamespace(), + jobid, + [ + sys.executable, "-m", "scripts.process_rqtl2_bundle", + app.config["SQL_URI"], app.config["REDIS_URL"], + jobs.jobsnamespace(), jobid, "--redisexpiry", + str(redis_ttl_seconds)], + "R/qtl2 Bundle Upload", + redis_ttl_seconds, + { + "bundle-metadata": json.dumps({ + "speciesid": species_id, + "populationid": population_id, + "rqtl2-bundle-file": str(fullpath( + request.form["rqtl2_bundle_file"])), + "geno-dataset-id": request.form.get( + "geno-dataset-id", ""), + "probe-study-id": request.form.get( + "probe-study-id", ""), + "probe-dataset-id": request.form.get( + "probe-dataset-id", ""), + **({ + "platformid": probeset_study_by_id( + conn, + int(request.form["probe-study-id"]))["ChipId"] + } if bool(request.form.get("probe-study-id")) else {}) + }) + }), + redisuri, + f"{app.config['UPLOAD_FOLDER']}/job_errors") + + return redirect(url_for("expression-data.rqtl2.rqtl2_processing_status", + jobid=jobid)) + + return with_errors(__thunk__, + partial(check_species, conn=conn), + partial(check_population, + conn=conn, + species_id=species_id), + partial(check_r_qtl2_bundle, + species_id=species_id, + population_id=population_id), + partial(check_geno_dataset, + conn=conn, + species_id=species_id, + population_id=population_id), + partial(check_probe_study, + conn=conn, + species_id=species_id, + population_id=population_id), + partial(check_probe_dataset, + conn=conn, + species_id=species_id, + population_id=population_id)) + + +@rqtl2.route("/status/") +def rqtl2_processing_status(jobid: UUID): + """Retrieve the status of the job processing the uploaded R/qtl2 bundle.""" + with Redis.from_url(app.config["REDIS_URL"], decode_responses=True) as rconn: + try: + thejob = jobs.job(rconn, jobs.jobsnamespace(), jobid) + + messagelistname = thejob.get("log-messagelist") + logmessages = (rconn.lrange(messagelistname, 0, -1) + if bool(messagelistname) else []) + + if thejob["status"] == "error": + return render_template( + "rqtl2/rqtl2-job-error.html", job=thejob, messages=logmessages) + if thejob["status"] == "success": + return render_template("rqtl2/rqtl2-job-results.html", + job=thejob, + messages=logmessages) + + return render_template( + "rqtl2/rqtl2-job-status.html", job=thejob, messages=logmessages) + except jobs.JobNotFound as _exc: + return render_template("rqtl2/no-such-job.html", jobid=jobid) diff --git a/uploader/expression_data/samples.py b/uploader/expression_data/samples.py new file mode 100644 index 0000000..95b9b73 --- /dev/null +++ b/uploader/expression_data/samples.py @@ -0,0 +1,359 @@ +"""Code regarding samples""" +import os +import sys +import csv +import uuid +from pathlib import Path +from typing import Iterator + +import MySQLdb as mdb +from redis import Redis +from MySQLdb.cursors import DictCursor +from flask import ( + flash, + request, + url_for, + redirect, + Blueprint, + render_template, + current_app as app) + +from functional_tools import take + +from uploader import jobs +from uploader.files import save_file +from uploader.authorisation import require_login +from uploader.input_validation import is_integer_input +from uploader.db_utils import ( + with_db_connection, + database_connection, + with_redis_connection) +from uploader.db import ( + species_by_id, + save_population, + population_by_id, + populations_by_species, + species as fetch_species) + +samples = Blueprint("samples", __name__) + +@samples.route("/upload/species", methods=["GET", "POST"]) +@require_login +def select_species(): + """Select the species.""" + if request.method == "GET": + return render_template("samples/select-species.html", + species=with_db_connection(fetch_species)) + + index_page = redirect(url_for("expression-data.index.upload_file")) + species_id = request.form.get("species_id") + if bool(species_id): + species_id = int(species_id) + species = with_db_connection( + lambda conn: species_by_id(conn, species_id)) + if bool(species): + return redirect(url_for( + "samples.select_population", species_id=species_id)) + flash("Invalid species selected!", "alert-error") + flash("You need to select a species", "alert-error") + return index_page + +@samples.route("/upload/species//create-population", + methods=["POST"]) +@require_login +def create_population(species_id: int): + """Create new grouping/population.""" + if not is_integer_input(species_id): + flash("You did not provide a valid species. Please select one to " + "continue.", + "alert-danger") + return redirect(url_for("expression-data.samples.select_species")) + species = with_db_connection(lambda conn: species_by_id(conn, species_id)) + if not bool(species): + flash("Species with given ID was not found.", "alert-danger") + return redirect(url_for("expression-data.samples.select_species")) + + species_page = redirect(url_for("expression-data.samples.select_species"), code=307) + with database_connection(app.config["SQL_URI"]) as conn: + species = species_by_id(conn, species_id) + pop_name = request.form.get("inbredset_name", "").strip() + pop_fullname = request.form.get("inbredset_fullname", "").strip() + + if not bool(species): + flash("Invalid species!", "alert-error error-create-population") + return species_page + if (not bool(pop_name)) or (not bool(pop_fullname)): + flash("You *MUST* provide a grouping/population name", + "alert-error error-create-population") + return species_page + + pop = save_population(conn, { + "SpeciesId": species["SpeciesId"], + "Name": pop_name, + "InbredSetName": pop_fullname, + "FullName": pop_fullname, + "Family": request.form.get("inbredset_family") or None, + "Description": request.form.get("description") or None + }) + + flash("Grouping/Population created successfully.", "alert-success") + return redirect(url_for("samples.upload_samples", + species_id=species_id, + population_id=pop["population_id"])) + +@samples.route("/upload/species//population", + methods=["GET", "POST"]) +@require_login +def select_population(species_id: int): + """Select from existing groupings/populations.""" + if not is_integer_input(species_id): + flash("You did not provide a valid species. Please select one to " + "continue.", + "alert-danger") + return redirect(url_for("expression-data.samples.select_species")) + species = with_db_connection(lambda conn: species_by_id(conn, species_id)) + if not bool(species): + flash("Species with given ID was not found.", "alert-danger") + return redirect(url_for("expression-data.samples.select_species")) + + if request.method == "GET": + return render_template( + "samples/select-population.html", + species=species, + populations=with_db_connection( + lambda conn: populations_by_species(conn, species_id))) + + population_page = redirect(url_for( + "samples.select_population", species_id=species_id), code=307) + _population_id = request.form.get("inbredset_id") + if not is_integer_input(_population_id): + flash("You did not provide a valid population. Please select one to " + "continue.", + "alert-danger") + return population_page + population = with_db_connection( + lambda conn: population_by_id(conn, _population_id)) + if not bool(population): + flash("Invalid grouping/population!", + "alert-error error-select-population") + return population_page + + return redirect(url_for("samples.upload_samples", + species_id=species_id, + population_id=_population_id), + code=307) + +def read_samples_file(filepath, separator: str, firstlineheading: bool, **kwargs) -> Iterator[dict]: + """Read the samples file.""" + with open(filepath, "r", encoding="utf-8") as inputfile: + reader = csv.DictReader( + inputfile, + fieldnames=( + None if firstlineheading + else ("Name", "Name2", "Symbol", "Alias")), + delimiter=separator, + quotechar=kwargs.get("quotechar", '"')) + for row in reader: + yield row + +def save_samples_data(conn: mdb.Connection, + speciesid: int, + file_data: Iterator[dict]): + """Save the samples to DB.""" + data = ({**row, "SpeciesId": speciesid} for row in file_data) + total = 0 + with conn.cursor() as cursor: + while True: + batch = take(data, 5000) + if len(batch) == 0: + break + cursor.executemany( + "INSERT INTO Strain(Name, Name2, SpeciesId, Symbol, Alias) " + "VALUES(" + " %(Name)s, %(Name2)s, %(SpeciesId)s, %(Symbol)s, %(Alias)s" + ") ON DUPLICATE KEY UPDATE Name=Name", + batch) + total += len(batch) + print(f"\tSaved {total} samples total so far.") + +def cross_reference_samples(conn: mdb.Connection, + species_id: int, + population_id: int, + strain_names: Iterator[str]): + """Link samples to their population.""" + with conn.cursor(cursorclass=DictCursor) as cursor: + cursor.execute( + "SELECT MAX(OrderId) AS loid FROM StrainXRef WHERE InbredSetId=%s", + (population_id,)) + last_order_id = (cursor.fetchone()["loid"] or 10) + total = 0 + while True: + batch = take(strain_names, 5000) + if len(batch) == 0: + break + params_str = ", ".join(["%s"] * len(batch)) + ## This query is slow -- investigate. + cursor.execute( + "SELECT s.Id FROM Strain AS s LEFT JOIN StrainXRef AS sx " + "ON s.Id = sx.StrainId WHERE s.SpeciesId=%s AND s.Name IN " + f"({params_str}) AND sx.StrainId IS NULL", + (species_id,) + tuple(batch)) + strain_ids = (sid["Id"] for sid in cursor.fetchall()) + params = tuple({ + "pop_id": population_id, + "strain_id": strain_id, + "order_id": last_order_id + (order_id * 10), + "mapping": "N", + "pedigree": None + } for order_id, strain_id in enumerate(strain_ids, start=1)) + cursor.executemany( + "INSERT INTO StrainXRef( " + " InbredSetId, StrainId, OrderId, Used_for_mapping, PedigreeStatus" + ")" + "VALUES (" + " %(pop_id)s, %(strain_id)s, %(order_id)s, %(mapping)s, " + " %(pedigree)s" + ")", + params) + last_order_id += (len(params) * 10) + total += len(batch) + print(f"\t{total} total samples cross-referenced to the population " + "so far.") + +def build_sample_upload_job(# pylint: disable=[too-many-arguments] + speciesid: int, + populationid: int, + samplesfile: Path, + separator: str, + firstlineheading: bool, + quotechar: str): + """Define the async command to run the actual samples data upload.""" + return [ + sys.executable, "-m", "scripts.insert_samples", app.config["SQL_URI"], + str(speciesid), str(populationid), str(samplesfile.absolute()), + separator, f"--redisuri={app.config['REDIS_URL']}", + f"--quotechar={quotechar}" + ] + (["--firstlineheading"] if firstlineheading else []) + +@samples.route("/upload/species//populations//samples", + methods=["GET", "POST"]) +@require_login +def upload_samples(species_id: int, population_id: int):#pylint: disable=[too-many-return-statements] + """Upload the samples.""" + samples_uploads_page = redirect(url_for("samples.upload_samples", + species_id=species_id, + population_id=population_id)) + if not is_integer_input(species_id): + flash("You did not provide a valid species. Please select one to " + "continue.", + "alert-danger") + return redirect(url_for("expression-data.samples.select_species")) + species = with_db_connection(lambda conn: species_by_id(conn, species_id)) + if not bool(species): + flash("Species with given ID was not found.", "alert-danger") + return redirect(url_for("expression-data.samples.select_species")) + + if not is_integer_input(population_id): + flash("You did not provide a valid population. Please select one " + "to continue.", + "alert-danger") + return redirect(url_for("samples.select_population", + species_id=species_id), + code=307) + population = with_db_connection( + lambda conn: population_by_id(conn, int(population_id))) + if not bool(population): + flash("Invalid grouping/population!", "alert-error") + return redirect(url_for("samples.select_population", + species_id=species_id), + code=307) + + if request.method == "GET" or request.files.get("samples_file") is None: + return render_template("samples/upload-samples.html", + species=species, + population=population) + + try: + samples_file = save_file(request.files["samples_file"], + Path(app.config["UPLOAD_FOLDER"])) + except AssertionError: + flash("You need to provide a file with the samples data.", + "alert-error") + return samples_uploads_page + + firstlineheading = (request.form.get("first_line_heading") == "on") + + separator = request.form.get("separator", ",") + if separator == "other": + separator = request.form.get("other_separator", ",") + if not bool(separator): + flash("You need to provide a separator character.", "alert-error") + return samples_uploads_page + + quotechar = (request.form.get("field_delimiter", '"') or '"') + + redisuri = app.config["REDIS_URL"] + with Redis.from_url(redisuri, decode_responses=True) as rconn: + the_job = jobs.launch_job( + jobs.initialise_job( + rconn, + jobs.jobsnamespace(), + str(uuid.uuid4()), + build_sample_upload_job( + species["SpeciesId"], + population["InbredSetId"], + samples_file, + separator, + firstlineheading, + quotechar), + "samples_upload", + app.config["JOBS_TTL_SECONDS"], + {"job_name": f"Samples Upload: {samples_file.name}"}), + redisuri, + f"{app.config['UPLOAD_FOLDER']}/job_errors") + return redirect(url_for( + "samples.upload_status", job_id=the_job["jobid"])) + +@samples.route("/upload/status/", methods=["GET"]) +def upload_status(job_id: uuid.UUID): + """Check on the status of a samples upload job.""" + job = with_redis_connection(lambda rconn: jobs.job( + rconn, jobs.jobsnamespace(), job_id)) + if job: + status = job["status"] + if status == "success": + return render_template("samples/upload-success.html", job=job) + + if status == "error": + return redirect(url_for("samples.upload_failure", job_id=job_id)) + + error_filename = Path(jobs.error_filename( + job_id, f"{app.config['UPLOAD_FOLDER']}/job_errors")) + if error_filename.exists(): + stat = os.stat(error_filename) + if stat.st_size > 0: + return redirect(url_for( + "samples.upload_failure", job_id=job_id)) + + return render_template( + "samples/upload-progress.html", + job=job) # maybe also handle this? + + return render_template("no_such_job.html", job_id=job_id), 400 + +@samples.route("/upload/failure/", methods=["GET"]) +def upload_failure(job_id: uuid.UUID): + """Display the errors of the samples upload failure.""" + job = with_redis_connection(lambda rconn: jobs.job( + rconn, jobs.jobsnamespace(), job_id)) + if not bool(job): + return render_template("no_such_job.html", job_id=job_id), 400 + + error_filename = Path(jobs.error_filename( + job_id, f"{app.config['UPLOAD_FOLDER']}/job_errors")) + if error_filename.exists(): + stat = os.stat(error_filename) + if stat.st_size > 0: + return render_template("worker_failure.html", job_id=job_id) + + return render_template("samples/upload-failure.html", job=job) -- cgit v1.2.3