diff options
Diffstat (limited to 'scripts')
| -rw-r--r-- | scripts/cli/options.py | 10 | ||||
| -rw-r--r-- | scripts/insert_samples.py | 16 | ||||
| -rw-r--r-- | scripts/load_phenotypes_to_db.py | 52 | ||||
| -rw-r--r-- | scripts/phenotypes/__init__.py | 1 | ||||
| -rw-r--r-- | scripts/phenotypes/delete_phenotypes.py | 173 | ||||
| -rw-r--r-- | scripts/rqtl2/phenotypes_qc.py | 7 | ||||
| -rw-r--r-- | scripts/run_qtlreaper.py | 49 |
7 files changed, 266 insertions, 42 deletions
diff --git a/scripts/cli/options.py b/scripts/cli/options.py index 70d2a27..58d3df4 100644 --- a/scripts/cli/options.py +++ b/scripts/cli/options.py @@ -44,3 +44,13 @@ def add_population_id(parser: ArgumentParser) -> ArgumentParser: type=int, help="The ID for the population to operate on.") return parser + + +def add_dataset_id(parser: ArgumentParser) -> ArgumentParser: + """Add dataset-id as a mandatory argument.""" + parser = add_population_id(parser) + parser.add_argument("dataset_id", + metavar="DATASET-ID", + type=int, + help="The ID for the dataset to operate on.") + return parser diff --git a/scripts/insert_samples.py b/scripts/insert_samples.py index fc029f9..96ae8e2 100644 --- a/scripts/insert_samples.py +++ b/scripts/insert_samples.py @@ -6,10 +6,10 @@ import argparse import traceback import MySQLdb as mdb -from redis import Redis + from gn_libs.mysqldb import database_connection -from uploader.check_connections import check_db, check_redis +from uploader.check_connections import check_db from uploader.species.models import species_by_id from uploader.population.models import population_by_id from uploader.samples.models import ( @@ -35,7 +35,6 @@ class SeparatorAction(argparse.Action): setattr(namespace, self.dest, (chr(9) if values == "\\t" else values)) def insert_samples(conn: mdb.Connection,# pylint: disable=[too-many-arguments, too-many-positional-arguments] - rconn: Redis,# pylint: disable=[unused-argument] speciesid: int, populationid: int, samplesfile: pathlib.Path, @@ -119,11 +118,6 @@ if __name__ == "__main__": help=("The character used to delimit (surround?) the value in " "each column.")) - # == Script-specific extras == - parser.add_argument("--redisuri", - help="URL to initialise connection to redis", - default="redis:///") - args = parser.parse_args() return args @@ -132,17 +126,13 @@ if __name__ == "__main__": status_code = 1 # Exit with an Exception args = cli_args() check_db(args.databaseuri) - check_redis(args.redisuri) if not args.samplesfile.exists(): logging.error("File not found: '%s'.", args.samplesfile) return 2 - with (Redis.from_url(args.redisuri, decode_responses=True) as rconn, - database_connection(args.databaseuri) as dbconn): - + with database_connection(args.databaseuri) as dbconn: try: status_code = insert_samples(dbconn, - rconn, args.speciesid, args.populationid, args.samplesfile, diff --git a/scripts/load_phenotypes_to_db.py b/scripts/load_phenotypes_to_db.py index e303bb3..31eb715 100644 --- a/scripts/load_phenotypes_to_db.py +++ b/scripts/load_phenotypes_to_db.py @@ -5,9 +5,9 @@ import json import time import logging import argparse -import datetime from pathlib import Path from zipfile import ZipFile +from datetime import datetime from typing import Any, Iterable from urllib.parse import urljoin from functools import reduce, partial @@ -198,13 +198,16 @@ save_phenotypes_n = partial(save_numeric_data, def update_auth(# pylint: disable=[too-many-locals,too-many-positional-arguments,too-many-arguments] - authserver, - token, + auth_details, + resource_details, species, population, dataset, xrefdata): """Grant the user access to their data.""" + logger.info("Updating authorisation for the data.") + logger.debug("Resource details for the authorisation: %s", resource_details) + authserver, token = auth_details _tries = 0 _delay = 1 headers = { @@ -215,14 +218,14 @@ def update_auth(# pylint: disable=[too-many-locals,too-many-positional-arguments return urljoin(authserver, endpoint) def __fetch_user_details__(): - logger.debug("… Fetching user details") + logger.info("… Fetching user details") return mrequests.get( authserveruri("/auth/user/"), headers=headers ) def __link_data__(user): - logger.debug("… linking uploaded data to user's group") + logger.info("… linking uploaded data to user's group") return mrequests.post( authserveruri("/auth/data/link/phenotype"), headers=headers, @@ -245,7 +248,7 @@ def update_auth(# pylint: disable=[too-many-locals,too-many-positional-arguments }).then(lambda ld_results: (user, ld_results)) def __fetch_phenotype_category_details__(user, linkeddata): - logger.debug("… fetching phenotype category details") + logger.info("… fetching phenotype category details") return mrequests.get( authserveruri("/auth/resource/categories"), headers=headers @@ -258,20 +261,18 @@ def update_auth(# pylint: disable=[too-many-locals,too-many-positional-arguments ) def __create_resource__(user, linkeddata, category): - logger.debug("… creating authorisation resource object") - now = datetime.datetime.now().isoformat() + logger.info("… creating authorisation resource object") return mrequests.post( authserveruri("/auth/resource/create"), headers=headers, json={ + **resource_details, "resource_category": category["resource_category_id"], - "resource_name": (f"{user['email']}—{dataset['Name']}—{now}—" - f"{len(xrefdata)} phenotypes"), "public": "off" }).then(lambda cr_results: (user, linkeddata, cr_results)) def __attach_data_to_resource__(user, linkeddata, resource): - logger.debug("… attaching data to authorisation resource object") + logger.info("… attaching data to authorisation resource object") return mrequests.post( authserveruri("/auth/resource/data/link"), headers=headers, @@ -288,8 +289,8 @@ def update_auth(# pylint: disable=[too-many-locals,too-many-positional-arguments # This is hacky. If the auth already exists, something went wrong # somewhere. # This needs investigation to recover correctly. - logger.info( - "The authorisation for the data was already set up.") + logger.error( + "Error: The authorisation for the data was already set up.") return 0 logger.error("ERROR: Updating the authorisation for the data failed.") logger.debug( @@ -461,6 +462,25 @@ if __name__ == "__main__": logging.getLogger("uploader.phenotypes.models").setLevel(log_level) + def __parse_resource_details__(meta) -> dict: + """Parse out details regarding the wrapper resource from the metadata.""" + _key_mappings_ = { + # allow both 'data_*' and 'data*' for the metadata. + "data_description": "description", + "datadescription": "description" + } + return { + "resource_name": meta.get( + "dataname", + meta.get("data_name", + "Unnamed phenotypes - " + datetime.now().isoformat())), + "resource_metadata": { + rkey: meta[mkey] + for mkey, rkey in _key_mappings_.items() if mkey in meta + } + } + + def main(): """Entry-point for this script.""" args = parse_args() @@ -516,8 +536,10 @@ if __name__ == "__main__": # Update authorisations (break this down) — maybe loop until it works? logger.info("Updating authorisation.") _job_metadata = job["metadata"] - return update_auth(_job_metadata["authserver"], - _job_metadata["token"], + + return update_auth((_job_metadata["authserver"], + _job_metadata["token"]), + __parse_resource_details__(_job_metadata), *db_results) diff --git a/scripts/phenotypes/__init__.py b/scripts/phenotypes/__init__.py new file mode 100644 index 0000000..73ad839 --- /dev/null +++ b/scripts/phenotypes/__init__.py @@ -0,0 +1 @@ +"Scripts for dealing with phenotypes." diff --git a/scripts/phenotypes/delete_phenotypes.py b/scripts/phenotypes/delete_phenotypes.py new file mode 100644 index 0000000..461f3ec --- /dev/null +++ b/scripts/phenotypes/delete_phenotypes.py @@ -0,0 +1,173 @@ +"""Delete phenotypes.""" +import sys +import logging +from pathlib import Path +from typing import Optional +from urllib.parse import urljoin +from argparse import Namespace, ArgumentParser + +import requests +from MySQLdb.cursors import DictCursor, BaseCursor + +from gn_libs.mysqldb import database_connection + +from uploader.phenotypes.models import delete_phenotypes +from scripts.cli.logging import setup_logging +from scripts.cli.options import (add_logging, + add_mariadb_uri, + add_population_id) + +logger = logging.getLogger(__name__) + +def read_xref_ids_file(filepath: Optional[Path]) -> tuple[int, ...]: + """Read the phenotypes' cross-reference IDS from file.""" + if filepath is None: + return tuple() + + logger.debug("Using file '%s' to retrieve XREF IDs for deletion.", + filepath.name) + _ids: tuple[int, ...] = tuple() + with filepath.open(mode="r") as infile: + for line in infile.readlines(): + try: + _ids += (int(line.strip()),) + except TypeError: + pass + + return _ids + + +def fetch_all_xref_ids( + cursor: BaseCursor, population_id: int) -> tuple[int, ...]: + """Fetch all cross-reference IDs.""" + cursor.execute("SELECT Id FROM PublishXRef WHERE InbredSetId=%s", + (population_id,)) + return tuple(int(row["Id"]) for row in cursor.fetchall()) + + +def update_auth( + auth_details: tuple[str, str], + species_id: int, + population_id: int, + dataset_id: int, + xref_ids: tuple[int, ...] = tuple() +): + """Update the authorisation server: remove items to delete.""" + authserver, token = auth_details + resp = requests.post( + urljoin(authserver, + (f"/auth/data/phenotypes/{species_id}/{population_id}" + f"/{dataset_id}/delete")), + timeout=(9.13, 20), + headers={ + "Authorization": f"Bearer {token}", + "Content-Type": "application/json" + }, + json={"xref_ids": xref_ids}) + resp.raise_for_status() + + +def delete_the_phenotypes( + cursor: BaseCursor, + population_id: int, + xref_ids: tuple[int, ...] = tuple()) -> int: + """Process and delete the phenotypes.""" + delete_phenotypes(cursor, population_id, xref_ids) + + return 0 + +if __name__ == "__main__": + def parse_args() -> Namespace: + """Parse CLI arguments.""" + parser = add_logging( + add_population_id( + add_mariadb_uri( + ArgumentParser( + prog="delete-phenotypes", + description=( + "Script to delete phenotypes from the database."))))) + parser.add_argument( + "dataset_id", + metavar="DATASET-ID", + type=int, + help="The dataset identifier for phenotypes to delete.") + parser.add_argument( + "auth_server_uri", + metavar="AUTH-SERVER-URI", + type=str, + help="URI to the authorisation server.") + parser.add_argument( + "auth_token", + metavar="AUTH-TOKEN", + type=str, + help=("Token to use to update the authorisation system with the " + "deletions done.")) + parser.add_argument( + "--xref_ids_file", + metavar="XREF-IDS-FILE", + type=Path, + help=("Path to a file with phenotypes cross-reference IDs to " + "delete.")) + parser.add_argument( + "--delete-all", + action="store_true", + help=("If no 'XREF-IDS-FILE' is provided, this flag determines " + "whether or not all the phenotypes for the given population " + "will be deleted.")) + return parser.parse_args() + + + def main(): + """The `delete-phenotypes` script's entry point.""" + args = parse_args() + setup_logging(logger, args.log_level.upper(), tuple()) + with (database_connection(args.db_uri) as conn, + conn.cursor(cursorclass=DictCursor) as cursor): + xref_ids = read_xref_ids_file(args.xref_ids_file) + try: + assert not (len(xref_ids) > 0 and args.delete_all) + xref_ids = (fetch_all_xref_ids(cursor, args.population_id) + if args.delete_all else xref_ids) + logger.debug("Will delete %s phenotypes and related data", + len(xref_ids)) + if len(xref_ids) == 0: + print("No cross-reference IDs were provided. Aborting.") + return 0 + + print("Updating authorisations: ", end="") + update_auth((args.auth_server_uri, args.auth_token), + args.species_id, + args.population_id, + args.dataset_id, + xref_ids) + print("OK.") + print("Deleting the data: ", end="") + delete_phenotypes(cursor, args.population_id, xref_ids=xref_ids) + print("OK.") + if args.xref_ids_file is not None: + print("Deleting temporary file: ", end="") + args.xref_ids_file.unlink() + print("OK.") + + return 0 + except AssertionError: + logger.error( + "'DELETE-ALL' and 'XREF-IDS' are mutually exclusive. " + "If you specify the list of XREF-IDS (in a file) to delete " + "and also specify to 'DELETE-ALL' phenotypes in the " + "population, we have no way of knowing what it is you want.") + return 1 + except requests.exceptions.HTTPError as _exc: + resp = _exc.response + resp_data = resp.json() + logger.debug("%s: %s", + resp_data["error"], + resp_data["error_description"], + exc_info=True) + return 1 + except Exception as _exc:# pylint: disable=[broad-exception-caught] + logger.debug("Failed while attempting to delete phenotypes.", + exc_info=True) + return 1 + + sys.exit(main()) diff --git a/scripts/rqtl2/phenotypes_qc.py b/scripts/rqtl2/phenotypes_qc.py index 72d6c83..084c876 100644 --- a/scripts/rqtl2/phenotypes_qc.py +++ b/scripts/rqtl2/phenotypes_qc.py @@ -198,7 +198,7 @@ def qc_phenocovar_file( "-", "-", (f"File {filepath.name} is missing the {heading} heading " - "in the header line."))),) + "in the header row/line."))),) def collect_errors(errors_and_linecount, line): _errs, _lc = errors_and_linecount @@ -312,8 +312,9 @@ def qc_pheno_file(# pylint: disable=[too-many-locals, too-many-arguments, too-ma "header row", "-", ", ".join(_absent), - ("The following phenotype names do not exist in any of the " - f"provided phenocovar files: ({', '.join(_absent)})"))),) + ("The following trait names/identifiers do not exist in any of " + "the provided descriptions/covariates files: " + f"({', '.join(_absent)})"))),) def collect_errors(errors_and_linecount, line): _errs, _lc = errors_and_linecount diff --git a/scripts/run_qtlreaper.py b/scripts/run_qtlreaper.py index 7d58402..2269ea6 100644 --- a/scripts/run_qtlreaper.py +++ b/scripts/run_qtlreaper.py @@ -1,10 +1,12 @@ """Script to run rust-qtlreaper and update database with results.""" +import os import sys import csv import time import secrets import logging import subprocess +import multiprocessing from pathlib import Path from functools import reduce from typing import Union, Iterator @@ -146,30 +148,55 @@ def dispatch(args: Namespace) -> int: _qtlreaper_main_output = args.working_dir.joinpath( f"main-output-{secrets.token_urlsafe(15)}.tsv")#type: ignore[attr-defined] + _qtlreaper_permu_output = args.working_dir.joinpath( + f"permu-output-{secrets.token_urlsafe(15)}.tsv") logger.debug("Main output filename: %s", _qtlreaper_main_output) with subprocess.Popen( ("qtlreaper", "--n_permutations", "1000", "--geno", _genofile, "--traits", _traitsfile, - "--main_output", _qtlreaper_main_output)) as _qtlreaper: + "--main_output", _qtlreaper_main_output, + "--permu_output", _qtlreaper_permu_output, + "--threads", str(int(1+(multiprocessing.cpu_count()/2)))), + env=({**os.environ, "RUST_BACKTRACE": "full"} + if logger.getEffectiveLevel() == logging.DEBUG + else dict(os.environ))) as _qtlreaper: while _qtlreaper.poll() is None: logger.debug("QTLReaper process running…") time.sleep(1) - results = tuple(#type: ignore[var-annotated] - max(qtls, key=lambda qtl: qtl["LRS"]) - for qtls in - reduce(__qtls_by_trait__, - parse_tsv_file(_qtlreaper_main_output), - {}).values()) - save_qtl_values_to_db(conn, results) + results = ( + tuple(#type: ignore[var-annotated] + max(qtls, key=lambda qtl: qtl["LRS"]) + for qtls in + reduce(__qtls_by_trait__, + parse_tsv_file(_qtlreaper_main_output), + {}).values()) + if _qtlreaper_main_output.exists() + else tuple()) logger.debug("Cleaning up temporary files.") - _traitsfile.unlink() - _qtlreaper_main_output.unlink() + + # short-circuits to delete file if exists + if _traitsfile.exists(): + _traitsfile.unlink() + logger.info("Deleted generated traits' file for QTLReaper.") + + if _qtlreaper_main_output.exists(): + _qtlreaper_main_output.unlink() + logger.info("Deleted QTLReaper's main output file.") + + if _qtlreaper_permu_output.exists(): + _qtlreaper_permu_output.unlink() + logger.info("Deleted QTLReaper's permutations file.") + + if _qtlreaper.returncode != 0: + return _qtlreaper.returncode + + save_qtl_values_to_db(conn, results) logger.info("Successfully computed p values for %s traits.", len(_traitsdata)) return 0 except FileNotFoundError as fnf: - logger.error(", ".join(fnf.args), exc_info=False) + logger.error(", ".join(str(arg) for arg in fnf.args), exc_info=False) except AssertionError as aserr: logger.error(", ".join(aserr.args), exc_info=False) except Exception as _exc:# pylint: disable=[broad-exception-caught] |
