diff options
Diffstat (limited to 'scripts')
| -rw-r--r-- | scripts/cli/options.py | 2 | ||||
| -rw-r--r-- | scripts/compute_phenotype_means.py | 2 | ||||
| -rw-r--r-- | scripts/insert_samples.py | 16 | ||||
| -rw-r--r-- | scripts/load_phenotypes_to_db.py | 17 | ||||
| -rw-r--r-- | scripts/phenotypes/__init__.py | 1 | ||||
| -rw-r--r-- | scripts/phenotypes/delete_phenotypes.py | 173 | ||||
| -rw-r--r-- | scripts/qc_on_rqtl2_bundle.py | 9 | ||||
| -rw-r--r-- | scripts/rqtl2/entry.py | 2 | ||||
| -rw-r--r-- | scripts/rqtl2/install_genotypes.py | 6 | ||||
| -rw-r--r-- | scripts/rqtl2/install_phenos.py | 7 | ||||
| -rw-r--r-- | scripts/rqtl2/phenotypes_qc.py | 3 | ||||
| -rw-r--r-- | scripts/run_qtlreaper.py | 44 |
12 files changed, 225 insertions, 57 deletions
diff --git a/scripts/cli/options.py b/scripts/cli/options.py index 67f35dc..70d2a27 100644 --- a/scripts/cli/options.py +++ b/scripts/cli/options.py @@ -13,7 +13,7 @@ def add_logging(parser: ArgumentParser) -> ArgumentParser: type=str, default="INFO", choices=loglevels, - help=(f"Controls the severity of events to log. Valid values are: " + + help=("Controls the severity of events to log. Valid values are: " + ", ".join(f"'{level}'" for level in loglevels))) return parser diff --git a/scripts/compute_phenotype_means.py b/scripts/compute_phenotype_means.py index ef2fabc..6d39ace 100644 --- a/scripts/compute_phenotype_means.py +++ b/scripts/compute_phenotype_means.py @@ -51,7 +51,7 @@ def run(args) -> int: T = TypeVar("T") -def comma_separated_list(val: str, itemstype: T = str) -> tuple[T, ...]: +def comma_separated_list(val: str, itemstype: type = str) -> tuple[T, ...]: """Convert val into a list of items of type 'itemstype'.""" return tuple(itemstype(item.strip()) for item in val.split(",")) diff --git a/scripts/insert_samples.py b/scripts/insert_samples.py index fc029f9..96ae8e2 100644 --- a/scripts/insert_samples.py +++ b/scripts/insert_samples.py @@ -6,10 +6,10 @@ import argparse import traceback import MySQLdb as mdb -from redis import Redis + from gn_libs.mysqldb import database_connection -from uploader.check_connections import check_db, check_redis +from uploader.check_connections import check_db from uploader.species.models import species_by_id from uploader.population.models import population_by_id from uploader.samples.models import ( @@ -35,7 +35,6 @@ class SeparatorAction(argparse.Action): setattr(namespace, self.dest, (chr(9) if values == "\\t" else values)) def insert_samples(conn: mdb.Connection,# pylint: disable=[too-many-arguments, too-many-positional-arguments] - rconn: Redis,# pylint: disable=[unused-argument] speciesid: int, populationid: int, samplesfile: pathlib.Path, @@ -119,11 +118,6 @@ if __name__ == "__main__": help=("The character used to delimit (surround?) the value in " "each column.")) - # == Script-specific extras == - parser.add_argument("--redisuri", - help="URL to initialise connection to redis", - default="redis:///") - args = parser.parse_args() return args @@ -132,17 +126,13 @@ if __name__ == "__main__": status_code = 1 # Exit with an Exception args = cli_args() check_db(args.databaseuri) - check_redis(args.redisuri) if not args.samplesfile.exists(): logging.error("File not found: '%s'.", args.samplesfile) return 2 - with (Redis.from_url(args.redisuri, decode_responses=True) as rconn, - database_connection(args.databaseuri) as dbconn): - + with database_connection(args.databaseuri) as dbconn: try: status_code = insert_samples(dbconn, - rconn, args.speciesid, args.populationid, args.samplesfile, diff --git a/scripts/load_phenotypes_to_db.py b/scripts/load_phenotypes_to_db.py index e449b82..e303bb3 100644 --- a/scripts/load_phenotypes_to_db.py +++ b/scripts/load_phenotypes_to_db.py @@ -6,9 +6,9 @@ import time import logging import argparse import datetime -from typing import Any from pathlib import Path from zipfile import ZipFile +from typing import Any, Iterable from urllib.parse import urljoin from functools import reduce, partial @@ -55,7 +55,7 @@ def save_phenotypes( if control_data["phenocovar_transposed"]: logger.info("Undoing transposition of the files rows and columns.") - phenofiles = ( + phenofiles = tuple( rqtl2.transpose_csv_with_rename( _file, build_line_splitter(control_data), @@ -86,7 +86,7 @@ def __row_to_dataitems__( dataidmap: dict, pheno_name2id: dict[str, int], samples: dict -) -> tuple[dict, ...]: +) -> Iterable[dict]: samplename = sample_row["id"] return ({ @@ -134,7 +134,7 @@ def save_numeric_data(# pylint: disable=[too-many-positional-arguments,too-many- conn: mysqldb.Connection, dataidmap: dict, pheno_name2id: dict[str, int], - samples: tuple[dict, ...], + samples: dict, control_data: dict, filesdir: Path, filetype: str, @@ -311,7 +311,9 @@ def update_auth(# pylint: disable=[too-many-locals,too-many-positional-arguments ).either(__handle_error__, __handle_success__) -def load_data(conn: mysqldb.Connection, job: dict) -> int:#pylint: disable=[too-many-locals] +def load_data(# pylint: disable=[too-many-locals] + conn: mysqldb.Connection, job: dict +) -> tuple[dict, dict, dict, tuple[int, ...]]: """Load the data attached in the given job.""" _job_metadata = job["metadata"] # Steps @@ -365,9 +367,8 @@ def load_data(conn: mysqldb.Connection, job: dict) -> int:#pylint: disable=[too- "publication_id": row["publication_id"], "data_id": row["data_id"] },))) - dataidmap, pheno_name2id, _xrefs = reduce(__build_phenos_maps__, - _phenos, - ({},{}, tuple())) + dataidmap, pheno_name2id, _xrefs = reduce(# type: ignore[var-annotated] + __build_phenos_maps__, _phenos, ({},{}, tuple())) # 3. a. Fetch the strain names and IDS: create name->ID map samples = { row["Name"]: row diff --git a/scripts/phenotypes/__init__.py b/scripts/phenotypes/__init__.py new file mode 100644 index 0000000..73ad839 --- /dev/null +++ b/scripts/phenotypes/__init__.py @@ -0,0 +1 @@ +"Scripts for dealing with phenotypes." diff --git a/scripts/phenotypes/delete_phenotypes.py b/scripts/phenotypes/delete_phenotypes.py new file mode 100644 index 0000000..461f3ec --- /dev/null +++ b/scripts/phenotypes/delete_phenotypes.py @@ -0,0 +1,173 @@ +"""Delete phenotypes.""" +import sys +import logging +from pathlib import Path +from typing import Optional +from urllib.parse import urljoin +from argparse import Namespace, ArgumentParser + +import requests +from MySQLdb.cursors import DictCursor, BaseCursor + +from gn_libs.mysqldb import database_connection + +from uploader.phenotypes.models import delete_phenotypes +from scripts.cli.logging import setup_logging +from scripts.cli.options import (add_logging, + add_mariadb_uri, + add_population_id) + +logger = logging.getLogger(__name__) + +def read_xref_ids_file(filepath: Optional[Path]) -> tuple[int, ...]: + """Read the phenotypes' cross-reference IDS from file.""" + if filepath is None: + return tuple() + + logger.debug("Using file '%s' to retrieve XREF IDs for deletion.", + filepath.name) + _ids: tuple[int, ...] = tuple() + with filepath.open(mode="r") as infile: + for line in infile.readlines(): + try: + _ids += (int(line.strip()),) + except TypeError: + pass + + return _ids + + +def fetch_all_xref_ids( + cursor: BaseCursor, population_id: int) -> tuple[int, ...]: + """Fetch all cross-reference IDs.""" + cursor.execute("SELECT Id FROM PublishXRef WHERE InbredSetId=%s", + (population_id,)) + return tuple(int(row["Id"]) for row in cursor.fetchall()) + + +def update_auth( + auth_details: tuple[str, str], + species_id: int, + population_id: int, + dataset_id: int, + xref_ids: tuple[int, ...] = tuple() +): + """Update the authorisation server: remove items to delete.""" + authserver, token = auth_details + resp = requests.post( + urljoin(authserver, + (f"/auth/data/phenotypes/{species_id}/{population_id}" + f"/{dataset_id}/delete")), + timeout=(9.13, 20), + headers={ + "Authorization": f"Bearer {token}", + "Content-Type": "application/json" + }, + json={"xref_ids": xref_ids}) + resp.raise_for_status() + + +def delete_the_phenotypes( + cursor: BaseCursor, + population_id: int, + xref_ids: tuple[int, ...] = tuple()) -> int: + """Process and delete the phenotypes.""" + delete_phenotypes(cursor, population_id, xref_ids) + + return 0 + +if __name__ == "__main__": + def parse_args() -> Namespace: + """Parse CLI arguments.""" + parser = add_logging( + add_population_id( + add_mariadb_uri( + ArgumentParser( + prog="delete-phenotypes", + description=( + "Script to delete phenotypes from the database."))))) + parser.add_argument( + "dataset_id", + metavar="DATASET-ID", + type=int, + help="The dataset identifier for phenotypes to delete.") + parser.add_argument( + "auth_server_uri", + metavar="AUTH-SERVER-URI", + type=str, + help="URI to the authorisation server.") + parser.add_argument( + "auth_token", + metavar="AUTH-TOKEN", + type=str, + help=("Token to use to update the authorisation system with the " + "deletions done.")) + parser.add_argument( + "--xref_ids_file", + metavar="XREF-IDS-FILE", + type=Path, + help=("Path to a file with phenotypes cross-reference IDs to " + "delete.")) + parser.add_argument( + "--delete-all", + action="store_true", + help=("If no 'XREF-IDS-FILE' is provided, this flag determines " + "whether or not all the phenotypes for the given population " + "will be deleted.")) + return parser.parse_args() + + + def main(): + """The `delete-phenotypes` script's entry point.""" + args = parse_args() + setup_logging(logger, args.log_level.upper(), tuple()) + with (database_connection(args.db_uri) as conn, + conn.cursor(cursorclass=DictCursor) as cursor): + xref_ids = read_xref_ids_file(args.xref_ids_file) + try: + assert not (len(xref_ids) > 0 and args.delete_all) + xref_ids = (fetch_all_xref_ids(cursor, args.population_id) + if args.delete_all else xref_ids) + logger.debug("Will delete %s phenotypes and related data", + len(xref_ids)) + if len(xref_ids) == 0: + print("No cross-reference IDs were provided. Aborting.") + return 0 + + print("Updating authorisations: ", end="") + update_auth((args.auth_server_uri, args.auth_token), + args.species_id, + args.population_id, + args.dataset_id, + xref_ids) + print("OK.") + print("Deleting the data: ", end="") + delete_phenotypes(cursor, args.population_id, xref_ids=xref_ids) + print("OK.") + if args.xref_ids_file is not None: + print("Deleting temporary file: ", end="") + args.xref_ids_file.unlink() + print("OK.") + + return 0 + except AssertionError: + logger.error( + "'DELETE-ALL' and 'XREF-IDS' are mutually exclusive. " + "If you specify the list of XREF-IDS (in a file) to delete " + "and also specify to 'DELETE-ALL' phenotypes in the " + "population, we have no way of knowing what it is you want.") + return 1 + except requests.exceptions.HTTPError as _exc: + resp = _exc.response + resp_data = resp.json() + logger.debug("%s: %s", + resp_data["error"], + resp_data["error_description"], + exc_info=True) + return 1 + except Exception as _exc:# pylint: disable=[broad-exception-caught] + logger.debug("Failed while attempting to delete phenotypes.", + exc_info=True) + return 1 + + sys.exit(main()) diff --git a/scripts/qc_on_rqtl2_bundle.py b/scripts/qc_on_rqtl2_bundle.py index 0207938..4e6ef00 100644 --- a/scripts/qc_on_rqtl2_bundle.py +++ b/scripts/qc_on_rqtl2_bundle.py @@ -40,7 +40,7 @@ def add_to_errors(rconn: Redis, """Add `errors` to a given list of errors""" errs = tuple(dict(item) for item in set( [dict2tuple(old) for old in - json.loads(rconn.hget(fqjobid, key) or "[]")] + + json.loads(rconn.hget(fqjobid, key) or "[]")] +# type: ignore[arg-type] [dict2tuple({"type": type(error).__name__, **error._asdict()}) for error in errors])) rconn.hset(fqjobid, key, json.dumps(errs)) @@ -83,7 +83,8 @@ def retrieve_errors_with_progress(rconn: Redis,#pylint: disable=[too-many-locals count = 0 checked = 0 cdata = rqtl2.control_data(zfile) - rconn.hset(fqjobid, f"{filetype}-filesize", compute_filesize(zfile, filetype)) + rconn.hset( + fqjobid, f"{filetype}-filesize", str(compute_filesize(zfile, filetype))) def __update_processed__(value): nonlocal checked checked = checked + len(value) @@ -104,7 +105,7 @@ def retrieve_errors_with_progress(rconn: Redis,#pylint: disable=[too-many-locals yield error __update_processed__(value) - rconn.hset(fqjobid, f"{filetype}-linecount", count) + rconn.hset(fqjobid, f"{filetype}-linecount", count)# type: ignore[arg-type] except rqe.MissingFileException: fname = cdata.get(filetype) yield rqfe.MissingFile(filetype, fname, ( @@ -295,7 +296,7 @@ def run_qc(rconn: Redis, return 1 def __fetch_errors__(rkey: str) -> tuple: - return tuple(json.loads(rconn.hget(fqjobid, rkey) or "[]")) + return tuple(json.loads(rconn.hget(fqjobid, rkey) or "[]")) # type: ignore[arg-type] return (1 if any(( bool(__fetch_errors__(key)) diff --git a/scripts/rqtl2/entry.py b/scripts/rqtl2/entry.py index e0e00e7..7423a4b 100644 --- a/scripts/rqtl2/entry.py +++ b/scripts/rqtl2/entry.py @@ -26,7 +26,7 @@ def build_main( def main(): with (Redis.from_url(args.redisuri, decode_responses=True) as rconn, database_connection(args.databaseuri) as dbconn): - logger.setLevel(args.loglevel.upper()) + logger.setLevel(args.log_level.upper()) fqjobid = jobs.job_key(args.redisprefix, args.jobid) try: diff --git a/scripts/rqtl2/install_genotypes.py b/scripts/rqtl2/install_genotypes.py index 8762655..5e6abb0 100644 --- a/scripts/rqtl2/install_genotypes.py +++ b/scripts/rqtl2/install_genotypes.py @@ -20,7 +20,7 @@ from scripts.rqtl2.entry import build_main from scripts.rqtl2.cli_parser import add_common_arguments from scripts.cli_parser import init_cli_parser, add_global_data_arguments -__MODULE__ = "scripts.rqtl2.install_genotypes" +logger = getLogger(__name__) def insert_markers( dbconn: mdb.Connection, @@ -191,7 +191,7 @@ def install_genotypes(#pylint: disable=[too-many-locals] dbconn: mdb.Connection, fullyqualifiedjobid: str,#pylint: disable=[unused-argument] args: argparse.Namespace, - logger: Logger = getLogger(__name__) + logger: Logger = logger # pylint: disable=[redefined-outer-name] ) -> int: """Load any existing genotypes into the database.""" (speciesid, populationid, datasetid, rqtl2bundle) = ( @@ -257,5 +257,5 @@ if __name__ == "__main__": return parser.parse_args() - main = build_main(cli_args(), install_genotypes, __MODULE__) + main = build_main(cli_args(), install_genotypes, logger) sys.exit(main()) diff --git a/scripts/rqtl2/install_phenos.py b/scripts/rqtl2/install_phenos.py index 9059cd6..11ac8a4 100644 --- a/scripts/rqtl2/install_phenos.py +++ b/scripts/rqtl2/install_phenos.py @@ -19,7 +19,7 @@ from r_qtl import r_qtl2_qc as rqc from functional_tools import take -__MODULE__ = "scripts.rqtl2.install_phenos" +logger = getLogger(__name__) def insert_probesets(dbconn: mdb.Connection, platformid: int, @@ -101,7 +101,8 @@ def install_pheno_files(#pylint: disable=[too-many-locals] dbconn: mdb.Connection, fullyqualifiedjobid: str,#pylint: disable=[unused-argument] args: argparse.Namespace, - logger: Logger = getLogger()) -> int: + logger: Logger = logger # pylint: disable=[redefined-outer-name] +) -> int: """Load data in `pheno` files and other related files into the database.""" (speciesid, platformid, datasetid, rqtl2bundle) = ( args.speciesid, args.platformid, args.datasetid, args.rqtl2bundle) @@ -159,5 +160,5 @@ if __name__ == "__main__": return parser.parse_args() - main = build_main(cli_args(), install_pheno_files, __MODULE__) + main = build_main(cli_args(), install_pheno_files, logger) sys.exit(main()) diff --git a/scripts/rqtl2/phenotypes_qc.py b/scripts/rqtl2/phenotypes_qc.py index 9f11f57..72d6c83 100644 --- a/scripts/rqtl2/phenotypes_qc.py +++ b/scripts/rqtl2/phenotypes_qc.py @@ -376,7 +376,8 @@ def run_qc(# pylint: disable=[too-many-locals] rconn: Redis, dbconn: mdb.Connection, fullyqualifiedjobid: str, - args: Namespace + args: Namespace, + logger: Logger = logger # pylint: disable=[redefined-outer-name] ) -> int: """Run quality control checks on the bundle.""" print("Beginning the quality assurance checks.") diff --git a/scripts/run_qtlreaper.py b/scripts/run_qtlreaper.py index ab58203..54e5d45 100644 --- a/scripts/run_qtlreaper.py +++ b/scripts/run_qtlreaper.py @@ -4,11 +4,10 @@ import csv import time import secrets import logging -import traceback import subprocess from pathlib import Path -from typing import Union from functools import reduce +from typing import Union, Iterator from argparse import Namespace, ArgumentParser from gn_libs import mysqldb @@ -57,7 +56,7 @@ def reconcile_samples( def generate_qtlreaper_traits_file( outdir: Path, samples: tuple[str, ...], - traits_data: dict[str, Union[int, float]], + traits_data: tuple[dict[str, Union[int, float]], ...], filename_prefix: str = "" ) -> Path: """Generate a file for use with qtlreaper that contains the traits' data.""" @@ -66,7 +65,7 @@ def generate_qtlreaper_traits_file( _dialect.quoting=0 _traitsfile = outdir.joinpath( - f"{filename_prefix}_{secrets.token_urlsafe(15)}.tsv") + f"{filename_prefix}_{secrets.token_urlsafe(15)}.tsv")#type: ignore[attr-defined] with _traitsfile.open(mode="w", encoding="utf-8") as outptr: writer = csv.DictWriter( outptr, fieldnames=("Trait",) + samples, dialect=_dialect) @@ -80,14 +79,13 @@ def generate_qtlreaper_traits_file( return _traitsfile -def parse_tsv_file(results_file: Path) -> list[dict]: +def parse_tsv_file(results_file: Path) -> Iterator[dict]: """Parse the rust-qtlreaper output into usable python objects.""" with results_file.open("r", encoding="utf-8") as readptr: _dialect = csv.unix_dialect() _dialect.delimiter = "\t" reader = csv.DictReader(readptr, dialect=_dialect) - for row in reader: - yield row + yield from reader def __qtls_by_trait__(qtls, current): @@ -98,7 +96,8 @@ def __qtls_by_trait__(qtls, current): } -def save_qtl_values_to_db(conn, qtls: dict): +def save_qtl_values_to_db(conn, qtls: tuple[dict, ...]): + """Save computed QTLs to the database.""" with conn.cursor() as cursor: cursor.executemany( "UPDATE PublishXRef SET " @@ -132,11 +131,11 @@ def dispatch(args: Namespace) -> int: ", ".join(_samples_not_in_genofile)) # Fetch traits data: provided list, or all traits in db - _traitsdata = phenotypes_vector_data( + _traitsdata = tuple(phenotypes_vector_data( conn, args.species_id, args.population_id, - xref_ids=tuple(args.xref_ids)).values() + xref_ids=tuple(args.xref_ids)).values()) logger.debug("Successfully got traits data. Generating the QTLReaper's traits file…") _traitsfile = generate_qtlreaper_traits_file( args.working_dir, @@ -146,7 +145,7 @@ def dispatch(args: Namespace) -> int: logger.debug("QTLReaper's Traits file: %s", _traitsfile) _qtlreaper_main_output = args.working_dir.joinpath( - f"main-output-{secrets.token_urlsafe(15)}.tsv") + f"main-output-{secrets.token_urlsafe(15)}.tsv")#type: ignore[attr-defined] logger.debug("Main output filename: %s", _qtlreaper_main_output) with subprocess.Popen( ("qtlreaper", @@ -157,26 +156,27 @@ def dispatch(args: Namespace) -> int: while _qtlreaper.poll() is None: logger.debug("QTLReaper process running…") time.sleep(1) - results = tuple(max(qtls, key=lambda qtl: qtl["LRS"]) - for qtls in - reduce(__qtls_by_trait__, - parse_tsv_file(_qtlreaper_main_output), - {}).values()) + results = tuple(#type: ignore[var-annotated] + max(qtls, key=lambda qtl: qtl["LRS"]) + for qtls in + reduce(__qtls_by_trait__, + parse_tsv_file(_qtlreaper_main_output), + {}).values()) save_qtl_values_to_db(conn, results) logger.debug("Cleaning up temporary files.") _traitsfile.unlink() _qtlreaper_main_output.unlink() logger.info("Successfully computed p values for %s traits.", len(_traitsdata)) - exitcode = 0 + return 0 except FileNotFoundError as fnf: - logger.error(", ".join(fnf.args), exc_info=False) + logger.error(", ".join(str(arg) for arg in fnf.args), exc_info=False) except AssertionError as aserr: logger.error(", ".join(aserr.args), exc_info=False) - except Exception as _exc: + except Exception as _exc:# pylint: disable=[broad-exception-caught] logger.debug("Type of exception: %s", type(_exc)) logger.error("General exception!", exc_info=True) - finally: - return exitcode + + return exitcode if __name__ == "__main__": @@ -205,7 +205,7 @@ if __name__ == "__main__": "in the population.")) args = parser.parse_args() setup_logging(logger, args.log_level) - + return dispatch(args) sys.exit(main()) |
