diff options
Diffstat (limited to 'scripts')
| -rw-r--r-- | scripts/cli/__init__.py | 3 | ||||
| -rw-r--r-- | scripts/cli/logging.py | 18 | ||||
| -rw-r--r-- | scripts/cli/options.py | 46 | ||||
| -rw-r--r-- | scripts/cli/validators.py | 10 | ||||
| -rw-r--r-- | scripts/compute_phenotype_means.py (renamed from scripts/compute-phenotype-means.py) | 22 | ||||
| -rw-r--r-- | scripts/insert_samples.py | 16 | ||||
| -rw-r--r-- | scripts/load_phenotypes_to_db.py | 17 | ||||
| -rw-r--r-- | scripts/phenotypes/__init__.py | 1 | ||||
| -rw-r--r-- | scripts/phenotypes/delete_phenotypes.py | 173 | ||||
| -rw-r--r-- | scripts/qc_on_rqtl2_bundle.py | 9 | ||||
| -rw-r--r-- | scripts/rqtl2/entry.py | 2 | ||||
| -rw-r--r-- | scripts/rqtl2/install_genotypes.py | 6 | ||||
| -rw-r--r-- | scripts/rqtl2/install_phenos.py | 7 | ||||
| -rw-r--r-- | scripts/rqtl2/phenotypes_qc.py | 3 | ||||
| -rw-r--r-- | scripts/run_qtlreaper.py | 211 |
15 files changed, 498 insertions, 46 deletions
diff --git a/scripts/cli/__init__.py b/scripts/cli/__init__.py new file mode 100644 index 0000000..45bbda9 --- /dev/null +++ b/scripts/cli/__init__.py @@ -0,0 +1,3 @@ +"""Package to hold CLI-specific utilities.""" + +from . import options diff --git a/scripts/cli/logging.py b/scripts/cli/logging.py new file mode 100644 index 0000000..30ecf17 --- /dev/null +++ b/scripts/cli/logging.py @@ -0,0 +1,18 @@ +"""Logging for scripts.""" +import logging + +def setup_logging( + script_logger: logging.Logger, + loglevel: str, + modules: tuple[str, ...] = tuple() +): + """Setup module-level loggers to the same log-level as the application.""" + logging.basicConfig( + encoding="utf-8", + format=("%(asctime)s — %(filename)s:%(lineno)s — %(levelname)s: " + "%(message)s"), + level=logging.INFO) + script_logger.setLevel(getattr(logging, loglevel.upper())) + effective_loglevel = logging.getLevelName(script_logger.getEffectiveLevel()) + for module in modules: + logging.getLogger(module).setLevel(effective_loglevel) diff --git a/scripts/cli/options.py b/scripts/cli/options.py new file mode 100644 index 0000000..70d2a27 --- /dev/null +++ b/scripts/cli/options.py @@ -0,0 +1,46 @@ +"""General options to be added to CLI scripts.""" +from argparse import ArgumentParser + + +def add_logging(parser: ArgumentParser) -> ArgumentParser: + """Add optional log-level option""" + loglevels = ("debug", "info", "warning", "error", "critical") + parser.add_argument( + "--log_level", + "--log-level", + "--loglevel", + metavar="LOG-LEVEL", + type=str, + default="INFO", + choices=loglevels, + help=("Controls the severity of events to log. Valid values are: " + + ", ".join(f"'{level}'" for level in loglevels))) + return parser + + +def add_mariadb_uri(parser: ArgumentParser) -> ArgumentParser: + """Add the MySQL/MariaDB URI argument.""" + parser.add_argument("db_uri", + metavar="DB-URI", + type=str, + help="MariaDB/MySQL connection URL") + return parser + + +def add_species_id(parser: ArgumentParser) -> ArgumentParser: + """Add species-id as a mandatory argument.""" + parser.add_argument("species_id", + metavar="SPECIES-ID", + type=int, + help="The species to operate on.") + return parser + + +def add_population_id(parser: ArgumentParser) -> ArgumentParser: + """Add population-id as a mandatory argument.""" + parser = add_species_id(parser) + parser.add_argument("population_id", + metavar="POPULATION-ID", + type=int, + help="The ID for the population to operate on.") + return parser diff --git a/scripts/cli/validators.py b/scripts/cli/validators.py new file mode 100644 index 0000000..6d16e4c --- /dev/null +++ b/scripts/cli/validators.py @@ -0,0 +1,10 @@ +"""CLI options validators.""" +from pathlib import Path + + +def directory_exists(val: str) -> Path: + """Check that directory path specified actually exists.""" + _dir = Path(val).absolute() + if _dir.is_dir() and _dir.exists(): + return _dir + raise FileNotFoundError(f"The path '{_dir}' MUST exist and be a directory.") diff --git a/scripts/compute-phenotype-means.py b/scripts/compute_phenotype_means.py index 3b876b7..6d39ace 100644 --- a/scripts/compute-phenotype-means.py +++ b/scripts/compute_phenotype_means.py @@ -9,6 +9,8 @@ import MySQLdb from gn_libs import mysqldb from uploader import setup_modules_logging + +from .cli_parser import add_logging_option from .load_phenotypes_to_db import update_means logger = logging.getLogger(__name__) @@ -40,19 +42,18 @@ def run(args) -> int: len(xref_ids)) return 0 _reasons = ( - "no population exists with the ID '%s'", + f"no population exists with the ID {args.population_id}", "the population exists but it has no phenotypes linked to it yet") logger.error( - ("No cross-reference IDs to run against. Likely causes are: " - + " OR ".join(_reasons) + "."), - args.population_id) + "No cross-reference IDs to run against. Likely causes are: %s", + " OR ".join(_reasons) + ".") return 1 T = TypeVar("T") -def comma_separated_list(val: str, itemstype: T = str) -> tuple[T, ...]: - """Convert val into a list of items of type 'itemstype'.""" - return tuple(itemstype(item.strip()) for item in val.split(",")) +def comma_separated_list(val: str, itemstype: type = str) -> tuple[T, ...]: + """Convert val into a list of items of type 'itemstype'.""" + return tuple(itemstype(item.strip()) for item in val.split(",")) def comma_separated_list_of_integers(val: str) -> tuple[int, ...]: @@ -80,12 +81,7 @@ if __name__ == "__main__": help=("Identifier for the InbredSet group/" "population to run means against.")) ## Optional arguments - parser.add_argument( - "--log-level", - type=str, - help="Determines what is logged out.", - choices=("debug", "info", "warning", "error", "critical"), - default="info") + parser = add_logging_option(parser) parser.add_argument( "--cross-ref-ids", type=comma_separated_list_of_integers, diff --git a/scripts/insert_samples.py b/scripts/insert_samples.py index fc029f9..96ae8e2 100644 --- a/scripts/insert_samples.py +++ b/scripts/insert_samples.py @@ -6,10 +6,10 @@ import argparse import traceback import MySQLdb as mdb -from redis import Redis + from gn_libs.mysqldb import database_connection -from uploader.check_connections import check_db, check_redis +from uploader.check_connections import check_db from uploader.species.models import species_by_id from uploader.population.models import population_by_id from uploader.samples.models import ( @@ -35,7 +35,6 @@ class SeparatorAction(argparse.Action): setattr(namespace, self.dest, (chr(9) if values == "\\t" else values)) def insert_samples(conn: mdb.Connection,# pylint: disable=[too-many-arguments, too-many-positional-arguments] - rconn: Redis,# pylint: disable=[unused-argument] speciesid: int, populationid: int, samplesfile: pathlib.Path, @@ -119,11 +118,6 @@ if __name__ == "__main__": help=("The character used to delimit (surround?) the value in " "each column.")) - # == Script-specific extras == - parser.add_argument("--redisuri", - help="URL to initialise connection to redis", - default="redis:///") - args = parser.parse_args() return args @@ -132,17 +126,13 @@ if __name__ == "__main__": status_code = 1 # Exit with an Exception args = cli_args() check_db(args.databaseuri) - check_redis(args.redisuri) if not args.samplesfile.exists(): logging.error("File not found: '%s'.", args.samplesfile) return 2 - with (Redis.from_url(args.redisuri, decode_responses=True) as rconn, - database_connection(args.databaseuri) as dbconn): - + with database_connection(args.databaseuri) as dbconn: try: status_code = insert_samples(dbconn, - rconn, args.speciesid, args.populationid, args.samplesfile, diff --git a/scripts/load_phenotypes_to_db.py b/scripts/load_phenotypes_to_db.py index e449b82..e303bb3 100644 --- a/scripts/load_phenotypes_to_db.py +++ b/scripts/load_phenotypes_to_db.py @@ -6,9 +6,9 @@ import time import logging import argparse import datetime -from typing import Any from pathlib import Path from zipfile import ZipFile +from typing import Any, Iterable from urllib.parse import urljoin from functools import reduce, partial @@ -55,7 +55,7 @@ def save_phenotypes( if control_data["phenocovar_transposed"]: logger.info("Undoing transposition of the files rows and columns.") - phenofiles = ( + phenofiles = tuple( rqtl2.transpose_csv_with_rename( _file, build_line_splitter(control_data), @@ -86,7 +86,7 @@ def __row_to_dataitems__( dataidmap: dict, pheno_name2id: dict[str, int], samples: dict -) -> tuple[dict, ...]: +) -> Iterable[dict]: samplename = sample_row["id"] return ({ @@ -134,7 +134,7 @@ def save_numeric_data(# pylint: disable=[too-many-positional-arguments,too-many- conn: mysqldb.Connection, dataidmap: dict, pheno_name2id: dict[str, int], - samples: tuple[dict, ...], + samples: dict, control_data: dict, filesdir: Path, filetype: str, @@ -311,7 +311,9 @@ def update_auth(# pylint: disable=[too-many-locals,too-many-positional-arguments ).either(__handle_error__, __handle_success__) -def load_data(conn: mysqldb.Connection, job: dict) -> int:#pylint: disable=[too-many-locals] +def load_data(# pylint: disable=[too-many-locals] + conn: mysqldb.Connection, job: dict +) -> tuple[dict, dict, dict, tuple[int, ...]]: """Load the data attached in the given job.""" _job_metadata = job["metadata"] # Steps @@ -365,9 +367,8 @@ def load_data(conn: mysqldb.Connection, job: dict) -> int:#pylint: disable=[too- "publication_id": row["publication_id"], "data_id": row["data_id"] },))) - dataidmap, pheno_name2id, _xrefs = reduce(__build_phenos_maps__, - _phenos, - ({},{}, tuple())) + dataidmap, pheno_name2id, _xrefs = reduce(# type: ignore[var-annotated] + __build_phenos_maps__, _phenos, ({},{}, tuple())) # 3. a. Fetch the strain names and IDS: create name->ID map samples = { row["Name"]: row diff --git a/scripts/phenotypes/__init__.py b/scripts/phenotypes/__init__.py new file mode 100644 index 0000000..73ad839 --- /dev/null +++ b/scripts/phenotypes/__init__.py @@ -0,0 +1 @@ +"Scripts for dealing with phenotypes." diff --git a/scripts/phenotypes/delete_phenotypes.py b/scripts/phenotypes/delete_phenotypes.py new file mode 100644 index 0000000..461f3ec --- /dev/null +++ b/scripts/phenotypes/delete_phenotypes.py @@ -0,0 +1,173 @@ +"""Delete phenotypes.""" +import sys +import logging +from pathlib import Path +from typing import Optional +from urllib.parse import urljoin +from argparse import Namespace, ArgumentParser + +import requests +from MySQLdb.cursors import DictCursor, BaseCursor + +from gn_libs.mysqldb import database_connection + +from uploader.phenotypes.models import delete_phenotypes +from scripts.cli.logging import setup_logging +from scripts.cli.options import (add_logging, + add_mariadb_uri, + add_population_id) + +logger = logging.getLogger(__name__) + +def read_xref_ids_file(filepath: Optional[Path]) -> tuple[int, ...]: + """Read the phenotypes' cross-reference IDS from file.""" + if filepath is None: + return tuple() + + logger.debug("Using file '%s' to retrieve XREF IDs for deletion.", + filepath.name) + _ids: tuple[int, ...] = tuple() + with filepath.open(mode="r") as infile: + for line in infile.readlines(): + try: + _ids += (int(line.strip()),) + except TypeError: + pass + + return _ids + + +def fetch_all_xref_ids( + cursor: BaseCursor, population_id: int) -> tuple[int, ...]: + """Fetch all cross-reference IDs.""" + cursor.execute("SELECT Id FROM PublishXRef WHERE InbredSetId=%s", + (population_id,)) + return tuple(int(row["Id"]) for row in cursor.fetchall()) + + +def update_auth( + auth_details: tuple[str, str], + species_id: int, + population_id: int, + dataset_id: int, + xref_ids: tuple[int, ...] = tuple() +): + """Update the authorisation server: remove items to delete.""" + authserver, token = auth_details + resp = requests.post( + urljoin(authserver, + (f"/auth/data/phenotypes/{species_id}/{population_id}" + f"/{dataset_id}/delete")), + timeout=(9.13, 20), + headers={ + "Authorization": f"Bearer {token}", + "Content-Type": "application/json" + }, + json={"xref_ids": xref_ids}) + resp.raise_for_status() + + +def delete_the_phenotypes( + cursor: BaseCursor, + population_id: int, + xref_ids: tuple[int, ...] = tuple()) -> int: + """Process and delete the phenotypes.""" + delete_phenotypes(cursor, population_id, xref_ids) + + return 0 + +if __name__ == "__main__": + def parse_args() -> Namespace: + """Parse CLI arguments.""" + parser = add_logging( + add_population_id( + add_mariadb_uri( + ArgumentParser( + prog="delete-phenotypes", + description=( + "Script to delete phenotypes from the database."))))) + parser.add_argument( + "dataset_id", + metavar="DATASET-ID", + type=int, + help="The dataset identifier for phenotypes to delete.") + parser.add_argument( + "auth_server_uri", + metavar="AUTH-SERVER-URI", + type=str, + help="URI to the authorisation server.") + parser.add_argument( + "auth_token", + metavar="AUTH-TOKEN", + type=str, + help=("Token to use to update the authorisation system with the " + "deletions done.")) + parser.add_argument( + "--xref_ids_file", + metavar="XREF-IDS-FILE", + type=Path, + help=("Path to a file with phenotypes cross-reference IDs to " + "delete.")) + parser.add_argument( + "--delete-all", + action="store_true", + help=("If no 'XREF-IDS-FILE' is provided, this flag determines " + "whether or not all the phenotypes for the given population " + "will be deleted.")) + return parser.parse_args() + + + def main(): + """The `delete-phenotypes` script's entry point.""" + args = parse_args() + setup_logging(logger, args.log_level.upper(), tuple()) + with (database_connection(args.db_uri) as conn, + conn.cursor(cursorclass=DictCursor) as cursor): + xref_ids = read_xref_ids_file(args.xref_ids_file) + try: + assert not (len(xref_ids) > 0 and args.delete_all) + xref_ids = (fetch_all_xref_ids(cursor, args.population_id) + if args.delete_all else xref_ids) + logger.debug("Will delete %s phenotypes and related data", + len(xref_ids)) + if len(xref_ids) == 0: + print("No cross-reference IDs were provided. Aborting.") + return 0 + + print("Updating authorisations: ", end="") + update_auth((args.auth_server_uri, args.auth_token), + args.species_id, + args.population_id, + args.dataset_id, + xref_ids) + print("OK.") + print("Deleting the data: ", end="") + delete_phenotypes(cursor, args.population_id, xref_ids=xref_ids) + print("OK.") + if args.xref_ids_file is not None: + print("Deleting temporary file: ", end="") + args.xref_ids_file.unlink() + print("OK.") + + return 0 + except AssertionError: + logger.error( + "'DELETE-ALL' and 'XREF-IDS' are mutually exclusive. " + "If you specify the list of XREF-IDS (in a file) to delete " + "and also specify to 'DELETE-ALL' phenotypes in the " + "population, we have no way of knowing what it is you want.") + return 1 + except requests.exceptions.HTTPError as _exc: + resp = _exc.response + resp_data = resp.json() + logger.debug("%s: %s", + resp_data["error"], + resp_data["error_description"], + exc_info=True) + return 1 + except Exception as _exc:# pylint: disable=[broad-exception-caught] + logger.debug("Failed while attempting to delete phenotypes.", + exc_info=True) + return 1 + + sys.exit(main()) diff --git a/scripts/qc_on_rqtl2_bundle.py b/scripts/qc_on_rqtl2_bundle.py index 0207938..4e6ef00 100644 --- a/scripts/qc_on_rqtl2_bundle.py +++ b/scripts/qc_on_rqtl2_bundle.py @@ -40,7 +40,7 @@ def add_to_errors(rconn: Redis, """Add `errors` to a given list of errors""" errs = tuple(dict(item) for item in set( [dict2tuple(old) for old in - json.loads(rconn.hget(fqjobid, key) or "[]")] + + json.loads(rconn.hget(fqjobid, key) or "[]")] +# type: ignore[arg-type] [dict2tuple({"type": type(error).__name__, **error._asdict()}) for error in errors])) rconn.hset(fqjobid, key, json.dumps(errs)) @@ -83,7 +83,8 @@ def retrieve_errors_with_progress(rconn: Redis,#pylint: disable=[too-many-locals count = 0 checked = 0 cdata = rqtl2.control_data(zfile) - rconn.hset(fqjobid, f"{filetype}-filesize", compute_filesize(zfile, filetype)) + rconn.hset( + fqjobid, f"{filetype}-filesize", str(compute_filesize(zfile, filetype))) def __update_processed__(value): nonlocal checked checked = checked + len(value) @@ -104,7 +105,7 @@ def retrieve_errors_with_progress(rconn: Redis,#pylint: disable=[too-many-locals yield error __update_processed__(value) - rconn.hset(fqjobid, f"{filetype}-linecount", count) + rconn.hset(fqjobid, f"{filetype}-linecount", count)# type: ignore[arg-type] except rqe.MissingFileException: fname = cdata.get(filetype) yield rqfe.MissingFile(filetype, fname, ( @@ -295,7 +296,7 @@ def run_qc(rconn: Redis, return 1 def __fetch_errors__(rkey: str) -> tuple: - return tuple(json.loads(rconn.hget(fqjobid, rkey) or "[]")) + return tuple(json.loads(rconn.hget(fqjobid, rkey) or "[]")) # type: ignore[arg-type] return (1 if any(( bool(__fetch_errors__(key)) diff --git a/scripts/rqtl2/entry.py b/scripts/rqtl2/entry.py index e0e00e7..7423a4b 100644 --- a/scripts/rqtl2/entry.py +++ b/scripts/rqtl2/entry.py @@ -26,7 +26,7 @@ def build_main( def main(): with (Redis.from_url(args.redisuri, decode_responses=True) as rconn, database_connection(args.databaseuri) as dbconn): - logger.setLevel(args.loglevel.upper()) + logger.setLevel(args.log_level.upper()) fqjobid = jobs.job_key(args.redisprefix, args.jobid) try: diff --git a/scripts/rqtl2/install_genotypes.py b/scripts/rqtl2/install_genotypes.py index 8762655..5e6abb0 100644 --- a/scripts/rqtl2/install_genotypes.py +++ b/scripts/rqtl2/install_genotypes.py @@ -20,7 +20,7 @@ from scripts.rqtl2.entry import build_main from scripts.rqtl2.cli_parser import add_common_arguments from scripts.cli_parser import init_cli_parser, add_global_data_arguments -__MODULE__ = "scripts.rqtl2.install_genotypes" +logger = getLogger(__name__) def insert_markers( dbconn: mdb.Connection, @@ -191,7 +191,7 @@ def install_genotypes(#pylint: disable=[too-many-locals] dbconn: mdb.Connection, fullyqualifiedjobid: str,#pylint: disable=[unused-argument] args: argparse.Namespace, - logger: Logger = getLogger(__name__) + logger: Logger = logger # pylint: disable=[redefined-outer-name] ) -> int: """Load any existing genotypes into the database.""" (speciesid, populationid, datasetid, rqtl2bundle) = ( @@ -257,5 +257,5 @@ if __name__ == "__main__": return parser.parse_args() - main = build_main(cli_args(), install_genotypes, __MODULE__) + main = build_main(cli_args(), install_genotypes, logger) sys.exit(main()) diff --git a/scripts/rqtl2/install_phenos.py b/scripts/rqtl2/install_phenos.py index 9059cd6..11ac8a4 100644 --- a/scripts/rqtl2/install_phenos.py +++ b/scripts/rqtl2/install_phenos.py @@ -19,7 +19,7 @@ from r_qtl import r_qtl2_qc as rqc from functional_tools import take -__MODULE__ = "scripts.rqtl2.install_phenos" +logger = getLogger(__name__) def insert_probesets(dbconn: mdb.Connection, platformid: int, @@ -101,7 +101,8 @@ def install_pheno_files(#pylint: disable=[too-many-locals] dbconn: mdb.Connection, fullyqualifiedjobid: str,#pylint: disable=[unused-argument] args: argparse.Namespace, - logger: Logger = getLogger()) -> int: + logger: Logger = logger # pylint: disable=[redefined-outer-name] +) -> int: """Load data in `pheno` files and other related files into the database.""" (speciesid, platformid, datasetid, rqtl2bundle) = ( args.speciesid, args.platformid, args.datasetid, args.rqtl2bundle) @@ -159,5 +160,5 @@ if __name__ == "__main__": return parser.parse_args() - main = build_main(cli_args(), install_pheno_files, __MODULE__) + main = build_main(cli_args(), install_pheno_files, logger) sys.exit(main()) diff --git a/scripts/rqtl2/phenotypes_qc.py b/scripts/rqtl2/phenotypes_qc.py index 9f11f57..72d6c83 100644 --- a/scripts/rqtl2/phenotypes_qc.py +++ b/scripts/rqtl2/phenotypes_qc.py @@ -376,7 +376,8 @@ def run_qc(# pylint: disable=[too-many-locals] rconn: Redis, dbconn: mdb.Connection, fullyqualifiedjobid: str, - args: Namespace + args: Namespace, + logger: Logger = logger # pylint: disable=[redefined-outer-name] ) -> int: """Run quality control checks on the bundle.""" print("Beginning the quality assurance checks.") diff --git a/scripts/run_qtlreaper.py b/scripts/run_qtlreaper.py new file mode 100644 index 0000000..54e5d45 --- /dev/null +++ b/scripts/run_qtlreaper.py @@ -0,0 +1,211 @@ +"""Script to run rust-qtlreaper and update database with results.""" +import sys +import csv +import time +import secrets +import logging +import subprocess +from pathlib import Path +from functools import reduce +from typing import Union, Iterator +from argparse import Namespace, ArgumentParser + +from gn_libs import mysqldb + +from uploader.phenotypes.models import phenotypes_vector_data +from uploader.population.models import population_by_species_and_id +from uploader.samples.models import samples_by_species_and_population + +from scripts.cli.logging import setup_logging +from scripts.cli.validators import directory_exists +from scripts.cli.options import add_logging, add_mariadb_uri, add_population_id + +logger = logging.getLogger(__name__) + + +def retrieve_genotype_file(genotypes_dir: Path, population_code: str) -> Path: + """Retrieves the genotype file""" + _genofile = genotypes_dir.joinpath(f"{population_code}.geno") + if _genofile.exists(): + return _genofile + raise FileNotFoundError(f"Could not find the genotype file '{population_code}.geno'") + + +def samples_from_genofile(genofile: Path) -> tuple[str, ...]: + """Read samples from the genotype file.""" + with genofile.open(mode="r", encoding="utf-8") as inptr: + while True: + line = inptr.readline() + if (line.startswith("#") # comment line + or line.startswith("@") # allele? heterozygosity? + or line.strip() == "" # empty line + ): + continue + return tuple(line.strip().split("\t")[4:]) + + +def reconcile_samples( + genosamples: tuple[str, ...], + dbsamples: tuple[str, ...] +) -> tuple[tuple[str, ...], tuple[str, ...]]: + """merge samples in genosamples and dbsamples and retain order in genosamples.""" + in_db_not_geno = set(dbsamples).difference(genosamples) + return genosamples, tuple(in_db_not_geno) + + +def generate_qtlreaper_traits_file( + outdir: Path, + samples: tuple[str, ...], + traits_data: tuple[dict[str, Union[int, float]], ...], + filename_prefix: str = "" +) -> Path: + """Generate a file for use with qtlreaper that contains the traits' data.""" + _dialect = csv.unix_dialect() + _dialect.delimiter="\t" + _dialect.quoting=0 + + _traitsfile = outdir.joinpath( + f"{filename_prefix}_{secrets.token_urlsafe(15)}.tsv")#type: ignore[attr-defined] + with _traitsfile.open(mode="w", encoding="utf-8") as outptr: + writer = csv.DictWriter( + outptr, fieldnames=("Trait",) + samples, dialect=_dialect) + writer.writeheader() + for row in traits_data: + writer.writerow({ + "Trait": row["xref_id"], + **{sample: row.get(sample, "") for sample in samples} + }) + + return _traitsfile + + +def parse_tsv_file(results_file: Path) -> Iterator[dict]: + """Parse the rust-qtlreaper output into usable python objects.""" + with results_file.open("r", encoding="utf-8") as readptr: + _dialect = csv.unix_dialect() + _dialect.delimiter = "\t" + reader = csv.DictReader(readptr, dialect=_dialect) + yield from reader + + +def __qtls_by_trait__(qtls, current): + """Organise QTL results by trait""" + return { + **qtls, + current["ID"]: qtls.get(current["ID"], tuple()) + (current,) + } + + +def save_qtl_values_to_db(conn, qtls: tuple[dict, ...]): + """Save computed QTLs to the database.""" + with conn.cursor() as cursor: + cursor.executemany( + "UPDATE PublishXRef SET " + "Locus=%(Locus)s, LRS=%(LRS)s, additive=%(Additive)s " + "WHERE Id=%(ID)s", + qtls) + + +def dispatch(args: Namespace) -> int: + """Dispatch the actual logic.""" + exitcode = 1 + with mysqldb.database_connection(args.db_uri) as conn: + try: + population = population_by_species_and_id(conn, args.species_id, args.population_id) + assert population, (f"No population with ID '{args.population_id} for " + f"species with ID '{args.species_id}'.") + _genofile = retrieve_genotype_file(args.genotypes_dir, population["Name"]) + logger.debug("Genotype file: %s", _genofile) + samples, _samples_not_in_genofile = reconcile_samples( + samples_from_genofile(_genofile), + tuple( + sample["Name"] for sample in + samples_by_species_and_population( + conn, args.species_id, args.population_id))) + if len(_samples_not_in_genofile) > 0: + logger.warning( + "Ignoring %d samples that are in the database but not in " + "the provided genotype file.", + len(_samples_not_in_genofile)) + logger.debug("Ignored the following samples: %s", + ", ".join(_samples_not_in_genofile)) + + # Fetch traits data: provided list, or all traits in db + _traitsdata = tuple(phenotypes_vector_data( + conn, + args.species_id, + args.population_id, + xref_ids=tuple(args.xref_ids)).values()) + logger.debug("Successfully got traits data. Generating the QTLReaper's traits file…") + _traitsfile = generate_qtlreaper_traits_file( + args.working_dir, + samples, + _traitsdata, + filename_prefix="qtlreaper_input_traits_file") + logger.debug("QTLReaper's Traits file: %s", _traitsfile) + + _qtlreaper_main_output = args.working_dir.joinpath( + f"main-output-{secrets.token_urlsafe(15)}.tsv")#type: ignore[attr-defined] + logger.debug("Main output filename: %s", _qtlreaper_main_output) + with subprocess.Popen( + ("qtlreaper", + "--n_permutations", "1000", + "--geno", _genofile, + "--traits", _traitsfile, + "--main_output", _qtlreaper_main_output)) as _qtlreaper: + while _qtlreaper.poll() is None: + logger.debug("QTLReaper process running…") + time.sleep(1) + results = tuple(#type: ignore[var-annotated] + max(qtls, key=lambda qtl: qtl["LRS"]) + for qtls in + reduce(__qtls_by_trait__, + parse_tsv_file(_qtlreaper_main_output), + {}).values()) + save_qtl_values_to_db(conn, results) + logger.debug("Cleaning up temporary files.") + _traitsfile.unlink() + _qtlreaper_main_output.unlink() + logger.info("Successfully computed p values for %s traits.", len(_traitsdata)) + return 0 + except FileNotFoundError as fnf: + logger.error(", ".join(str(arg) for arg in fnf.args), exc_info=False) + except AssertionError as aserr: + logger.error(", ".join(aserr.args), exc_info=False) + except Exception as _exc:# pylint: disable=[broad-exception-caught] + logger.debug("Type of exception: %s", type(_exc)) + logger.error("General exception!", exc_info=True) + + return exitcode + + +if __name__ == "__main__": + def main(): + """run_qtlreaper.py: entry point.""" + parser = add_logging(add_population_id(add_mariadb_uri( + ArgumentParser("run_qtlreaper")))) + parser.add_argument( + "genotypes_dir", + metavar="GENOTYPES-DIRECTORY", + type=directory_exists, + help="Path to directory with the genotypes.") + parser.add_argument( + "working_dir", + metavar="WORKING-DIRECTORY", + type=directory_exists, + help="Directory where the script will write temporary files.") + parser.add_argument( + "xref_ids", + metavar="CROSS-REFERENCE-IDS", + type=int, + nargs="*", + help=("Optional list of specific cross-reference IDs to narrow down" + " to. If provided, QTLReaper will only run against them. " + "If NOT provided, QTLReaper will run against all the traits " + "in the population.")) + args = parser.parse_args() + setup_logging(logger, args.log_level) + + return dispatch(args) + + sys.exit(main()) |
