about summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--scripts/rqtl2/phenotypes_qc.py134
1 files changed, 134 insertions, 0 deletions
diff --git a/scripts/rqtl2/phenotypes_qc.py b/scripts/rqtl2/phenotypes_qc.py
new file mode 100644
index 0000000..6a804df
--- /dev/null
+++ b/scripts/rqtl2/phenotypes_qc.py
@@ -0,0 +1,134 @@
+"""Run quality control on phenotypes-specific files in the bundle."""
+import sys
+from pathlib import Path
+from argparse import ArgumentParser
+from logging import Logger, getLogger, StreamHandler
+
+import MySQLdb as mdb
+
+from r_qtl import r_qtl2 as rqtl2
+from r_qtl import r_qtl2_qc as rqc
+from r_qtl import exceptions as rqe
+
+from scripts.cli_parser import init_cli_parser
+from scripts.rqtl2.entry import build_main
+from scripts.rqtl2.cli_parser import add_bundle_argument
+
+def validate(phenobundle: Path, logger: Logger) -> dict:
+    """Check that the bundle is generally valid"""
+    try:
+        rqc.validate_bundle(phenobundle)
+    except rqe.RQTLError as rqtlerr:
+        # logger.error("Bundle file validation failed!", exc_info=True)
+        return {
+            "skip": True,
+            "logger": logger,
+            "phenobundle": phenobundle,
+            "errors": (" ".join(rqtlerr.args),)
+        }
+    return {
+        "errors": tuple(),
+        "skip": False,
+        "phenobundle": phenobundle,
+        "logger": logger
+    }
+
+
+def check_for_mandatory_pheno_keys(
+        phenobundle: Path,
+        logger: Logger,
+        **kwargs
+) -> dict:
+    """Check that the mandatory keys exist for phenotypes."""
+    if kwargs.get("skip", False):
+        return {
+            **kwargs,
+            "logger": logger,
+            "phenobundle": phenobundle
+        }
+
+    _mandatory_keys = ("pheno", "phenocovar")
+    _cdata = rqtl2.read_control_file(phenobundle)
+    _errors = kwargs.get("errors", tuple()) + tuple(
+        f"Expected '{key}' file(s) are not declared in the bundle."
+        for key in _mandatory_keys if key not in _cdata.keys())
+    return {
+        **kwargs,
+        "logger": logger,
+        "phenobundle": phenobundle,
+        "errors": _errors,
+        "skip": len(_errors) > 0
+    }
+
+
+def check_for_averages_files(
+        phenobundle: Path,
+        logger: Logger,
+        **kwargs
+) -> dict:
+    """Check that averages files appear together"""
+    if kwargs.get("skip", False):
+        return {
+            **kwargs,
+            "logger": logger,
+            "phenobundle": phenobundle
+        }
+
+    _together = (("phenose", "phenonum"), ("phenonum", "phenose"))
+    _cdata = rqtl2.read_control_file(phenobundle)
+    _errors = kwargs.get("errors", tuple()) + tuple(
+        f"'{first}' is defined in the control file but there is no "
+        f"corresponding '{second}'"
+        for first, second in _together
+        if ((first in _cdata.keys()) and (second not in _cdata.keys())))
+    return {
+        **kwargs,
+        "logger": logger,
+        "phenobundle": phenobundle,
+        "errors": _errors,
+        "skip": len(_errors) > 0
+    }
+
+
+def run_qc(dbconn: mdb.Connection, phenobundle: Path, logger: Logger) -> int:
+    """Run quality control checks on the bundle."""
+    results = check_for_averages_files(
+        **check_for_mandatory_pheno_keys(
+            **validate(phenobundle, logger)))
+    errors = results.get("errors", tuple())
+    if len(errors) > 0:
+        logger.error("We found the following errors:\n%s",
+                     "\n".join(f" - {error}" for error in errors))
+        return 1
+    # TODO: Run QC on actual values
+    #       Steps:
+    #       - Extract file to specific directory
+    #       - For every pheno, phenocovar, phenose, phenonum file, undo
+    #         transposition where relevant
+    #       - Check that `description` and `units` is present in phenocovar for
+    #         all phenotypes
+    #       - Check all phenotypes in pheno files exist in phenocovar files
+    #       - Check all numeric values
+    raise NotImplementedError("WIP!")
+
+
+if __name__ == "__main__":
+
+    def cli_args():
+        """Process command-line arguments for `install_phenos`"""
+        parser = add_bundle_argument(init_cli_parser(
+            program="PhenotypesQC",
+            description=(
+                "Perform Quality Control checks on a phenotypes bundle file")))
+        return parser.parse_args()
+
+    _logger = getLogger("phenotypes_qc")
+    _logger.addHandler(StreamHandler(stream=sys.stderr))
+
+    main = build_main(
+        cli_args(),
+        lambda dbconn, args: run_qc(dbconn, args.rqtl2bundle, _logger),
+        _logger,
+        "DEBUG")
+    sys.exit(main())
+