From ad5610fa54038ddf39db89103636883b53f9afbc Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Thu, 17 Oct 2024 14:25:18 -0500 Subject: Undo transpose for any transposed files To reduce the complexity involved in the processing of the files, we undo any transposition of the CSV files for those files that are marked as transposed. --- scripts/rqtl2/phenotypes_qc.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/scripts/rqtl2/phenotypes_qc.py b/scripts/rqtl2/phenotypes_qc.py index 438d4da..e495a97 100644 --- a/scripts/rqtl2/phenotypes_qc.py +++ b/scripts/rqtl2/phenotypes_qc.py @@ -3,6 +3,7 @@ import sys import tempfile from pathlib import Path from zipfile import ZipFile +import multiprocessing as mproc from logging import Logger, getLogger, StreamHandler import MySQLdb as mdb @@ -15,6 +16,7 @@ from uploader.files import sha256_digest_over_file from scripts.rqtl2.entry import build_main from scripts.rqtl2.cli_parser import add_bundle_argument from scripts.cli_parser import init_cli_parser, add_global_data_arguments +from scripts.rqtl2.bundleutils import build_line_joiner, build_line_splitter def validate(phenobundle: Path, logger: Logger) -> dict: """Check that the bundle is generally valid""" @@ -98,6 +100,19 @@ def extract_bundle(bundle: Path, workdir: Path) -> tuple[Path, tuple[Path, ...]] extractiondir = workdir.joinpath( f"{sha256_digest_over_file(bundle)}_phenotype_qc_{bundle.name}") return extractiondir, rqtl2.extract(zfile, extractiondir) + + +def undo_transpose(filetype: str, cdata: dict, extractiondir): + """Undo transposition of all files of type `filetype` in thebundle.""" + if len(cdata.get(filetype, [])) > 0 and cdata.get(f"{filetype}_transposed", False): + files = (extractiondir.joinpath(_file) for _file in cdata[filetype]) + for _file in files: + rqtl2.transpose_csv_with_rename( + _file, + build_line_splitter(cdata), + build_line_joiner(cdata)) + + def run_qc(# pylint: disable=[too-many-arguments] dbconn: mdb.Connection, phenobundle: Path, @@ -119,8 +134,16 @@ def run_qc(# pylint: disable=[too-many-arguments] # Steps: # - Extract file to specific directory extractiondir, *_bundlefiles = extract_bundle(phenobundle, workingdir) + # - For every pheno, phenocovar, phenose, phenonum file, undo # transposition where relevant + cdata = rqtl2.control_data(extractiondir) + with mproc.Pool(mproc.cpu_count() - 1) as pool: + pool.starmap( + undo_transpose, + ((ftype, cdata, extractiondir) + for ftype in ("pheno", "phenocovar", "phenose", "phenonum"))) + # - Check that `description` and `units` is present in phenocovar for # all phenotypes # - Check all phenotypes in pheno files exist in phenocovar files -- cgit v1.2.3