diff options
author | Frederick Muriuki Muriithi | 2024-10-17 14:25:18 -0500 |
---|---|---|
committer | Frederick Muriuki Muriithi | 2024-10-17 14:39:43 -0500 |
commit | ad5610fa54038ddf39db89103636883b53f9afbc (patch) | |
tree | f507a2cfbd2969b4cdc070b5e7fb60989855bf70 | |
parent | 35b8a52071390bdce17453cd6b197b18b349b5e9 (diff) | |
download | gn-uploader-ad5610fa54038ddf39db89103636883b53f9afbc.tar.gz |
Undo transpose for any transposed files
To reduce the complexity involved in the processing of the files, we
undo any transposition of the CSV files for those files that are
marked as transposed.
-rw-r--r-- | scripts/rqtl2/phenotypes_qc.py | 23 |
1 files changed, 23 insertions, 0 deletions
diff --git a/scripts/rqtl2/phenotypes_qc.py b/scripts/rqtl2/phenotypes_qc.py index 438d4da..e495a97 100644 --- a/scripts/rqtl2/phenotypes_qc.py +++ b/scripts/rqtl2/phenotypes_qc.py @@ -3,6 +3,7 @@ import sys import tempfile from pathlib import Path from zipfile import ZipFile +import multiprocessing as mproc from logging import Logger, getLogger, StreamHandler import MySQLdb as mdb @@ -15,6 +16,7 @@ from uploader.files import sha256_digest_over_file from scripts.rqtl2.entry import build_main from scripts.rqtl2.cli_parser import add_bundle_argument from scripts.cli_parser import init_cli_parser, add_global_data_arguments +from scripts.rqtl2.bundleutils import build_line_joiner, build_line_splitter def validate(phenobundle: Path, logger: Logger) -> dict: """Check that the bundle is generally valid""" @@ -98,6 +100,19 @@ def extract_bundle(bundle: Path, workdir: Path) -> tuple[Path, tuple[Path, ...]] extractiondir = workdir.joinpath( f"{sha256_digest_over_file(bundle)}_phenotype_qc_{bundle.name}") return extractiondir, rqtl2.extract(zfile, extractiondir) + + +def undo_transpose(filetype: str, cdata: dict, extractiondir): + """Undo transposition of all files of type `filetype` in thebundle.""" + if len(cdata.get(filetype, [])) > 0 and cdata.get(f"{filetype}_transposed", False): + files = (extractiondir.joinpath(_file) for _file in cdata[filetype]) + for _file in files: + rqtl2.transpose_csv_with_rename( + _file, + build_line_splitter(cdata), + build_line_joiner(cdata)) + + def run_qc(# pylint: disable=[too-many-arguments] dbconn: mdb.Connection, phenobundle: Path, @@ -119,8 +134,16 @@ def run_qc(# pylint: disable=[too-many-arguments] # Steps: # - Extract file to specific directory extractiondir, *_bundlefiles = extract_bundle(phenobundle, workingdir) + # - For every pheno, phenocovar, phenose, phenonum file, undo # transposition where relevant + cdata = rqtl2.control_data(extractiondir) + with mproc.Pool(mproc.cpu_count() - 1) as pool: + pool.starmap( + undo_transpose, + ((ftype, cdata, extractiondir) + for ftype in ("pheno", "phenocovar", "phenose", "phenonum"))) + # - Check that `description` and `units` is present in phenocovar for # all phenotypes # - Check all phenotypes in pheno files exist in phenocovar files |