aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFrederick Muriuki Muriithi2024-10-17 14:25:18 -0500
committerFrederick Muriuki Muriithi2024-10-17 14:39:43 -0500
commitad5610fa54038ddf39db89103636883b53f9afbc (patch)
treef507a2cfbd2969b4cdc070b5e7fb60989855bf70
parent35b8a52071390bdce17453cd6b197b18b349b5e9 (diff)
downloadgn-uploader-ad5610fa54038ddf39db89103636883b53f9afbc.tar.gz
Undo transpose for any transposed files
To reduce the complexity involved in the processing of the files, we undo any transposition of the CSV files for those files that are marked as transposed.
-rw-r--r--scripts/rqtl2/phenotypes_qc.py23
1 files changed, 23 insertions, 0 deletions
diff --git a/scripts/rqtl2/phenotypes_qc.py b/scripts/rqtl2/phenotypes_qc.py
index 438d4da..e495a97 100644
--- a/scripts/rqtl2/phenotypes_qc.py
+++ b/scripts/rqtl2/phenotypes_qc.py
@@ -3,6 +3,7 @@ import sys
import tempfile
from pathlib import Path
from zipfile import ZipFile
+import multiprocessing as mproc
from logging import Logger, getLogger, StreamHandler
import MySQLdb as mdb
@@ -15,6 +16,7 @@ from uploader.files import sha256_digest_over_file
from scripts.rqtl2.entry import build_main
from scripts.rqtl2.cli_parser import add_bundle_argument
from scripts.cli_parser import init_cli_parser, add_global_data_arguments
+from scripts.rqtl2.bundleutils import build_line_joiner, build_line_splitter
def validate(phenobundle: Path, logger: Logger) -> dict:
"""Check that the bundle is generally valid"""
@@ -98,6 +100,19 @@ def extract_bundle(bundle: Path, workdir: Path) -> tuple[Path, tuple[Path, ...]]
extractiondir = workdir.joinpath(
f"{sha256_digest_over_file(bundle)}_phenotype_qc_{bundle.name}")
return extractiondir, rqtl2.extract(zfile, extractiondir)
+
+
+def undo_transpose(filetype: str, cdata: dict, extractiondir):
+ """Undo transposition of all files of type `filetype` in thebundle."""
+ if len(cdata.get(filetype, [])) > 0 and cdata.get(f"{filetype}_transposed", False):
+ files = (extractiondir.joinpath(_file) for _file in cdata[filetype])
+ for _file in files:
+ rqtl2.transpose_csv_with_rename(
+ _file,
+ build_line_splitter(cdata),
+ build_line_joiner(cdata))
+
+
def run_qc(# pylint: disable=[too-many-arguments]
dbconn: mdb.Connection,
phenobundle: Path,
@@ -119,8 +134,16 @@ def run_qc(# pylint: disable=[too-many-arguments]
# Steps:
# - Extract file to specific directory
extractiondir, *_bundlefiles = extract_bundle(phenobundle, workingdir)
+
# - For every pheno, phenocovar, phenose, phenonum file, undo
# transposition where relevant
+ cdata = rqtl2.control_data(extractiondir)
+ with mproc.Pool(mproc.cpu_count() - 1) as pool:
+ pool.starmap(
+ undo_transpose,
+ ((ftype, cdata, extractiondir)
+ for ftype in ("pheno", "phenocovar", "phenose", "phenonum")))
+
# - Check that `description` and `units` is present in phenocovar for
# all phenotypes
# - Check all phenotypes in pheno files exist in phenocovar files