From ad5610fa54038ddf39db89103636883b53f9afbc Mon Sep 17 00:00:00 2001
From: Frederick Muriuki Muriithi
Date: Thu, 17 Oct 2024 14:25:18 -0500
Subject: Undo transpose for any transposed files

To reduce the complexity involved in the processing of the files, we
undo any transposition of the CSV files for those files that are
marked as transposed.
---
 scripts/rqtl2/phenotypes_qc.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/scripts/rqtl2/phenotypes_qc.py b/scripts/rqtl2/phenotypes_qc.py
index 438d4da..e495a97 100644
--- a/scripts/rqtl2/phenotypes_qc.py
+++ b/scripts/rqtl2/phenotypes_qc.py
@@ -3,6 +3,7 @@ import sys
 import tempfile
 from pathlib import Path
 from zipfile import ZipFile
+import multiprocessing as mproc
 from logging import Logger, getLogger, StreamHandler
 
 import MySQLdb as mdb
@@ -15,6 +16,7 @@ from uploader.files import sha256_digest_over_file
 from scripts.rqtl2.entry import build_main
 from scripts.rqtl2.cli_parser import add_bundle_argument
 from scripts.cli_parser import init_cli_parser, add_global_data_arguments
+from scripts.rqtl2.bundleutils import build_line_joiner, build_line_splitter
 
 def validate(phenobundle: Path, logger: Logger) -> dict:
     """Check that the bundle is generally valid"""
@@ -98,6 +100,19 @@ def extract_bundle(bundle: Path, workdir: Path) -> tuple[Path, tuple[Path, ...]]
         extractiondir = workdir.joinpath(
             f"{sha256_digest_over_file(bundle)}_phenotype_qc_{bundle.name}")
         return extractiondir, rqtl2.extract(zfile, extractiondir)
+
+
+def undo_transpose(filetype: str, cdata: dict, extractiondir):
+    """Undo transposition of all files of type `filetype` in thebundle."""
+    if len(cdata.get(filetype, [])) > 0 and cdata.get(f"{filetype}_transposed", False):
+        files = (extractiondir.joinpath(_file) for _file in cdata[filetype])
+        for _file in files:
+            rqtl2.transpose_csv_with_rename(
+                _file,
+                build_line_splitter(cdata),
+                build_line_joiner(cdata))
+
+
 def run_qc(# pylint: disable=[too-many-arguments]
         dbconn: mdb.Connection,
         phenobundle: Path,
@@ -119,8 +134,16 @@ def run_qc(# pylint: disable=[too-many-arguments]
     #       Steps:
     #       - Extract file to specific directory
     extractiondir, *_bundlefiles = extract_bundle(phenobundle, workingdir)
+
     #       - For every pheno, phenocovar, phenose, phenonum file, undo
     #         transposition where relevant
+    cdata = rqtl2.control_data(extractiondir)
+    with mproc.Pool(mproc.cpu_count() - 1) as pool:
+        pool.starmap(
+            undo_transpose,
+            ((ftype, cdata, extractiondir)
+             for ftype in ("pheno", "phenocovar", "phenose", "phenonum")))
+
     #       - Check that `description` and `units` is present in phenocovar for
     #         all phenotypes
     #       - Check all phenotypes in pheno files exist in phenocovar files
-- 
cgit v1.2.3