about summary refs log tree commit diff
diff options
context:
space:
mode:
authorFrederick Muriuki Muriithi2025-05-19 13:14:44 -0500
committerFrederick Muriuki Muriithi2025-05-19 14:34:54 -0500
commit7b8e548e3440414fbe4abb57f8111105107009b2 (patch)
tree64daa717b34dd2f34f48111d127af5ec5123e663
parente153eb4a0aeeff28cf838b63074ac53bc2164bb3 (diff)
downloadgn-uploader-7b8e548e3440414fbe4abb57f8111105107009b2.tar.gz
Save numeric/computational data for phenotypes.
-rw-r--r--scripts/load_phenotypes_to_db.py154
1 files changed, 139 insertions, 15 deletions
diff --git a/scripts/load_phenotypes_to_db.py b/scripts/load_phenotypes_to_db.py
index 82324fc..4d90291 100644
--- a/scripts/load_phenotypes_to_db.py
+++ b/scripts/load_phenotypes_to_db.py
@@ -10,6 +10,7 @@ from MySQLdb.cursors import Cursor, DictCursor
 from gn_libs import jobs, mysqldb, sqlite3
 
 from r_qtl import r_qtl2 as rqtl2
+from uploader.phenotypes.modes import save_phenotypes_data
 from uploader.publications.models import create_new_publications
 from uploader.samples.models import samples_by_species_and_population
 
@@ -63,9 +64,9 @@ def save_phenotypes(
     if control_data["phenocovar_transposed"]:
         logger.info("Undoing transposition of the files rows and columns.")
         phenofiles = (
-            transpose_csv_with_rename(
-                _file
-                build_line_splitter(control_data)
+            rqtl2.transpose_csv_with_rename(
+                _file,
+                build_line_splitter(control_data),
                 build_line_joiner(control_data))
             for _file in control_data["phenocovar"])
 
@@ -89,19 +90,145 @@ def __fetch_next_dataid__(conn: mysqldb.Connection) -> int:
         return int(cursor.fetchone()) + 1
 
 
-def save_phenotypes_data(conn: mysqldb.Connection, dataidmap, samples, datafiles):
+def save_pheno_data(
+        conn: mysqldb.Connection,
+        dataidmap: dict,
+        samples: tuple[dict, ...],
+        control_data: dict
+):
     """Read the `datafiles` and save the data in the database."""
-    pass
+    phenofiles = control_data["pheno"]
+    if len(phenofiles) <= 0:
+        return tuple()
+
+    if control_data["pheno_transposed"]:
+        logger.info("Undoing transposition of the files rows and columns.")
+        phenofiles = (
+            rqtl2.transpose_csv_with_rename(
+                _file,
+                build_line_splitter(control_data),
+                build_line_joiner(control_data))
+            for _file in control_data["pheno"])
+
+    _headers = rqtl2.read_csv_file_headers(control_data["pheno"][0],
+                                           control_data["pheno_transposed"],
+                                           control_data["sep"],
+                                           control_data["comment.char"])
 
+    def __row_to_data_items__(row):
+        return tuple(
+            {
+                "phenotype_id": row["phenotype_id"],
+                "data_id": dataidmap[row["phenotype_id"]]["data_id"],
+                "sample_name": samplename,
+                "sample_id": samples[samplename]["Id"],
+                "value": value
+            }
+            for samplename, value in row.items()
+            if samplename in samples.keys())
+
+    return save_phenotypes_data(
+        conn,
+        "PublishData"
+        (item for item in
+         (row_to_dataitems(dict(zip(_headers, line))) for filecontent
+          in (rqtl2.read_csv_file(path) for path in phenofiles)
+         for idx, line in enumerate(filecontent)
+         if idx != 0)))
 
-def save_phenotype_se(conn: mysqldb.Connection, dataidmap, samples, sefiles):
+
+def save_phenotype_se(
+        conn: mysqldb.Connection,
+        dataidmap: dict,
+        samples: tuple[dict, ...],
+        control_data: dict
+):
     """Read the `sefiles` and save the data in the database."""
-    pass
+    sefiles = control_data["phenose"]
+    if len(sefiles) <= 0:
+        return tuple()
 
+    if control_data["phenose_transposed"]:
+        logger.info("Undoing transposition of the files rows and columns.")
+        sefiles = (
+            rqtl2.transpose_csv_with_rename(
+                _file,
+                build_line_splitter(control_data),
+                build_line_joiner(control_data))
+            for _file in control_data["phenose"])
 
-def save_phenotype_n(conn: mysqldb.Connection, dataidmap, samples, nfiles):
+    _headers = rqtl2.read_csv_file_headers(control_data["phenose"][0],
+                                           control_data["phenose_transposed"],
+                                           control_data["sep"],
+                                           control_data["comment.char"])
+
+    def __row_to_data_items__(row):
+        return tuple(
+            {
+                "phenotype_id": row["phenotype_id"],
+                "data_id": dataidmap[row["phenotype_id"]]["data_id"],
+                "sample_name": samplename,
+                "sample_id": samples[samplename]["Id"],
+                "error": value
+            }
+            for samplename, value in row.items()
+            if samplename in samples.keys())
+
+    return save_phenotypes_data(
+        conn,
+        "PublishSE"
+        (item for item in
+         (row_to_dataitems(dict(zip(_headers, line))) for filecontent
+          in (rqtl2.read_csv_file(path) for path in sefiles)
+         for idx, line in enumerate(filecontent)
+         if idx != 0)))
+
+
+def save_phenotype_n(
+        conn: mysqldb.Connection,
+        dataidmap: dict,
+        samples: tuple[dict, ...],
+        control_data: dict
+):
     """Read the `nfiles` and save the data in the database."""
-    pass
+    sefiles = control_data["phenonum"]
+    if len(sefiles) <= 0:
+        return tuple()
+
+    if control_data["phenonum_transposed"]:
+        logger.info("Undoing transposition of the files rows and columns.")
+        sefiles = (
+            rqtl2.transpose_csv_with_rename(
+                _file,
+                build_line_splitter(control_data),
+                build_line_joiner(control_data))
+            for _file in control_data["phenonum"])
+
+    _headers = rqtl2.read_csv_file_headers(control_data["phenonum"][0],
+                                           control_data["phenonum_transposed"],
+                                           control_data["sep"],
+                                           control_data["comment.char"])
+
+    def __row_to_data_items__(row):
+        return tuple(
+            {
+                "phenotype_id": row["phenotype_id"],
+                "data_id": dataidmap[row["phenotype_id"]]["data_id"],
+                "sample_name": samplename,
+                "sample_id": samples[samplename]["Id"],
+                "count": value
+            }
+            for samplename, value in row.items()
+            if samplename in samples.keys())
+
+    return save_phenotypes_data(
+        conn,
+        "NStrain"
+        (item for item in
+         (row_to_dataitems(dict(zip(_headers, line))) for filecontent
+          in (rqtl2.read_csv_file(path) for path in sefiles)
+         for idx, line in enumerate(filecontent)
+         if idx != 0)))
 
 
 def cross_reference_phenotypes_publications_and_data(
@@ -145,17 +272,14 @@ def load_data(conn, job):
         for row in samples_by_species_and_population(
                 conn, species["SpeciesId"], population["PopulationId"])}
     #    b. Save all the data items (DataIds are vibes), return new IDs
-    data = save_phenotypes_data(
-        cursor, dataidmap, samples, , _control_data["pheno"])
+    data = save_pheno_data(conn, dataidmap, samples, _control_data)
     # 4. Cross-reference Phenotype, Publication, and PublishData in PublishXRef
     xrefs = cross_reference_phenotypes_publications_and_data(
         cursor, __merge_map_with_publications__(dataidmap))
     # 5. If standard errors and N exist, save them too
     #    (use IDs returned in `3. b.` above).
-    data_se = save_phenotypes_data(
-        cursor, dataidmap, samples, , _control_data["phenose"])
-    data_n = save_phenotypes_n(
-        cursor, dataidmap, samples, , _control_data["phenonum"])
+    data_se = save_phenotypes_se(conn, dataidmap, samples, _control_data)
+    data_n = save_phenotypes_n(conn, dataidmap, samples, _control_data)
     # 6. If entirely new data, update authorisations (break this down)
     update_auth(_user, _species, _population, _dataset, _phenos)
     return 0