aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFrederick Muriuki Muriithi2025-05-19 13:14:44 -0500
committerFrederick Muriuki Muriithi2025-05-19 14:34:54 -0500
commit7b8e548e3440414fbe4abb57f8111105107009b2 (patch)
tree64daa717b34dd2f34f48111d127af5ec5123e663
parente153eb4a0aeeff28cf838b63074ac53bc2164bb3 (diff)
downloadgn-uploader-7b8e548e3440414fbe4abb57f8111105107009b2.tar.gz
Save numeric/computational data for phenotypes.
-rw-r--r--scripts/load_phenotypes_to_db.py154
1 files changed, 139 insertions, 15 deletions
diff --git a/scripts/load_phenotypes_to_db.py b/scripts/load_phenotypes_to_db.py
index 82324fc..4d90291 100644
--- a/scripts/load_phenotypes_to_db.py
+++ b/scripts/load_phenotypes_to_db.py
@@ -10,6 +10,7 @@ from MySQLdb.cursors import Cursor, DictCursor
from gn_libs import jobs, mysqldb, sqlite3
from r_qtl import r_qtl2 as rqtl2
+from uploader.phenotypes.modes import save_phenotypes_data
from uploader.publications.models import create_new_publications
from uploader.samples.models import samples_by_species_and_population
@@ -63,9 +64,9 @@ def save_phenotypes(
if control_data["phenocovar_transposed"]:
logger.info("Undoing transposition of the files rows and columns.")
phenofiles = (
- transpose_csv_with_rename(
- _file
- build_line_splitter(control_data)
+ rqtl2.transpose_csv_with_rename(
+ _file,
+ build_line_splitter(control_data),
build_line_joiner(control_data))
for _file in control_data["phenocovar"])
@@ -89,19 +90,145 @@ def __fetch_next_dataid__(conn: mysqldb.Connection) -> int:
return int(cursor.fetchone()) + 1
-def save_phenotypes_data(conn: mysqldb.Connection, dataidmap, samples, datafiles):
+def save_pheno_data(
+ conn: mysqldb.Connection,
+ dataidmap: dict,
+ samples: tuple[dict, ...],
+ control_data: dict
+):
"""Read the `datafiles` and save the data in the database."""
- pass
+ phenofiles = control_data["pheno"]
+ if len(phenofiles) <= 0:
+ return tuple()
+
+ if control_data["pheno_transposed"]:
+ logger.info("Undoing transposition of the files rows and columns.")
+ phenofiles = (
+ rqtl2.transpose_csv_with_rename(
+ _file,
+ build_line_splitter(control_data),
+ build_line_joiner(control_data))
+ for _file in control_data["pheno"])
+
+ _headers = rqtl2.read_csv_file_headers(control_data["pheno"][0],
+ control_data["pheno_transposed"],
+ control_data["sep"],
+ control_data["comment.char"])
+ def __row_to_data_items__(row):
+ return tuple(
+ {
+ "phenotype_id": row["phenotype_id"],
+ "data_id": dataidmap[row["phenotype_id"]]["data_id"],
+ "sample_name": samplename,
+ "sample_id": samples[samplename]["Id"],
+ "value": value
+ }
+ for samplename, value in row.items()
+ if samplename in samples.keys())
+
+ return save_phenotypes_data(
+ conn,
+ "PublishData"
+ (item for item in
+ (row_to_dataitems(dict(zip(_headers, line))) for filecontent
+ in (rqtl2.read_csv_file(path) for path in phenofiles)
+ for idx, line in enumerate(filecontent)
+ if idx != 0)))
-def save_phenotype_se(conn: mysqldb.Connection, dataidmap, samples, sefiles):
+
+def save_phenotype_se(
+ conn: mysqldb.Connection,
+ dataidmap: dict,
+ samples: tuple[dict, ...],
+ control_data: dict
+):
"""Read the `sefiles` and save the data in the database."""
- pass
+ sefiles = control_data["phenose"]
+ if len(sefiles) <= 0:
+ return tuple()
+ if control_data["phenose_transposed"]:
+ logger.info("Undoing transposition of the files rows and columns.")
+ sefiles = (
+ rqtl2.transpose_csv_with_rename(
+ _file,
+ build_line_splitter(control_data),
+ build_line_joiner(control_data))
+ for _file in control_data["phenose"])
-def save_phenotype_n(conn: mysqldb.Connection, dataidmap, samples, nfiles):
+ _headers = rqtl2.read_csv_file_headers(control_data["phenose"][0],
+ control_data["phenose_transposed"],
+ control_data["sep"],
+ control_data["comment.char"])
+
+ def __row_to_data_items__(row):
+ return tuple(
+ {
+ "phenotype_id": row["phenotype_id"],
+ "data_id": dataidmap[row["phenotype_id"]]["data_id"],
+ "sample_name": samplename,
+ "sample_id": samples[samplename]["Id"],
+ "error": value
+ }
+ for samplename, value in row.items()
+ if samplename in samples.keys())
+
+ return save_phenotypes_data(
+ conn,
+ "PublishSE"
+ (item for item in
+ (row_to_dataitems(dict(zip(_headers, line))) for filecontent
+ in (rqtl2.read_csv_file(path) for path in sefiles)
+ for idx, line in enumerate(filecontent)
+ if idx != 0)))
+
+
+def save_phenotype_n(
+ conn: mysqldb.Connection,
+ dataidmap: dict,
+ samples: tuple[dict, ...],
+ control_data: dict
+):
"""Read the `nfiles` and save the data in the database."""
- pass
+ sefiles = control_data["phenonum"]
+ if len(sefiles) <= 0:
+ return tuple()
+
+ if control_data["phenonum_transposed"]:
+ logger.info("Undoing transposition of the files rows and columns.")
+ sefiles = (
+ rqtl2.transpose_csv_with_rename(
+ _file,
+ build_line_splitter(control_data),
+ build_line_joiner(control_data))
+ for _file in control_data["phenonum"])
+
+ _headers = rqtl2.read_csv_file_headers(control_data["phenonum"][0],
+ control_data["phenonum_transposed"],
+ control_data["sep"],
+ control_data["comment.char"])
+
+ def __row_to_data_items__(row):
+ return tuple(
+ {
+ "phenotype_id": row["phenotype_id"],
+ "data_id": dataidmap[row["phenotype_id"]]["data_id"],
+ "sample_name": samplename,
+ "sample_id": samples[samplename]["Id"],
+ "count": value
+ }
+ for samplename, value in row.items()
+ if samplename in samples.keys())
+
+ return save_phenotypes_data(
+ conn,
+ "NStrain"
+ (item for item in
+ (row_to_dataitems(dict(zip(_headers, line))) for filecontent
+ in (rqtl2.read_csv_file(path) for path in sefiles)
+ for idx, line in enumerate(filecontent)
+ if idx != 0)))
def cross_reference_phenotypes_publications_and_data(
@@ -145,17 +272,14 @@ def load_data(conn, job):
for row in samples_by_species_and_population(
conn, species["SpeciesId"], population["PopulationId"])}
# b. Save all the data items (DataIds are vibes), return new IDs
- data = save_phenotypes_data(
- cursor, dataidmap, samples, , _control_data["pheno"])
+ data = save_pheno_data(conn, dataidmap, samples, _control_data)
# 4. Cross-reference Phenotype, Publication, and PublishData in PublishXRef
xrefs = cross_reference_phenotypes_publications_and_data(
cursor, __merge_map_with_publications__(dataidmap))
# 5. If standard errors and N exist, save them too
# (use IDs returned in `3. b.` above).
- data_se = save_phenotypes_data(
- cursor, dataidmap, samples, , _control_data["phenose"])
- data_n = save_phenotypes_n(
- cursor, dataidmap, samples, , _control_data["phenonum"])
+ data_se = save_phenotypes_se(conn, dataidmap, samples, _control_data)
+ data_n = save_phenotypes_n(conn, dataidmap, samples, _control_data)
# 6. If entirely new data, update authorisations (break this down)
update_auth(_user, _species, _population, _dataset, _phenos)
return 0