diff options
-rw-r--r-- | scripts/load_phenotypes_to_db.py | 154 |
1 files changed, 139 insertions, 15 deletions
diff --git a/scripts/load_phenotypes_to_db.py b/scripts/load_phenotypes_to_db.py index 82324fc..4d90291 100644 --- a/scripts/load_phenotypes_to_db.py +++ b/scripts/load_phenotypes_to_db.py @@ -10,6 +10,7 @@ from MySQLdb.cursors import Cursor, DictCursor from gn_libs import jobs, mysqldb, sqlite3 from r_qtl import r_qtl2 as rqtl2 +from uploader.phenotypes.modes import save_phenotypes_data from uploader.publications.models import create_new_publications from uploader.samples.models import samples_by_species_and_population @@ -63,9 +64,9 @@ def save_phenotypes( if control_data["phenocovar_transposed"]: logger.info("Undoing transposition of the files rows and columns.") phenofiles = ( - transpose_csv_with_rename( - _file - build_line_splitter(control_data) + rqtl2.transpose_csv_with_rename( + _file, + build_line_splitter(control_data), build_line_joiner(control_data)) for _file in control_data["phenocovar"]) @@ -89,19 +90,145 @@ def __fetch_next_dataid__(conn: mysqldb.Connection) -> int: return int(cursor.fetchone()) + 1 -def save_phenotypes_data(conn: mysqldb.Connection, dataidmap, samples, datafiles): +def save_pheno_data( + conn: mysqldb.Connection, + dataidmap: dict, + samples: tuple[dict, ...], + control_data: dict +): """Read the `datafiles` and save the data in the database.""" - pass + phenofiles = control_data["pheno"] + if len(phenofiles) <= 0: + return tuple() + + if control_data["pheno_transposed"]: + logger.info("Undoing transposition of the files rows and columns.") + phenofiles = ( + rqtl2.transpose_csv_with_rename( + _file, + build_line_splitter(control_data), + build_line_joiner(control_data)) + for _file in control_data["pheno"]) + + _headers = rqtl2.read_csv_file_headers(control_data["pheno"][0], + control_data["pheno_transposed"], + control_data["sep"], + control_data["comment.char"]) + def __row_to_data_items__(row): + return tuple( + { + "phenotype_id": row["phenotype_id"], + "data_id": dataidmap[row["phenotype_id"]]["data_id"], + "sample_name": samplename, + "sample_id": samples[samplename]["Id"], + "value": value + } + for samplename, value in row.items() + if samplename in samples.keys()) + + return save_phenotypes_data( + conn, + "PublishData" + (item for item in + (row_to_dataitems(dict(zip(_headers, line))) for filecontent + in (rqtl2.read_csv_file(path) for path in phenofiles) + for idx, line in enumerate(filecontent) + if idx != 0))) -def save_phenotype_se(conn: mysqldb.Connection, dataidmap, samples, sefiles): + +def save_phenotype_se( + conn: mysqldb.Connection, + dataidmap: dict, + samples: tuple[dict, ...], + control_data: dict +): """Read the `sefiles` and save the data in the database.""" - pass + sefiles = control_data["phenose"] + if len(sefiles) <= 0: + return tuple() + if control_data["phenose_transposed"]: + logger.info("Undoing transposition of the files rows and columns.") + sefiles = ( + rqtl2.transpose_csv_with_rename( + _file, + build_line_splitter(control_data), + build_line_joiner(control_data)) + for _file in control_data["phenose"]) -def save_phenotype_n(conn: mysqldb.Connection, dataidmap, samples, nfiles): + _headers = rqtl2.read_csv_file_headers(control_data["phenose"][0], + control_data["phenose_transposed"], + control_data["sep"], + control_data["comment.char"]) + + def __row_to_data_items__(row): + return tuple( + { + "phenotype_id": row["phenotype_id"], + "data_id": dataidmap[row["phenotype_id"]]["data_id"], + "sample_name": samplename, + "sample_id": samples[samplename]["Id"], + "error": value + } + for samplename, value in row.items() + if samplename in samples.keys()) + + return save_phenotypes_data( + conn, + "PublishSE" + (item for item in + (row_to_dataitems(dict(zip(_headers, line))) for filecontent + in (rqtl2.read_csv_file(path) for path in sefiles) + for idx, line in enumerate(filecontent) + if idx != 0))) + + +def save_phenotype_n( + conn: mysqldb.Connection, + dataidmap: dict, + samples: tuple[dict, ...], + control_data: dict +): """Read the `nfiles` and save the data in the database.""" - pass + sefiles = control_data["phenonum"] + if len(sefiles) <= 0: + return tuple() + + if control_data["phenonum_transposed"]: + logger.info("Undoing transposition of the files rows and columns.") + sefiles = ( + rqtl2.transpose_csv_with_rename( + _file, + build_line_splitter(control_data), + build_line_joiner(control_data)) + for _file in control_data["phenonum"]) + + _headers = rqtl2.read_csv_file_headers(control_data["phenonum"][0], + control_data["phenonum_transposed"], + control_data["sep"], + control_data["comment.char"]) + + def __row_to_data_items__(row): + return tuple( + { + "phenotype_id": row["phenotype_id"], + "data_id": dataidmap[row["phenotype_id"]]["data_id"], + "sample_name": samplename, + "sample_id": samples[samplename]["Id"], + "count": value + } + for samplename, value in row.items() + if samplename in samples.keys()) + + return save_phenotypes_data( + conn, + "NStrain" + (item for item in + (row_to_dataitems(dict(zip(_headers, line))) for filecontent + in (rqtl2.read_csv_file(path) for path in sefiles) + for idx, line in enumerate(filecontent) + if idx != 0))) def cross_reference_phenotypes_publications_and_data( @@ -145,17 +272,14 @@ def load_data(conn, job): for row in samples_by_species_and_population( conn, species["SpeciesId"], population["PopulationId"])} # b. Save all the data items (DataIds are vibes), return new IDs - data = save_phenotypes_data( - cursor, dataidmap, samples, , _control_data["pheno"]) + data = save_pheno_data(conn, dataidmap, samples, _control_data) # 4. Cross-reference Phenotype, Publication, and PublishData in PublishXRef xrefs = cross_reference_phenotypes_publications_and_data( cursor, __merge_map_with_publications__(dataidmap)) # 5. If standard errors and N exist, save them too # (use IDs returned in `3. b.` above). - data_se = save_phenotypes_data( - cursor, dataidmap, samples, , _control_data["phenose"]) - data_n = save_phenotypes_n( - cursor, dataidmap, samples, , _control_data["phenonum"]) + data_se = save_phenotypes_se(conn, dataidmap, samples, _control_data) + data_n = save_phenotypes_n(conn, dataidmap, samples, _control_data) # 6. If entirely new data, update authorisations (break this down) update_auth(_user, _species, _population, _dataset, _phenos) return 0 |