diff options
-rw-r--r-- | scripts/load_phenotypes_to_db.py | 29 |
1 files changed, 15 insertions, 14 deletions
diff --git a/scripts/load_phenotypes_to_db.py b/scripts/load_phenotypes_to_db.py index 980aa94..9ba91a0 100644 --- a/scripts/load_phenotypes_to_db.py +++ b/scripts/load_phenotypes_to_db.py @@ -26,18 +26,18 @@ logger = logging.getLogger(__name__) def save_phenotypes( cursor: mysqldb.Connection, - control_data: dict[str, Any] + control_data: dict[str, Any], + filesdir: Path ) -> tuple[dict, ...]: """Read `phenofiles` and save the phenotypes therein.""" logger.info("Saving new phenotypes.") - logger.debug("Processing %s 'pheno' files.", len(phenofiles)) ## TODO: Replace with something like this: ## # phenofiles = control_data["phenocovar"] + control_data.get( # "gn-metadata", {}).get("pheno", []) # # This is meant to load (and merge) data from the "phenocovar" and # "gn-metadata -> pheno" files into a single collection of phenotypes. - phenofiles = control_data["phenocovar"] + phenofiles = tuple(filesdir.joinpath(_file) for _file in control_data["phenocovar"]) if len(phenofiles) <= 0: return tuple() @@ -48,9 +48,9 @@ def save_phenotypes( _file, build_line_splitter(control_data), build_line_joiner(control_data)) - for _file in control_data["phenocovar"]) + for _file in phenofiles) - _headers = rqtl2.read_csv_file_headers(control_data["phenocovar"][0], + _headers = rqtl2.read_csv_file_headers(phenofiles[0], control_data["phenocovar_transposed"], control_data["sep"], control_data["comment.char"]) @@ -74,10 +74,12 @@ def save_pheno_data( conn: mysqldb.Connection, dataidmap: dict, samples: tuple[dict, ...], - control_data: dict + control_data: dict, + filesdir: Path ): """Read the `datafiles` and save the data in the database.""" - phenofiles = control_data["pheno"] + phenofiles = tuple( + filesdir.joinpath(_file) for file in control_data["pheno"]) if len(phenofiles) <= 0: return tuple() @@ -88,9 +90,9 @@ def save_pheno_data( _file, build_line_splitter(control_data), build_line_joiner(control_data)) - for _file in control_data["pheno"]) + for _file in phenofiles) - _headers = rqtl2.read_csv_file_headers(control_data["pheno"][0], + _headers = rqtl2.read_csv_file_headers(phenofiles[0], control_data["pheno_transposed"], control_data["sep"], control_data["comment.char"]) @@ -251,8 +253,7 @@ def load_data(conn, job): with ZipFile(str(bundle), "r") as zfile: _files = rqtl2.extract(zfile, _outdir) logger.info("Saving basic phenotype data.") - - _phenos = save_phenotypes(cursor, _control_data) + _phenos = save_phenotypes(conn, _control_data, _outdir) dataidmap = { row["phenotype_id"]: { "population_id": population["Id"], @@ -268,14 +269,14 @@ def load_data(conn, job): for row in samples_by_species_and_population( conn, species["SpeciesId"], population["PopulationId"])} # b. Save all the data items (DataIds are vibes), return new IDs - data = save_pheno_data(conn, dataidmap, samples, _control_data) + data = save_pheno_data(conn, dataidmap, samples, _control_data, _outdir) # 4. Cross-reference Phenotype, Publication, and PublishData in PublishXRef xrefs = cross_reference_phenotypes_publications_and_data( conn, tuple(dataidmap.values())) # 5. If standard errors and N exist, save them too # (use IDs returned in `3. b.` above). - data_se = save_phenotypes_se(conn, dataidmap, samples, _control_data) - data_n = save_phenotypes_n(conn, dataidmap, samples, _control_data) + data_se = save_phenotypes_se(conn, dataidmap, samples, _control_data, _outdir) + data_n = save_phenotypes_n(conn, dataidmap, samples, _control_data, _outdir) # 6. If entirely new data, update authorisations (break this down) update_auth(_user, _species, _population, _dataset, _phenos) return 0 |