From d8ca7f5c916eadb40fb8e4ea55a853f46f065ae4 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Wed, 28 May 2025 16:10:11 -0500 Subject: Pass path to bundle extraction dir, and make paths relative to it. Pass the path where the bundle was extracted to the function doing the work. Make the paths for the files being used be relative to the extraction directory thus passed. --- scripts/load_phenotypes_to_db.py | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'scripts') diff --git a/scripts/load_phenotypes_to_db.py b/scripts/load_phenotypes_to_db.py index 980aa94..9ba91a0 100644 --- a/scripts/load_phenotypes_to_db.py +++ b/scripts/load_phenotypes_to_db.py @@ -26,18 +26,18 @@ logger = logging.getLogger(__name__) def save_phenotypes( cursor: mysqldb.Connection, - control_data: dict[str, Any] + control_data: dict[str, Any], + filesdir: Path ) -> tuple[dict, ...]: """Read `phenofiles` and save the phenotypes therein.""" logger.info("Saving new phenotypes.") - logger.debug("Processing %s 'pheno' files.", len(phenofiles)) ## TODO: Replace with something like this: ## # phenofiles = control_data["phenocovar"] + control_data.get( # "gn-metadata", {}).get("pheno", []) # # This is meant to load (and merge) data from the "phenocovar" and # "gn-metadata -> pheno" files into a single collection of phenotypes. - phenofiles = control_data["phenocovar"] + phenofiles = tuple(filesdir.joinpath(_file) for _file in control_data["phenocovar"]) if len(phenofiles) <= 0: return tuple() @@ -48,9 +48,9 @@ def save_phenotypes( _file, build_line_splitter(control_data), build_line_joiner(control_data)) - for _file in control_data["phenocovar"]) + for _file in phenofiles) - _headers = rqtl2.read_csv_file_headers(control_data["phenocovar"][0], + _headers = rqtl2.read_csv_file_headers(phenofiles[0], control_data["phenocovar_transposed"], control_data["sep"], control_data["comment.char"]) @@ -74,10 +74,12 @@ def save_pheno_data( conn: mysqldb.Connection, dataidmap: dict, samples: tuple[dict, ...], - control_data: dict + control_data: dict, + filesdir: Path ): """Read the `datafiles` and save the data in the database.""" - phenofiles = control_data["pheno"] + phenofiles = tuple( + filesdir.joinpath(_file) for file in control_data["pheno"]) if len(phenofiles) <= 0: return tuple() @@ -88,9 +90,9 @@ def save_pheno_data( _file, build_line_splitter(control_data), build_line_joiner(control_data)) - for _file in control_data["pheno"]) + for _file in phenofiles) - _headers = rqtl2.read_csv_file_headers(control_data["pheno"][0], + _headers = rqtl2.read_csv_file_headers(phenofiles[0], control_data["pheno_transposed"], control_data["sep"], control_data["comment.char"]) @@ -251,8 +253,7 @@ def load_data(conn, job): with ZipFile(str(bundle), "r") as zfile: _files = rqtl2.extract(zfile, _outdir) logger.info("Saving basic phenotype data.") - - _phenos = save_phenotypes(cursor, _control_data) + _phenos = save_phenotypes(conn, _control_data, _outdir) dataidmap = { row["phenotype_id"]: { "population_id": population["Id"], @@ -268,14 +269,14 @@ def load_data(conn, job): for row in samples_by_species_and_population( conn, species["SpeciesId"], population["PopulationId"])} # b. Save all the data items (DataIds are vibes), return new IDs - data = save_pheno_data(conn, dataidmap, samples, _control_data) + data = save_pheno_data(conn, dataidmap, samples, _control_data, _outdir) # 4. Cross-reference Phenotype, Publication, and PublishData in PublishXRef xrefs = cross_reference_phenotypes_publications_and_data( conn, tuple(dataidmap.values())) # 5. If standard errors and N exist, save them too # (use IDs returned in `3. b.` above). - data_se = save_phenotypes_se(conn, dataidmap, samples, _control_data) - data_n = save_phenotypes_n(conn, dataidmap, samples, _control_data) + data_se = save_phenotypes_se(conn, dataidmap, samples, _control_data, _outdir) + data_n = save_phenotypes_n(conn, dataidmap, samples, _control_data, _outdir) # 6. If entirely new data, update authorisations (break this down) update_auth(_user, _species, _population, _dataset, _phenos) return 0 -- cgit 1.4.1