diff options
author | Frederick Muriuki Muriithi | 2025-05-28 16:10:11 -0500 |
---|---|---|
committer | Frederick Muriuki Muriithi | 2025-05-28 16:10:11 -0500 |
commit | d8ca7f5c916eadb40fb8e4ea55a853f46f065ae4 (patch) | |
tree | 30520773cfd9b7905eea2ac8a3631d61552aa207 | |
parent | 9f2ddd603444f9a27bdebca2fd4435c6d5bb7411 (diff) | |
download | gn-uploader-d8ca7f5c916eadb40fb8e4ea55a853f46f065ae4.tar.gz |
Pass path to bundle extraction dir, and make paths relative to it.
Pass the path where the bundle was extracted to the function doing the
work. Make the paths for the files being used be relative to the
extraction directory thus passed.
-rw-r--r-- | scripts/load_phenotypes_to_db.py | 29 |
1 files changed, 15 insertions, 14 deletions
diff --git a/scripts/load_phenotypes_to_db.py b/scripts/load_phenotypes_to_db.py index 980aa94..9ba91a0 100644 --- a/scripts/load_phenotypes_to_db.py +++ b/scripts/load_phenotypes_to_db.py @@ -26,18 +26,18 @@ logger = logging.getLogger(__name__) def save_phenotypes( cursor: mysqldb.Connection, - control_data: dict[str, Any] + control_data: dict[str, Any], + filesdir: Path ) -> tuple[dict, ...]: """Read `phenofiles` and save the phenotypes therein.""" logger.info("Saving new phenotypes.") - logger.debug("Processing %s 'pheno' files.", len(phenofiles)) ## TODO: Replace with something like this: ## # phenofiles = control_data["phenocovar"] + control_data.get( # "gn-metadata", {}).get("pheno", []) # # This is meant to load (and merge) data from the "phenocovar" and # "gn-metadata -> pheno" files into a single collection of phenotypes. - phenofiles = control_data["phenocovar"] + phenofiles = tuple(filesdir.joinpath(_file) for _file in control_data["phenocovar"]) if len(phenofiles) <= 0: return tuple() @@ -48,9 +48,9 @@ def save_phenotypes( _file, build_line_splitter(control_data), build_line_joiner(control_data)) - for _file in control_data["phenocovar"]) + for _file in phenofiles) - _headers = rqtl2.read_csv_file_headers(control_data["phenocovar"][0], + _headers = rqtl2.read_csv_file_headers(phenofiles[0], control_data["phenocovar_transposed"], control_data["sep"], control_data["comment.char"]) @@ -74,10 +74,12 @@ def save_pheno_data( conn: mysqldb.Connection, dataidmap: dict, samples: tuple[dict, ...], - control_data: dict + control_data: dict, + filesdir: Path ): """Read the `datafiles` and save the data in the database.""" - phenofiles = control_data["pheno"] + phenofiles = tuple( + filesdir.joinpath(_file) for file in control_data["pheno"]) if len(phenofiles) <= 0: return tuple() @@ -88,9 +90,9 @@ def save_pheno_data( _file, build_line_splitter(control_data), build_line_joiner(control_data)) - for _file in control_data["pheno"]) + for _file in phenofiles) - _headers = rqtl2.read_csv_file_headers(control_data["pheno"][0], + _headers = rqtl2.read_csv_file_headers(phenofiles[0], control_data["pheno_transposed"], control_data["sep"], control_data["comment.char"]) @@ -251,8 +253,7 @@ def load_data(conn, job): with ZipFile(str(bundle), "r") as zfile: _files = rqtl2.extract(zfile, _outdir) logger.info("Saving basic phenotype data.") - - _phenos = save_phenotypes(cursor, _control_data) + _phenos = save_phenotypes(conn, _control_data, _outdir) dataidmap = { row["phenotype_id"]: { "population_id": population["Id"], @@ -268,14 +269,14 @@ def load_data(conn, job): for row in samples_by_species_and_population( conn, species["SpeciesId"], population["PopulationId"])} # b. Save all the data items (DataIds are vibes), return new IDs - data = save_pheno_data(conn, dataidmap, samples, _control_data) + data = save_pheno_data(conn, dataidmap, samples, _control_data, _outdir) # 4. Cross-reference Phenotype, Publication, and PublishData in PublishXRef xrefs = cross_reference_phenotypes_publications_and_data( conn, tuple(dataidmap.values())) # 5. If standard errors and N exist, save them too # (use IDs returned in `3. b.` above). - data_se = save_phenotypes_se(conn, dataidmap, samples, _control_data) - data_n = save_phenotypes_n(conn, dataidmap, samples, _control_data) + data_se = save_phenotypes_se(conn, dataidmap, samples, _control_data, _outdir) + data_n = save_phenotypes_n(conn, dataidmap, samples, _control_data, _outdir) # 6. If entirely new data, update authorisations (break this down) update_auth(_user, _species, _population, _dataset, _phenos) return 0 |