aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--scripts/load_phenotypes_to_db.py29
1 files changed, 15 insertions, 14 deletions
diff --git a/scripts/load_phenotypes_to_db.py b/scripts/load_phenotypes_to_db.py
index 980aa94..9ba91a0 100644
--- a/scripts/load_phenotypes_to_db.py
+++ b/scripts/load_phenotypes_to_db.py
@@ -26,18 +26,18 @@ logger = logging.getLogger(__name__)
def save_phenotypes(
cursor: mysqldb.Connection,
- control_data: dict[str, Any]
+ control_data: dict[str, Any],
+ filesdir: Path
) -> tuple[dict, ...]:
"""Read `phenofiles` and save the phenotypes therein."""
logger.info("Saving new phenotypes.")
- logger.debug("Processing %s 'pheno' files.", len(phenofiles))
## TODO: Replace with something like this: ##
# phenofiles = control_data["phenocovar"] + control_data.get(
# "gn-metadata", {}).get("pheno", [])
#
# This is meant to load (and merge) data from the "phenocovar" and
# "gn-metadata -> pheno" files into a single collection of phenotypes.
- phenofiles = control_data["phenocovar"]
+ phenofiles = tuple(filesdir.joinpath(_file) for _file in control_data["phenocovar"])
if len(phenofiles) <= 0:
return tuple()
@@ -48,9 +48,9 @@ def save_phenotypes(
_file,
build_line_splitter(control_data),
build_line_joiner(control_data))
- for _file in control_data["phenocovar"])
+ for _file in phenofiles)
- _headers = rqtl2.read_csv_file_headers(control_data["phenocovar"][0],
+ _headers = rqtl2.read_csv_file_headers(phenofiles[0],
control_data["phenocovar_transposed"],
control_data["sep"],
control_data["comment.char"])
@@ -74,10 +74,12 @@ def save_pheno_data(
conn: mysqldb.Connection,
dataidmap: dict,
samples: tuple[dict, ...],
- control_data: dict
+ control_data: dict,
+ filesdir: Path
):
"""Read the `datafiles` and save the data in the database."""
- phenofiles = control_data["pheno"]
+ phenofiles = tuple(
+ filesdir.joinpath(_file) for file in control_data["pheno"])
if len(phenofiles) <= 0:
return tuple()
@@ -88,9 +90,9 @@ def save_pheno_data(
_file,
build_line_splitter(control_data),
build_line_joiner(control_data))
- for _file in control_data["pheno"])
+ for _file in phenofiles)
- _headers = rqtl2.read_csv_file_headers(control_data["pheno"][0],
+ _headers = rqtl2.read_csv_file_headers(phenofiles[0],
control_data["pheno_transposed"],
control_data["sep"],
control_data["comment.char"])
@@ -251,8 +253,7 @@ def load_data(conn, job):
with ZipFile(str(bundle), "r") as zfile:
_files = rqtl2.extract(zfile, _outdir)
logger.info("Saving basic phenotype data.")
-
- _phenos = save_phenotypes(cursor, _control_data)
+ _phenos = save_phenotypes(conn, _control_data, _outdir)
dataidmap = {
row["phenotype_id"]: {
"population_id": population["Id"],
@@ -268,14 +269,14 @@ def load_data(conn, job):
for row in samples_by_species_and_population(
conn, species["SpeciesId"], population["PopulationId"])}
# b. Save all the data items (DataIds are vibes), return new IDs
- data = save_pheno_data(conn, dataidmap, samples, _control_data)
+ data = save_pheno_data(conn, dataidmap, samples, _control_data, _outdir)
# 4. Cross-reference Phenotype, Publication, and PublishData in PublishXRef
xrefs = cross_reference_phenotypes_publications_and_data(
conn, tuple(dataidmap.values()))
# 5. If standard errors and N exist, save them too
# (use IDs returned in `3. b.` above).
- data_se = save_phenotypes_se(conn, dataidmap, samples, _control_data)
- data_n = save_phenotypes_n(conn, dataidmap, samples, _control_data)
+ data_se = save_phenotypes_se(conn, dataidmap, samples, _control_data, _outdir)
+ data_n = save_phenotypes_n(conn, dataidmap, samples, _control_data, _outdir)
# 6. If entirely new data, update authorisations (break this down)
update_auth(_user, _species, _population, _dataset, _phenos)
return 0