diff options
-rw-r--r-- | scripts/load_phenotypes_to_db.py | 108 |
1 files changed, 54 insertions, 54 deletions
diff --git a/scripts/load_phenotypes_to_db.py b/scripts/load_phenotypes_to_db.py index d48084e..753494b 100644 --- a/scripts/load_phenotypes_to_db.py +++ b/scripts/load_phenotypes_to_db.py @@ -38,28 +38,31 @@ def save_publications( return _publications -def save_phenotypes(cursor: Cursor, phenofiles): +def save_phenotypes( + cursor: mysqldb.Connection, + control_data: dict[str, Any] +) -> tuple[dict, ...]: """Read `phenofiles` and save the phenotypes therein.""" pass -def save_phenotypes_data(cursor: Cursor, dataidmap, samples, datafiles): +def save_phenotypes_data(conn: mysqldb.Connection, dataidmap, samples, datafiles): """Read the `datafiles` and save the data in the database.""" pass -def save_phenotype_se(cursor: Cursor, dataidmap, samples, sefiles): +def save_phenotype_se(conn: mysqldb.Connection, dataidmap, samples, sefiles): """Read the `sefiles` and save the data in the database.""" pass -def save_phenotype_n(cursor: Cursor, dataidmap, samples, nfiles): +def save_phenotype_n(conn: mysqldb.Connection, dataidmap, samples, nfiles): """Read the `nfiles` and save the data in the database.""" pass def cross_reference_phenotypes_publications_and_data( - cursor: Cursor, xref_data: tuple[dict, ...]): + conn: mysqldb.Connection, xref_data: tuple[dict, ...]): """Crossreference the phenotypes, publication and data.""" pass @@ -67,56 +70,53 @@ def cross_reference_phenotypes_publications_and_data( def load_data(conn, job): """Load the data attached in the given job.""" _job_metadata = json.loads(job["job-metadata"]) - with conn.cursor(cursorclass=DictCursor) as cursor: - # Steps - # 0. Read data from the files: can be multiple files per type - # - # 1. Save all new phenotypes: - # -> return phenotype IDs - _control_data = rqtl.control_data(job["job-metadata"]["bundle-file"]) - logger.info("Saving basic phenotype data.") - - _phenos = save_phenotypes(cursor, _control_data) - _next_data_id = fetch_next_dataid(...) - dataidmap = { - row["phenotype_id"]: { - "phenotype_id": row["phenotype_id"], - "data_id": _nextid - } - for _nextid, row in enumerate(_phenos, start=_next_data_id) + # Steps + # 0. Read data from the files: can be multiple files per type + # + # 1. Save all new phenotypes: + # -> return phenotype IDs + _control_data = rqtl.control_data(job["job-metadata"]["bundle-file"]) + logger.info("Saving basic phenotype data.") + + _phenos = save_phenotypes(cursor, _control_data) + _next_data_id = fetch_next_dataid(...) + dataidmap = { + row["phenotype_id"]: { + "phenotype_id": row["phenotype_id"], + "data_id": _nextid } - # 2. Save any new publications (in multi-file bundle): - # -> return publication IDS - publications = publications + save_publications( - cursor, - _control_data.get( - "metadata", {}).get( - "publications"), - _job_metadata.get("publicationid")) - _pubidmap = { - # TODO: Map the pheno ids to the publication ids - } - # 3. a. Fetch the strain names and IDS: create name->ID map - samples = samples_by_species_and_population( - # from uploader.samples.models import samples_by_species_and_population - conn, species["SpeciesId"], population["PopulationId"]) - # b. Save all the data items (DataIds are vibes), return new IDs - data = save_phenotypes_data( - cursor, dataidmap, samples, , _control_data["pheno"]) - # c. If standard errors and N exist, save them too - # (use IDs returned in `b` above). - data_se = save_phenotypes_data( - cursor, dataidmap, samples, , _control_data["phenose"]) - data_n = save_phenotypes_n( - cursor, dataidmap, samples, , _control_data["phenonum"]) - # 4. Cross-reference Phenotype, Publication, and PublishData in PublishXRef - xrefs = cross_reference_phenotypes_publications_and_data( - cursor, __merge_map_with_publications__(dataidmap)) - # 5. If entirely new data, update authorisations (break this down) - update_auth(_user, _species, _population, _dataset, _phenos) - return 0 - - return 1 + for _nextid, row in enumerate(_phenos, start=_next_data_id) + } + # 2. Save any new publications (in multi-file bundle): + # -> return publication IDS + publications = publications + save_publications( + cursor, + _control_data.get( + "metadata", {}).get( + "publications"), + _job_metadata.get("publicationid")) + _pubidmap = { + # TODO: Map the pheno ids to the publication ids + } + # 3. a. Fetch the strain names and IDS: create name->ID map + samples = samples_by_species_and_population( + # from uploader.samples.models import samples_by_species_and_population + conn, species["SpeciesId"], population["PopulationId"]) + # b. Save all the data items (DataIds are vibes), return new IDs + data = save_phenotypes_data( + cursor, dataidmap, samples, , _control_data["pheno"]) + # c. If standard errors and N exist, save them too + # (use IDs returned in `b` above). + data_se = save_phenotypes_data( + cursor, dataidmap, samples, , _control_data["phenose"]) + data_n = save_phenotypes_n( + cursor, dataidmap, samples, , _control_data["phenonum"]) + # 4. Cross-reference Phenotype, Publication, and PublishData in PublishXRef + xrefs = cross_reference_phenotypes_publications_and_data( + cursor, __merge_map_with_publications__(dataidmap)) + # 5. If entirely new data, update authorisations (break this down) + update_auth(_user, _species, _population, _dataset, _phenos) + return 0 if __name__ == "__main__": |