aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--scripts/load_phenotypes_to_db.py108
1 files changed, 107 insertions, 1 deletions
diff --git a/scripts/load_phenotypes_to_db.py b/scripts/load_phenotypes_to_db.py
index fa710a6..5c792f0 100644
--- a/scripts/load_phenotypes_to_db.py
+++ b/scripts/load_phenotypes_to_db.py
@@ -3,6 +3,8 @@ import logging
import argparse
from pathlib import Path
+from MySQLdb.cursors import Cursor, DictCursor
+
from gn_libs import jobs, mysqldb, sqlite3
logging.basicConfig(
@@ -10,9 +12,90 @@ logging.basicConfig(
logger = logging.getLogger(__name__)
+def save_publications(cursor: Cursor, pubfiles):
+ """Read the `pubfiles` and save the publications therein."""
+ # Check for PubMed IDs, perhaps?
+ pass
+
+
+def save_phenotypes(cursor: Cursor, phenofiles):
+ """Read `phenofiles` and save the phenotypes therein."""
+ pass
+
+
+def save_phenotypes_data(cursor: Cursor, dataidmap, samples, datafiles):
+ """Read the `datafiles` and save the data in the database."""
+ pass
+
+
+def save_phenotype_se(cursor: Cursor, dataidmap, samples, sefiles):
+ """Read the `sefiles` and save the data in the database."""
+ pass
+
+
+def save_phenotype_n(cursor: Cursor, dataidmap, samples, nfiles):
+ """Read the `nfiles` and save the data in the database."""
+ pass
+
+
+def cross_reference_phenotypes_publications_and_data(
+ cursor: Cursor, xref_data: tuple[dict, ...]):
+ """Crossreference the phenotypes, publication and data."""
+ pass
+
+
def load_data(conn, job):
"""Load the data attached in the given job."""
- pass
+ with conn.cursor(cursorclass=DictCursor) as cursor:
+ # Steps
+ # 0. Read data from the files: can be multiple files per type
+ #
+ # 1. Save all new phenotypes:
+ # -> return phenotype IDs
+ _control_data = rqtl.control_data(job["job-metadata"]["bundle-file"])
+ logger.info("Saving basic phenotype data.")
+ _phenos = save_phenotypes(cursor, _control_data["pheno"])
+ _next_data_id = fetch_next_dataid(...)
+ dataidmap = {
+ row["phenotype_id"]: {
+ "phenotype_id": row["phenotype_id"],
+ "data_id": _nextid
+ }
+ for _nextid, row in enumerate(_phenos, start=_next_data_id)
+ }
+ # 2. Save any new publications (in multi-file bundle):
+ # -> return publication IDS
+ logger.info("Saving publications.")
+ # -> check whether the publication-id exists?
+ # -> perhaps setup the bundle with the appropriate publications
+ # -> gn-data -> (phenotypes-se, phenotypes-n)
+ # -> gn-metadata -> (pheno, geno)
+ publication = save_publications(
+ cursor, _control_data.get("metadata", {}).get("publications"))
+ _pubidmap = {
+ # TODO: Map the pheno ids to the publication ids
+ }
+ # 3. a. Fetch the strain names and IDS: create name->ID map
+ samples = samples_by_species_and_population(
+ # from uploader.samples.models import samples_by_species_and_population
+ conn, species["SpeciesId"], population["PopulationId"])
+ # b. Save all the data items (DataIds are vibes), return new IDs
+ data = save_phenotypes_data(
+ cursor, dataidmap, samples, , _control_data["pheno"])
+ # c. If standard errors and N exist, save them too
+ # (use IDs returned in `b` above).
+ data_se = save_phenotypes_data(
+ cursor, dataidmap, samples, , _control_data["phenose"])
+ data_n = save_phenotypes_n(
+ cursor, dataidmap, samples, , _control_data["phenonum"])
+ # 4. Cross-reference Phenotype, Publication, and PublishData in PublishXRef
+ xrefs = cross_reference_phenotypes_publications_and_data(
+ cursor, __merge_map_with_publications__(dataidmap))
+ # 5. If entirely new data, update authorisations (break this down)
+ update_auth(_user, _species, _population, _dataset, _phenos)
+ return 0
+
+ return 1
if __name__ == "__main__":
@@ -46,6 +129,26 @@ if __name__ == "__main__":
with (mysqldb.database_connection(args.db_uri) as conn,
sqlite3.connection(args.jobs_db_path) as jobs_conn):
+ # Lock the PublishXRef/PublishData/PublishSE/NStrain here: Why?
+ # The `DataId` values are sequential, but not auto-increment
+ # Can't convert `PublishXRef`.`DataId` to AUTO_INCREMENT.
+ # `SELECT MAX(DataId) FROM PublishXRef;`
+ # How do you check for a table lock?
+ # https://oracle-base.com/articles/mysql/mysql-identify-locked-tables
+ # `SHOW OPEN TABLES LIKE 'Publish%';`
+ logger.debug(
+ ("Locking database tables for the connection:"
+ "\n\t- %s\n\t- %s\n\t- %s\n\t- %s\n"),
+ "PublishXRef",
+ "PublishData",
+ "PublishSE",
+ "NStrain")
+ conn.execute(# Lock the tables to avoid race conditions
+ "LOCK TABLES "
+ "PublishXRef WRITE, "
+ "PublishData WRITE, "
+ "PublishSE WRITE, "
+ "NStrain WRITE")
try:
return load_data(conn, jobs.job(jobs_conn, args.job_id))
except jobs.jobs.JobNotFound as _jne:
@@ -55,6 +158,9 @@ if __name__ == "__main__":
exc_info=True,
stack_info=True)
+ logger.debug("Unlocking all database tables.")
+ conn.execute("UNLOCK TABLES")
+
return 1