about summary refs log tree commit diff
diff options
context:
space:
mode:
authorFrederick Muriuki Muriithi2025-05-12 13:33:56 -0500
committerFrederick Muriuki Muriithi2025-05-12 13:33:56 -0500
commiteef0ed319f24946f174e9e2adee71a753df12163 (patch)
tree7efaab1ce0345532bdc2037bdb640feb3ef4abde
parentff82ac219ba8be0810f4789989a7a67b0e42b877 (diff)
downloadgn-uploader-eef0ed319f24946f174e9e2adee71a753df12163.tar.gz
Wireframe saving the data into the database.
-rw-r--r--scripts/load_phenotypes_to_db.py108
1 files changed, 107 insertions, 1 deletions
diff --git a/scripts/load_phenotypes_to_db.py b/scripts/load_phenotypes_to_db.py
index fa710a6..5c792f0 100644
--- a/scripts/load_phenotypes_to_db.py
+++ b/scripts/load_phenotypes_to_db.py
@@ -3,6 +3,8 @@ import logging
 import argparse
 from pathlib import Path
 
+from MySQLdb.cursors import Cursor, DictCursor
+
 from gn_libs import jobs, mysqldb, sqlite3
 
 logging.basicConfig(
@@ -10,9 +12,90 @@ logging.basicConfig(
 logger = logging.getLogger(__name__)
 
 
+def save_publications(cursor: Cursor, pubfiles):
+    """Read the `pubfiles` and save the publications therein."""
+    # Check for PubMed IDs, perhaps?
+    pass
+
+
+def save_phenotypes(cursor: Cursor, phenofiles):
+    """Read `phenofiles` and save the phenotypes therein."""
+    pass
+
+
+def save_phenotypes_data(cursor: Cursor, dataidmap, samples, datafiles):
+    """Read the `datafiles` and save the data in the database."""
+    pass
+
+
+def save_phenotype_se(cursor: Cursor, dataidmap, samples, sefiles):
+    """Read the `sefiles` and save the data in the database."""
+    pass
+
+
+def save_phenotype_n(cursor: Cursor, dataidmap, samples, nfiles):
+    """Read the `nfiles` and save the data in the database."""
+    pass
+
+
+def cross_reference_phenotypes_publications_and_data(
+        cursor: Cursor, xref_data: tuple[dict, ...]):
+    """Crossreference the phenotypes, publication and data."""
+    pass
+
+
 def load_data(conn, job):
     """Load the data attached in the given job."""
-    pass
+    with conn.cursor(cursorclass=DictCursor) as cursor:
+        # Steps
+        # 0. Read data from the files: can be multiple files per type
+        #
+        # 1. Save all new phenotypes:
+        #     -> return phenotype IDs
+        _control_data = rqtl.control_data(job["job-metadata"]["bundle-file"])
+        logger.info("Saving basic phenotype data.")
+        _phenos = save_phenotypes(cursor, _control_data["pheno"])
+        _next_data_id = fetch_next_dataid(...)
+        dataidmap = {
+            row["phenotype_id"]: {
+                "phenotype_id": row["phenotype_id"],
+                "data_id": _nextid
+            }
+            for _nextid, row in enumerate(_phenos, start=_next_data_id)
+        }
+        # 2. Save any new publications (in multi-file bundle):
+        #     -> return publication IDS
+        logger.info("Saving publications.")
+        # -> check whether the publication-id exists?
+        # -> perhaps setup the bundle with the appropriate publications
+        # -> gn-data -> (phenotypes-se, phenotypes-n)
+        # -> gn-metadata -> (pheno, geno)
+        publication = save_publications(
+            cursor, _control_data.get("metadata", {}).get("publications"))
+        _pubidmap = {
+            # TODO: Map the pheno ids to the publication ids
+        }
+        # 3. a. Fetch the strain names and IDS: create name->ID map
+        samples = samples_by_species_and_population(
+            # from uploader.samples.models import samples_by_species_and_population
+            conn, species["SpeciesId"], population["PopulationId"])
+        #    b. Save all the data items (DataIds are vibes), return new IDs
+        data = save_phenotypes_data(
+            cursor, dataidmap, samples, , _control_data["pheno"])
+        #    c. If standard errors and N exist, save them too
+        #       (use IDs returned in `b` above).
+        data_se = save_phenotypes_data(
+            cursor, dataidmap, samples, , _control_data["phenose"])
+        data_n = save_phenotypes_n(
+            cursor, dataidmap, samples, , _control_data["phenonum"])
+        # 4. Cross-reference Phenotype, Publication, and PublishData in PublishXRef
+        xrefs = cross_reference_phenotypes_publications_and_data(
+            cursor, __merge_map_with_publications__(dataidmap))
+        # 5. If entirely new data, update authorisations (break this down)
+        update_auth(_user, _species, _population, _dataset, _phenos)
+        return 0
+
+    return 1
 
 
 if __name__ == "__main__":
@@ -46,6 +129,26 @@ if __name__ == "__main__":
 
         with (mysqldb.database_connection(args.db_uri) as conn,
               sqlite3.connection(args.jobs_db_path) as jobs_conn):
+            # Lock the PublishXRef/PublishData/PublishSE/NStrain here: Why?
+            #     The `DataId` values are sequential, but not auto-increment
+            #     Can't convert `PublishXRef`.`DataId` to AUTO_INCREMENT.
+            #     `SELECT MAX(DataId) FROM PublishXRef;`
+            #     How do you check for a table lock?
+            #     https://oracle-base.com/articles/mysql/mysql-identify-locked-tables
+            #     `SHOW OPEN TABLES LIKE 'Publish%';`
+            logger.debug(
+                ("Locking database tables for the connection:"
+                 "\n\t- %s\n\t- %s\n\t- %s\n\t- %s\n"),
+                "PublishXRef",
+                "PublishData",
+                "PublishSE",
+                "NStrain")
+            conn.execute(# Lock the tables to avoid race conditions
+                "LOCK TABLES "
+                         "PublishXRef WRITE, "
+                         "PublishData WRITE, "
+                         "PublishSE WRITE, "
+                         "NStrain WRITE")
             try:
                 return load_data(conn, jobs.job(jobs_conn, args.job_id))
             except jobs.jobs.JobNotFound as _jne:
@@ -55,6 +158,9 @@ if __name__ == "__main__":
                              exc_info=True,
                              stack_info=True)
 
+            logger.debug("Unlocking all database tables.")
+            conn.execute("UNLOCK TABLES")
+
         return 1