about summary refs log tree commit diff
diff options
context:
space:
mode:
authorFrederick Muriuki Muriithi2025-08-25 10:00:38 -0500
committerFrederick Muriuki Muriithi2025-08-25 10:00:38 -0500
commit87186314c2431381390595063487eb6a2718a1e4 (patch)
treebedf42caaacb2bf75855f1ff6646025064053c3c
parent148c71a817fc3507594bb07bae770c4498cb7d78 (diff)
downloadgn-uploader-87186314c2431381390595063487eb6a2718a1e4.tar.gz
Create and cross-reference phenotypes
Update `create_new_phenotypes` function to create the phenotypes and
cross-reference them to their populations, publications, and groups in
one transaction to ensure consistency.
-rw-r--r--uploader/phenotypes/models.py135
1 files changed, 110 insertions, 25 deletions
diff --git a/uploader/phenotypes/models.py b/uploader/phenotypes/models.py
index deaf991..b30f624 100644
--- a/uploader/phenotypes/models.py
+++ b/uploader/phenotypes/models.py
@@ -322,41 +322,126 @@ def __pre_process_phenotype_data__(row):
 
 
 def create_new_phenotypes(conn: mdb.Connection,
+                          population_id: int,
+                          publication_id: int,
                           phenotypes: Iterable[dict]) -> tuple[dict, ...]:
-    """Add entirely new phenotypes to the database."""
+    """Add entirely new phenotypes to the database. WARNING: Not thread-safe."""
     _phenos = tuple()
     with conn.cursor(cursorclass=DictCursor) as cursor:
+        def make_next_id(idcol, table):
+            cursor.execute(f"SELECT MAX({idcol}) AS last_id FROM {table}")
+            _last_id = int(cursor.fetchone()["last_id"])
+            def __next_id__():
+                _next_id = _last_id + 1
+                while True:
+                    yield _next_id
+                    _next_id = _next_id + 1
+
+            return __next_id__
+
+        ### Bottleneck: Everything below makes this function not         ###
+        ###   thread-safe because we have to retrieve the last IDs from  ###
+        ###   the database and increment those to compute the next IDs.  ###
+        ###   This is an unfortunate result from the current schema that ###
+        ###   has a cross-reference table that requires that a phenotype ###
+        ###   be linked to an existing publication, and have data IDs to ###
+        ###   link to that phenotype's data.                             ###
+        ###   The fact that the IDs are sequential also compounds the    ###
+        ###   bottleneck.                                                ###
+        ###
+        ###   For extra safety, ensure the following tables are locked   ###
+        ###   for `WRITE`:                                               ###
+        ###   - PublishXRef                                              ###
+        ###   - Phenotype                                                ###
+        ###   - PublishXRef                                              ###
+        __next_xref_id = make_next_id("Id", "PublishXRef")()
+        __next_pheno_id__ = make_next_id("Id", "Phenotype")()
+        __next_data_id__ = make_next_id("DataId", "PublishXRef")()
+
+        def __build_params_and_prepubabbrevs__(acc, row):
+            processed = __pre_process_phenotype_data__(row)
+            return (
+                acc[0] + ({
+                    **processed,
+                    "population_id": population_id,
+                    "publication_id": publication_id,
+                    "phenotype_id": next(__next_pheno_id__),
+                    "xref_id": next(__next_xref_id),
+                    "data_id": next(__next_data_id__)
+                },),
+                acc[1] + (processed["pre_publication_abbreviation"],))
         while True:
             batch = take(phenotypes, 1000)
             if len(batch) == 0:
                 break
 
+            params, abbrevs = reduce(__build_params_and_prepubabbrevs__,
+                                     batch,
+                                     (tuple(), tuple()))
+            # Check for uniqueness for all "Pre_publication_description" values
+            abbrevs_paramsstr = ", ".join(["%s"] * len(abbrevs))
+            _query = ("SELECT PublishXRef.PhenotypeId, Phenotype.* "
+                      "FROM PublishXRef "
+                      "INNER JOIN Phenotype "
+                      "ON PublishXRef.PhenotypeId=Phenotype.Id "
+                      "WHERE PublishXRef.InbredSetId=%s "
+                      "AND Phenotype.Pre_publication_abbreviation IN "
+                      f"({abbrevs_paramsstr})")
+            cursor.execute(_query,
+                           ((population_id,) + abbrevs))
+            existing = tuple(row["Pre_publication_abbreviation"]
+                             for row in cursor.fetchall())
+            if len(existing) > 0:
+                # Narrow this exception, perhaps?
+                raise Exception(
+                    "Found already existing phenotypes with the following "
+                    "'Pre-publication abbreviations':\n\t"
+                    "\n\t".join(f"* {item}" for item in existing))
+
+            cursor.executemany(
+                (
+                    "INSERT INTO "
+                    "Phenotype("
+                    "Id, "
+                    "Pre_publication_description, "
+                    "Post_publication_description, "
+                    "Original_description, "
+                    "Units, "
+                    "Pre_publication_abbreviation, "
+                    "Post_publication_abbreviation, "
+                    "Authorized_Users"
+                    ")"
+                    "VALUES ("
+                    "%(phenotype_id)s, "
+                    "%(pre_publication_description)s, "
+                    "%(post_publication_description)s, "
+                    "%(original_description)s, "
+                    "%(units)s, "
+                    "%(pre_publication_abbreviation)s, "
+                    "%(post_publication_abbreviation)s, "
+                    "'robwilliams'"
+                    ")"),
+                params)
+            _comments = f"Created at {datetime.now().isoformat()}"
             cursor.executemany(
-                ("INSERT INTO "
-                 "Phenotype("
-                 "Pre_publication_description, Post_publication_description, "
-                 "Original_description, Units, Pre_publication_abbreviation, "
-                 "Post_publication_abbreviation, Authorized_Users"
+                ("INSERT INTO PublishXRef("
+                 "Id, "
+                 "InbredSetId, "
+                 "PhenotypeId, "
+                 "PublicationId, "
+                 "DataId, "
+                 "comments"
                  ")"
-                 "VALUES (%s, %s, %s, %s, %s, %s, 'robwilliams')"),
-                tuple((row["description"],
-                       row["description"],
-                       row["description"],
-                       row["units"],
-                       row["id"],
-                       row["id"])
-                      for row in batch))
-            paramstr = ", ".join(["%s"] * len(batch))
-            cursor.execute(
-                "SELECT * FROM Phenotype WHERE Pre_publication_abbreviation IN "
-                f"({paramstr})",
-                tuple(item["id"] for item in batch))
-            _phenos = _phenos + tuple({
-                "phenotype_id": row["Id"],
-                "id": row["Pre_publication_abbreviation"],
-                "description": row["Original_description"],
-                "units": row["Units"]
-            } for row in cursor.fetchall())
+                 "VALUES("
+                 "%(xref_id)s, "
+                 "%(population_id)s, "
+                 "%(phenotype_id)s, "
+                 "%(publication_id)s, "
+                 "%(data_id)s, "
+                 f"'{_comments}'"
+                 ")"),
+                params)
+            _phenos = _phenos + params
 
     return _phenos