diff options
author | Frederick Muriuki Muriithi | 2025-08-25 10:00:38 -0500 |
---|---|---|
committer | Frederick Muriuki Muriithi | 2025-08-25 10:00:38 -0500 |
commit | 87186314c2431381390595063487eb6a2718a1e4 (patch) | |
tree | bedf42caaacb2bf75855f1ff6646025064053c3c | |
parent | 148c71a817fc3507594bb07bae770c4498cb7d78 (diff) | |
download | gn-uploader-87186314c2431381390595063487eb6a2718a1e4.tar.gz |
Create and cross-reference phenotypes
Update `create_new_phenotypes` function to create the phenotypes and cross-reference them to their populations, publications, and groups in one transaction to ensure consistency.
-rw-r--r-- | uploader/phenotypes/models.py | 135 |
1 files changed, 110 insertions, 25 deletions
diff --git a/uploader/phenotypes/models.py b/uploader/phenotypes/models.py index deaf991..b30f624 100644 --- a/uploader/phenotypes/models.py +++ b/uploader/phenotypes/models.py @@ -322,41 +322,126 @@ def __pre_process_phenotype_data__(row): def create_new_phenotypes(conn: mdb.Connection, + population_id: int, + publication_id: int, phenotypes: Iterable[dict]) -> tuple[dict, ...]: - """Add entirely new phenotypes to the database.""" + """Add entirely new phenotypes to the database. WARNING: Not thread-safe.""" _phenos = tuple() with conn.cursor(cursorclass=DictCursor) as cursor: + def make_next_id(idcol, table): + cursor.execute(f"SELECT MAX({idcol}) AS last_id FROM {table}") + _last_id = int(cursor.fetchone()["last_id"]) + def __next_id__(): + _next_id = _last_id + 1 + while True: + yield _next_id + _next_id = _next_id + 1 + + return __next_id__ + + ### Bottleneck: Everything below makes this function not ### + ### thread-safe because we have to retrieve the last IDs from ### + ### the database and increment those to compute the next IDs. ### + ### This is an unfortunate result from the current schema that ### + ### has a cross-reference table that requires that a phenotype ### + ### be linked to an existing publication, and have data IDs to ### + ### link to that phenotype's data. ### + ### The fact that the IDs are sequential also compounds the ### + ### bottleneck. ### + ### + ### For extra safety, ensure the following tables are locked ### + ### for `WRITE`: ### + ### - PublishXRef ### + ### - Phenotype ### + ### - PublishXRef ### + __next_xref_id = make_next_id("Id", "PublishXRef")() + __next_pheno_id__ = make_next_id("Id", "Phenotype")() + __next_data_id__ = make_next_id("DataId", "PublishXRef")() + + def __build_params_and_prepubabbrevs__(acc, row): + processed = __pre_process_phenotype_data__(row) + return ( + acc[0] + ({ + **processed, + "population_id": population_id, + "publication_id": publication_id, + "phenotype_id": next(__next_pheno_id__), + "xref_id": next(__next_xref_id), + "data_id": next(__next_data_id__) + },), + acc[1] + (processed["pre_publication_abbreviation"],)) while True: batch = take(phenotypes, 1000) if len(batch) == 0: break + params, abbrevs = reduce(__build_params_and_prepubabbrevs__, + batch, + (tuple(), tuple())) + # Check for uniqueness for all "Pre_publication_description" values + abbrevs_paramsstr = ", ".join(["%s"] * len(abbrevs)) + _query = ("SELECT PublishXRef.PhenotypeId, Phenotype.* " + "FROM PublishXRef " + "INNER JOIN Phenotype " + "ON PublishXRef.PhenotypeId=Phenotype.Id " + "WHERE PublishXRef.InbredSetId=%s " + "AND Phenotype.Pre_publication_abbreviation IN " + f"({abbrevs_paramsstr})") + cursor.execute(_query, + ((population_id,) + abbrevs)) + existing = tuple(row["Pre_publication_abbreviation"] + for row in cursor.fetchall()) + if len(existing) > 0: + # Narrow this exception, perhaps? + raise Exception( + "Found already existing phenotypes with the following " + "'Pre-publication abbreviations':\n\t" + "\n\t".join(f"* {item}" for item in existing)) + + cursor.executemany( + ( + "INSERT INTO " + "Phenotype(" + "Id, " + "Pre_publication_description, " + "Post_publication_description, " + "Original_description, " + "Units, " + "Pre_publication_abbreviation, " + "Post_publication_abbreviation, " + "Authorized_Users" + ")" + "VALUES (" + "%(phenotype_id)s, " + "%(pre_publication_description)s, " + "%(post_publication_description)s, " + "%(original_description)s, " + "%(units)s, " + "%(pre_publication_abbreviation)s, " + "%(post_publication_abbreviation)s, " + "'robwilliams'" + ")"), + params) + _comments = f"Created at {datetime.now().isoformat()}" cursor.executemany( - ("INSERT INTO " - "Phenotype(" - "Pre_publication_description, Post_publication_description, " - "Original_description, Units, Pre_publication_abbreviation, " - "Post_publication_abbreviation, Authorized_Users" + ("INSERT INTO PublishXRef(" + "Id, " + "InbredSetId, " + "PhenotypeId, " + "PublicationId, " + "DataId, " + "comments" ")" - "VALUES (%s, %s, %s, %s, %s, %s, 'robwilliams')"), - tuple((row["description"], - row["description"], - row["description"], - row["units"], - row["id"], - row["id"]) - for row in batch)) - paramstr = ", ".join(["%s"] * len(batch)) - cursor.execute( - "SELECT * FROM Phenotype WHERE Pre_publication_abbreviation IN " - f"({paramstr})", - tuple(item["id"] for item in batch)) - _phenos = _phenos + tuple({ - "phenotype_id": row["Id"], - "id": row["Pre_publication_abbreviation"], - "description": row["Original_description"], - "units": row["Units"] - } for row in cursor.fetchall()) + "VALUES(" + "%(xref_id)s, " + "%(population_id)s, " + "%(phenotype_id)s, " + "%(publication_id)s, " + "%(data_id)s, " + f"'{_comments}'" + ")"), + params) + _phenos = _phenos + params return _phenos |