#!/usr/bin/env python3 """ WARNING This script is to be run only one time to update the schema for the `CaseAttribute` and the `CaseAttributeXRefNew` tables. Running the script more than once has no useful purpose, and will just litter your database schema with `BACKUP_CaseAttribute*` tables. DESCRIPTION The script makes the following schema updates For the `CaseAttribute` table: * Adds `InbredSetId` column: links each case attribute to a population * Rename `Id` to `CaseAttributeId`: Makes it explicit what the ID is for, and helps simplify queries with joins against this table For the `CaseAttributeXRefNew` table: * Reorganise order of columns. For out of date databases (e.g. small db), the script will also: * Ppdate the character set for the `InbredSet` and `Strain` tables to utf8mb4. * Change the database engine for the `InbredSet` and `Strain` tables to InnoDB. TABLE BACKUPS The script will backup the `CaseAttribute` and `CaseAttributeXRefNew` tables. The backup table names take the form: BACKUP_<tablename>_<timestr> Where <tablename> is one of `CaseAttribute` or `CaseAttributeXRefNew` and <timestr> is a string indicating the date and time the script was run (in local timezone for system the script is run on). USAGE python3 update-case-attribute-tables-20230818 SQL_URI The `SQL_URI` argument is mandatory and is of the form: mysql://<username>:<password>@<host>[:<port>]/<database> """ import os import sys import time import random import traceback from datetime import datetime from urllib.parse import urlparse import click import MySQLdb as mdb from MySQLdb.cursors import DictCursor # from gn3.db_utils import database_connection def convert_to_innodb(cursor, table): """Convert `table` to InnoDB Engine.""" cursor.execute(f"SHOW CREATE TABLE {table};") res = cursor.fetchone() tblname, tblcreate = res["Table"], res["Create Table"] engine_charset = { key: val for key, val in (item.split("=") for item in (item for item in tblcreate.split("\n")[-1].split(" ") if item.startswith("ENGINE=") or item.startswith("CHARSET=")))} if engine_charset.get("CHARSET") != "utf8mb4": cursor.execute( f"ALTER TABLE {table} CONVERT TO CHARACTER SET utf8mb4") if engine_charset["ENGINE"] == "MyISAM": cursor.execute(f"ALTER TABLE {table} ENGINE=InnoDB") def table_exists(cursor, table: str) -> bool: """Check whether a table exists.""" cursor.execute(f"SHOW TABLES") return table in tuple(tuple(row.values())[0] for row in cursor.fetchall()) def table_has_field(cursor, table: str, field: str) -> bool: """Check whether `table` has `field`.""" cursor.execute(f"DESC {table}") return field in tuple(row["Field"] for row in cursor.fetchall()) def cleanup_inbred_set_schema(cursor): """ Clean up the InbredSet schema to prevent issues with ForeignKey constraints. """ cursor.execute("SELECT Id, InbredSetId FROM InbredSet " "WHERE InbredSetId IS NULL OR InbredSetId = ''") fixed_nulls =tuple({"Id": row[0], "InbredSetId": row[0]} for row in cursor.fetchall()) if len(fixed_nulls) > 0: cursor.executemany( "UPDATE InbredSet SET InbredSetId=%(InbredSetId)s " "WHERE Id=%(Id)s", fixed_nulls) cursor.execute("""ALTER TABLE InbredSet CHANGE COLUMN InbredSetId InbredSetId INT(5) UNSIGNED NOT NULL""") def create_temp_case_attributes_table(cursor): """Create the `CaseAttributeTemp` table.""" case_attribute_exists = table_exists(cursor, "CaseAttribute") if (not case_attribute_exists or (case_attribute_exists and not table_has_field(cursor, "CaseAttribute", "InbredSet"))): cursor.execute( """CREATE TABLE IF NOT EXISTS CaseAttributeTemp( InbredSetId INT(5) UNSIGNED NOT NULL, CaseAttributeId INT(5) UNSIGNED NOT NULL, Name VARCHAR(30) NOT NULL, Description VARCHAR(250) NOT NULL, -- FOREIGN KEY(InbredSetId) REFERENCES InbredSet(InbredSetId) -- ON DELETE RESTRICT ON UPDATE CASCADE, PRIMARY KEY(InbredSetId, CaseAttributeId) ) ENGINE=InnoDB CHARSET=utf8mb4;""") def create_temp_case_attributes_xref_table(cursor): """Create the `CaseAttributeXRefNewTemp` table.""" if table_exists(cursor, "CaseAttributeTemp"): cursor.execute( """CREATE TABLE IF NOT EXISTS CaseAttributeXRefNewTemp( InbredSetId INT(5) UNSIGNED NOT NULL, StrainId INT(20) UNSIGNED NOT NULL, CaseAttributeId INT(5) UNSIGNED NOT NULL, Value VARCHAR(100) NOT NULL, -- FOREIGN KEY(InbredSetId) REFERENCES InbredSet(InbredSetId) -- ON UPDATE CASCADE ON DELETE RESTRICT, -- FOREIGN KEY(StrainId) REFERENCES Strain(Id) -- ON UPDATE CASCADE ON DELETE RESTRICT, -- FOREIGN KEY (CaseAttributeId) REFERENCES CaseAttribute(CaseAttributeId) -- ON UPDATE CASCADE ON DELETE RESTRICT, PRIMARY KEY(InbredSetId, StrainId, CaseAttributeId) ) ENGINE=InnoDB CHARSET=utf8mb4;""") def fetch_case_attribute_data(cursor, limit: int = 1000): """Fetch case attribute data.""" offset = 0 while True: cursor.execute( "SELECT " "caxrn.StrainId, caxrn.CaseAttributeId, caxrn.Value, " "ca.Name AS CaseAttributeName, " "ca.Description AS CaseAttributeDescription, iset.InbredSetId " "FROM " "CaseAttribute AS ca INNER JOIN CaseAttributeXRefNew AS caxrn " "ON ca.Id=caxrn.CaseAttributeId " "INNER JOIN " "StrainXRef AS sxr " "ON caxrn.StrainId=sxr.StrainId " "INNER JOIN " "InbredSet AS iset " "ON sxr.InbredSetId=iset.InbredSetId " "WHERE " "caxrn.value != 'x' " "AND caxrn.value IS NOT NULL " f"LIMIT {limit} OFFSET {offset}") results = cursor.fetchall() if len(results) <= 0: break yield results offset = offset + len(results) def copy_data(cursor): """Copy data from existing tables into new temp tables.""" if table_exists(cursor, "CaseAttributeTemp") and table_exists(cursor, "CaseAttributeXRefNewTemp"): for bunch_of_data in fetch_case_attribute_data(cursor): ca_data = tuple({key: val for key, val in item} for item in set( (("InbredSetId", item["InbredSetId"]), ("StrainId", item["StrainId"]), ("CaseAttributeId", item["CaseAttributeId"]), ("Name", item["CaseAttributeName"]), ("Description", (item["CaseAttributeDescription"] or item["CaseAttributeName"])), ("Value", item["Value"])) for item in bunch_of_data)) cursor.executemany( "INSERT INTO " "CaseAttributeTemp(" "InbredSetId, CaseAttributeId, Name, Description) " "VALUES(" "%(InbredSetId)s, %(CaseAttributeId)s, %(Name)s, " "%(Description)s) " "ON DUPLICATE KEY UPDATE Name=VALUES(Name)", ca_data) cursor.executemany( "INSERT INTO " "CaseAttributeXRefNewTemp(" "InbredSetId, StrainId, CaseAttributeId, Value) " "VALUES(" "%(InbredSetId)s, %(StrainId)s, %(CaseAttributeId)s, %(Value)s) " "ON DUPLICATE KEY UPDATE `Value`=VALUES(`Value`)", bunch_of_data) time.sleep(random.randint(5, 20)) def rename_table(cursor, table, newtable): """Rename `table` to `newtable`.""" cursor.execute(f"ALTER TABLE {table} RENAME TO {newtable}") def parse_db_url(sql_uri: str) -> tuple: """function to parse SQL_URI env variable note:there\ is a default value for SQL_URI so a tuple result is\ always expected""" parsed_db = urlparse(sql_uri) return ( parsed_db.hostname, parsed_db.username, parsed_db.password, parsed_db.path[1:], parsed_db.port) @click.command(help="Update DB schema for Case-Attributes") @click.argument("sql_uri") def main(sql_uri: str) -> None: # for innodb: `SET autocommit=0` to prevent releasing locks immediately # after next commit. # See https://dev.mysql.com/doc/refman/8.0/en/lock-tables.html host, user, passwd, db_name, port = parse_db_url(sql_uri) conn = mdb.connect( db=db_name, user=user, passwd=passwd or '', host=host, port=port or 3306) try: cursor = conn.cursor(cursorclass=DictCursor) convert_to_innodb(cursor, "InbredSet") convert_to_innodb(cursor, "Strain") cleanup_inbred_set_schema(cursor) create_temp_case_attributes_table(cursor) create_temp_case_attributes_xref_table(cursor) copy_data(cursor) timestr = datetime.now().isoformat().replace( "-", "").replace(":", "_").replace(".", "__") rename_table(cursor, "CaseAttribute", f"BACKUP_CaseAttribute_{timestr}") rename_table(cursor, "CaseAttributeXRefNew", f"BACKUP_CaseAttributeXRefNew_{timestr}") rename_table(cursor, "CaseAttributeTemp", "CaseAttribute") rename_table(cursor, "CaseAttributeXRefNewTemp", "CaseAttributeXRefNew") conn.commit() except Exception as _exc: print(traceback.format_exc(), file=sys.stderr) conn.rollback() finally: conn.close() if __name__ == "__main__": # pylint: disable=no-value-for-parameter main()