aboutsummaryrefslogtreecommitdiff
path: root/scripts/update-case-attribute-tables-20230818
blob: 70e8c1a9e7ec2e4d3783b967c067281de1829059 (about) (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
#!/usr/bin/env python3

"""
WARNING
    This script is to be run only one time to update the schema for the
    `CaseAttribute` and the `CaseAttributeXRefNew` tables.
    Running the script more than once has no useful purpose, and will just
    litter your database schema with `BACKUP_CaseAttribute*` tables.

DESCRIPTION
    The script makes the following schema updates

    For the `CaseAttribute` table:
    * Adds `InbredSetId` column: links each case attribute to a population
    * Rename `Id` to `CaseAttributeId`: Makes it explicit what the ID is for,
      and helps simplify queries with joins against this table

    For the `CaseAttributeXRefNew` table:
    * Reorganise order of columns.

    For out of date databases (e.g. small db), the script will also:
    * Ppdate the character set for the `InbredSet` and `Strain` tables to
      utf8mb4.
    * Change the database engine for the `InbredSet` and `Strain` tables to
      InnoDB.

TABLE BACKUPS
    The script will backup the `CaseAttribute` and `CaseAttributeXRefNew` tables.
    The backup table names take the form:

      BACKUP_<tablename>_<timestr>

    Where <tablename> is one of `CaseAttribute` or `CaseAttributeXRefNew` and
    <timestr> is a string indicating the date and time the script was run (in
    local timezone for system the script is run on).

USAGE
    python3 update-case-attribute-tables-20230818 SQL_URI

    The `SQL_URI` argument is mandatory and is of the form:

      mysql://<username>:<password>@<host>[:<port>]/<database>
"""
import os
import sys
import time
import random
import traceback
from datetime import datetime
from urllib.parse import urlparse

import click
import MySQLdb as mdb
from MySQLdb.cursors import DictCursor

# from gn3.db_utils import database_connection

def convert_to_innodb(cursor, table):
    """Convert `table` to InnoDB Engine."""
    cursor.execute(f"SHOW CREATE TABLE {table};")
    res = cursor.fetchone()
    tblname, tblcreate = res["Table"], res["Create Table"]
    engine_charset = {
        key: val for key, val in
        (item.split("=") for item in
         (item for item in tblcreate.split("\n")[-1].split(" ")
          if item.startswith("ENGINE=") or item.startswith("CHARSET=")))}
    if engine_charset.get("CHARSET") != "utf8mb4":
        cursor.execute(
            f"ALTER TABLE {table} CONVERT TO CHARACTER SET utf8mb4")
    if engine_charset["ENGINE"] == "MyISAM":
        cursor.execute(f"ALTER TABLE {table} ENGINE=InnoDB")


def table_exists(cursor, table: str) -> bool:
    """Check whether a table exists."""
    cursor.execute(f"SHOW TABLES")
    return table in tuple(tuple(row.values())[0] for row in cursor.fetchall())

def table_has_field(cursor, table: str, field: str) -> bool:
    """Check whether `table` has `field`."""
    cursor.execute(f"DESC {table}")
    return field in tuple(row["Field"] for row in cursor.fetchall())

def cleanup_inbred_set_schema(cursor):
    """
    Clean up the InbredSet schema to prevent issues with ForeignKey constraints.
    """
    cursor.execute("SELECT Id, InbredSetId FROM InbredSet "
                   "WHERE InbredSetId IS NULL OR InbredSetId = ''")
    fixed_nulls =tuple({"Id": row[0], "InbredSetId": row[0]}
                       for row in cursor.fetchall())
    if len(fixed_nulls) > 0:
        cursor.executemany(
            "UPDATE InbredSet SET InbredSetId=%(InbredSetId)s "
            "WHERE Id=%(Id)s",
            fixed_nulls)

    cursor.execute("""ALTER TABLE InbredSet
    CHANGE COLUMN InbredSetId InbredSetId INT(5) UNSIGNED NOT NULL""")

def create_temp_case_attributes_table(cursor):
    """Create the `CaseAttributeTemp` table."""
    case_attribute_exists = table_exists(cursor, "CaseAttribute")
    if (not case_attribute_exists
        or (case_attribute_exists
            and not table_has_field(cursor, "CaseAttribute", "InbredSet"))):
        cursor.execute(
            """CREATE TABLE IF NOT EXISTS CaseAttributeTemp(
              InbredSetId INT(5) UNSIGNED NOT NULL,
              CaseAttributeId INT(5) UNSIGNED NOT NULL,
              Name VARCHAR(30) NOT NULL,
              Description VARCHAR(250) NOT NULL,
              -- FOREIGN KEY(InbredSetId) REFERENCES InbredSet(InbredSetId)
              --   ON DELETE RESTRICT ON UPDATE CASCADE,
              PRIMARY KEY(InbredSetId, CaseAttributeId)
            ) ENGINE=InnoDB CHARSET=utf8mb4;""")

def create_temp_case_attributes_xref_table(cursor):
    """Create the `CaseAttributeXRefNewTemp` table."""
    if table_exists(cursor, "CaseAttributeTemp"):
        cursor.execute(
            """CREATE TABLE IF NOT EXISTS CaseAttributeXRefNewTemp(
              InbredSetId INT(5) UNSIGNED NOT NULL,
              StrainId INT(20) UNSIGNED NOT NULL,
              CaseAttributeId INT(5) UNSIGNED NOT NULL,
              Value VARCHAR(100) NOT NULL,
              -- FOREIGN KEY(InbredSetId) REFERENCES InbredSet(InbredSetId)
              --   ON UPDATE CASCADE ON DELETE RESTRICT,
              -- FOREIGN KEY(StrainId) REFERENCES Strain(Id)
              --   ON UPDATE CASCADE ON DELETE RESTRICT,
              -- FOREIGN KEY (CaseAttributeId) REFERENCES CaseAttribute(CaseAttributeId)
              --   ON UPDATE CASCADE ON DELETE RESTRICT,
              PRIMARY KEY(InbredSetId, StrainId, CaseAttributeId)
            ) ENGINE=InnoDB CHARSET=utf8mb4;""")

def fetch_case_attribute_data(cursor, limit: int = 1000):
    """Fetch case attribute data."""
    offset = 0
    while True:
        cursor.execute(
            "SELECT "
            "caxrn.StrainId, caxrn.CaseAttributeId, caxrn.Value, "
            "ca.Name AS CaseAttributeName, "
            "ca.Description AS CaseAttributeDescription, iset.InbredSetId "
            "FROM "
            "CaseAttribute AS ca INNER JOIN CaseAttributeXRefNew AS caxrn "
            "ON ca.Id=caxrn.CaseAttributeId "
            "INNER JOIN "
            "StrainXRef AS sxr "
            "ON caxrn.StrainId=sxr.StrainId "
            "INNER JOIN "
            "InbredSet AS iset "
            "ON sxr.InbredSetId=iset.InbredSetId "
            "WHERE "
            "caxrn.value != 'x' "
            "AND caxrn.value IS NOT NULL "
            f"LIMIT {limit} OFFSET {offset}")
        results = cursor.fetchall()
        if len(results) <= 0:
            break
        yield results
        offset = offset + len(results)

def copy_data(cursor):
    """Copy data from existing tables into new temp tables."""
    if table_exists(cursor, "CaseAttributeTemp") and table_exists(cursor, "CaseAttributeXRefNewTemp"):
        for bunch_of_data in fetch_case_attribute_data(cursor):
            ca_data = tuple({key: val for key, val in item} for item in set(
                (("InbredSetId", item["InbredSetId"]),
                 ("StrainId", item["StrainId"]),
                 ("CaseAttributeId", item["CaseAttributeId"]),
                 ("Name", item["CaseAttributeName"]),
                 ("Description", (item["CaseAttributeDescription"]
                                  or item["CaseAttributeName"])),
                 ("Value", item["Value"]))
                for item in bunch_of_data))
            cursor.executemany(
                "INSERT INTO "
                "CaseAttributeTemp("
                "InbredSetId, CaseAttributeId, Name, Description) "
                "VALUES("
                "%(InbredSetId)s, %(CaseAttributeId)s, %(Name)s, "
                "%(Description)s) "
                "ON DUPLICATE KEY UPDATE Name=VALUES(Name)",
                ca_data)
            cursor.executemany(
                "INSERT INTO "
                "CaseAttributeXRefNewTemp("
                "InbredSetId, StrainId, CaseAttributeId, Value) "
                "VALUES("
                "%(InbredSetId)s, %(StrainId)s, %(CaseAttributeId)s, %(Value)s) "
                "ON DUPLICATE KEY UPDATE `Value`=VALUES(`Value`)",
                bunch_of_data)
            time.sleep(random.randint(5, 20))

def rename_table(cursor, table, newtable):
    """Rename `table` to `newtable`."""
    cursor.execute(f"ALTER TABLE {table} RENAME TO {newtable}")

def parse_db_url(sql_uri: str) -> tuple:
    """function to parse SQL_URI env variable note:there\
    is a default value for SQL_URI so a tuple result is\
    always expected"""
    parsed_db = urlparse(sql_uri)
    return (
        parsed_db.hostname, parsed_db.username, parsed_db.password,
        parsed_db.path[1:], parsed_db.port)

@click.command(help="Update DB schema for Case-Attributes")
@click.argument("sql_uri")
def main(sql_uri: str) -> None:
    # for innodb: `SET autocommit=0` to prevent releasing locks immediately
    # after next commit.
    # See https://dev.mysql.com/doc/refman/8.0/en/lock-tables.html
    host, user, passwd, db_name, port = parse_db_url(sql_uri)
    conn = mdb.connect(
        db=db_name, user=user, passwd=passwd or '', host=host, port=port or 3306)
    try:
        cursor = conn.cursor(cursorclass=DictCursor)
        convert_to_innodb(cursor, "InbredSet")
        convert_to_innodb(cursor, "Strain")
        cleanup_inbred_set_schema(cursor)
        create_temp_case_attributes_table(cursor)
        create_temp_case_attributes_xref_table(cursor)
        copy_data(cursor)
        timestr = datetime.now().isoformat().replace(
            "-", "").replace(":", "_").replace(".", "__")
        rename_table(cursor, "CaseAttribute", f"BACKUP_CaseAttribute_{timestr}")
        rename_table(cursor, "CaseAttributeXRefNew", f"BACKUP_CaseAttributeXRefNew_{timestr}")
        rename_table(cursor, "CaseAttributeTemp", "CaseAttribute")
        rename_table(cursor, "CaseAttributeXRefNewTemp", "CaseAttributeXRefNew")
        conn.commit()
    except Exception as _exc:
        print(traceback.format_exc(), file=sys.stderr)
        conn.rollback()
    finally:
        conn.close()

if __name__ == "__main__":
    # pylint: disable=no-value-for-parameter
    main()