diff options
-rw-r--r-- | scripts/load_phenotypes_to_db.py | 40 | ||||
-rw-r--r-- | uploader/phenotypes/models.py | 56 |
2 files changed, 76 insertions, 20 deletions
diff --git a/scripts/load_phenotypes_to_db.py b/scripts/load_phenotypes_to_db.py index de06d70..d2d1d2c 100644 --- a/scripts/load_phenotypes_to_db.py +++ b/scripts/load_phenotypes_to_db.py @@ -18,7 +18,8 @@ from uploader.samples.models import samples_by_species_and_population from uploader.phenotypes.models import ( dataset_by_id, save_phenotypes_data, - create_new_phenotypes) + create_new_phenotypes, + quick_save_phenotypes_data) from uploader.publications.models import ( create_new_publications, fetch_publication_by_id) @@ -157,16 +158,33 @@ def save_numeric_data( build_line_joiner(control_data)) for _file in phenofiles) - return save_phenotypes_data( - conn, - table, - __build_dataitems__( - filetype, - phenofiles, - control_data, - samples, - dataidmap, - pheno_name2id)) + try: + logger.debug("Attempt quick save with `LOAD … INFILE`.") + return quick_save_phenotypes_data( + conn, + table, + __build_dataitems__( + filetype, + phenofiles, + control_data, + samples, + dataidmap, + pheno_name2id), + filesdir) + except Exception as _exc: + logger.debug("Could not use `LOAD … INFILE`, using raw query", + exc_info=True) + import time;time.sleep(60) + return save_phenotypes_data( + conn, + table, + __build_dataitems__( + filetype, + phenofiles, + control_data, + samples, + dataidmap, + pheno_name2id)) save_pheno_data = partial(save_numeric_data, diff --git a/uploader/phenotypes/models.py b/uploader/phenotypes/models.py index e47a016..20b8e77 100644 --- a/uploader/phenotypes/models.py +++ b/uploader/phenotypes/models.py @@ -1,5 +1,7 @@ """Database and utility functions for phenotypes.""" import logging +import tempfile +from pathlib import Path from functools import reduce from datetime import datetime from typing import Optional, Iterable @@ -13,6 +15,16 @@ from gn_libs.mysqldb import debug_query logger = logging.getLogger(__name__) +__PHENO_DATA_TABLES__ = { + "PublishData": { + "table": "PublishData", "valueCol": "value", "DataIdCol": "Id"}, + "PublishSE": { + "table": "PublishSE", "valueCol": "error", "DataIdCol": "DataId"}, + "NStrain": { + "table": "NStrain", "valueCol": "count", "DataIdCol": "DataId"} +} + + def datasets_by_population( conn: mdb.Connection, species_id: int, @@ -326,14 +338,7 @@ def save_phenotypes_data( data: Iterable[dict] ) -> int: """Save new phenotypes data into the database.""" - _table_details = { - "PublishData": { - "table": "PublishData", "valueCol": "value", "DataIdCol": "Id"}, - "PublishSE": { - "table": "PublishSE", "valueCol": "error", "DataIdCol": "DataId"}, - "NStrain": { - "table": "PublishData", "valueCol": "count", "DataIdCol": "DataId"} - }[table] + _table_details = __PHENO_DATA_TABLES__[table] with conn.cursor(cursorclass=DictCursor) as cursor: _count = 0 while True: @@ -347,7 +352,7 @@ def save_phenotypes_data( (f"INSERT INTO {_table_details['table']}" f"({_table_details['DataIdCol']}, StrainId, {_table_details['valueCol']}) " "VALUES " - f"(%(data_id)s, %(sample_id)s, %({_table_details['valueCol']})s) "), + f"(%(data_id)s, %(sample_id)s, %(value)s) "), tuple(batch)) debug_query(cursor, logger) _count = _count + len(batch) @@ -355,3 +360,36 @@ def save_phenotypes_data( logger.debug("Saved a total of %s data rows", _count) return _count + + +def quick_save_phenotypes_data( + conn: mdb.Connection, + table: str, + dataitems: Iterable[dict], + tmpdir: Path +) -> int: + """Save data items to the database, but using """ + _table_details = __PHENO_DATA_TABLES__[table] + with (tempfile.NamedTemporaryFile( + prefix=f"{table}_data", mode="wt", dir=tmpdir) as tmpfile, + conn.cursor(cursorclass=DictCursor) as cursor): + _count = 0 + console.debug("Write data rows to text file.") + for row in dataitems: + tmpfile.write( + f'{row["data_id"]}\t{row["sample_id"]}\t{row["value"]}\n') + _count = _count + 1 + tmpfile.flush() + + console.debug("Load text file into database (table: %s)", + _table_details["table"]) + cursor.execute( + f"LOAD DATA LOCAL INFILE '{tmpfile.name}' " + f"INTO TABLE {_table_details['table']} " + "(" + f"{_table_details['DataIdCol']}, " + "StrainId, " + f"{_table_details['valueCol']}" + ")") + debug_query(cursor, logger) + return _count |