aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--scripts/load_phenotypes_to_db.py40
-rw-r--r--uploader/phenotypes/models.py56
2 files changed, 76 insertions, 20 deletions
diff --git a/scripts/load_phenotypes_to_db.py b/scripts/load_phenotypes_to_db.py
index de06d70..d2d1d2c 100644
--- a/scripts/load_phenotypes_to_db.py
+++ b/scripts/load_phenotypes_to_db.py
@@ -18,7 +18,8 @@ from uploader.samples.models import samples_by_species_and_population
from uploader.phenotypes.models import (
dataset_by_id,
save_phenotypes_data,
- create_new_phenotypes)
+ create_new_phenotypes,
+ quick_save_phenotypes_data)
from uploader.publications.models import (
create_new_publications,
fetch_publication_by_id)
@@ -157,16 +158,33 @@ def save_numeric_data(
build_line_joiner(control_data))
for _file in phenofiles)
- return save_phenotypes_data(
- conn,
- table,
- __build_dataitems__(
- filetype,
- phenofiles,
- control_data,
- samples,
- dataidmap,
- pheno_name2id))
+ try:
+ logger.debug("Attempt quick save with `LOAD … INFILE`.")
+ return quick_save_phenotypes_data(
+ conn,
+ table,
+ __build_dataitems__(
+ filetype,
+ phenofiles,
+ control_data,
+ samples,
+ dataidmap,
+ pheno_name2id),
+ filesdir)
+ except Exception as _exc:
+ logger.debug("Could not use `LOAD … INFILE`, using raw query",
+ exc_info=True)
+ import time;time.sleep(60)
+ return save_phenotypes_data(
+ conn,
+ table,
+ __build_dataitems__(
+ filetype,
+ phenofiles,
+ control_data,
+ samples,
+ dataidmap,
+ pheno_name2id))
save_pheno_data = partial(save_numeric_data,
diff --git a/uploader/phenotypes/models.py b/uploader/phenotypes/models.py
index e47a016..20b8e77 100644
--- a/uploader/phenotypes/models.py
+++ b/uploader/phenotypes/models.py
@@ -1,5 +1,7 @@
"""Database and utility functions for phenotypes."""
import logging
+import tempfile
+from pathlib import Path
from functools import reduce
from datetime import datetime
from typing import Optional, Iterable
@@ -13,6 +15,16 @@ from gn_libs.mysqldb import debug_query
logger = logging.getLogger(__name__)
+__PHENO_DATA_TABLES__ = {
+ "PublishData": {
+ "table": "PublishData", "valueCol": "value", "DataIdCol": "Id"},
+ "PublishSE": {
+ "table": "PublishSE", "valueCol": "error", "DataIdCol": "DataId"},
+ "NStrain": {
+ "table": "NStrain", "valueCol": "count", "DataIdCol": "DataId"}
+}
+
+
def datasets_by_population(
conn: mdb.Connection,
species_id: int,
@@ -326,14 +338,7 @@ def save_phenotypes_data(
data: Iterable[dict]
) -> int:
"""Save new phenotypes data into the database."""
- _table_details = {
- "PublishData": {
- "table": "PublishData", "valueCol": "value", "DataIdCol": "Id"},
- "PublishSE": {
- "table": "PublishSE", "valueCol": "error", "DataIdCol": "DataId"},
- "NStrain": {
- "table": "PublishData", "valueCol": "count", "DataIdCol": "DataId"}
- }[table]
+ _table_details = __PHENO_DATA_TABLES__[table]
with conn.cursor(cursorclass=DictCursor) as cursor:
_count = 0
while True:
@@ -347,7 +352,7 @@ def save_phenotypes_data(
(f"INSERT INTO {_table_details['table']}"
f"({_table_details['DataIdCol']}, StrainId, {_table_details['valueCol']}) "
"VALUES "
- f"(%(data_id)s, %(sample_id)s, %({_table_details['valueCol']})s) "),
+ f"(%(data_id)s, %(sample_id)s, %(value)s) "),
tuple(batch))
debug_query(cursor, logger)
_count = _count + len(batch)
@@ -355,3 +360,36 @@ def save_phenotypes_data(
logger.debug("Saved a total of %s data rows", _count)
return _count
+
+
+def quick_save_phenotypes_data(
+ conn: mdb.Connection,
+ table: str,
+ dataitems: Iterable[dict],
+ tmpdir: Path
+) -> int:
+ """Save data items to the database, but using """
+ _table_details = __PHENO_DATA_TABLES__[table]
+ with (tempfile.NamedTemporaryFile(
+ prefix=f"{table}_data", mode="wt", dir=tmpdir) as tmpfile,
+ conn.cursor(cursorclass=DictCursor) as cursor):
+ _count = 0
+ console.debug("Write data rows to text file.")
+ for row in dataitems:
+ tmpfile.write(
+ f'{row["data_id"]}\t{row["sample_id"]}\t{row["value"]}\n')
+ _count = _count + 1
+ tmpfile.flush()
+
+ console.debug("Load text file into database (table: %s)",
+ _table_details["table"])
+ cursor.execute(
+ f"LOAD DATA LOCAL INFILE '{tmpfile.name}' "
+ f"INTO TABLE {_table_details['table']} "
+ "("
+ f"{_table_details['DataIdCol']}, "
+ "StrainId, "
+ f"{_table_details['valueCol']}"
+ ")")
+ debug_query(cursor, logger)
+ return _count