From 73be3e40455e106d061ad13928bf66840bf314ed Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Tue, 18 May 2021 03:33:49 -0400
Subject: Notes on SQL query

---
 docs/performance/slow-probesetdata-sql.org | 318 +++++++++++++++++++++++++++++
 1 file changed, 318 insertions(+)
 create mode 100644 docs/performance/slow-probesetdata-sql.org

(limited to 'docs')

diff --git a/docs/performance/slow-probesetdata-sql.org b/docs/performance/slow-probesetdata-sql.org
new file mode 100644
index 0000000..85858d1
--- /dev/null
+++ b/docs/performance/slow-probesetdata-sql.org
@@ -0,0 +1,318 @@
+* Slow ProbeSetData SQL query
+
+** The case
+
+We were looking to optimize a query for correlations. On Penguin2 (standard
+drives, RAID5) the query took 1 hour. On Tux01 (solid state NVME) it took 6
+minutes. Adding an index for StrainId (see explorations below) reduced that
+query to 3 minutes - which is kinda acceptable. The real problem, however is
+that this is a quadratic search - so it will get worse quickly - and we need
+to solve it.
+
+So what does ProbeSetData contain?
+
+#+BEGIN_SRC SQL
+select * from ProbeSetData limit 5;
++----+----------+-------+
+| Id | StrainId | value |
++----+----------+-------+
+|  1 |        1 | 5.742 |
+|  1 |        2 | 5.006 |
+|  1 |        3 | 6.079 |
+|  1 |        4 | 6.414 |
+|  1 |        5 | 4.885 |
++----+----------+-------+
+#+END_SRC
+
+You can see Id is sectioned in the file (and there are not that many Ids) but
+StrainId is *distributed* through the database file and some 'StrainIds' match
+many data points. Id stands for Dataset and StrainId really means Measurement(!)
+
+Our query looked for 1,236,088 measurement distributed over a 53Gb file (and
+an even larger index file). Turns out the full table is read many many times
+over for one particular query pivoting on strainid...
+
+We have the following options:
+
+1. Reorder the table
+2. Use column based storage
+3. Use compression
+4. Use a different storage layout
+
+*** Reorder the table
+
+We could reorder the table on StrainID which would make this search much
+faster but it would many common (dataset) queries slower. So, that is not a
+great idea. One thing we could try is create a copy of the first table.
+
+*** Use column based storage
+
+Column-based storage works when you need a subset of the data in the table. In
+this case it won't help much because, even though the pivot itself would be
+faster, we still traverse all data to get IDs and values.
+
+*** Use compression
+
+Compression reduces the size on disk and may be beneficial. Real life metrics
+on the internet don't show that much improvement, but we could try native
+compression and/or ZFS.
+
+*** Use a different storage layout
+
+My prediction is that we can not get around this.
+
+
+** Exploration
+
+In the following steps I reduce the complex case to a simple case explaining
+the performance bottleneck we are seeing today. I did not add comments, but
+you can see what I did.
+
+#+BEGIN_SRC SQL
+MariaDB [db_webqtl]> select * from ProbeSetXRef limit 2;
++------------------+------------+--------+------------+--------------------+------------+------------------+---------------------+------------+------------------+--------+--------------------+------+
+| ProbeSetFreezeId | ProbeSetId | DataId | Locus_old  | LRS_old            | pValue_old | mean             | se                  | Locus      | LRS              | pValue | additive           | h2   |
++------------------+------------+--------+------------+--------------------+------------+------------------+---------------------+------------+------------------+--------+--------------------+------+
+|                1 |          1 |      1 | 10.095.400 |   13.3971627898894 |      0.163 | 5.48794285714286 | 0.08525787814808819 | rs13480619 |  12.590069931048 |  0.269 |        -0.28515625 | NULL |
+|                1 |          2 |      2 | D15Mit189  | 10.042057464356201 |      0.431 | 9.90165714285714 |  0.0374686634976217 | rs29535974 | 10.5970737900941 |  0.304 | -0.116783333333333 | NULL |
++------------------+------------+--------+------------+--------------------+------------+------------------+---------------------+------------+------------------+--------+--------------------+------+
+2 rows in set (0.001 sec)
+#+END_SRC
+
+#+BEGIN_SRC SQL
+MariaDB [db_webqtl]> SELECT ProbeSet.Name,T4.value FROM (ProbeSet, ProbeSetXRef, ProbeSetFreeze)  left join ProbeSetData as T4 on T4.Id = ProbeSetXRef.DataId
+    ->                         and T4.StrainId=4
+    -> limit 5;
++------+-------+
+| Name | value |
++------+-------+
+| NULL | 6.414 |
+| NULL | 6.414 |
+| NULL | 6.414 |
+| NULL | 6.414 |
+| NULL | 6.414 |
++------+-------+
+#+END_SRC
+
+#+BEGIN_SRC SQL
+MariaDB [db_webqtl]> SELECT ProbeSet.Name,T4.value,T5.value FROM (ProbeSet, ProbeSetXRef, ProbeSetFreeze)
+left join ProbeSetData as T4 on T4.Id = ProbeSetXRef.DataId and T4.StrainId=4
+left join ProbeSetData as T5 on T5.Id = ProbeSetXRef.DataId and T5.StrainId=5
+WHERE ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id
+and ProbeSetFreeze.Name = 'UMUTAffyExon_0209_RMA'
+and ProbeSet.Id = ProbeSetXRef.ProbeSetId
+order by ProbeSet.Id  limit 5;
+
++---------+---------+
+| Name    | value   |
++---------+---------+
+| 4331726 | 5.52895 |
+| 5054239 | 6.29465 |
+| 4642578 | 9.13706 |
+| 4398221 | 6.77672 |
+| 5543360 | 4.30016 |
++---------+---------+
+#+END_SRC
+
+#+BEGIN_SRC SQL
+SELECT ProbeSet.Name,T4.value FROM (ProbeSet, ProbeSetXRef, ProbeSetFreeze)
+left join ProbeSetData as T4 on T4.Id = ProbeSetXRef.DataId and T4.StrainId=4
+WHERE ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id
+and ProbeSetFreeze.Name = 'UMUTAffyExon_0209_RMA'
+and ProbeSet.Id = ProbeSetXRef.ProbeSetId
+order by ProbeSet.Id ;
+
+1236087 rows in set (19.173 sec)
+#+END_SRC
+
+#+BEGIN_SRC SQL
+SELECT ProbeSet.Name,T4.value FROM (ProbeSet, ProbeSetXRef, ProbeSetFreeze)
+left join ProbeSetData as T4 on T4.StrainId=4 and T4.Id = ProbeSetXRef.DataId
+WHERE ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id
+and ProbeSetFreeze.Name = 'UMUTAffyExon_0209_RMA'
+and ProbeSet.Id = ProbeSetXRef.ProbeSetId
+order by ProbeSet.Id ;
+
+1236087 rows in set (19.173 sec)
+#+END_SRC
+
+#+BEGIN_SRC SQL
+SELECT ProbeSet.Name FROM (ProbeSet, ProbeSetXRef, ProbeSetFreeze)
+WHERE ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id
+and ProbeSetFreeze.Name = 'UMUTAffyExon_0209_RMA'
+and ProbeSet.Id = ProbeSetXRef.ProbeSetId
+order by ProbeSet.Id ;
+#+END_SRC
+
+Find all the probeset 'names' (probe sequence included) for one dataset:
+
+#+BEGIN_SRC SQL
+SELECT count(DISTINCT ProbeSet.Name) FROM (ProbeSet, ProbeSetXRef, ProbeSetFreeze)          WHERE ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id                       and ProbeSetFreeze.Name = 'UMUTAffyExon_0209_RMA'            and ProbeSet.Id = ProbeSetXRef.ProbeSetId                                   order by ProbeSet.Id;
++-------------------------------+
+| count(DISTINCT ProbeSet.Name) |
++-------------------------------+
+|                       1236087 |
++-------------------------------+
+#+END_SRC
+
+Now for each of those probesets:
+
+#+BEGIN_SRC SQL
+SELECT ProbeSet.Name,T4.value FROM (ProbeSet, ProbeSetXRef)
+left join ProbeSetData as T4 on T4.StrainId=4 and T4.Id = ProbeSetXRef.DataId
+WHERE ProbeSet.Id = ProbeSetXRef.ProbeSetId
+order by ProbeSet.Id limit 5;
+#+END_SRC
+
+ProbeSetXRef contains the p-values:
+
+#+BEGIN_SRC SQL
+select * from ProbeSetXRef limit 5;
++------------------+------------+--------+------------+--------------------+------------+-------------------+---------------------+------------+------------------+--------+--------------------+------+
+| ProbeSetFreezeId | ProbeSetId | DataId | Locus_old  | LRS_old            | pValue_old | mean              | se                  | Locus      | LRS              | pValue | additive           | h2   |
++------------------+------------+--------+------------+--------------------+------------+-------------------+---------------------+------------+------------------+--------+--------------------+------+
+|                1 |          1 |      1 | 10.095.400 |   13.3971627898894 |      0.163 |  5.48794285714286 | 0.08525787814808819 | rs13480619 |  12.590069931048 |  0.269 |        -0.28515625 | NULL |
+|                1 |          2 |      2 | D15Mit189  | 10.042057464356201 |      0.431 |  9.90165714285714 |  0.0374686634976217 | rs29535974 | 10.5970737900941 |  0.304 | -0.116783333333333 | NULL |
+#+END_SRC
+
+
+#+BEGIN_SRC SQL
+SELECT count(T4.value) FROM (ProbeSet, ProbeSetXRef)
+left join ProbeSetData as T4 on T4.StrainId=4 and T4.Id = ProbeSetXRef.DataId
+WHERE ProbeSet.Id = ProbeSetXRef.ProbeSetId ;
+#+END_SRC
+
+
+#+BEGIN_SRC SQL
+SELECT count(T4.value) FROM (ProbeSet, ProbeSetXRef) left join ProbeSetData as T4 on T4.StrainId=4 limit 5;
+#+END_SRC
+
+#+BEGIN_SRC SQL
+select value from (ProbeSetData) where StrainId=4 limit 5;
+#+END_SRC
+
+So, this is the sloooow baby:
+
+#+BEGIN_SRC SQL
+select count(id) from (ProbeSetData) where StrainId=4;
+
+| ProbeSetData |          0 | DataId   |            2 | StrainId    | A         |  4852908856 |     NULL | NULL   |      | BTREE      |         |               |
+
+-rw-rw---- 1 mysql mysql  53G Mar  3 23:49 ProbeSetData.MYD
+-rw-rw---- 1 mysql mysql  66G Mar  4 03:00 ProbeSetData.MYI
+#+END_SRC
+
+#+BEGIN_SRC SQL
+create index strainid on ProbeSetData(StrainId);
+Stage: 1 of 2 'Copy to tmp table'   8.77% of stage done
+Stage: 2 of 2 'Enabling keys'      0% of stage done
+#+END_SRC
+
+#+BEGIN_SRC SQL
+MariaDB [db_webqtl]> create index strainid on ProbeSetData(StrainId);
+Query OK, 5111384047 rows affected (2 hours 56 min 25.807 sec)
+Records: 5111384047  Duplicates: 0  Warnings: 0
+#+END_SRC
+
+#+BEGIN_SRC SQL
+MariaDB [db_webqtl]> select count(id) from (ProbeSetData) where StrainId=4;
+
++-----------+
+| count(id) |
++-----------+
+|  14267545 |
++-----------+
+1 row in set (19.707 sec)
+#+END_SRC
+
+
+#+BEGIN_SRC SQL
+MariaDB [db_webqtl]> select count(*) from ProbeSetData where strainid = 140;
++----------+
+| count(*) |
++----------+
+| 10717771 |
++----------+
+1 row in set (10.161 sec)
+#+END_SRC
+
+#+BEGIN_SRC SQL
+MariaDB [db_webqtl]> select count(*) from ProbeSetData where strainid = 140 and id=4;
++----------+
+| count(*) |
++----------+
+|        0 |
++----------+
+1 row in set (0.000 sec)
+#+END_SRC
+
+#+BEGIN_SRC SQL
+MariaDB [db_webqtl]> select count(*) from ProbeSetData where strainid = 4 and id=4;
++----------+
+| count(*) |
++----------+
+|        1 |
++----------+
+1 row in set (0.000 sec)
+#+END_SRC
+
+
+#+BEGIN_SRC SQL
+select id from ProbeSetFreeze where id=1;
+
+WHERE ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id
+and ProbeSetFreeze.Name = 'UMUTAffyExon_0209_RMA'
+and ProbeSet.Id = ProbeSetXRef.ProbeSetId
+order by ProbeSet.Id  limit 5;
+#+END_SRC
+
+#+BEGIN_SRC SQL
+select count(ProbeSetId) from ProbeSetXRef where ProbeSetFreezeId=1;
++-------------------+
+| count(ProbeSetId) |
++-------------------+
+|             12422 |
++-------------------+
+1 row in set (0.006 sec)
+#+END_SRC
+
+
+#+BEGIN_SRC SQL
+select count(ProbeSetId) from (ProbeSetXRef,ProbeSetFreeze) where
+ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id
+and ProbeSetFreeze.Name = 'UMUTAffyExon_0209_RMA';
+#+END_SRC
+
+#+BEGIN_SRC SQL
+MariaDB [db_webqtl]> select count(ProbeSetId) from (ProbeSetXRef,ProbeSetFreeze) where
+    -> ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id
+    -> and ProbeSetFreeze.Name = 'UMUTAffyExon_0209_RMA';
++-------------------+
+| count(ProbeSetId) |
++-------------------+
+|           1236087 |
++-------------------+
+1 row in set (0.594 sec)
+#+END_SRC
+
+ProbeSetXRef.ProbeSetFreezeId is 206, so
+
+#+BEGIN_SRC SQL
+MariaDB [db_webqtl]> select count(ProbeSetId) from (ProbeSetXRef) where ProbeSetXRef.ProbeSetFreezeId = 206;
++-------------------+
+| count(ProbeSetId) |
++-------------------+
+|           1236087 |
++-------------------+
+1 row in set (0.224 sec)
+#+END_SRC
+
+#+BEGIN_SRC SQL
+MariaDB [db_webqtl]> select count(*) from ProbeSetData where strainid = 1 and id=4;
++----------+
+| count(*) |
++----------+
+|        1 |
++----------+
+1 row in set (0.000 sec)
+#+END_SRC
-- 
cgit 1.4.1