aboutsummaryrefslogtreecommitdiff
path: root/docs/performance/slow-probesetdata-sql.org
diff options
context:
space:
mode:
authorPjotr Prins2021-05-19 02:30:28 -0400
committerPjotr Prins2021-05-19 02:30:28 -0400
commit92584ab4cd23edf309927aad3af7a08e8b8f8b08 (patch)
tree141a5508f7d413a761b454f85a17411dbb537b8e /docs/performance/slow-probesetdata-sql.org
parente174551e624e8d9df5d9f143fb0571c78994940e (diff)
downloadgenenetwork3-92584ab4cd23edf309927aad3af7a08e8b8f8b08.tar.gz
New SQL approach
Diffstat (limited to 'docs/performance/slow-probesetdata-sql.org')
-rw-r--r--docs/performance/slow-probesetdata-sql.org86
1 files changed, 84 insertions, 2 deletions
diff --git a/docs/performance/slow-probesetdata-sql.org b/docs/performance/slow-probesetdata-sql.org
index 3b7ed03..3a0a2d7 100644
--- a/docs/performance/slow-probesetdata-sql.org
+++ b/docs/performance/slow-probesetdata-sql.org
@@ -45,7 +45,7 @@ We have the following options:
We could reorder the table on StrainID which would make this search much faster
but it would many common (dataset) queries slower. So, that is not a great
idea. One thing we could try is add a copy of the first table. Not exactly
-elegant but a quick fix for sure. We'll need an embedded procedure to keep it
+elegant but a quick fix for sure. We'll need an embedded procedure to keep it
up-to-data.
*** Use column based storage
@@ -96,6 +96,23 @@ MariaDB [db_webqtl]> SELECT ProbeSet.Name,T4.value FROM (ProbeSet, ProbeSetXRef,
+------+-------+
#+END_SRC
+
+#+BEGIN_SRC SQL
+MariaDB [db_webqtl]> SELECT ProbeSet.Name,T4.value FROM (ProbeSet, ProbeSetXRef)
+ left join ProbeSetData as T4 on T4.StrainId=4 and T4.Id = ProbeSetXRef.DataId
+ WHERE ProbeSet.Id = ProbeSetXRef.ProbeSetId order by ProbeSet.Id limit 5;
++-----------+-------+
+| Name | value |
++-----------+-------+
+| 100001_at | 6.414 |
+| 100001_at | 6.414 |
+| 100001_at | 6.414 |
+| 100001_at | 8.414 |
+| 100001_at | 8.414 |
++-----------+-------+
+5 rows in set (20.064 sec)
+#+END_SRC SQL
+
#+BEGIN_SRC SQL
MariaDB [db_webqtl]> SELECT ProbeSet.Name,T4.value,T5.value FROM (ProbeSet, ProbeSetXRef, ProbeSetFreeze)
left join ProbeSetData as T4 on T4.Id = ProbeSetXRef.DataId and T4.StrainId=4
@@ -319,6 +336,69 @@ MariaDB [db_webqtl]> select count(*) from ProbeSetData where strainid = 1 and id
1 row in set (0.000 sec)
#+END_SRC
+Now this query is fast because it traverses the ProbeSetData table only once and uses id as a starting point:
+
+#+BEGIN_SRC SQL
+MariaDB [db_webqtl]> select count(*) from (ProbeSetData,ProbeSetXRef) where ProbeSetXRef.ProbeSetFreezeId = 206 and id=ProbeSetId;
++----------+
+| count(*) |
++----------+
+| 10699448 |
++----------+
+1 row in set (4.429 sec)
+#+END_SRC
+
+#+BEGIN_SRC SQL
+select id,strainid,value from (ProbeSetData,ProbeSetXRef) where
+ ProbeSetXRef.ProbeSetFreezeId = 206 and id=ProbeSetId
+limit 5;
++--------+----------+-------+
+| id | strainid | value |
++--------+----------+-------+
+| 225088 | 1 | 7.33 |
+| 225088 | 2 | 7.559 |
+| 225088 | 3 | 7.84 |
+| 225088 | 4 | 7.835 |
+| 225088 | 5 | 7.652 |
++--------+----------+-------+
+5 rows in set (0.001 sec)
+#+END_SRC
+
+
+#+BEGIN_SRC SQL
+select id,strainid,value from (ProbeSetData,ProbeSetXRef) where ProbeSetXRef.ProbeSetFreezeId = 206 and id=ProbeSetId and strainid=4 limit 5;
++--------+----------+-------+
+| id | strainid | value |
++--------+----------+-------+
+| 225088 | 4 | 7.835 |
+| 225089 | 4 | 9.595 |
+| 225090 | 4 | 8.982 |
+| 225091 | 4 | 8.153 |
+| 225092 | 4 | 7.111 |
++--------+----------+-------+
+5 rows in set (0.000 sec)
+#+END_SRC
+
+#+BEGIN_SRC SQL
+MariaDB [db_webqtl]> select ProbeSet.name,strainid,probesetfreezeid,value from (ProbeSet,ProbeSetData,ProbeSetFreeze,ProbeSetXRef) where ProbeSetXRef.ProbeSetFreezeId = 206 and ProbeSetData.id=ProbeSetId and (strainid=4 or strainid=5) and ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id and ProbeSet.Id = ProbeSetXRef.ProbeSetId limit 5;
++---------+----------+------------------+-------+
+| name | strainid | probesetfreezeid | value |
++---------+----------+------------------+-------+
+| 4331726 | 4 | 206 | 7.835 |
+| 5054239 | 4 | 206 | 9.595 |
+| 4642578 | 4 | 206 | 8.982 |
+| 4398221 | 4 | 206 | 8.153 |
+| 5543360 | 4 | 206 | 7.111 |
++---------+----------+------------------+-------+
+5 rows in set (2.174 sec)
+#+END_SRC
+
+
+No more joins and super fast!!
+
+*** TODO check if the values match the original query.
+
+
** Original query
This is the original query generated by GN2 that takes 1 hour on
@@ -327,7 +407,9 @@ traverses the full 53GB database table (and even larger index) for each of
them.
#+BEGIN_SRC SQL
-SELECT ProbeSet.Name,T4.value, T5.value, T6.value, T7.value, T8.value, T9.value, T10.value, T11.value, T12.value, T13.value, T14.value, T15.value, T16.value, T17.value, T18.value, T19.value, T20.value, T21.value, T22.value, T23.value, T24.value, T25.value, T26.value, T28.value, T29.value, T30.value, T31.value, T33.value, T35.value, T36.value, T37.value, T39.value, T98.value, T100.value, T103.value FROM (ProbeSet, ProbeSetXRef, ProbeSetFreeze)
+SELECT ProbeSet.Name,T4.value, T5.value, T6.value, T7.value, T8.value, T9.value, T10.value, T11.value, T12.value, T13.value, T14.value, T15.value, T16.value, T17.value, T18.value, T19.value, T20.value, T21.value, T22.value, T23.value, T24.value, T25.value, T26.value, T28.value,
+ T29.value, T30.value, T31.value, T33.value, T35.value, T36.value, T37.value, T39.value,
+ T98.value, T100.value, T103.value FROM (ProbeSet, ProbeSetXRef, ProbeSetFreeze)
left join ProbeSetData as T4 on T4.Id = ProbeSetXRef.DataId and T4.StrainId=4
left join ProbeSetData as T5 on T5.Id = ProbeSetXRef.DataId and T5.StrainId=5
left join ProbeSetData as T6 on T6.Id = ProbeSetXRef.DataId and T6.StrainId=6