From 92584ab4cd23edf309927aad3af7a08e8b8f8b08 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Wed, 19 May 2021 02:30:28 -0400 Subject: New SQL approach --- docs/performance/slow-probesetdata-sql.org | 86 +++++++++++++++++++++++++++++- 1 file changed, 84 insertions(+), 2 deletions(-) (limited to 'docs') diff --git a/docs/performance/slow-probesetdata-sql.org b/docs/performance/slow-probesetdata-sql.org index 3b7ed03..3a0a2d7 100644 --- a/docs/performance/slow-probesetdata-sql.org +++ b/docs/performance/slow-probesetdata-sql.org @@ -45,7 +45,7 @@ We have the following options: We could reorder the table on StrainID which would make this search much faster but it would many common (dataset) queries slower. So, that is not a great idea. One thing we could try is add a copy of the first table. Not exactly -elegant but a quick fix for sure. We'll need an embedded procedure to keep it +elegant but a quick fix for sure. We'll need an embedded procedure to keep it up-to-data. *** Use column based storage @@ -96,6 +96,23 @@ MariaDB [db_webqtl]> SELECT ProbeSet.Name,T4.value FROM (ProbeSet, ProbeSetXRef, +------+-------+ #+END_SRC + +#+BEGIN_SRC SQL +MariaDB [db_webqtl]> SELECT ProbeSet.Name,T4.value FROM (ProbeSet, ProbeSetXRef) + left join ProbeSetData as T4 on T4.StrainId=4 and T4.Id = ProbeSetXRef.DataId + WHERE ProbeSet.Id = ProbeSetXRef.ProbeSetId order by ProbeSet.Id limit 5; ++-----------+-------+ +| Name | value | ++-----------+-------+ +| 100001_at | 6.414 | +| 100001_at | 6.414 | +| 100001_at | 6.414 | +| 100001_at | 8.414 | +| 100001_at | 8.414 | ++-----------+-------+ +5 rows in set (20.064 sec) +#+END_SRC SQL + #+BEGIN_SRC SQL MariaDB [db_webqtl]> SELECT ProbeSet.Name,T4.value,T5.value FROM (ProbeSet, ProbeSetXRef, ProbeSetFreeze) left join ProbeSetData as T4 on T4.Id = ProbeSetXRef.DataId and T4.StrainId=4 @@ -319,6 +336,69 @@ MariaDB [db_webqtl]> select count(*) from ProbeSetData where strainid = 1 and id 1 row in set (0.000 sec) #+END_SRC +Now this query is fast because it traverses the ProbeSetData table only once and uses id as a starting point: + +#+BEGIN_SRC SQL +MariaDB [db_webqtl]> select count(*) from (ProbeSetData,ProbeSetXRef) where ProbeSetXRef.ProbeSetFreezeId = 206 and id=ProbeSetId; ++----------+ +| count(*) | ++----------+ +| 10699448 | ++----------+ +1 row in set (4.429 sec) +#+END_SRC + +#+BEGIN_SRC SQL +select id,strainid,value from (ProbeSetData,ProbeSetXRef) where + ProbeSetXRef.ProbeSetFreezeId = 206 and id=ProbeSetId +limit 5; ++--------+----------+-------+ +| id | strainid | value | ++--------+----------+-------+ +| 225088 | 1 | 7.33 | +| 225088 | 2 | 7.559 | +| 225088 | 3 | 7.84 | +| 225088 | 4 | 7.835 | +| 225088 | 5 | 7.652 | ++--------+----------+-------+ +5 rows in set (0.001 sec) +#+END_SRC + + +#+BEGIN_SRC SQL +select id,strainid,value from (ProbeSetData,ProbeSetXRef) where ProbeSetXRef.ProbeSetFreezeId = 206 and id=ProbeSetId and strainid=4 limit 5; ++--------+----------+-------+ +| id | strainid | value | ++--------+----------+-------+ +| 225088 | 4 | 7.835 | +| 225089 | 4 | 9.595 | +| 225090 | 4 | 8.982 | +| 225091 | 4 | 8.153 | +| 225092 | 4 | 7.111 | ++--------+----------+-------+ +5 rows in set (0.000 sec) +#+END_SRC + +#+BEGIN_SRC SQL +MariaDB [db_webqtl]> select ProbeSet.name,strainid,probesetfreezeid,value from (ProbeSet,ProbeSetData,ProbeSetFreeze,ProbeSetXRef) where ProbeSetXRef.ProbeSetFreezeId = 206 and ProbeSetData.id=ProbeSetId and (strainid=4 or strainid=5) and ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id and ProbeSet.Id = ProbeSetXRef.ProbeSetId limit 5; ++---------+----------+------------------+-------+ +| name | strainid | probesetfreezeid | value | ++---------+----------+------------------+-------+ +| 4331726 | 4 | 206 | 7.835 | +| 5054239 | 4 | 206 | 9.595 | +| 4642578 | 4 | 206 | 8.982 | +| 4398221 | 4 | 206 | 8.153 | +| 5543360 | 4 | 206 | 7.111 | ++---------+----------+------------------+-------+ +5 rows in set (2.174 sec) +#+END_SRC + + +No more joins and super fast!! + +*** TODO check if the values match the original query. + + ** Original query This is the original query generated by GN2 that takes 1 hour on @@ -327,7 +407,9 @@ traverses the full 53GB database table (and even larger index) for each of them. #+BEGIN_SRC SQL -SELECT ProbeSet.Name,T4.value, T5.value, T6.value, T7.value, T8.value, T9.value, T10.value, T11.value, T12.value, T13.value, T14.value, T15.value, T16.value, T17.value, T18.value, T19.value, T20.value, T21.value, T22.value, T23.value, T24.value, T25.value, T26.value, T28.value, T29.value, T30.value, T31.value, T33.value, T35.value, T36.value, T37.value, T39.value, T98.value, T100.value, T103.value FROM (ProbeSet, ProbeSetXRef, ProbeSetFreeze) +SELECT ProbeSet.Name,T4.value, T5.value, T6.value, T7.value, T8.value, T9.value, T10.value, T11.value, T12.value, T13.value, T14.value, T15.value, T16.value, T17.value, T18.value, T19.value, T20.value, T21.value, T22.value, T23.value, T24.value, T25.value, T26.value, T28.value, + T29.value, T30.value, T31.value, T33.value, T35.value, T36.value, T37.value, T39.value, + T98.value, T100.value, T103.value FROM (ProbeSet, ProbeSetXRef, ProbeSetFreeze) left join ProbeSetData as T4 on T4.Id = ProbeSetXRef.DataId and T4.StrainId=4 left join ProbeSetData as T5 on T5.Id = ProbeSetXRef.DataId and T5.StrainId=5 left join ProbeSetData as T6 on T6.Id = ProbeSetXRef.DataId and T6.StrainId=6 -- cgit v1.2.3