New SQL approach

author: Pjotr Prins 2021-05-19 02:30:28 -0400
committer: Pjotr Prins 2021-05-19 02:30:28 -0400
commit: 92584ab4cd23edf309927aad3af7a08e8b8f8b08 (patch)
tree: 141a5508f7d413a761b454f85a17411dbb537b8e /docs
parent: e174551e624e8d9df5d9f143fb0571c78994940e (diff)
download: genenetwork3-92584ab4cd23edf309927aad3af7a08e8b8f8b08.tar.gz
1 files changed, 84 insertions, 2 deletions
diff --git a/docs/performance/slow-probesetdata-sql.org b/docs/performance/slow-probesetdata-sql.org
index 3b7ed03..3a0a2d7 100644
--- a/docs/performance/slow-probesetdata-sql.org
+++ b/docs/performance/slow-probesetdata-sql.org
@@ -45,7 +45,7 @@ We have the following options:
 We could reorder the table on StrainID which would make this search much faster
 but it would many common (dataset) queries slower. So, that is not a great
 idea. One thing we could try is add a copy of the first table. Not exactly
-elegant but a quick fix for sure. We'll need an embedded procedure to keep it 
+elegant but a quick fix for sure. We'll need an embedded procedure to keep it
 up-to-data.
 
 *** Use column based storage
@@ -96,6 +96,23 @@ MariaDB [db_webqtl]> SELECT ProbeSet.Name,T4.value FROM (ProbeSet, ProbeSetXRef,
 +------+-------+
 #+END_SRC
 
+
+#+BEGIN_SRC SQL
+MariaDB [db_webqtl]> SELECT ProbeSet.Name,T4.value FROM (ProbeSet, ProbeSetXRef)
+  left join ProbeSetData as T4 on T4.StrainId=4 and T4.Id = ProbeSetXRef.DataId
+  WHERE ProbeSet.Id = ProbeSetXRef.ProbeSetId order by ProbeSet.Id limit 5;
++-----------+-------+
+| Name      | value |
++-----------+-------+
+| 100001_at | 6.414 |
+| 100001_at | 6.414 |
+| 100001_at | 6.414 |
+| 100001_at | 8.414 |
+| 100001_at | 8.414 |
++-----------+-------+
+5 rows in set (20.064 sec)
+#+END_SRC SQL
+
 #+BEGIN_SRC SQL
 MariaDB [db_webqtl]> SELECT ProbeSet.Name,T4.value,T5.value FROM (ProbeSet, ProbeSetXRef, ProbeSetFreeze)
 left join ProbeSetData as T4 on T4.Id = ProbeSetXRef.DataId and T4.StrainId=4
@@ -319,6 +336,69 @@ MariaDB [db_webqtl]> select count(*) from ProbeSetData where strainid = 1 and id
 1 row in set (0.000 sec)
 #+END_SRC
 
+Now this query is fast because it traverses the ProbeSetData table only once and uses id as a starting point:
+
+#+BEGIN_SRC SQL
+MariaDB [db_webqtl]>  select count(*) from (ProbeSetData,ProbeSetXRef) where ProbeSetXRef.ProbeSetFreezeId = 206 and id=ProbeSetId;
++----------+
+| count(*) |
++----------+
+| 10699448 |
++----------+
+1 row in set (4.429 sec)
+#+END_SRC
+
+#+BEGIN_SRC SQL
+select id,strainid,value from (ProbeSetData,ProbeSetXRef) where
+  ProbeSetXRef.ProbeSetFreezeId = 206 and id=ProbeSetId
+limit 5;
++--------+----------+-------+
+| id     | strainid | value |
++--------+----------+-------+
+| 225088 |        1 |  7.33 |
+| 225088 |        2 | 7.559 |
+| 225088 |        3 |  7.84 |
+| 225088 |        4 | 7.835 |
+| 225088 |        5 | 7.652 |
++--------+----------+-------+
+5 rows in set (0.001 sec)
+#+END_SRC
+
+
+#+BEGIN_SRC SQL
+select id,strainid,value from (ProbeSetData,ProbeSetXRef) where   ProbeSetXRef.ProbeSetFreezeId = 206 and id=ProbeSetId   and strainid=4 limit 5;
++--------+----------+-------+
+| id     | strainid | value |
++--------+----------+-------+
+| 225088 |        4 | 7.835 |
+| 225089 |        4 | 9.595 |
+| 225090 |        4 | 8.982 |
+| 225091 |        4 | 8.153 |
+| 225092 |        4 | 7.111 |
++--------+----------+-------+
+5 rows in set (0.000 sec)
+#+END_SRC
+
+#+BEGIN_SRC SQL
+MariaDB [db_webqtl]> select ProbeSet.name,strainid,probesetfreezeid,value from (ProbeSet,ProbeSetData,ProbeSetFreeze,ProbeSetXRef) where   ProbeSetXRef.ProbeSetFreezeId = 206 and ProbeSetData.id=ProbeSetId   and (strainid=4 or strainid=5) and ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id and ProbeSet.Id = ProbeSetXRef.ProbeSetId limit 5;
++---------+----------+------------------+-------+
+| name    | strainid | probesetfreezeid | value |
++---------+----------+------------------+-------+
+| 4331726 |        4 |              206 | 7.835 |
+| 5054239 |        4 |              206 | 9.595 |
+| 4642578 |        4 |              206 | 8.982 |
+| 4398221 |        4 |              206 | 8.153 |
+| 5543360 |        4 |              206 | 7.111 |
++---------+----------+------------------+-------+
+5 rows in set (2.174 sec)
+#+END_SRC
+
+
+No more joins and super fast!!
+
+*** TODO check if the values match the original query.
+
+
 ** Original query
 
 This is the original query generated by GN2 that takes 1 hour on
@@ -327,7 +407,9 @@ traverses the full 53GB database table (and even larger index) for each of
 them.
 
 #+BEGIN_SRC SQL
-SELECT ProbeSet.Name,T4.value, T5.value, T6.value, T7.value, T8.value, T9.value, T10.value, T11.value, T12.value, T13.value, T14.value, T15.value, T16.value, T17.value, T18.value, T19.value, T20.value, T21.value, T22.value, T23.value, T24.value, T25.value, T26.value, T28.value, T29.value, T30.value, T31.value, T33.value, T35.value, T36.value, T37.value, T39.value, T98.value, T100.value, T103.value FROM (ProbeSet, ProbeSetXRef, ProbeSetFreeze)
+SELECT ProbeSet.Name,T4.value, T5.value, T6.value, T7.value, T8.value, T9.value, T10.value, T11.value, T12.value, T13.value, T14.value, T15.value, T16.value, T17.value, T18.value, T19.value, T20.value, T21.value, T22.value, T23.value, T24.value, T25.value, T26.value, T28.value,
+  T29.value, T30.value, T31.value, T33.value, T35.value, T36.value, T37.value, T39.value,
+  T98.value, T100.value, T103.value FROM (ProbeSet, ProbeSetXRef, ProbeSetFreeze)
                         left join ProbeSetData as T4 on T4.Id = ProbeSetXRef.DataId and T4.StrainId=4
                         left join ProbeSetData as T5 on T5.Id = ProbeSetXRef.DataId and T5.StrainId=5
                         left join ProbeSetData as T6 on T6.Id = ProbeSetXRef.DataId and T6.StrainId=6
author	Pjotr Prins	2021-05-19 02:30:28 -0400
committer	Pjotr Prins	2021-05-19 02:30:28 -0400
commit	92584ab4cd23edf309927aad3af7a08e8b8f8b08 (patch)
tree	141a5508f7d413a761b454f85a17411dbb537b8e /docs
parent	e174551e624e8d9df5d9f143fb0571c78994940e (diff)
download	genenetwork3-92584ab4cd23edf309927aad3af7a08e8b8f8b08.tar.gz