aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPjotr Prins2021-05-19 07:36:28 -0400
committerPjotr Prins2021-05-19 07:36:28 -0400
commitafdddb23851465203cd5dab26300b09a5cd6d405 (patch)
tree06fa64cf07fbacd05ba44b477151bc89b44fc9c6
parentd8c887c5be5b0dbf108cca6121d805b93311ee91 (diff)
downloadgenenetwork3-afdddb23851465203cd5dab26300b09a5cd6d405.tar.gz
More SQL optimization trials
-rw-r--r--docs/performance/slow-probesetdata-sql.org148
1 files changed, 144 insertions, 4 deletions
diff --git a/docs/performance/slow-probesetdata-sql.org b/docs/performance/slow-probesetdata-sql.org
index 5ef4699..919eb37 100644
--- a/docs/performance/slow-probesetdata-sql.org
+++ b/docs/performance/slow-probesetdata-sql.org
@@ -64,6 +64,18 @@ compression and/or ZFS.
My prediction is that we can not get around this.
+** Result
+
+The final query is shown at the bottom of 'Exploration'. It takes 2 minutes
+on Penguin2 and Tux01. Essentially I took out the joins (which parsed the same
+table repeatedly) and added an index. The trick is to keep minimizing the query.
+
+The 2 minute query will do for now and it probably is no longer quadratic.
+
+We can probably improve things in the future by changing the way ProbeSetData
+is stored.
+
+
** Exploration
In the following steps I reduce the complex case to a simple case explaining
@@ -401,8 +413,6 @@ limit 5;
No more joins and super fast!!
-*** TODO check if the values match the original query.
-
#+BEGIN_SRC SQL
SELECT ProbeSet.Name,T4.StrainID,Probesetfreezeid,T4.value
FROM (ProbeSet, ProbeSetXRef, ProbeSetFreeze)
@@ -495,9 +505,8 @@ Now you can see the difference.
select ProbeSetData.id,ProbeSet.name,strainid,probesetfreezeid,value from (ProbeSet,ProbeSetData,ProbeSetFreeze,ProbeSetXRef)
where ProbeSetData.id=ProbeSetXRef.DataId
and (strainid=4 or strainid=5)
- and ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id
+ and ProbeSetXRef.ProbeSetFreezeId = 206
and ProbeSet.Id = ProbeSetXRef.ProbeSetId
- and ProbeSetFreeze.Name = 'UMUTAffyExon_0209_RMA'
and ProbeSet.name = '4331726'
limit 5;
@@ -510,6 +519,137 @@ select ProbeSetData.id,ProbeSet.name,strainid,probesetfreezeid,value from (Probe
It worked!
+BUT IT IS SLOW for the full query. This is now due to this table being huge
+and DataID distributed through the table (sigh!).
+
+#+BEGIN_SRC SQL
+MariaDB [db_webqtl]> select count(*) from ProbeSetXRef;
++----------+
+| count(*) |
++----------+
+| 47713039 |
++----------+
+1 row in set (0.000 sec)
+#+END_SRC
+
+
+#+BEGIN_SRC SQL
+select ProbeSetData.id,ProbeSet.name,strainid,probesetfreezeid,value from (ProbeSet,ProbeSetData,ProbeSetFreeze,ProbeSetXRef)
+ where
+ ProbeSetXRef.ProbeSetFreezeId = 206
+ and ProbeSet.Id = ProbeSetXRef.ProbeSetId
+ and ProbeSetFreeze.Id = 206
+ and ProbeSetData.id=ProbeSetXRef.DataId
+ and (strainid=4 or strainid=5)
+ limit 5;
+
++----------+---------+----------+------------------+---------+
+| id | name | strainid | probesetfreezeid | value |
++----------+---------+----------+------------------+---------+
+| 38549183 | 4304920 | 4 | 206 | 4.97269 |
+| 38549184 | 4304921 | 4 | 206 | 6.25133 |
+| 38549185 | 4304922 | 4 | 206 | 6.03701 |
+| 38549186 | 4304923 | 4 | 206 | 9.10316 |
+| 38549187 | 4304925 | 4 | 206 | 8.90826 |
++----------+---------+----------+------------------+---------+
+5 rows in set (23.995 sec)
+
+select count(ProbeSetId) from ProbeSetXRef where ProbeSetFreezeId = 206;
++-------------------+
+| count(ProbeSetId) |
++-------------------+
+| 1236087 |
++-------------------+
+
+select count(ProbeSet.name) from (ProbeSet,ProbeSetXRef)
+ where ProbeSetFreezeId = 206
+ and ProbeSet.Id = ProbeSetXRef.ProbeSetId;
++----------------------+
+| count(ProbeSet.name) |
++----------------------+
+| 1236087 |
++----------------------+
+1 row in set (7.126 sec)
+
+select ProbeSet.name,ProbeSetData.id from (ProbeSet,ProbeSetXRef,ProbeSetData)
+ where ProbeSetFreezeId = 206
+ and ProbeSet.Id = ProbeSetXRef.ProbeSetId
+ and ProbeSetData.id = ProbeSetXRef.DataId
+;
+
+MariaDB [db_webqtl]> select count(ProbeSetData.id) from (ProbeSet,ProbeSetXRef,ProbeSetData) where ProbeSetFreezeId = 206 and ProbeSet.Id = ProbeSetXRef.ProbeSetId and ProbeSetData.id = ProbeSetXRef.DataId;
++------------------------+
+| count(ProbeSetData.id) |
++------------------------+
+| 114956091 |
++------------------------+
+1 row in set (35.836 sec)
+
+select count(*) from (ProbeSet,ProbeSetXRef,ProbeSetData) where ProbeSetFreezeId = 206 and ProbeSet.Id = ProbeSetXRef.ProbeSetId and ProbeSetData.id = ProbeSetXRef.DataId ;
++-----------+
+| count(*) |
++-----------+
+| 114956091 |
++-----------+
+1 row in set (35.392 sec)
+
+
+select ProbeSet.name,ProbeSetData.value from (ProbeSet,ProbeSetXRef,ProbeSetData)
+ where ProbeSetFreezeId = 206
+ and ProbeSet.Id = ProbeSetXRef.ProbeSetId
+ and ProbeSetData.id = ProbeSetXRef.DataId
+;
+
+select count(value) from (ProbeSet,ProbeSetXRef,ProbeSetData) where ProbeSetFreezeId = 206 and ProbeSet.Id = ProbeSetXRef.ProbeSetId and ProbeSetData.id = ProbeSetXRef.DataId and strainid=5 ;
++--------------+
+| count(value) |
++--------------+
+| 1236087 |
++--------------+
+1 row in set (14.819 sec)
+
+
+select count(value) from (ProbeSet,ProbeSetXRef,ProbeSetData)
+where ProbeSetFreezeId = 206
+ and ProbeSet.Id = ProbeSetXRef.ProbeSetId
+ and ProbeSetData.id = ProbeSetXRef.DataId
+ and ((strainid>=4 and strainid<=31) or strainid in (33,35,36,37,39,98,100,103))
+;
+
++--------------+
+| count(value) |
++--------------+
+| 43263045 |
++--------------+
+1 row in set (1 min 40.498 sec)
+
+select name,strainid,value from (ProbeSet,ProbeSetXRef,ProbeSetData)
+where ProbeSetFreezeId = 206
+ and ProbeSet.Id = ProbeSetXRef.ProbeSetId
+ and ProbeSetData.id = ProbeSetXRef.DataId
+ and ((strainid>=4 and strainid<=31) or strainid in (33,35,36,37,39,98,100,103))
+limit 5;
++---------+----------+---------+
+| name | strainid | value |
++---------+----------+---------+
+| 4331726 | 4 | 5.52895 |
+| 4331726 | 5 | 6.76158 |
+| 4331726 | 6 | 6.06911 |
+| 4331726 | 7 | 6.24858 |
+| 4331726 | 8 | 6.36076 |
++---------+----------+---------+
+
+select name,strainid,value from (ProbeSet,ProbeSetXRef,ProbeSetData)
+where ProbeSetFreezeId = 206
+ and ProbeSet.Id = ProbeSetXRef.ProbeSetId
+ and ProbeSetData.id = ProbeSetXRef.DataId
+ and ((strainid>=4 and strainid<=31) or strainid in (33,35,36,37,39,98,100,103))
+;
+
+#+END_SRC
+
+The final query works in 2.2 minutes on both Penguin2 and Tux01.
+
** Original query
This is the original query generated by GN2 that takes 1 hour on