From 764136ff18bd355f7b9dbe91f870919b0a17322e Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Fri, 24 Jun 2016 06:51:28 +0000 Subject: Log: and document SQL calls --- doc/database.org | 152 ++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 135 insertions(+), 17 deletions(-) (limited to 'doc/database.org') diff --git a/doc/database.org b/doc/database.org index 2221c4fb..753179e6 100644 --- a/doc/database.org +++ b/doc/database.org @@ -7,7 +7,7 @@ number of the BSD's and the Human liver dataset (GSE9588). * GeneNetwork database -** Estimated table sizes +** Estimated table sizes select table_name,round(((data_length + index_length) / 1024 / 1024), 2) `Size in MB` from information_schema.TABLES where table_schema = "db_webqtl" order by data_length; @@ -172,24 +172,24 @@ This table is being used by both GN1 and GN2 from the trait pages! gn1/web/webqtl/showTrait/ShowTraitPage.py: query = "SELECT count(id) FROM AccessLog WHERE ip_address = %s and \ gn1/web/webqtl/showTrait/ShowTraitPage.py: self.cursor.execute("insert into AccessLog(accesstime,ip_address) values(Now(),%s)" ,user_ip) gn1/web/webqtl/textUI/cmdClass.py: query = """SELECT count(id) FROM AccessLog WHERE ip_address = %s AND UNIX_TIMESTAMP()-UNIX_TIMESTAMP(accesstime)<86400""" -gn1/web/webqtl/textUI/cmdClass.py: query = """INSERT INTO AccessLog(accesstime,ip_address) values(Now(),%s)""" +gn1/web/webqtl/textUI/cmdClass.py: query = """INSERT INTO AccessLog(accesstime,ip_address) values(Now(),%s)""" gn2/wqflask/wqflask/show_trait/show_trait_page.py: query = "SELECT count(id) FROM AccessLog WHERE ip_address = %s and \ gn2/wqflask/wqflask/show_trait/show_trait_page.py: self.cursor.execute("insert into AccessLog(accesstime,ip_address) values(Now(),%s)", user_ip) When looking at the code in GN1 and GN2 it restricts the daily use of the trait data page (set to 1,000 - whoever reaches that?). Unlike mentioned in the schema description, this table does *not* keep track -of cookies. +of cookies. From the code it looks like GN2 uses a mixture of Redis and sqlalchemy to keep track of logged in sessions (see -gn2/wqflask/wqflask/user_manager.py) and cookies through a user_uuid in +gn2/wqflask/wqflask/user_manager.py) and cookies through a user_uuid in model.py. In gn2/wqflask/wqflask/templates/collections/view_anonymous.html it show_trait_page appears to be loaded (need to check). -** AvgMethod +** AvgMethod Probesetfreeze refers to AvgMethod @@ -235,7 +235,7 @@ Hooked in API (URL encoding) GN2 only (see menu bar) -** Ensembl* +** Ensembl* Probe information @@ -321,15 +321,15 @@ Indel Snp browser (variant browser Gn1) ** Info* -Infra system PhP +Infra system PhP -Data Info button +Data Info button Infosystem users has separate entries -Also Investigators, User, Organizations, +Also Investigators, User, Organizations, -** LCorrRamin3 +** LCorrRamin3 Lit. Correlations Prof. Ramin @@ -376,7 +376,7 @@ This table contains names, full descriptions, and short symbols for traits and phenotype used primarily in the Published Phenotypes databases. -Contains 10k rows, March 2016, of which 5000 are for the BXDs). +Contains 10k rows, March 2016, of which 5000 are for the BXDs). | Id | Pre_publication_description | Post_publication_description | Original_description | Units | Pre_publication_abbreviation | Post_publication_abbreviation | Lab_code | Submitter | Owner | Authorized_Users | +----+-----------------------------+----------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+------------------------------+-------------------------------+----------+-------------+-------+------------------+ @@ -493,7 +493,7 @@ select count(*) from ProbeSet limit 5; ** ProbeSetData -Probedata - main molecular data. Probesets, metabolome, +Probedata - main molecular data. Probesets, metabolome, Almost all important molecular assay data is in this table including probe set data, RNA-seq data, proteomic data, and metabolomic @@ -599,7 +599,7 @@ select * from ProbeSetFreeze limit 5; Note that the following unlimited search is very slow: -select max(value) from ProbeSetData; +select max(value) from ProbeSetData; +------------+ | max(value) | @@ -617,12 +617,12 @@ Suggestions on the schema page: "StrainId" should be "CaseId" or "SampleId". -"ProbeSetData" should probably be "AssayData" or something more neutral. +"ProbeSetData" should probably be "AssayData" or something more neutral. *** Comments I think the ProbeSetData table should be generalized to a 'phenotypes' -table with an 'sample_id' column and a 'value' column. +table with an 'sample_id' column and a 'value' column. A new table 'samples' will link each sample against an 'experiment', an 'individual' and which in turn can link to a 'strain'. @@ -697,7 +697,7 @@ select * from Species; +----+-----------+----------------------+----------------+----------------------+-------------------------+------------+---------+ -** InbredSet +** InbredSet Menu @@ -723,10 +723,128 @@ User selection - retained ** UserPrivilege -** Vlookup +** Vlookup * Fetching Data +** Menu + +What happens at the main search menu can be emulated with curl. + +*** Search Mouse mRNA from HC_M2_0606_P dataset + +: curl "http://localhost:5003/search?species=mouse&group=BXD&ppocampus+mRNA&dataset=HC_M2_0606_P&search_terms_or=&search_terms_and=MEAN%3D%2815+16%29+LRS%3D%2823+46%29+&FormID=searchResult" + +the first SQL query checks whether a dataset is confidential or not: + +INFO:base.data_set:.sql: geno_mrna_confidentiality: u'SELECT Id, Name, +FullName, confidentiality,AuthorisedUsers FROM ProbeSetFreeze WHERE +Name = "HC_M2_0606_P"' + +The following is executed once per search: + +INFO:db.call:.sql: retrieve_other_names: + SELECT ProbeSetFreeze.Id, ProbeSetFreeze.Name, ProbeSetFreeze.FullName, ProbeSetFreeze.ShortName, ProbeSetFreeze.DataScale, Tissue.Name + FROM ProbeSetFreeze, ProbeFreeze, Tissue + WHERE ProbeSetFreeze.public > 0 AND + ProbeSetFreeze.ProbeFreezeId = ProbeFreeze.Id AND + ProbeFreeze.TissueId = Tissue.Id AND + (ProbeSetFreeze.Name = 'HC_M2_0606_P' OR ProbeSetFreeze.FullName = 'HC_M2_0606_P' OR ProbeSetFreeze.ShortName = 'HC_M2_0606_P') + +INFO:db.call:.sql: (112, 'HC_M2_0606_P', 'Hippocampus Consortium M430v2 (Jun06) PDNN', 'Hippocampus M430v2 BXD 06/06 PDNN', 'log2', 'Hippocampus mRNA') + +INFO:db.call:.sql: __init__: + SELECT + InbredSet.Name, InbredSet.Id + FROM + InbredSet, ProbeSetFreeze, ProbeFreeze + WHERE + ProbeFreeze.InbredSetId = InbredSet.Id AND + ProbeFreeze.Id = ProbeSetFreeze.ProbeFreezeId AND + ProbeSetFreeze.Name = "HC_M2_0606_P" + +INFO:db.call:.sql: ('BXD', 1) + +INFO:base.species:.sql: __init__: + Select + Chr_Length.Name, Chr_Length.OrderId, Length from Chr_Length, InbredSet + where + Chr_Length.SpeciesId = InbredSet.SpeciesId AND + InbredSet.Name = 'BXD' + Order by OrderId + +And the following are executed for every trait: + +INFO:wqflask.do_search:.sql: execute: SELECT distinct ProbeSet.Name as TNAME, 0 as thistable, ProbeSetXRef.Mean as TMEAN, ProbeSetXRef.LRS as TLRS, ProbeSetXRef.PVALUE as TPVALUE, ProbeSet.Chr_num as TCHR_NUM, ProbeSet.Mb as TMB, ProbeSet.Symbol as TSYMBOL, ProbeSet.name_num as TNAME_NUM FROM ProbeSetXRef, ProbeSet WHERE (( ProbeSetXRef.mean > 15.0 and ProbeSetXRef.mean < 16.0 )AND( ProbeSetXRef.LRS > 23.0 and ProbeSetXRef.LRS < 46.0 )) and ProbeSet.Id = ProbeSetXRef.ProbeSetId and ProbeSetXRef.ProbeSetFreezeId = 112 ORDER BY ProbeSet.symbol ASC + +INFO:base.trait:.sql: retrieve_info: SELECT ProbeSet.name, + ProbeSet.symbol, ProbeSet.description, + ProbeSet.probe_target_description, ProbeSet.chr, + ProbeSet.mb, ProbeSet.alias, ProbeSet.geneid, + ProbeSet.genbankid, ProbeSet.unigeneid, + ProbeSet.omim, ProbeSet.refseq_transcriptid, + ProbeSet.blatseq, ProbeSet.targetseq, + ProbeSet.chipid, ProbeSet.comments, + ProbeSet.strand_probe, ProbeSet.strand_gene, + ProbeSet.probe_set_target_region, + ProbeSet.probe_set_specificity, + ProbeSet.probe_set_blat_score, + ProbeSet.probe_set_blat_mb_start, + ProbeSet.probe_set_blat_mb_end, + ProbeSet.probe_set_strand, + ProbeSet.probe_set_note_by_rw, ProbeSet.flag + + FROM ProbeSet, ProbeSetFreeze, ProbeSetXRef + WHERE + ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id AND + ProbeSetXRef.ProbeSetId = ProbeSet.Id AND + ProbeSetFreeze.Name = 'HC_M2_0606_P' AND + ProbeSet.Name = '1443823_s_at' + +INFO:base.trait:.sql: retrieve_info: + SELECT + HomologeneId + FROM + Homologene, Species, InbredSet + WHERE + Homologene.GeneId =98660 AND + InbredSet.Name = 'BXD' AND + InbredSet.SpeciesId = Species.Id AND + Species.TaxonomyId = Homologene.TaxonomyId + +INFO:base.trait:.sql: retrieve_info: + SELECT + ProbeSetXRef.Locus, ProbeSetXRef.LRS, ProbeSetXRef.pValue, ProbeSetXRef.mean, ProbeSetXRef.additive + FROM + ProbeSetXRef, ProbeSet + WHERE + ProbeSetXRef.ProbeSetId = ProbeSet.Id AND + ProbeSet.Name = "1443823_s_at" AND + ProbeSetXRef.ProbeSetFreezeId =112 + +INFO:base.trait:.sql: retrieve_info: + select Geno.Chr, Geno.Mb from Geno, Species + where Species.Name = 'mouse' and + Geno.Name = 'NES13033186' and + Geno.SpeciesId = Species.Id + + +and finally mean and chromosome positions are fetched a second time +for every trait with + +INFO:base.data_set:.sql: get_trait_info: select ProbeSetXRef.mean from ProbeSetXRef, ProbeSet + where ProbeSetXRef.ProbeSetFreezeId = 112 and + ProbeSet.Id = ProbeSetXRef.ProbeSetId and + ProbeSet.Name = '1443823_s_at' + +INFO:base.data_set:.sql: get_trait_info: + select Geno.Chr, Geno.Mb from Geno, Species + where Species.Name = 'mouse' and + Geno.Name = 'NES13033186' and + Geno.SpeciesId = Species.Id + +(that is a bug!). + ** Fetch phenotypes To get at phenotype data ProbeSetData is the main table (almost all -- cgit v1.2.3