From 6ea9c44c442791b6140c25c9f4edc7a92fb25c57 Mon Sep 17 00:00:00 2001 From: Alexander_Kabui Date: Sun, 14 Aug 2022 23:39:25 +0300 Subject: init:perform top n sample for probes --- wqflask/wqflask/correlation/rust_correlation.py | 124 +++++++++++++++++++++++- 1 file changed, 119 insertions(+), 5 deletions(-) diff --git a/wqflask/wqflask/correlation/rust_correlation.py b/wqflask/wqflask/correlation/rust_correlation.py index 3628f549..94720f54 100644 --- a/wqflask/wqflask/correlation/rust_correlation.py +++ b/wqflask/wqflask/correlation/rust_correlation.py @@ -1,6 +1,9 @@ """module contains integration code for rust-gn3""" import json from functools import reduce +from flask import g +from utility.db_tools import mescape +from utility.db_tools import create_in_clause from wqflask.correlation.correlation_functions import get_trait_symbol_and_tissue_values from wqflask.correlation.correlation_gn3_api import create_target_this_trait from wqflask.correlation.correlation_gn3_api import lit_for_trait_list @@ -12,6 +15,106 @@ from gn3.computations.rust_correlation import parse_tissue_corr_data from gn3.db_utils import database_connector + + +def chunk_dataset(dataset,steps,name): + + results = [] + + query = """ + SELECT ProbeSetXRef.DataId,ProbeSet.Name + FROM ProbeSet, ProbeSetXRef, ProbeSetFreeze + WHERE ProbeSetFreeze.Name = '{}' AND + ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id AND + ProbeSetXRef.ProbeSetId = ProbeSet.Id + """.format(name) + + traits_name_dict = dict(g.db.execute(query).fetchall()) + + + for i in range(0, len(dataset), steps): + matrix = list(dataset[i:i + steps]) + trait_name = traits_name_dict[matrix[0][0]] + + strains = [trait_name] + [str(value) for (trait_name, strain, value) in matrix] + results.append(",".join(strains)) + + breakpoint() + return results + + +def compute_top_n_sample(start_vars, dataset, trait_list): + """only if dataset is of type probeset""" + + + + + def __fetch_sample_ids__(samples_vals, samples_group): + + + all_samples = json.loads(samples_vals) + sample_data = get_sample_corr_data( + sample_type=samples_group, all_samples=all_samples, + dataset_samples=dataset.group.all_samples_ordered()) + + + with database_connector() as conn: + + curr = conn.cursor() + + curr.execute( + """ + SELECT Strain.Name, Strain.Id FROM Strain, Species + WHERE Strain.Name IN {} + and Strain.SpeciesId=Species.Id + and Species.name = '{}' + """.format(create_in_clause(list(sample_data.keys())), + *mescape(dataset.group.species)) + + ) + + return dict(curr.fetchall()) + + + + + + + + + + + + ty = __fetch_sample_ids__(start_vars["sample_vals"], start_vars["corr_samples_group"]) + + + + with database_connector() as conn: + + curr = conn.cursor() + + curr.execute( + + """ + SELECT * from ProbeSetData + where StrainID in {} + and id in (SELECT ProbeSetXRef.DataId + FROM (ProbeSet, ProbeSetXRef, ProbeSetFreeze) + WHERE ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id + and ProbeSetFreeze.Name = '{}' + and ProbeSet.Name in {} + and ProbeSet.Id = ProbeSetXRef.ProbeSetId) + """.format(create_in_clause(list(ty.values())),dataset.name,create_in_clause(trait_list)) + + + ) + + + + + return chunk_dataset(list(curr.fetchall()),len(ty.values()),dataset.name) + + def compute_top_n_lit(corr_results, this_dataset, this_trait) -> dict: (this_trait_geneid, geneid_dict, species) = do_lit_correlation( this_trait, this_dataset) @@ -69,6 +172,7 @@ def merge_results(dict_a: dict, dict_b: dict, dict_c: dict) -> list[dict]: } return [__merge__(tname, tcorrs) for tname, tcorrs in dict_a.items()] + def __compute_sample_corr__( start_vars: dict, corr_type: str, method: str, n_top: int, target_trait_info: tuple): @@ -86,11 +190,11 @@ def __compute_sample_corr__( r = ",".join(lts) target_data.append(r) - return run_correlation( target_data, list(sample_data.values()), method, ",", corr_type, n_top) + def __compute_tissue_corr__( start_vars: dict, corr_type: str, method: str, n_top: int, target_trait_info: tuple): @@ -111,6 +215,7 @@ def __compute_tissue_corr__( return run_correlation(data[1], data[0], method, ",", "tissue") return {} + def __compute_lit_corr__( start_vars: dict, corr_type: str, method: str, n_top: int, target_trait_info: tuple): @@ -127,6 +232,7 @@ def __compute_lit_corr__( species=species, gene_id=this_trait_geneid) return {} + def compute_correlation_rust( start_vars: dict, corr_type: str, method: str = "pearson", n_top: int = 500, compute_all: bool = False): @@ -135,7 +241,7 @@ def compute_correlation_rust( (this_dataset, this_trait, target_dataset, sample_data) = ( target_trait_info) - ## Replace this with `match ...` once we hit Python 3.10 + # Replace this with `match ...` once we hit Python 3.10 corr_type_fns = { "sample": __compute_sample_corr__, "tissue": __compute_tissue_corr__, @@ -143,15 +249,23 @@ def compute_correlation_rust( } results = corr_type_fns[corr_type]( start_vars, corr_type, method, n_top, target_trait_info) - ## END: Replace this with `match ...` once we hit Python 3.10 + # END: Replace this with `match ...` once we hit Python 3.10 top_tissue_results = {} top_lit_results = {} + + + results = compute_top_n_sample(start_vars,target_dataset,list(results.keys())) + + + + breakpoint() + if compute_all: # example compute of compute both correlation top_tissue_results = compute_top_n_tissue( - this_dataset,this_trait,results,method) - top_lit_results = compute_top_n_lit(results,this_dataset,this_trait) + this_dataset, this_trait, results, method) + top_lit_results = compute_top_n_lit(results, this_dataset, this_trait) return { "correlation_results": merge_results( -- cgit v1.2.3 From cda1370d5712ae3c756215ef848dedc99cd5504d Mon Sep 17 00:00:00 2001 From: Alexander_Kabui Date: Mon, 15 Aug 2022 00:47:21 +0300 Subject: add default values for num overlap,corr_coeff --- wqflask/wqflask/correlation/show_corr_results.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/wqflask/wqflask/correlation/show_corr_results.py b/wqflask/wqflask/correlation/show_corr_results.py index 1c391386..f5fdd9b3 100644 --- a/wqflask/wqflask/correlation/show_corr_results.py +++ b/wqflask/wqflask/correlation/show_corr_results.py @@ -121,9 +121,9 @@ def correlation_json_for_table(correlation_data, this_trait, this_dataset, targe results_dict['dataset'] = target_dataset['name'] results_dict['hmac'] = hmac.data_hmac( '{}:{}'.format(target_trait['name'], target_dataset['name'])) - results_dict['sample_r'] = f"{float(trait['corr_coefficient']):.3f}" - results_dict['num_overlap'] = trait['num_overlap'] - results_dict['sample_p'] = f"{float(trait['p_value']):.3e}" + results_dict['sample_r'] = f"{float(trait.get('corr_coefficient',0.0)):.3f}" + results_dict['num_overlap'] = trait.get('num_overlap',0) + results_dict['sample_p'] = f"{float(trait.get('p_value',0)):.3e}" if target_dataset['type'] == "ProbeSet": results_dict['symbol'] = target_trait['symbol'] results_dict['description'] = "N/A" -- cgit v1.2.3 From fa8ef3e466e3919648e1d4cf9c38ed30328fc7a6 Mon Sep 17 00:00:00 2001 From: Alexander_Kabui Date: Tue, 16 Aug 2022 12:11:36 +0300 Subject: minor fixes for computing all correlations --- wqflask/wqflask/correlation/rust_correlation.py | 69 ++++++++++++++----------- 1 file changed, 38 insertions(+), 31 deletions(-) diff --git a/wqflask/wqflask/correlation/rust_correlation.py b/wqflask/wqflask/correlation/rust_correlation.py index 94720f54..2a2ad4a0 100644 --- a/wqflask/wqflask/correlation/rust_correlation.py +++ b/wqflask/wqflask/correlation/rust_correlation.py @@ -39,15 +39,14 @@ def chunk_dataset(dataset,steps,name): strains = [trait_name] + [str(value) for (trait_name, strain, value) in matrix] results.append(",".join(strains)) - breakpoint() return results def compute_top_n_sample(start_vars, dataset, trait_list): - """only if dataset is of type probeset""" - - + """check if dataset is of type probeset""" + if dataset.type!= "Probeset": + return {} def __fetch_sample_ids__(samples_vals, samples_group): @@ -73,19 +72,9 @@ def compute_top_n_sample(start_vars, dataset, trait_list): ) - return dict(curr.fetchall()) - - - - - + return (sample_data,dict(curr.fetchall())) - - - - - - ty = __fetch_sample_ids__(start_vars["sample_vals"], start_vars["corr_samples_group"]) + (sample_data,sample_ids) = __fetch_sample_ids__(start_vars["sample_vals"], start_vars["corr_samples_group"]) @@ -93,6 +82,8 @@ def compute_top_n_sample(start_vars, dataset, trait_list): curr = conn.cursor() + #fetching strain data in bulk + curr.execute( """ @@ -104,15 +95,14 @@ def compute_top_n_sample(start_vars, dataset, trait_list): and ProbeSetFreeze.Name = '{}' and ProbeSet.Name in {} and ProbeSet.Id = ProbeSetXRef.ProbeSetId) - """.format(create_in_clause(list(ty.values())),dataset.name,create_in_clause(trait_list)) + """.format(create_in_clause(list(sample_ids.values())),dataset.name,create_in_clause(trait_list)) ) + corr_data = chunk_dataset(list(curr.fetchall()),len(sample_ids.values()),dataset.name) - - - return chunk_dataset(list(curr.fetchall()),len(ty.values()),dataset.name) + return run_correlation(corr_data,list(sample_data.values()),"pearson",",") def compute_top_n_lit(corr_results, this_dataset, this_trait) -> dict: @@ -170,7 +160,10 @@ def merge_results(dict_a: dict, dict_b: dict, dict_c: dict) -> list[dict]: **dict_c.get(trait_name, {}) } } - return [__merge__(tname, tcorrs) for tname, tcorrs in dict_a.items()] + results = [__merge__(tname, tcorrs) for tname, tcorrs in dict_a.items()] + + + return results def __compute_sample_corr__( @@ -249,27 +242,41 @@ def compute_correlation_rust( } results = corr_type_fns[corr_type]( start_vars, corr_type, method, n_top, target_trait_info) + # END: Replace this with `match ...` once we hit Python 3.10 - top_tissue_results = {} - top_lit_results = {} + top_a = top_b = {} - results = compute_top_n_sample(start_vars,target_dataset,list(results.keys())) + if compute_all: + if corr_type == "sample": + top_a = compute_top_n_tissue( + this_dataset, this_trait, results, method) + + top_b = compute_top_n_lit(results, this_dataset, this_trait) - breakpoint() - if compute_all: - # example compute of compute both correlation - top_tissue_results = compute_top_n_tissue( + elif corr_type == "lit": + + #currently fails for lit + + top_a = compute_top_n_sample(start_vars,target_dataset,list(results.keys())) + top_b = compute_top_n_tissue( this_dataset, this_trait, results, method) - top_lit_results = compute_top_n_lit(results, this_dataset, this_trait) - return { + else: + + top_a = compute_top_n_sample(start_vars,target_dataset,list(results.keys())) + + top_b = compute_top_n_lit(results, this_dataset, this_trait) + + + + return { "correlation_results": merge_results( - results, top_tissue_results, top_lit_results), + results, top_a, top_b), "this_trait": this_trait.name, "target_dataset": start_vars['corr_dataset'], "return_results": n_top -- cgit v1.2.3 From 56928f087caaa7021a34a1186d0316a9e35e3e9c Mon Sep 17 00:00:00 2001 From: Alexander_Kabui Date: Tue, 16 Aug 2022 12:26:35 +0300 Subject: remove flask g object --- wqflask/wqflask/correlation/rust_correlation.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/wqflask/wqflask/correlation/rust_correlation.py b/wqflask/wqflask/correlation/rust_correlation.py index 2a2ad4a0..ce1b5fda 100644 --- a/wqflask/wqflask/correlation/rust_correlation.py +++ b/wqflask/wqflask/correlation/rust_correlation.py @@ -1,7 +1,6 @@ """module contains integration code for rust-gn3""" import json from functools import reduce -from flask import g from utility.db_tools import mescape from utility.db_tools import create_in_clause from wqflask.correlation.correlation_functions import get_trait_symbol_and_tissue_values @@ -29,8 +28,12 @@ def chunk_dataset(dataset,steps,name): ProbeSetXRef.ProbeSetId = ProbeSet.Id """.format(name) - traits_name_dict = dict(g.db.execute(query).fetchall()) + with database_connector() as conn: + curr = conn.cursor() + curr.execute(query) + + traits_name_dict = curr.fetchall() for i in range(0, len(dataset), steps): matrix = list(dataset[i:i + steps]) @@ -45,7 +48,7 @@ def chunk_dataset(dataset,steps,name): def compute_top_n_sample(start_vars, dataset, trait_list): """check if dataset is of type probeset""" - if dataset.type!= "Probeset": + if dataset.type.lower()!= "probeset": return {} def __fetch_sample_ids__(samples_vals, samples_group): @@ -248,6 +251,8 @@ def compute_correlation_rust( top_a = top_b = {} + compute_all = True + if compute_all: if corr_type == "sample": -- cgit v1.2.3 From a07e80862e210dfaee9a42645306bd5bdcd8a911 Mon Sep 17 00:00:00 2001 From: Alexander_Kabui Date: Tue, 16 Aug 2022 12:41:21 +0300 Subject: index error fix --- wqflask/wqflask/correlation/rust_correlation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wqflask/wqflask/correlation/rust_correlation.py b/wqflask/wqflask/correlation/rust_correlation.py index ce1b5fda..0bbf6533 100644 --- a/wqflask/wqflask/correlation/rust_correlation.py +++ b/wqflask/wqflask/correlation/rust_correlation.py @@ -33,7 +33,7 @@ def chunk_dataset(dataset,steps,name): curr.execute(query) - traits_name_dict = curr.fetchall() + traits_name_dict = dict(curr.fetchall()) for i in range(0, len(dataset), steps): matrix = list(dataset[i:i + steps]) -- cgit v1.2.3 From 3b40fea60efa96380e0fd4d2c8d0f8d021ca9447 Mon Sep 17 00:00:00 2001 From: Alexander_Kabui Date: Tue, 16 Aug 2022 12:49:10 +0300 Subject: enable compute all for test --- wqflask/wqflask/correlation/rust_correlation.py | 5 +---- wqflask/wqflask/views.py | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/wqflask/wqflask/correlation/rust_correlation.py b/wqflask/wqflask/correlation/rust_correlation.py index 0bbf6533..95354994 100644 --- a/wqflask/wqflask/correlation/rust_correlation.py +++ b/wqflask/wqflask/correlation/rust_correlation.py @@ -163,10 +163,9 @@ def merge_results(dict_a: dict, dict_b: dict, dict_c: dict) -> list[dict]: **dict_c.get(trait_name, {}) } } - results = [__merge__(tname, tcorrs) for tname, tcorrs in dict_a.items()] + return [__merge__(tname, tcorrs) for tname, tcorrs in dict_a.items()] - return results def __compute_sample_corr__( @@ -251,8 +250,6 @@ def compute_correlation_rust( top_a = top_b = {} - compute_all = True - if compute_all: if corr_type == "sample": diff --git a/wqflask/wqflask/views.py b/wqflask/wqflask/views.py index 2e13451d..e054cd49 100644 --- a/wqflask/wqflask/views.py +++ b/wqflask/wqflask/views.py @@ -876,7 +876,7 @@ def test_corr_compute_page(): correlation_results = compute_correlation_rust(start_vars, start_vars["corr_type"], start_vars['corr_sample_method'], - int(start_vars.get("corr_return_results", 500))) + int(start_vars.get("corr_return_results", 500)),True) correlation_results = set_template_vars(request.form, correlation_results) -- cgit v1.2.3 From 9fd3c438c2197c35e0560ae45f249359c57f9a3d Mon Sep 17 00:00:00 2001 From: Alexander_Kabui Date: Tue, 16 Aug 2022 14:26:24 +0300 Subject: linter fixes --- wqflask/wqflask/correlation/rust_correlation.py | 63 ++++++++++++------------- 1 file changed, 30 insertions(+), 33 deletions(-) diff --git a/wqflask/wqflask/correlation/rust_correlation.py b/wqflask/wqflask/correlation/rust_correlation.py index 95354994..7d796e70 100644 --- a/wqflask/wqflask/correlation/rust_correlation.py +++ b/wqflask/wqflask/correlation/rust_correlation.py @@ -3,7 +3,8 @@ import json from functools import reduce from utility.db_tools import mescape from utility.db_tools import create_in_clause -from wqflask.correlation.correlation_functions import get_trait_symbol_and_tissue_values +from wqflask.correlation.correlation_functions\ + import get_trait_symbol_and_tissue_values from wqflask.correlation.correlation_gn3_api import create_target_this_trait from wqflask.correlation.correlation_gn3_api import lit_for_trait_list from wqflask.correlation.correlation_gn3_api import do_lit_correlation @@ -14,9 +15,7 @@ from gn3.computations.rust_correlation import parse_tissue_corr_data from gn3.db_utils import database_connector - - -def chunk_dataset(dataset,steps,name): +def chunk_dataset(dataset, steps, name): results = [] @@ -39,7 +38,8 @@ def chunk_dataset(dataset,steps,name): matrix = list(dataset[i:i + steps]) trait_name = traits_name_dict[matrix[0][0]] - strains = [trait_name] + [str(value) for (trait_name, strain, value) in matrix] + strains = [trait_name] + [str(value) + for (trait_name, strain, value) in matrix] results.append(",".join(strains)) return results @@ -48,18 +48,16 @@ def chunk_dataset(dataset,steps,name): def compute_top_n_sample(start_vars, dataset, trait_list): """check if dataset is of type probeset""" - if dataset.type.lower()!= "probeset": - return {} + if dataset.type.lower() != "probeset": + return {} def __fetch_sample_ids__(samples_vals, samples_group): - all_samples = json.loads(samples_vals) sample_data = get_sample_corr_data( sample_type=samples_group, all_samples=all_samples, dataset_samples=dataset.group.all_samples_ordered()) - with database_connector() as conn: curr = conn.cursor() @@ -75,21 +73,20 @@ def compute_top_n_sample(start_vars, dataset, trait_list): ) - return (sample_data,dict(curr.fetchall())) - - (sample_data,sample_ids) = __fetch_sample_ids__(start_vars["sample_vals"], start_vars["corr_samples_group"]) - + return (sample_data, dict(curr.fetchall())) + (sample_data, sample_ids) = __fetch_sample_ids__( + start_vars["sample_vals"], start_vars["corr_samples_group"]) with database_connector() as conn: curr = conn.cursor() - #fetching strain data in bulk + # fetching strain data in bulk curr.execute( - """ + """ SELECT * from ProbeSetData where StrainID in {} and id in (SELECT ProbeSetXRef.DataId @@ -98,21 +95,25 @@ def compute_top_n_sample(start_vars, dataset, trait_list): and ProbeSetFreeze.Name = '{}' and ProbeSet.Name in {} and ProbeSet.Id = ProbeSetXRef.ProbeSetId) - """.format(create_in_clause(list(sample_ids.values())),dataset.name,create_in_clause(trait_list)) + """.format(create_in_clause(list(sample_ids.values())), dataset.name, create_in_clause(trait_list)) ) - corr_data = chunk_dataset(list(curr.fetchall()),len(sample_ids.values()),dataset.name) + corr_data = chunk_dataset(list(curr.fetchall()), len( + sample_ids.values()), dataset.name) - return run_correlation(corr_data,list(sample_data.values()),"pearson",",") + return run_correlation(corr_data, + list(sample_data.values()), + "pearson", ",") def compute_top_n_lit(corr_results, this_dataset, this_trait) -> dict: (this_trait_geneid, geneid_dict, species) = do_lit_correlation( this_trait, this_dataset) - geneid_dict = {trait_name: geneid for (trait_name, geneid) in geneid_dict.items() if + geneid_dict = {trait_name: geneid for (trait_name, geneid) + in geneid_dict.items() if corr_results.get(trait_name)} with database_connector() as conn: return reduce( @@ -166,8 +167,6 @@ def merge_results(dict_a: dict, dict_b: dict, dict_c: dict) -> list[dict]: return [__merge__(tname, tcorrs) for tname, tcorrs in dict_a.items()] - - def __compute_sample_corr__( start_vars: dict, corr_type: str, method: str, n_top: int, target_trait_info: tuple): @@ -247,7 +246,6 @@ def compute_correlation_rust( # END: Replace this with `match ...` once we hit Python 3.10 - top_a = top_b = {} if compute_all: @@ -255,28 +253,27 @@ def compute_correlation_rust( if corr_type == "sample": top_a = compute_top_n_tissue( - this_dataset, this_trait, results, method) - - top_b = compute_top_n_lit(results, this_dataset, this_trait) + this_dataset, this_trait, results, method) + top_b = compute_top_n_lit(results, this_dataset, this_trait) elif corr_type == "lit": - #currently fails for lit + # currently fails for lit - top_a = compute_top_n_sample(start_vars,target_dataset,list(results.keys())) - top_b = compute_top_n_tissue( - this_dataset, this_trait, results, method) + top_a = compute_top_n_sample( + start_vars, target_dataset, list(results.keys())) + top_b = compute_top_n_tissue( + this_dataset, this_trait, results, method) else: - top_a = compute_top_n_sample(start_vars,target_dataset,list(results.keys())) + top_a = compute_top_n_sample( + start_vars, target_dataset, list(results.keys())) top_b = compute_top_n_lit(results, this_dataset, this_trait) - - - return { + return { "correlation_results": merge_results( results, top_a, top_b), "this_trait": this_trait.name, -- cgit v1.2.3 From 840b35b6b93092d46c2862b50c4bcb7d66a5e90f Mon Sep 17 00:00:00 2001 From: Alexander_Kabui Date: Tue, 16 Aug 2022 14:37:04 +0300 Subject: rename boolean variables --- wqflask/wqflask/correlation/rust_correlation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wqflask/wqflask/correlation/rust_correlation.py b/wqflask/wqflask/correlation/rust_correlation.py index 7d796e70..10513563 100644 --- a/wqflask/wqflask/correlation/rust_correlation.py +++ b/wqflask/wqflask/correlation/rust_correlation.py @@ -229,7 +229,7 @@ def __compute_lit_corr__( def compute_correlation_rust( start_vars: dict, corr_type: str, method: str = "pearson", - n_top: int = 500, compute_all: bool = False): + n_top: int = 500, should_compute_all: bool = False): """function to compute correlation""" target_trait_info = create_target_this_trait(start_vars) (this_dataset, this_trait, target_dataset, sample_data) = ( @@ -248,7 +248,7 @@ def compute_correlation_rust( top_a = top_b = {} - if compute_all: + if should_compute_all: if corr_type == "sample": -- cgit v1.2.3