From 86ead9b3d823e46350b5566b197463b5fdc46102 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 24 Feb 2022 19:27:28 +0300 Subject: init replace rpy2 for pca --- .../wqflask/correlation_matrix/show_corr_matrix.py | 28 ++++++++++------------ 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/wqflask/wqflask/correlation_matrix/show_corr_matrix.py b/wqflask/wqflask/correlation_matrix/show_corr_matrix.py index e7b16e77..9462f973 100644 --- a/wqflask/wqflask/correlation_matrix/show_corr_matrix.py +++ b/wqflask/wqflask/correlation_matrix/show_corr_matrix.py @@ -23,8 +23,6 @@ import math import random import string -import rpy2.robjects as ro -from rpy2.robjects.packages import importr import numpy as np import scipy @@ -38,6 +36,12 @@ from utility import helper_functions from utility import corr_result_helpers from utility.redis_tools import get_redis_conn + + +from gn3.computations.principal_component_analysis import compute_pca + +from gn3.computations.principal_component_analysis import process_factor_loadings_tdata + Redis = get_redis_conn() THIRTY_DAYS = 60 * 60 * 24 * 30 @@ -174,7 +178,7 @@ class CorrelationMatrix: self.pca_trait_ids = [] pca = self.calculate_pca( list(range(len(self.traits))), corr_eigen_value, corr_eigen_vectors) - self.loadings_array = self.process_loadings() + self.loadings_array = process_factor_loadings_tdata(self.loadings,len(self.trait_list)) else: self.pca_works = "False" except: @@ -188,18 +192,12 @@ class CorrelationMatrix: sample_data=self.sample_data,) def calculate_pca(self, cols, corr_eigen_value, corr_eigen_vectors): - base = importr('base') - stats = importr('stats') - - corr_results_to_list = ro.FloatVector( - [item for sublist in self.pca_corr_results for item in sublist]) - - m = ro.r.matrix(corr_results_to_list, nrow=len(cols)) - eigen = base.eigen(m) - pca = stats.princomp(m, cor="TRUE") - self.loadings = pca.rx('loadings') - self.scores = pca.rx('scores') - self.scale = pca.rx('scale') + + + pca = compute_pca(self.pca_corr_results) + + self.loadings = pca["components"] + self.scores = pca["scores"] trait_array = zScore(self.trait_data_array) trait_array_vectors = np.dot(corr_eigen_vectors, trait_array) -- cgit v1.2.3 From 0ee723d14957c01162a67f4f6b99a25d43908b5b Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 24 Feb 2022 22:19:00 +0300 Subject: remove redundant functions and code --- .../wqflask/correlation_matrix/show_corr_matrix.py | 70 ++-------------------- 1 file changed, 6 insertions(+), 64 deletions(-) diff --git a/wqflask/wqflask/correlation_matrix/show_corr_matrix.py b/wqflask/wqflask/correlation_matrix/show_corr_matrix.py index 9462f973..bcd73436 100644 --- a/wqflask/wqflask/correlation_matrix/show_corr_matrix.py +++ b/wqflask/wqflask/correlation_matrix/show_corr_matrix.py @@ -41,6 +41,7 @@ from utility.redis_tools import get_redis_conn from gn3.computations.principal_component_analysis import compute_pca from gn3.computations.principal_component_analysis import process_factor_loadings_tdata +from gn3.computations.principal_component_analysis import generate_pca_traits_vals Redis = get_redis_conn() THIRTY_DAYS = 60 * 60 * 24 * 30 @@ -169,15 +170,12 @@ class CorrelationMatrix: self.pca_works = "False" try: - corr_result_eigen = np.linalg.eig(np.array(self.pca_corr_results)) - corr_eigen_value, corr_eigen_vectors = sortEigenVectors( - corr_result_eigen) if self.do_PCA == True: self.pca_works = "True" self.pca_trait_ids = [] pca = self.calculate_pca( - list(range(len(self.traits))), corr_eigen_value, corr_eigen_vectors) + list(range(len(self.traits)))) self.loadings_array = process_factor_loadings_tdata(self.loadings,len(self.trait_list)) else: self.pca_works = "False" @@ -191,7 +189,7 @@ class CorrelationMatrix: samples=self.all_sample_list, sample_data=self.sample_data,) - def calculate_pca(self, cols, corr_eigen_value, corr_eigen_vectors): + def calculate_pca(self, cols): pca = compute_pca(self.pca_corr_results) @@ -199,8 +197,9 @@ class CorrelationMatrix: self.loadings = pca["components"] self.scores = pca["scores"] - trait_array = zScore(self.trait_data_array) - trait_array_vectors = np.dot(corr_eigen_vectors, trait_array) + trait_array_vectors = generate_pca_traits_vals(self.trait_data_array,self.pca_corr_results) + + pca_traits = [] for i, vector in enumerate(trait_array_vectors): @@ -231,21 +230,6 @@ class CorrelationMatrix: return pca - def process_loadings(self): - loadings_array = [] - loadings_row = [] - for i in range(len(self.trait_list)): - loadings_row = [] - if len(self.trait_list) > 2: - the_range = 3 - else: - the_range = 2 - for j in range(the_range): - position = i + len(self.trait_list) * j - loadings_row.append(self.loadings[0][position]) - loadings_array.append(loadings_row) - return loadings_array - def export_corr_matrix(corr_results): corr_matrix_filename = "corr_matrix_" + \ @@ -285,45 +269,3 @@ def export_corr_matrix(corr_results): return corr_matrix_filename, matrix_export_path - -def zScore(trait_data_array): - NN = len(trait_data_array[0]) - if NN < 10: - return trait_data_array - else: - i = 0 - for data in trait_data_array: - N = len(data) - S = reduce(lambda x, y: x + y, data, 0.) - SS = reduce(lambda x, y: x + y * y, data, 0.) - mean = S / N - var = SS - S * S / N - stdev = math.sqrt(var / (N - 1)) - if stdev == 0: - stdev = 1e-100 - data2 = [(x - mean) / stdev for x in data] - trait_data_array[i] = data2 - i += 1 - return trait_data_array - - -def sortEigenVectors(vector): - try: - eigenValues = vector[0].tolist() - eigenVectors = vector[1].T.tolist() - combines = [] - i = 0 - for item in eigenValues: - combines.append([eigenValues[i], eigenVectors[i]]) - i += 1 - sorted(combines, key=cmp_to_key(webqtlUtil.cmpEigenValue)) - A = [] - B = [] - for item in combines: - A.append(item[0]) - B.append(item[1]) - sum = reduce(lambda x, y: x + y, A, 0.0) - A = [x * 100.0 / sum for x in A] - return [A, B] - except: - return [] -- cgit v1.2.3 From d5d0e0f3c271e056057c0311083ab3684ccc4386 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Fri, 25 Feb 2022 17:33:19 +0300 Subject: integrating generating temp dataset for pca --- .../wqflask/correlation_matrix/show_corr_matrix.py | 63 +++++++++++----------- 1 file changed, 32 insertions(+), 31 deletions(-) diff --git a/wqflask/wqflask/correlation_matrix/show_corr_matrix.py b/wqflask/wqflask/correlation_matrix/show_corr_matrix.py index bcd73436..d5ec738b 100644 --- a/wqflask/wqflask/correlation_matrix/show_corr_matrix.py +++ b/wqflask/wqflask/correlation_matrix/show_corr_matrix.py @@ -19,7 +19,6 @@ # This module is used by GeneNetwork project (www.genenetwork.org) import datetime -import math import random import string @@ -29,9 +28,6 @@ import scipy from base import data_set from base.webqtlConfig import GENERATED_TEXT_DIR -from functools import reduce -from functools import cmp_to_key -from utility import webqtlUtil from utility import helper_functions from utility import corr_result_helpers from utility.redis_tools import get_redis_conn @@ -42,6 +38,8 @@ from gn3.computations.principal_component_analysis import compute_pca from gn3.computations.principal_component_analysis import process_factor_loadings_tdata from gn3.computations.principal_component_analysis import generate_pca_traits_vals +from gn3.computations.principal_component_analysis import generate_pca_temp_dataset +from gn3.computations.principal_component_analysis import cache_pca_dataset Redis = get_redis_conn() THIRTY_DAYS = 60 * 60 * 24 * 30 @@ -171,11 +169,11 @@ class CorrelationMatrix: self.pca_works = "False" try: + if self.do_PCA == True: self.pca_works = "True" self.pca_trait_ids = [] - pca = self.calculate_pca( - list(range(len(self.traits)))) + pca = self.calculate_pca() self.loadings_array = process_factor_loadings_tdata(self.loadings,len(self.trait_list)) else: self.pca_works = "False" @@ -189,7 +187,7 @@ class CorrelationMatrix: samples=self.all_sample_list, sample_data=self.sample_data,) - def calculate_pca(self, cols): + def calculate_pca(self): pca = compute_pca(self.pca_corr_results) @@ -197,36 +195,37 @@ class CorrelationMatrix: self.loadings = pca["components"] self.scores = pca["scores"] - trait_array_vectors = generate_pca_traits_vals(self.trait_data_array,self.pca_corr_results) - - - - pca_traits = [] - for i, vector in enumerate(trait_array_vectors): - # ZS: Check if below check is necessary - # if corr_eigen_value[i-1] > 100.0/len(self.trait_list): - pca_traits.append((vector * -1.0).tolist()) this_group_name = self.trait_list[0][1].group.name temp_dataset = data_set.create_dataset( dataset_name="Temp", dataset_type="Temp", group_name=this_group_name) temp_dataset.group.get_samplelist() - for i, pca_trait in enumerate(pca_traits): - trait_id = "PCA" + str(i + 1) + "_" + temp_dataset.group.species + "_" + \ - this_group_name + "_" + datetime.datetime.now().strftime("%m%d%H%M%S") - this_vals_string = "" - position = 0 - for sample in temp_dataset.group.all_samples_ordered(): - if sample in self.shared_samples_list: - this_vals_string += str(pca_trait[position]) - this_vals_string += " " - position += 1 - else: - this_vals_string += "x " - this_vals_string = this_vals_string[:-1] - Redis.set(trait_id, this_vals_string, ex=THIRTY_DAYS) - self.pca_trait_ids.append(trait_id) + + species = temp_dataset.group.species + + group =this_group_name + + trait_data_array = self.trait_data_array + + pca_corr = self.pca_corr_results + + sample_list = temp_dataset.group.all_samples_ordered() + + + shared = self.shared_samples_list + + dt_time = datetime.datetime.now().strftime("%m%d%H%M%S") + + + + results = generate_pca_temp_dataset(species = species, group= group,traits_data = self.trait_data_array,corr_array = self.pca_corr_results,dataset_samples = sample_list, shared_samples=shared,create_time=dt_time) + + + + cache_pca_dataset(Redis,THIRTY_DAYS,results) + + self.pca_trait_ids = list(results.keys()) return pca @@ -269,3 +268,5 @@ def export_corr_matrix(corr_results): return corr_matrix_filename, matrix_export_path + + -- cgit v1.2.3 From 78d8ed64a072851ddc58281553dfc9806c25b332 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Sun, 27 Feb 2022 10:21:46 +0300 Subject: code refactoring --- .../wqflask/correlation_matrix/show_corr_matrix.py | 50 ++++++---------------- 1 file changed, 14 insertions(+), 36 deletions(-) diff --git a/wqflask/wqflask/correlation_matrix/show_corr_matrix.py b/wqflask/wqflask/correlation_matrix/show_corr_matrix.py index d5ec738b..499a4e13 100644 --- a/wqflask/wqflask/correlation_matrix/show_corr_matrix.py +++ b/wqflask/wqflask/correlation_matrix/show_corr_matrix.py @@ -33,7 +33,6 @@ from utility import corr_result_helpers from utility.redis_tools import get_redis_conn - from gn3.computations.principal_component_analysis import compute_pca from gn3.computations.principal_component_analysis import process_factor_loadings_tdata @@ -44,6 +43,7 @@ from gn3.computations.principal_component_analysis import cache_pca_dataset Redis = get_redis_conn() THIRTY_DAYS = 60 * 60 * 24 * 30 + class CorrelationMatrix: def __init__(self, start_vars): @@ -54,7 +54,6 @@ class CorrelationMatrix: self.all_sample_list = [] self.traits = [] - self.insufficient_shared_samples = False self.do_PCA = True # ZS: Getting initial group name before verifying all traits are in the same group in the following loop this_group = self.trait_list[0][1].group.name @@ -169,12 +168,12 @@ class CorrelationMatrix: self.pca_works = "False" try: - if self.do_PCA == True: self.pca_works = "True" self.pca_trait_ids = [] pca = self.calculate_pca() - self.loadings_array = process_factor_loadings_tdata(self.loadings,len(self.trait_list)) + self.loadings_array = process_factor_loadings_tdata( + self.loadings, len(self.trait_list)) else: self.pca_works = "False" except: @@ -189,43 +188,25 @@ class CorrelationMatrix: def calculate_pca(self): - pca = compute_pca(self.pca_corr_results) self.loadings = pca["components"] self.scores = pca["scores"] - this_group_name = self.trait_list[0][1].group.name temp_dataset = data_set.create_dataset( dataset_name="Temp", dataset_type="Temp", group_name=this_group_name) temp_dataset.group.get_samplelist() + pca_dataset = generate_pca_temp_dataset(species=temp_dataset.group.species, group=this_group_name, + traits_data=self.trait_data_array, corr_array=self.pca_corr_results, + dataset_samples=temp_dataset.group.all_samples_ordered(), + shared_samples=self.shared_samples_list, + create_time=datetime.datetime.now().strftime("%m%d%H%M%S")) - species = temp_dataset.group.species - - group =this_group_name - - trait_data_array = self.trait_data_array - - pca_corr = self.pca_corr_results - - sample_list = temp_dataset.group.all_samples_ordered() - - - shared = self.shared_samples_list + cache_pca_dataset(Redis, THIRTY_DAYS, pca_dataset) - dt_time = datetime.datetime.now().strftime("%m%d%H%M%S") - - - - results = generate_pca_temp_dataset(species = species, group= group,traits_data = self.trait_data_array,corr_array = self.pca_corr_results,dataset_samples = sample_list, shared_samples=shared,create_time=dt_time) - - - - cache_pca_dataset(Redis,THIRTY_DAYS,results) - - self.pca_trait_ids = list(results.keys()) + self.pca_trait_ids = list(pca_dataset.keys()) return pca @@ -242,11 +223,11 @@ def export_corr_matrix(corr_results): output_file.write("\n") output_file.write("Correlation ") for i, item in enumerate(corr_results[0]): - output_file.write("Trait" + str(i + 1) + ": " + \ + output_file.write("Trait" + str(i + 1) + ": " + str(item[0].dataset.name) + "::" + str(item[0].name) + "\t") output_file.write("\n") for i, row in enumerate(corr_results): - output_file.write("Trait" + str(i + 1) + ": " + \ + output_file.write("Trait" + str(i + 1) + ": " + str(row[0][0].dataset.name) + "::" + str(row[0][0].name) + "\t") for item in row: output_file.write(str(item[1]) + "\t") @@ -256,17 +237,14 @@ def export_corr_matrix(corr_results): output_file.write("\n") output_file.write("N ") for i, item in enumerate(corr_results[0]): - output_file.write("Trait" + str(i) + ": " + \ + output_file.write("Trait" + str(i) + ": " + str(item[0].dataset.name) + "::" + str(item[0].name) + "\t") output_file.write("\n") for i, row in enumerate(corr_results): - output_file.write("Trait" + str(i) + ": " + \ + output_file.write("Trait" + str(i) + ": " + str(row[0][0].dataset.name) + "::" + str(row[0][0].name) + "\t") for item in row: output_file.write(str(item[2]) + "\t") output_file.write("\n") return corr_matrix_filename, matrix_export_path - - - -- cgit v1.2.3 From aed325dcf84629bc26809eae6d537f81dcc40cf7 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Tue, 8 Mar 2022 17:33:27 +0300 Subject: make fixes;variable names and kwargs --- .../wqflask/correlation_matrix/show_corr_matrix.py | 29 +++++++++++----------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/wqflask/wqflask/correlation_matrix/show_corr_matrix.py b/wqflask/wqflask/correlation_matrix/show_corr_matrix.py index 499a4e13..9b4cb2eb 100644 --- a/wqflask/wqflask/correlation_matrix/show_corr_matrix.py +++ b/wqflask/wqflask/correlation_matrix/show_corr_matrix.py @@ -33,12 +33,11 @@ from utility import corr_result_helpers from utility.redis_tools import get_redis_conn -from gn3.computations.principal_component_analysis import compute_pca +from gn3.computations.pca import compute_pca -from gn3.computations.principal_component_analysis import process_factor_loadings_tdata -from gn3.computations.principal_component_analysis import generate_pca_traits_vals -from gn3.computations.principal_component_analysis import generate_pca_temp_dataset -from gn3.computations.principal_component_analysis import cache_pca_dataset +from gn3.computations.pca import process_factor_loadings_tdata +from gn3.computations.pca import generate_pca_temp_traits +from gn3.computations.pca import cache_pca_dataset Redis = get_redis_conn() THIRTY_DAYS = 60 * 60 * 24 * 30 @@ -168,12 +167,12 @@ class CorrelationMatrix: self.pca_works = "False" try: - if self.do_PCA == True: + if self.do_PCA: self.pca_works = "True" self.pca_trait_ids = [] pca = self.calculate_pca() self.loadings_array = process_factor_loadings_tdata( - self.loadings, len(self.trait_list)) + factor_loadings=self.loadings, traits_num=len(self.trait_list)) else: self.pca_works = "False" except: @@ -198,15 +197,17 @@ class CorrelationMatrix: dataset_name="Temp", dataset_type="Temp", group_name=this_group_name) temp_dataset.group.get_samplelist() - pca_dataset = generate_pca_temp_dataset(species=temp_dataset.group.species, group=this_group_name, - traits_data=self.trait_data_array, corr_array=self.pca_corr_results, - dataset_samples=temp_dataset.group.all_samples_ordered(), - shared_samples=self.shared_samples_list, - create_time=datetime.datetime.now().strftime("%m%d%H%M%S")) + pca_temp_traits = generate_pca_temp_traits(species=temp_dataset.group.species, group=this_group_name, + traits_data=self.trait_data_array, corr_array=self.pca_corr_results, + dataset_samples=temp_dataset.group.all_samples_ordered(), + shared_samples=self.shared_samples_list, + create_time=datetime.datetime.now().strftime("%m%d%H%M%S")) - cache_pca_dataset(Redis, THIRTY_DAYS, pca_dataset) + + cache_pca_dataset(redis_conn=get_redis_conn( + ), exp_days=60 * 60 * 24 * 30, pca_trait_dict=pca_temp_traits) - self.pca_trait_ids = list(pca_dataset.keys()) + self.pca_trait_ids = list(pca_temp_traits.keys()) return pca -- cgit v1.2.3 From 6359dc2bf8973991072634e6a2b8d6a8a038166a Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Tue, 8 Mar 2022 17:39:22 +0300 Subject: remove global variables;pep8 formatting --- .../wqflask/correlation_matrix/show_corr_matrix.py | 24 +++++++++------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/wqflask/wqflask/correlation_matrix/show_corr_matrix.py b/wqflask/wqflask/correlation_matrix/show_corr_matrix.py index 9b4cb2eb..88d62045 100644 --- a/wqflask/wqflask/correlation_matrix/show_corr_matrix.py +++ b/wqflask/wqflask/correlation_matrix/show_corr_matrix.py @@ -21,27 +21,23 @@ import datetime import random import string - - import numpy as np import scipy -from base import data_set +from base.data_set import create_dataset from base.webqtlConfig import GENERATED_TEXT_DIR -from utility import helper_functions -from utility import corr_result_helpers + + +from utility.helper_functions import get_trait_db_obs +from utility.corr_result_helpers import normalize_values from utility.redis_tools import get_redis_conn from gn3.computations.pca import compute_pca - from gn3.computations.pca import process_factor_loadings_tdata from gn3.computations.pca import generate_pca_temp_traits from gn3.computations.pca import cache_pca_dataset -Redis = get_redis_conn() -THIRTY_DAYS = 60 * 60 * 24 * 30 - class CorrelationMatrix: @@ -49,7 +45,7 @@ class CorrelationMatrix: trait_db_list = [trait.strip() for trait in start_vars['trait_list'].split(',')] - helper_functions.get_trait_db_obs(self, trait_db_list) + get_trait_db_obs(self, trait_db_list) self.all_sample_list = [] self.traits = [] @@ -117,7 +113,7 @@ class CorrelationMatrix: if sample in self.shared_samples_list: self.shared_samples_list.remove(sample) - this_trait_vals, target_vals, num_overlap = corr_result_helpers.normalize_values( + this_trait_vals, target_vals, num_overlap = normalize_values( this_trait_vals, target_vals) if num_overlap < self.lowest_overlap: @@ -193,8 +189,9 @@ class CorrelationMatrix: self.scores = pca["scores"] this_group_name = self.trait_list[0][1].group.name - temp_dataset = data_set.create_dataset( - dataset_name="Temp", dataset_type="Temp", group_name=this_group_name) + temp_dataset = create_dataset( + dataset_name="Temp", dataset_type="Temp", + group_name=this_group_name) temp_dataset.group.get_samplelist() pca_temp_traits = generate_pca_temp_traits(species=temp_dataset.group.species, group=this_group_name, @@ -203,7 +200,6 @@ class CorrelationMatrix: shared_samples=self.shared_samples_list, create_time=datetime.datetime.now().strftime("%m%d%H%M%S")) - cache_pca_dataset(redis_conn=get_redis_conn( ), exp_days=60 * 60 * 24 * 30, pca_trait_dict=pca_temp_traits) -- cgit v1.2.3