aboutsummaryrefslogtreecommitdiff
path: root/gn2/wqflask/correlation_matrix
diff options
context:
space:
mode:
Diffstat (limited to 'gn2/wqflask/correlation_matrix')
-rw-r--r--gn2/wqflask/correlation_matrix/__init__.py0
-rw-r--r--gn2/wqflask/correlation_matrix/show_corr_matrix.py259
2 files changed, 259 insertions, 0 deletions
diff --git a/gn2/wqflask/correlation_matrix/__init__.py b/gn2/wqflask/correlation_matrix/__init__.py
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/gn2/wqflask/correlation_matrix/__init__.py
diff --git a/gn2/wqflask/correlation_matrix/show_corr_matrix.py b/gn2/wqflask/correlation_matrix/show_corr_matrix.py
new file mode 100644
index 00000000..f7eb0b4c
--- /dev/null
+++ b/gn2/wqflask/correlation_matrix/show_corr_matrix.py
@@ -0,0 +1,259 @@
+# Copyright (C) University of Tennessee Health Science Center, Memphis, TN.
+#
+# This program is free software: you can redistribute it and/or modify it
+# under the terms of the GNU Affero General Public License
+# as published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the GNU Affero General Public License for more details.
+#
+# This program is available from Source Forge: at GeneNetwork Project
+# (sourceforge.net/projects/genenetwork/).
+#
+# Contact Dr. Robert W. Williams at rwilliams@uthsc.edu
+#
+#
+# This module is used by GeneNetwork project (www.genenetwork.org)
+
+import datetime
+import random
+import string
+import numpy as np
+import scipy
+
+from gn2.base.data_set import create_dataset
+from gn2.base.webqtlConfig import GENERATED_TEXT_DIR
+
+
+from gn2.utility.helper_functions import get_trait_db_obs
+from gn2.utility.corr_result_helpers import normalize_values
+from gn2.utility.redis_tools import get_redis_conn
+
+
+from gn3.computations.pca import compute_pca
+from gn3.computations.pca import process_factor_loadings_tdata
+from gn3.computations.pca import generate_pca_temp_traits
+from gn3.computations.pca import cache_pca_dataset
+from gn3.computations.pca import generate_scree_plot_data
+
+
+class CorrelationMatrix:
+
+ def __init__(self, start_vars):
+ trait_db_list = [trait.strip()
+ for trait in start_vars['trait_list'].split(',')]
+
+ get_trait_db_obs(self, trait_db_list)
+
+ self.all_sample_list = []
+ self.traits = []
+ self.do_PCA = True
+ # ZS: Getting initial group name before verifying all traits are in the same group in the following loop
+ this_group = self.trait_list[0][1].group.name
+ for trait_db in self.trait_list:
+ this_group = trait_db[1].group.name
+ this_trait = trait_db[0]
+ self.traits.append(this_trait)
+ this_sample_data = this_trait.data
+
+ for sample in this_sample_data:
+ if sample not in self.all_sample_list:
+ self.all_sample_list.append(sample)
+
+ self.sample_data = []
+ for trait_db in self.trait_list:
+ this_trait = trait_db[0]
+ this_sample_data = this_trait.data
+
+ this_trait_vals = []
+ for sample in self.all_sample_list:
+ if sample in this_sample_data:
+ this_trait_vals.append(this_sample_data[sample].value)
+ else:
+ this_trait_vals.append('')
+ self.sample_data.append(this_trait_vals)
+
+ # Shouldn't do PCA if there are more traits than observations/samples
+ if len(this_trait_vals) < len(self.trait_list) or len(self.trait_list) < 3:
+ self.do_PCA = False
+
+ # ZS: Variable set to the lowest overlapping samples in order to notify user, or 8, whichever is lower (since 8 is when we want to display warning)
+ self.lowest_overlap = 8
+
+ self.corr_results = []
+ self.pca_corr_results = []
+ self.scree_data = []
+ self.shared_samples_list = self.all_sample_list
+ for trait_db in self.trait_list:
+ this_trait = trait_db[0]
+ this_db = trait_db[1]
+
+ this_db_samples = this_db.group.all_samples_ordered()
+ this_sample_data = this_trait.data
+
+ corr_result_row = []
+ pca_corr_result_row = []
+ is_spearman = False # ZS: To determine if it's above or below the diagonal
+ for target in self.trait_list:
+ target_trait = target[0]
+ target_db = target[1]
+ target_samples = target_db.group.all_samples_ordered()
+ target_sample_data = target_trait.data
+
+ this_trait_vals = []
+ target_vals = []
+ for index, sample in enumerate(target_samples):
+ if (sample in this_sample_data) and (sample in target_sample_data):
+ sample_value = this_sample_data[sample].value
+ target_sample_value = target_sample_data[sample].value
+ this_trait_vals.append(sample_value)
+ target_vals.append(target_sample_value)
+ else:
+ if sample in self.shared_samples_list:
+ self.shared_samples_list.remove(sample)
+
+ this_trait_vals, target_vals, num_overlap = normalize_values(
+ this_trait_vals, target_vals)
+
+ if num_overlap < self.lowest_overlap:
+ self.lowest_overlap = num_overlap
+ if num_overlap < 2:
+ corr_result_row.append([target_trait, 0, num_overlap])
+ pca_corr_result_row.append(0)
+ else:
+ pearson_r, pearson_p = scipy.stats.pearsonr(
+ this_trait_vals, target_vals)
+ if is_spearman == False:
+ sample_r, sample_p = pearson_r, pearson_p
+ if sample_r > 0.999:
+ is_spearman = True
+ else:
+ sample_r, sample_p = scipy.stats.spearmanr(
+ this_trait_vals, target_vals)
+
+ corr_result_row.append(
+ [target_trait, sample_r, num_overlap])
+ pca_corr_result_row.append(pearson_r)
+
+ self.corr_results.append(corr_result_row)
+ self.pca_corr_results.append(pca_corr_result_row)
+
+ self.export_filename, self.export_filepath = export_corr_matrix(
+ self.corr_results)
+
+ self.trait_data_array = []
+ for trait_db in self.trait_list:
+ this_trait = trait_db[0]
+ this_db = trait_db[1]
+ this_db_samples = this_db.group.all_samples_ordered()
+ this_sample_data = this_trait.data
+
+ this_trait_vals = []
+ for index, sample in enumerate(this_db_samples):
+ if (sample in this_sample_data) and (sample in self.shared_samples_list):
+ sample_value = this_sample_data[sample].value
+ this_trait_vals.append(sample_value)
+ self.trait_data_array.append(this_trait_vals)
+
+ groups = []
+ for sample in self.all_sample_list:
+ groups.append(1)
+
+ self.pca_works = "False"
+ try:
+
+ if self.do_PCA:
+ self.pca_works = "True"
+ self.pca_trait_ids = []
+ pca = self.calculate_pca()
+ self.loadings_array = process_factor_loadings_tdata(
+ factor_loadings=self.loadings, traits_num=len(self.trait_list))
+
+ else:
+ self.pca_works = "False"
+ except:
+ self.pca_works = "False"
+
+ self.js_data = dict(traits=[trait.name for trait in self.traits],
+ groups=groups,
+ scree_data = self.scree_data,
+ cols=list(range(len(self.traits))),
+ rows=list(range(len(self.traits))),
+ samples=self.all_sample_list,
+ sample_data=self.sample_data,)
+
+ def calculate_pca(self):
+
+ pca = compute_pca(self.pca_corr_results)
+
+ self.loadings = pca["components"]
+ self.scores = pca["scores"]
+ self.pca_obj = pca["pca"]
+
+ this_group_name = self.trait_list[0][1].group.name
+ temp_dataset = create_dataset(
+ dataset_name="Temp", dataset_type="Temp",
+ group_name=this_group_name)
+ temp_dataset.group.get_samplelist(redis_conn=get_redis_conn())
+
+ pca_temp_traits = generate_pca_temp_traits(species=temp_dataset.group.species, group=this_group_name,
+ traits_data=self.trait_data_array, corr_array=self.pca_corr_results,
+ dataset_samples=temp_dataset.group.all_samples_ordered(),
+ shared_samples=self.shared_samples_list,
+ create_time=datetime.datetime.now().strftime("%m%d%H%M%S"))
+
+ cache_pca_dataset(redis_conn=get_redis_conn(
+ ), exp_days=60 * 60 * 24 * 30, pca_trait_dict=pca_temp_traits)
+
+ self.pca_trait_ids = list(pca_temp_traits.keys())
+
+ x_coord, y_coord = generate_scree_plot_data(
+ list(self.pca_obj.explained_variance_ratio_))
+
+ self.scree_data = {
+ "x_coord": x_coord,
+ "y_coord": y_coord
+ }
+ return pca
+
+
+def export_corr_matrix(corr_results):
+ corr_matrix_filename = "corr_matrix_" + \
+ ''.join(random.choice(string.ascii_uppercase + string.digits)
+ for _ in range(6))
+ matrix_export_path = "{}{}.csv".format(
+ GENERATED_TEXT_DIR, corr_matrix_filename)
+ with open(matrix_export_path, "w+") as output_file:
+ output_file.write(
+ "Time/Date: " + datetime.datetime.now().strftime("%x / %X") + "\n")
+ output_file.write("\n")
+ output_file.write("Correlation ")
+ for i, item in enumerate(corr_results[0]):
+ output_file.write("Trait" + str(i + 1) + ": " +
+ str(item[0].dataset.name) + "::" + str(item[0].name) + "\t")
+ output_file.write("\n")
+ for i, row in enumerate(corr_results):
+ output_file.write("Trait" + str(i + 1) + ": " +
+ str(row[0][0].dataset.name) + "::" + str(row[0][0].name) + "\t")
+ for item in row:
+ output_file.write(str(item[1]) + "\t")
+ output_file.write("\n")
+
+ output_file.write("\n")
+ output_file.write("\n")
+ output_file.write("N ")
+ for i, item in enumerate(corr_results[0]):
+ output_file.write("Trait" + str(i) + ": " +
+ str(item[0].dataset.name) + "::" + str(item[0].name) + "\t")
+ output_file.write("\n")
+ for i, row in enumerate(corr_results):
+ output_file.write("Trait" + str(i) + ": " +
+ str(row[0][0].dataset.name) + "::" + str(row[0][0].name) + "\t")
+ for item in row:
+ output_file.write(str(item[2]) + "\t")
+ output_file.write("\n")
+
+ return corr_matrix_filename, matrix_export_path