From 34e4933de5a1cd444abe618fcfd93b424bf3442e Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Tue, 20 Apr 2021 01:38:26 +0300
Subject: refactor code for iterating mrna tissue data

---
 wqflask/base/mrna_assay_tissue_data.py | 39 +++++++++++++++++++++++++++++-----
 1 file changed, 34 insertions(+), 5 deletions(-)

(limited to 'wqflask/base')

diff --git a/wqflask/base/mrna_assay_tissue_data.py b/wqflask/base/mrna_assay_tissue_data.py
index f1929518..0220d73b 100644
--- a/wqflask/base/mrna_assay_tissue_data.py
+++ b/wqflask/base/mrna_assay_tissue_data.py
@@ -6,6 +6,7 @@ from utility import db_tools
 from utility import Bunch
 
 from utility.db_tools import escape
+from gn3.db_utils import database_connector
 
 
 from utility.logger import getLogger
@@ -44,16 +45,42 @@ class MrnaAssayTissueData(object):
                 and t.Mean = x.maxmean;
                     '''.format(in_clause)
 
-        results = g.db.execute(query).fetchall()
 
-        lower_symbols = []
+        # lower_symbols = []
+        lower_symbols = {}
         for gene_symbol in gene_symbols:
+            # lower_symbols[gene_symbol.lower()] = True
             if gene_symbol != None:
-                lower_symbols.append(gene_symbol.lower())
-
+                lower_symbols[gene_symbol.lower()] = True
+
+        import time
+        # initial_time = time.time()
+        # conn,cursor = database_connector()
+        # cursor.execute(query)
+        # for result in cursor.fetchall():
+        #     symbol = result[0]
+        #     self.data[symbol].gene_id = result[1]
+        #     self.data[symbol].data_id = result[2]
+        #     self.data[symbol].chr = result[3]
+        #     self.data[symbol].mb = result[4]
+        #     self.data[symbol].description = result[5]
+        #     self.data[symbol].probe_target_description = result[6]
+
+
+        # print("my loop takes>>>>",time.time()-initial_time)
+        # conn.close()
+        # r
+
+        # takes 5 seconds
+        initial_time = time.time()
+        results = list(g.db.execute(query).fetchall())
         for result in results:
             symbol = result[0]
-            if symbol.lower() in lower_symbols:
+            # if  symbol  is not None
+            # exists = lower_symbols.get(symbol.lower())
+            # if symbol.lower() in lower_symbols:
+            if symbol  is not None and lower_symbols.get(symbol.lower()):
+
                 symbol = symbol.lower()
 
                 self.data[symbol].gene_id = result.GeneId
@@ -62,6 +89,7 @@ class MrnaAssayTissueData(object):
                 self.data[symbol].mb = result.Mb
                 self.data[symbol].description = result.description
                 self.data[symbol].probe_target_description = result.Probe_Target_Description
+        print("time taken in the loop is",time.time()-initial_time)
 
     ###########################################################################
     #Input: cursor, symbolList (list), dataIdDict(Dict)
@@ -82,6 +110,7 @@ class MrnaAssayTissueData(object):
                        WHERE TissueProbeSetData.Id IN {} and
                              TissueProbeSetXRef.DataId = TissueProbeSetData.Id""".format(db_tools.create_in_clause(id_list))
 
+
             results = g.db.execute(query).fetchall()
             for result in results:
                 if result.Symbol.lower() not in symbol_values_dict:
-- 
cgit v1.2.3


From 1b0566d7c9779b979d20c350f66d5628fb55eba6 Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Fri, 23 Apr 2021 23:22:46 +0300
Subject: debugging for fetching probe data

---
 wqflask/base/data_set.py                           | 51 ++++++++++++++++++++--
 wqflask/wqflask/correlation/correlation_gn3_api.py |  2 +-
 wqflask/wqflask/views.py                           |  3 ++
 3 files changed, 51 insertions(+), 5 deletions(-)

(limited to 'wqflask/base')

diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py
index 178234fe..468c4da0 100644
--- a/wqflask/base/data_set.py
+++ b/wqflask/base/data_set.py
@@ -115,7 +115,8 @@ Publish or ProbeSet. E.g.
             except:
                 pass
 
-            self.redis_instance.set("dataset_structure", json.dumps(self.datasets))
+            self.redis_instance.set(
+                "dataset_structure", json.dumps(self.datasets))
 
     def set_dataset_key(self, t, name):
         """If name is not in the object's dataset dictionary, set it, and update
@@ -154,10 +155,12 @@ Publish or ProbeSet. E.g.
         if t in ['pheno', 'other_pheno']:
             group_name = name.replace("Publish", "")
 
-        results = g.db.execute(sql_query_mapping[t].format(group_name)).fetchone()
+        results = g.db.execute(
+            sql_query_mapping[t].format(group_name)).fetchone()
         if results:
             self.datasets[name] = dataset_name_mapping[t]
-            self.redis_instance.set("dataset_structure", json.dumps(self.datasets))
+            self.redis_instance.set(
+                "dataset_structure", json.dumps(self.datasets))
             return True
 
         return None
@@ -169,7 +172,8 @@ Publish or ProbeSet. E.g.
                 # This has side-effects, with the end result being a truth-y value
                 if(self.set_dataset_key(t, name)):
                     break
-        return self.datasets.get(name, None)  # Return None if name has not been set
+        # Return None if name has not been set
+        return self.datasets.get(name, None)
 
 
 # Do the intensive work at startup one time only
@@ -651,6 +655,43 @@ class DataSet(object):
                 "Dataset {} is not yet available in GeneNetwork.".format(self.name))
             pass
 
+    def fetch_probe_trait_data(self, sample_list=None):
+        if sample_list:
+            self.samplelist = sample_list
+        else:
+            self.samplelist = self.group.samplelist
+
+        if self.group.parlist != None and self.group.f1list != None:
+            if (self.group.parlist + self.group.f1list) in self.samplelist:
+                self.samplelist += self.group.parlist + self.group.f1list
+
+        query = """
+            SELECT Strain.Name, Strain.Id FROM Strain, Species
+            WHERE Strain.Name IN {}
+            and Strain.SpeciesId=Species.Id
+            and Species.name = '{}'
+            """.format(create_in_clause(self.samplelist), *mescape(self.group.species))
+        logger.sql(query)
+        results = dict(g.db.execute(query).fetchall())
+        sample_ids = [results[item] for item in self.samplelist]
+
+        query = """SELECT * from ProbeSetData WHERE Id in ( SELECT ProbeSetXRef.DataId FROM (ProbeSet, ProbeSetXRef, ProbeSetFreeze) WHERE ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id  and ProbeSetFreeze.Name = 'HC_M2_0606_P'  and ProbeSet.Id = ProbeSetXRef.ProbeSetId  order by ProbeSet.Id )    and  StrainId in ({})""".format(
+            ",".join(str(sample_id) for sample_id in sample_ids))
+
+        results = g.db.execute(query).fetchall()
+
+        # with conn:
+        #     cursor = conn.cursor()
+        #     cursor.execute(query)
+        #     results = cursor.fetchall()
+        trait_data = {}
+        for trait_id, StrainId, value in results:
+            if trait_id in trait_data:
+                trait_data[trait_id].append(value)
+            else:
+                trait_data[trait_id] = [value]
+        self.trait_data = trait_data
+
     def get_trait_data(self, sample_list=None):
         if sample_list:
             self.samplelist = sample_list
@@ -670,6 +711,7 @@ class DataSet(object):
         logger.sql(query)
         results = dict(g.db.execute(query).fetchall())
         sample_ids = [results[item] for item in self.samplelist]
+        print("the number of sample ids are", len(sample_ids))
 
         # MySQL limits the number of tables that can be used in a join to 61,
         # so we break the sample ids into smaller chunks
@@ -720,6 +762,7 @@ class DataSet(object):
             trait_sample_data.append(results)
 
         trait_count = len(trait_sample_data[0])
+        print("the trait count is >>>", trait_count)
         self.trait_data = collections.defaultdict(list)
 
         # put all of the separate data together into a dictionary where the keys are
diff --git a/wqflask/wqflask/correlation/correlation_gn3_api.py b/wqflask/wqflask/correlation/correlation_gn3_api.py
index e7394647..51bf5fb5 100644
--- a/wqflask/wqflask/correlation/correlation_gn3_api.py
+++ b/wqflask/wqflask/correlation/correlation_gn3_api.py
@@ -78,7 +78,7 @@ def compute_correlation(start_vars, method="pearson"):
         # }
         sample_data = process_samples(
             start_vars, this_dataset.group.samplelist)
-        target_dataset.get_trait_data(list(sample_data.keys()))
+        target_dataset.fetch_probe_trait_data(list(sample_data.keys()))
         this_trait = retrieve_sample_data(this_trait, this_dataset)
 
         print("Creating dataset and trait took", time.time()-initial_time)
diff --git a/wqflask/wqflask/views.py b/wqflask/wqflask/views.py
index 072db466..2c239425 100644
--- a/wqflask/wqflask/views.py
+++ b/wqflask/wqflask/views.py
@@ -881,7 +881,10 @@ def network_graph_page():
 def corr_compute_page():
     logger.info("In corr_compute, request.form is:", pf(request.form))
     logger.info(request.url)
+    import time
+    initial_time = time.time()
     correlation_results = compute_correlation(request.form)
+    print(">>>>Time taken by this endpoint",time.time()-initial_time)
     return render_template("demo_correlation_page.html",correlation_results=correlation_results[1:20])
 
 @app.route("/corr_matrix", methods=('POST',))
-- 
cgit v1.2.3


From 067d27460965aaf1ceaa863a315a0c7dbc47ae02 Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Mon, 26 Apr 2021 17:05:06 +0300
Subject: fix:remove debug statements and commented code

---
 wqflask/base/mrna_assay_tissue_data.py             | 25 ---------
 wqflask/wqflask/correlation/correlation_gn3_api.py | 60 +++-------------------
 2 files changed, 8 insertions(+), 77 deletions(-)

(limited to 'wqflask/base')

diff --git a/wqflask/base/mrna_assay_tissue_data.py b/wqflask/base/mrna_assay_tissue_data.py
index 0220d73b..5a64afb2 100644
--- a/wqflask/base/mrna_assay_tissue_data.py
+++ b/wqflask/base/mrna_assay_tissue_data.py
@@ -52,33 +52,9 @@ class MrnaAssayTissueData(object):
             # lower_symbols[gene_symbol.lower()] = True
             if gene_symbol != None:
                 lower_symbols[gene_symbol.lower()] = True
-
-        import time
-        # initial_time = time.time()
-        # conn,cursor = database_connector()
-        # cursor.execute(query)
-        # for result in cursor.fetchall():
-        #     symbol = result[0]
-        #     self.data[symbol].gene_id = result[1]
-        #     self.data[symbol].data_id = result[2]
-        #     self.data[symbol].chr = result[3]
-        #     self.data[symbol].mb = result[4]
-        #     self.data[symbol].description = result[5]
-        #     self.data[symbol].probe_target_description = result[6]
-
-
-        # print("my loop takes>>>>",time.time()-initial_time)
-        # conn.close()
-        # r
-
-        # takes 5 seconds
-        initial_time = time.time()
         results = list(g.db.execute(query).fetchall())
         for result in results:
             symbol = result[0]
-            # if  symbol  is not None
-            # exists = lower_symbols.get(symbol.lower())
-            # if symbol.lower() in lower_symbols:
             if symbol  is not None and lower_symbols.get(symbol.lower()):
 
                 symbol = symbol.lower()
@@ -89,7 +65,6 @@ class MrnaAssayTissueData(object):
                 self.data[symbol].mb = result.Mb
                 self.data[symbol].description = result.description
                 self.data[symbol].probe_target_description = result.Probe_Target_Description
-        print("time taken in the loop is",time.time()-initial_time)
 
     ###########################################################################
     #Input: cursor, symbolList (list), dataIdDict(Dict)
diff --git a/wqflask/wqflask/correlation/correlation_gn3_api.py b/wqflask/wqflask/correlation/correlation_gn3_api.py
index c945f699..3c21a850 100644
--- a/wqflask/wqflask/correlation/correlation_gn3_api.py
+++ b/wqflask/wqflask/correlation/correlation_gn3_api.py
@@ -63,9 +63,6 @@ def sample_for_trait_lists(corr_results, target_dataset, this_trait, this_datase
         "trait_sample_data": sample_data,
         "trait_id": start_vars["trait_id"]
     }
-    # trait_lists = dict([(list(corr_result)[0],True) for corr_result in corr_results])
-    # target_dataset.trait_data =list(filter(lambda dict_obj: dict_obj.keys()[
-    #                  0] in corr_results_traits, target_dataset_data))
     results = map_shared_keys_to_values(
         target_dataset.samplelist, target_dataset.trait_data)
     correlation_results = compute_all_sample_correlation(corr_method="pearson",
@@ -77,33 +74,15 @@ def sample_for_trait_lists(corr_results, target_dataset, this_trait, this_datase
 
 
 def tissue_for_trait_lists(corr_results, this_dataset, target_dataset, this_trait):
-    # # print(corr_results[0])--
-    # [{"awsdsd_at": {'corr_coeffient': 0.49714692782257336, 'p_value': 1.872077762359228e-05, 'num_overlap': 67}}]
-
-    print("creating trait_lists")
-    # corr_results = corr_results[0::]
     trait_lists = dict([(list(corr_result)[0], True)
                         for corr_result in corr_results])
-    print("finished creating trait_list")
-
     traits_symbol_dict = this_dataset.retrieve_genes("Symbol")
-    print("Retrieved symbol dict")
-    print("creating dict here>>>>>>>>>")
-    import time
-    init_time = time.time()
     traits_symbol_dict = dict({trait_name: symbol for (
         trait_name, symbol) in traits_symbol_dict.items() if trait_lists.get(trait_name)})
-    print("time taken to create this max dict is>>>>", time.time()-init_time)
-    print("finished creatinf the dict")
-    print("Fetching tissue datas")
     primary_tissue_data, target_tissue_data = get_tissue_correlation_input(
         this_trait, traits_symbol_dict)
-    print("finihsed>>>>>>>>>>>>>>>>>>")
-    print("Calling experimental_compute_all_tissue_correlation")
     corr_results = experimental_compute_all_tissue_correlation(
         primary_tissue_dict=primary_tissue_data, target_tissues_data=target_tissue_data, corr_method="pearson")
-    # print('finished calling this tissue reuslts',corr_results)
-
     return corr_results
 
 
@@ -123,22 +102,14 @@ def compute_correlation(start_vars, method="pearson"):
     corr_input_data = {}
 
     if corr_type == "sample":
-        import time
-        initial_time = time.time()
-        # corr_input_data = {
-        #     "target_dataset": target_dataset.trait_data,
-        #     "target_samplelist": target_dataset.samplelist,
-        #     "trait_data": {
-        #         "trait_sample_data": sample_data,
-        #         "trait_id": start_vars["trait_id"]
-        #     }
-        # }
+        
         sample_data = process_samples(
             start_vars, this_dataset.group.samplelist)
+        initial_time = time.time()
         target_dataset.get_trait_data(list(sample_data.keys()))
         this_trait = retrieve_sample_data(this_trait, this_dataset)
+        print("Creating target dataset and trait took", time.time()-initial_time)
 
-        print("Creating dataset and trait took", time.time()-initial_time)
 
         this_trait_data = {
             "trait_sample_data": sample_data,
@@ -151,15 +122,9 @@ def compute_correlation(start_vars, method="pearson"):
                                                              this_trait=this_trait_data,
                                                              target_dataset=results)
 
-        print("computedd>>>>>>>>>>>>>")
-
         print("doing sample correlation took", time.time()-initial_time)
-
-        other_results_time = time.time()
-        other_results = tissue_for_trait_lists(
-            correlation_results, this_dataset, target_dataset, this_trait)
-        print(">>>time taken for this is", time.time()-other_results_time)
-
+        # other_results = tissue_for_trait_lists(
+        #     correlation_results, this_dataset, target_dataset, this_trait)
         # requests_url = f"{GN3_CORRELATION_API}/sample_x/{method}"
         return correlation_results
 
@@ -177,17 +142,9 @@ def compute_correlation(start_vars, method="pearson"):
                                                                           target_tissues_data=corr_input_data[
             "target_tissues_dict"],
             corr_method=method)
-        print("correlation y took", time.time()-initial_time)
-        # initial_time = time.time()
-        # correlation_results = compute_all_tissue_correlation(primary_tissue_dict=corr_input_data["primary_tissue"],
-        #                                                      target_tissues_data=corr_input_data["target_tissues_dict"],
-        #                                                      corr_method=method)
-        # print("time taken for compute tissue is", time.time()-initial_time)
-
-        # requests_url = f"{GN3_CORRELATION_API}/tissue_corr/{method}"
-
-        sample_results = sample_for_trait_lists(
-            correlation_results, target_dataset, this_trait, this_dataset, start_vars)
+        print("computing tissue took >>>>", time.time()-initial_time)
+        # sample_results = sample_for_trait_lists(
+        #     correlation_results, target_dataset, this_trait, this_dataset, start_vars)
         return correlation_results
 
     elif corr_type == "lit":
@@ -203,7 +160,6 @@ def compute_correlation(start_vars, method="pearson"):
                 species=species, gene_id=this_trait_geneid)
 
         return lit_corr_results
-        print("the time taken is", time.time()-initial_time)
         # requests_url = f"{GN3_CORRELATION_API}/lit_corr/{species}/{this_trait_geneid}"
         # corr_input_data = geneid_dict
     # corr_results = requests.post(requests_url, json=corr_input_data)
-- 
cgit v1.2.3


From 27538980f93c1d72b0b2d76151312f3fbce4c9a5 Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Mon, 10 May 2021 08:24:42 +0300
Subject: add previous endpoint for correlation

---
 wqflask/base/data_set.py | 37 -------------------------------------
 wqflask/wqflask/views.py | 13 ++++++++-----
 2 files changed, 8 insertions(+), 42 deletions(-)

(limited to 'wqflask/base')

diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py
index 468c4da0..d0f5e6f2 100644
--- a/wqflask/base/data_set.py
+++ b/wqflask/base/data_set.py
@@ -655,42 +655,7 @@ class DataSet(object):
                 "Dataset {} is not yet available in GeneNetwork.".format(self.name))
             pass
 
-    def fetch_probe_trait_data(self, sample_list=None):
-        if sample_list:
-            self.samplelist = sample_list
-        else:
-            self.samplelist = self.group.samplelist
-
-        if self.group.parlist != None and self.group.f1list != None:
-            if (self.group.parlist + self.group.f1list) in self.samplelist:
-                self.samplelist += self.group.parlist + self.group.f1list
-
-        query = """
-            SELECT Strain.Name, Strain.Id FROM Strain, Species
-            WHERE Strain.Name IN {}
-            and Strain.SpeciesId=Species.Id
-            and Species.name = '{}'
-            """.format(create_in_clause(self.samplelist), *mescape(self.group.species))
-        logger.sql(query)
-        results = dict(g.db.execute(query).fetchall())
-        sample_ids = [results[item] for item in self.samplelist]
-
-        query = """SELECT * from ProbeSetData WHERE Id in ( SELECT ProbeSetXRef.DataId FROM (ProbeSet, ProbeSetXRef, ProbeSetFreeze) WHERE ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id  and ProbeSetFreeze.Name = 'HC_M2_0606_P'  and ProbeSet.Id = ProbeSetXRef.ProbeSetId  order by ProbeSet.Id )    and  StrainId in ({})""".format(
-            ",".join(str(sample_id) for sample_id in sample_ids))
 
-        results = g.db.execute(query).fetchall()
-
-        # with conn:
-        #     cursor = conn.cursor()
-        #     cursor.execute(query)
-        #     results = cursor.fetchall()
-        trait_data = {}
-        for trait_id, StrainId, value in results:
-            if trait_id in trait_data:
-                trait_data[trait_id].append(value)
-            else:
-                trait_data[trait_id] = [value]
-        self.trait_data = trait_data
 
     def get_trait_data(self, sample_list=None):
         if sample_list:
@@ -711,7 +676,6 @@ class DataSet(object):
         logger.sql(query)
         results = dict(g.db.execute(query).fetchall())
         sample_ids = [results[item] for item in self.samplelist]
-        print("the number of sample ids are", len(sample_ids))
 
         # MySQL limits the number of tables that can be used in a join to 61,
         # so we break the sample ids into smaller chunks
@@ -762,7 +726,6 @@ class DataSet(object):
             trait_sample_data.append(results)
 
         trait_count = len(trait_sample_data[0])
-        print("the trait count is >>>", trait_count)
         self.trait_data = collections.defaultdict(list)
 
         # put all of the separate data together into a dictionary where the keys are
diff --git a/wqflask/wqflask/views.py b/wqflask/wqflask/views.py
index b042a211..19779651 100644
--- a/wqflask/wqflask/views.py
+++ b/wqflask/wqflask/views.py
@@ -881,11 +881,14 @@ def network_graph_page():
 def corr_compute_page():
     logger.info("In corr_compute, request.form is:", pf(request.form))
     logger.info(request.url)
-    import time
-    initial_time = time.time()
-    correlation_results = compute_correlation(request.form)
-    print(">>>>Time taken by this endpoint",time.time()-initial_time)
-    return render_template("test_correlation_page.html",correlation_results=correlation_results)
+    template_vars = show_corr_results.CorrelationResults(request.form)
+    return render_template("correlation_page.html", **template_vars.__dict__)
+
+    # to test the new  correlation api uncomment these lines
+
+    # correlation_results = compute_correlation(request.form)
+    # print(">>>>Time taken by this endpoint",time.time()-initial_time)
+    # return render_template("test_correlation_page.html",correlation_results=correlation_results)
 
 @app.route("/corr_matrix", methods=('POST',))
 def corr_matrix_page():
-- 
cgit v1.2.3