From 36044483d365a907a9da6ad8a7b3f0dfb0a918e2 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Thu, 12 Aug 2021 17:21:44 +0300 Subject: Initialise heatmap generation module Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/heatmaps/heatmaps.py: Initialise the module with some code to be used to test out plotly features on the command-line. * guix.scm: Add `python-plotly` and `python-pandas` as dependencies. --- gn3/heatmaps/heatmaps.py | 54 ++++++++++++++++++++++++++++++++++++++++++++++++ guix.scm | 5 ++++- 2 files changed, 58 insertions(+), 1 deletion(-) create mode 100644 gn3/heatmaps/heatmaps.py diff --git a/gn3/heatmaps/heatmaps.py b/gn3/heatmaps/heatmaps.py new file mode 100644 index 0000000..3bf7917 --- /dev/null +++ b/gn3/heatmaps/heatmaps.py @@ -0,0 +1,54 @@ +import random +import plotly.express as px + +#### Remove these #### + +heatmap_dir = "heatmap_images" + +def generate_random_data(data_stop: float = 2, width: int = 10, height: int = 30): + """ + This is mostly a utility function to be used to generate random data, useful + for development of the heatmap generation code, without access to the actual + database data. + """ + return [[random.uniform(0,data_stop) for i in range(0, width)] + for j in range(0, height)] + +def heatmap_x_axis_names(): + return [ + "UCLA_BXDBXH_CARTILAGE_V2::ILM103710672", + "UCLA_BXDBXH_CARTILAGE_V2::ILM2260338", + "UCLA_BXDBXH_CARTILAGE_V2::ILM3140576", + "UCLA_BXDBXH_CARTILAGE_V2::ILM5670577", + "UCLA_BXDBXH_CARTILAGE_V2::ILM2070121", + "UCLA_BXDBXH_CARTILAGE_V2::ILM103990541", + "UCLA_BXDBXH_CARTILAGE_V2::ILM1190722", + "UCLA_BXDBXH_CARTILAGE_V2::ILM6590722", + "UCLA_BXDBXH_CARTILAGE_V2::ILM4200064", + "UCLA_BXDBXH_CARTILAGE_V2::ILM3140463"] +#### END: Remove these #### + +# Grey + Blue + Red +def generate_heatmap(): + rows = 20 + data = generate_random_data(height=rows) + y = (["%s"%x for x in range(1, rows+1)][:-1] + ["X"]) #replace last item with x for now + fig = px.imshow( + data, + x=heatmap_x_axis_names(), + y=y, + width=500) + fig.update_traces(xtype="array") + fig.update_traces(ytype="array") + # fig.update_traces(xgap=10) + fig.update_xaxes( + visible=True, + title_text="Traits", + title_font_size=16) + fig.update_layout( + coloraxis_colorscale=[ + [0.0, '#3B3B3B'], [0.4999999999999999, '#ABABAB'], + [0.5, '#F5DE11'], [1.0, '#FF0D00']]) + + fig.write_html("%s/%s"%(heatmap_dir, "test_image.html")) + return fig diff --git a/guix.scm b/guix.scm index f94fe1a..729d089 100644 --- a/guix.scm +++ b/guix.scm @@ -38,6 +38,7 @@ (gn packages python) (gnu packages base) (gnu packages check) + (gnu packages graph) (gnu packages cran) (gnu packages databases) (gnu packages statistics) @@ -101,7 +102,9 @@ ,python-sqlalchemy-stubs) ("r-optparse" ,r-optparse) ("r-qtl" ,r-qtl) - ("r-stringi" ,r-stringi))) + ("r-stringi" ,r-stringi) + ("python-plotly" ,python-plotly) + ("python-pandas" ,python-pandas))) (build-system python-build-system) (home-page "https://github.com/genenetwork/genenetwork3") (synopsis "GeneNetwork3 API for data science and machine learning.") -- cgit v1.2.3 From 5f56ac39c60b345e1a135c75f4bf35f8e881f4d6 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Tue, 17 Aug 2021 08:44:20 +0300 Subject: Fix errors: add in missing parenthesis Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Call the `cursor.fetchone()` function to get results. Without the parenthesis, the code was trying to use the function itself as the results, which was a bug, and would lead to failure. --- gn3/db/datasets.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gn3/db/datasets.py b/gn3/db/datasets.py index 53d6811..4a05499 100644 --- a/gn3/db/datasets.py +++ b/gn3/db/datasets.py @@ -25,7 +25,7 @@ def retrieve_probeset_trait_dataset_name( return dict(zip( ["dataset_id", "dataset_name", "dataset_fullname", "dataset_shortname", "dataset_datascale"], - cursor.fetchone)) + cursor.fetchone())) def retrieve_publish_trait_dataset_name( threshold: int, name: str, connection: Any): @@ -49,7 +49,7 @@ def retrieve_publish_trait_dataset_name( return dict(zip( ["dataset_id", "dataset_name", "dataset_fullname", "dataset_shortname"], - cursor.fetchone)) + cursor.fetchone())) def retrieve_geno_trait_dataset_name( threshold: int, name: str, connection: Any): @@ -73,7 +73,7 @@ def retrieve_geno_trait_dataset_name( return dict(zip( ["dataset_id", "dataset_name", "dataset_fullname", "dataset_shortname"], - cursor.fetchone)) + cursor.fetchone())) def retrieve_temp_trait_dataset_name( threshold: int, name: str, connection: Any): @@ -97,7 +97,7 @@ def retrieve_temp_trait_dataset_name( return dict(zip( ["dataset_id", "dataset_name", "dataset_fullname", "dataset_shortname"], - cursor.fetchone)) + cursor.fetchone())) def retrieve_dataset_name( trait_type: str, threshold: int, trait_name: str, dataset_name: str, -- cgit v1.2.3 From 1a9d28e6db2140cc7b3491c6dbcf4fc8cd8c09b6 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Tue, 17 Aug 2021 08:47:11 +0300 Subject: Add tests and fix errors caught with tests Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/computations/heatmap.py: fix errors * tests/unit/computations/test_heatmap.py: new tests Add new tests with the expected source data format, and expected results. Fix all errors that were caught by running the tests --- gn3/computations/heatmap.py | 18 +++++------ tests/unit/computations/test_heatmap.py | 54 +++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 9 deletions(-) create mode 100644 tests/unit/computations/test_heatmap.py diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py index a0e778a..8a86fe8 100644 --- a/gn3/computations/heatmap.py +++ b/gn3/computations/heatmap.py @@ -34,11 +34,11 @@ def export_trait_data( """ def __export_all_types(tdata, strain): sample_data = [] - if tdata[strain]["val"]: - sample_data.append(tdata[strain]["val"]) + if tdata[strain]["value"]: + sample_data.append(tdata[strain]["value"]) if var_exists: - if tdata[strain].var: - sample_data.append(tdata[strain]["var"]) + if tdata[strain]["variance"]: + sample_data.append(tdata[strain]["variance"]) else: sample_data.append(None) if n_exists: @@ -58,15 +58,15 @@ def export_trait_data( def __exporter(accumulator, strain): # pylint: disable=[R0911] - if trait_data.has_key(strain): + if strain in trait_data["data"]: if dtype == "val": - return accumulator + (trait_data[strain]["val"], ) + return accumulator + (trait_data["data"][strain]["value"], ) if dtype == "var": - return accumulator + (trait_data[strain]["var"], ) + return accumulator + (trait_data["data"][strain]["variance"], ) if dtype == "N": - return trait_data[strain]["ndata"] + return accumulator + (trait_data["data"][strain]["ndata"], ) if dtype == "all": - return accumulator + __export_all_types(trait_data, strain) + return accumulator + __export_all_types(trait_data["data"], strain) raise KeyError("Type `%s` is incorrect" % dtype) if var_exists and n_exists: return accumulator + (None, None, None) diff --git a/tests/unit/computations/test_heatmap.py b/tests/unit/computations/test_heatmap.py new file mode 100644 index 0000000..78303ae --- /dev/null +++ b/tests/unit/computations/test_heatmap.py @@ -0,0 +1,54 @@ +"""Module contains tests for gn3.computations.heatmap""" +from unittest import TestCase +from gn3.computations.heatmap import export_trait_data + +strainlist = ["B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"] +trait_data = {"mysqlid": 36688172, "data": {"B6cC3-1": {"strain_name": "B6cC3-1", "value": 7.51879, "variance": None, "ndata": None}, "BXD1": {"strain_name": "BXD1", "value": 7.77141, "variance": None, "ndata": None}, "BXD12": {"strain_name": "BXD12", "value": 8.39265, "variance": None, "ndata": None}, "BXD16": {"strain_name": "BXD16", "value": 8.17443, "variance": None, "ndata": None}, "BXD19": {"strain_name": "BXD19", "value": 8.30401, "variance": None, "ndata": None}, "BXD2": {"strain_name": "BXD2", "value": 7.80944, "variance": None, "ndata": None}, "BXD21": {"strain_name": "BXD21", "value": 8.93809, "variance": None, "ndata": None}, "BXD24": {"strain_name": "BXD24", "value": 7.99415, "variance": None, "ndata": None}, "BXD27": {"strain_name": "BXD27", "value": 8.12177, "variance": None, "ndata": None}, "BXD28": {"strain_name": "BXD28", "value": 7.67688, "variance": None, "ndata": None}, "BXD32": {"strain_name": "BXD32", "value": 7.79062, "variance": None, "ndata": None}, "BXD39": {"strain_name": "BXD39", "value": 8.27641, "variance": None, "ndata": None}, "BXD40": {"strain_name": "BXD40", "value": 8.18012, "variance": None, "ndata": None}, "BXD42": {"strain_name": "BXD42", "value": 7.82433, "variance": None, "ndata": None}, "BXD6": {"strain_name": "BXD6", "value": 8.09718, "variance": None, "ndata": None}, "BXH14": {"strain_name": "BXH14", "value": 7.97475, "variance": None, "ndata": None}, "BXH19": {"strain_name": "BXH19", "value": 7.67223, "variance": None, "ndata": None}, "BXH2": {"strain_name": "BXH2", "value": 7.93622, "variance": None, "ndata": None}, "BXH22": {"strain_name": "BXH22", "value": 7.43692, "variance": None, "ndata": None}, "BXH4": {"strain_name": "BXH4", "value": 7.96336, "variance": None, "ndata": None}, "BXH6": {"strain_name": "BXH6", "value": 7.75132, "variance": None, "ndata": None}, "BXH7": {"strain_name": "BXH7", "value": 8.12927, "variance": None, "ndata": None}, "BXH8": {"strain_name": "BXH8", "value": 6.77338, "variance": None, "ndata": None}, "BXH9": {"strain_name": "BXH9", "value": 8.03836, "variance": None, "ndata": None}, "C3H/HeJ": {"strain_name": "C3H/HeJ", "value": 7.42795, "variance": None, "ndata": None}, "C57BL/6J": {"strain_name": "C57BL/6J", "value": 7.50606, "variance": None, "ndata": None}, "DBA/2J": {"strain_name": "DBA/2J", "value": 7.72588, "variance": None, "ndata": None}}} + +class TestHeatmap(TestCase): + """Class for testing heatmap computation functions""" + + def test_export_trait_data_dtype(self): + """ + Test `export_trait_data` with different values for the `dtype` keyword + argument + """ + for dtype, expected in [ + ["val", (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["var", (None, None, None, None, None, None)], + ["N", (None, None, None, None, None, None)], + ["all", (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)]]: + with self.subTest(dtype=dtype): + self.assertEqual( + export_trait_data(trait_data, strainlist, dtype=dtype), + expected) + + def test_export_trait_data_dtype_all_flags(self): + """ + Test `export_trait_data` with different values for the `dtype` keyword + argument and the different flags set up + """ + for dtype, vflag, nflag, expected in [ + ["val", False, False, (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["val", False, True, (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["val", True, False, (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["val", True, True, (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["var", False, False, (None, None, None, None, None, None)], + ["var", False, True, (None, None, None, None, None, None)], + ["var", True, False, (None, None, None, None, None, None)], + ["var", True, True, (None, None, None, None, None, None)], + ["N", False, False, (None, None, None, None, None, None)], + ["N", False, True, (None, None, None, None, None, None)], + ["N", True, False, (None, None, None, None, None, None)], + ["N", True, True, (None, None, None, None, None, None)], + ["all", False, False, (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["all", False, True, (7.51879, None, 7.77141, None, 8.39265, None, 8.17443, None, 8.30401, None, 7.80944, None)], + ["all", True, False, (7.51879, None, 7.77141, None, 8.39265, None, 8.17443, None, 8.30401, None, 7.80944, None)], + ["all", True, True, (7.51879, None, None, 7.77141, None, None, 8.39265, None, None, 8.17443, None, None, 8.30401, None, None, 7.80944, None, None)] + ]: + with self.subTest(dtype=dtype, vflag=vflag, nflag=nflag): + self.assertEqual( + export_trait_data( + trait_data, strainlist, dtype=dtype, var_exists=vflag, + n_exists=nflag), + expected) -- cgit v1.2.3 From a2f6406909951a80dc4ead809a09e8de2c15200d Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Tue, 17 Aug 2021 08:49:14 +0300 Subject: Provide top-level `riset` key-value pair Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Provide the expected, top-level `riset` key-value pair and eliminate the redundant key-value pair. --- gn3/db/traits.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gn3/db/traits.py b/gn3/db/traits.py index 6ea24be..1031e44 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -418,9 +418,9 @@ def retrieve_trait_info( conn) if trait_info["haveinfo"]: return { - **trait_post_processing_functions_table[trait_dataset_type](trait_info), - "db": {**trait["db"], **trait_dataset}, - "riset": trait_dataset["riset"] + **trait_post_processing_functions_table[trait_dataset_type]( + {**trait_info, "riset": trait_dataset["riset"]}), + "db": {**trait["db"], **trait_dataset} } return trait_info -- cgit v1.2.3 From 99bfda81abe76b3bb3f7034cf6cdac21c8d50726 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Tue, 17 Aug 2021 11:05:03 +0300 Subject: Make child sequence a list Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Since the `slink` function assigns values to the `listcopy` variable and its children, this commit ensures that the sequence is a list to allow for the assignment. If the child-sequence is a tuple, that would lead to an exception. --- gn3/computations/slink.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gn3/computations/slink.py b/gn3/computations/slink.py index 5953e6b..3d7a576 100644 --- a/gn3/computations/slink.py +++ b/gn3/computations/slink.py @@ -161,7 +161,7 @@ def slink(lists): try: size = len(lists) listindexcopy = list(range(size)) - listscopy = [child[:] for child in lists] + listscopy = [list(child[:]) for child in lists] init_size = size candidate = [] while init_size > 2: -- cgit v1.2.3 From d491be2057843921cc67bd1c4b1ae612d9f15d34 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Tue, 17 Aug 2021 11:42:20 +0300 Subject: Fix obvious linting errors * Fix linting errors that do not change the function of the code. --- gn3/api/correlation.py | 4 ++-- gn3/api/general.py | 3 ++- gn3/computations/correlations.py | 4 ++-- wsgi.py | 6 ++++-- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/gn3/api/correlation.py b/gn3/api/correlation.py index a3e366e..46121f8 100644 --- a/gn3/api/correlation.py +++ b/gn3/api/correlation.py @@ -79,7 +79,7 @@ def compute_tissue_corr(corr_method="pearson"): target_tissues_dict = tissue_input_data["target_tissues_dict"] results = compute_tissue_correlation(primary_tissue_dict=primary_tissue_dict, - target_tissues_data=target_tissues_dict, - corr_method=corr_method) + target_tissues_data=target_tissues_dict, + corr_method=corr_method) return jsonify(results) diff --git a/gn3/api/general.py b/gn3/api/general.py index 86fb7b7..69ec343 100644 --- a/gn3/api/general.py +++ b/gn3/api/general.py @@ -13,7 +13,8 @@ general = Blueprint("general", __name__) @general.route("/version") def version(): - return jsonify("1.0") + """Get API version.""" + return jsonify("1.0") @general.route("/metadata/upload/", methods=["POST"], strict_slashes=False) diff --git a/gn3/computations/correlations.py b/gn3/computations/correlations.py index 1fd3213..8d76c09 100644 --- a/gn3/computations/correlations.py +++ b/gn3/computations/correlations.py @@ -341,8 +341,8 @@ def compute_all_lit_correlation(conn, trait_lists: List, def compute_tissue_correlation(primary_tissue_dict: dict, - target_tissues_data: dict, - corr_method: str): + target_tissues_data: dict, + corr_method: str): """Function acts as an abstraction for tissue_correlation_for_trait\ required input are target tissue object and primary tissue trait\ target tissues data contains the trait_symbol_dict and symbol_tissue_vals diff --git a/wsgi.py b/wsgi.py index d30bc49..0fcb573 100644 --- a/wsgi.py +++ b/wsgi.py @@ -1,9 +1,11 @@ +""" +WSGI application entry-point. +""" # import main +from gn3.app import create_app print("STARTING WSGI APP") -from gn3.app import create_app - app = create_app() if __name__ == "__main__": -- cgit v1.2.3 From 41fc5136914548710529cbed7ef370dfb5b4a5c8 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Tue, 17 Aug 2021 11:43:32 +0300 Subject: Test the clustering Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/computations/heatmap.py: Fix clustering bugs * tests/unit/computations/test_heatmap.py: Add new tests. Fix linting issues. Test and fix the clustering function. --- gn3/computations/heatmap.py | 14 ++-- tests/unit/computations/test_heatmap.py | 109 +++++++++++++++++++++++++++++--- 2 files changed, 106 insertions(+), 17 deletions(-) diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py index 8a86fe8..3c35029 100644 --- a/gn3/computations/heatmap.py +++ b/gn3/computations/heatmap.py @@ -110,13 +110,13 @@ def cluster_traits(traits_data_list: Sequence[Dict]): https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L138-L162 """ def __compute_corr(tdata_i, tdata_j): - if tdata_j[0] < tdata_i[0]: - corr_vals = compute_correlation(tdata_i, tdata_j) - corr = corr_vals[0] - if (1 - corr) < 0: - return 0.0 - return 1 - corr - return 0.0 + if tdata_i[0] == tdata_j[0]: + return 0.0 + corr_vals = compute_correlation(tdata_i[1], tdata_j[1]) + corr = corr_vals[0] + if (1 - corr) < 0: + return 0.0 + return 1 - corr def __cluster(tdata_i): return tuple( diff --git a/tests/unit/computations/test_heatmap.py b/tests/unit/computations/test_heatmap.py index 78303ae..650cb45 100644 --- a/tests/unit/computations/test_heatmap.py +++ b/tests/unit/computations/test_heatmap.py @@ -1,9 +1,38 @@ """Module contains tests for gn3.computations.heatmap""" from unittest import TestCase -from gn3.computations.heatmap import export_trait_data +from gn3.computations.heatmap import cluster_traits, export_trait_data strainlist = ["B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"] -trait_data = {"mysqlid": 36688172, "data": {"B6cC3-1": {"strain_name": "B6cC3-1", "value": 7.51879, "variance": None, "ndata": None}, "BXD1": {"strain_name": "BXD1", "value": 7.77141, "variance": None, "ndata": None}, "BXD12": {"strain_name": "BXD12", "value": 8.39265, "variance": None, "ndata": None}, "BXD16": {"strain_name": "BXD16", "value": 8.17443, "variance": None, "ndata": None}, "BXD19": {"strain_name": "BXD19", "value": 8.30401, "variance": None, "ndata": None}, "BXD2": {"strain_name": "BXD2", "value": 7.80944, "variance": None, "ndata": None}, "BXD21": {"strain_name": "BXD21", "value": 8.93809, "variance": None, "ndata": None}, "BXD24": {"strain_name": "BXD24", "value": 7.99415, "variance": None, "ndata": None}, "BXD27": {"strain_name": "BXD27", "value": 8.12177, "variance": None, "ndata": None}, "BXD28": {"strain_name": "BXD28", "value": 7.67688, "variance": None, "ndata": None}, "BXD32": {"strain_name": "BXD32", "value": 7.79062, "variance": None, "ndata": None}, "BXD39": {"strain_name": "BXD39", "value": 8.27641, "variance": None, "ndata": None}, "BXD40": {"strain_name": "BXD40", "value": 8.18012, "variance": None, "ndata": None}, "BXD42": {"strain_name": "BXD42", "value": 7.82433, "variance": None, "ndata": None}, "BXD6": {"strain_name": "BXD6", "value": 8.09718, "variance": None, "ndata": None}, "BXH14": {"strain_name": "BXH14", "value": 7.97475, "variance": None, "ndata": None}, "BXH19": {"strain_name": "BXH19", "value": 7.67223, "variance": None, "ndata": None}, "BXH2": {"strain_name": "BXH2", "value": 7.93622, "variance": None, "ndata": None}, "BXH22": {"strain_name": "BXH22", "value": 7.43692, "variance": None, "ndata": None}, "BXH4": {"strain_name": "BXH4", "value": 7.96336, "variance": None, "ndata": None}, "BXH6": {"strain_name": "BXH6", "value": 7.75132, "variance": None, "ndata": None}, "BXH7": {"strain_name": "BXH7", "value": 8.12927, "variance": None, "ndata": None}, "BXH8": {"strain_name": "BXH8", "value": 6.77338, "variance": None, "ndata": None}, "BXH9": {"strain_name": "BXH9", "value": 8.03836, "variance": None, "ndata": None}, "C3H/HeJ": {"strain_name": "C3H/HeJ", "value": 7.42795, "variance": None, "ndata": None}, "C57BL/6J": {"strain_name": "C57BL/6J", "value": 7.50606, "variance": None, "ndata": None}, "DBA/2J": {"strain_name": "DBA/2J", "value": 7.72588, "variance": None, "ndata": None}}} +trait_data = { + "mysqlid": 36688172, + "data": { + "B6cC3-1": {"strain_name": "B6cC3-1", "value": 7.51879, "variance": None, "ndata": None}, + "BXD1": {"strain_name": "BXD1", "value": 7.77141, "variance": None, "ndata": None}, + "BXD12": {"strain_name": "BXD12", "value": 8.39265, "variance": None, "ndata": None}, + "BXD16": {"strain_name": "BXD16", "value": 8.17443, "variance": None, "ndata": None}, + "BXD19": {"strain_name": "BXD19", "value": 8.30401, "variance": None, "ndata": None}, + "BXD2": {"strain_name": "BXD2", "value": 7.80944, "variance": None, "ndata": None}, + "BXD21": {"strain_name": "BXD21", "value": 8.93809, "variance": None, "ndata": None}, + "BXD24": {"strain_name": "BXD24", "value": 7.99415, "variance": None, "ndata": None}, + "BXD27": {"strain_name": "BXD27", "value": 8.12177, "variance": None, "ndata": None}, + "BXD28": {"strain_name": "BXD28", "value": 7.67688, "variance": None, "ndata": None}, + "BXD32": {"strain_name": "BXD32", "value": 7.79062, "variance": None, "ndata": None}, + "BXD39": {"strain_name": "BXD39", "value": 8.27641, "variance": None, "ndata": None}, + "BXD40": {"strain_name": "BXD40", "value": 8.18012, "variance": None, "ndata": None}, + "BXD42": {"strain_name": "BXD42", "value": 7.82433, "variance": None, "ndata": None}, + "BXD6": {"strain_name": "BXD6", "value": 8.09718, "variance": None, "ndata": None}, + "BXH14": {"strain_name": "BXH14", "value": 7.97475, "variance": None, "ndata": None}, + "BXH19": {"strain_name": "BXH19", "value": 7.67223, "variance": None, "ndata": None}, + "BXH2": {"strain_name": "BXH2", "value": 7.93622, "variance": None, "ndata": None}, + "BXH22": {"strain_name": "BXH22", "value": 7.43692, "variance": None, "ndata": None}, + "BXH4": {"strain_name": "BXH4", "value": 7.96336, "variance": None, "ndata": None}, + "BXH6": {"strain_name": "BXH6", "value": 7.75132, "variance": None, "ndata": None}, + "BXH7": {"strain_name": "BXH7", "value": 8.12927, "variance": None, "ndata": None}, + "BXH8": {"strain_name": "BXH8", "value": 6.77338, "variance": None, "ndata": None}, + "BXH9": {"strain_name": "BXH9", "value": 8.03836, "variance": None, "ndata": None}, + "C3H/HeJ": {"strain_name": "C3H/HeJ", "value": 7.42795, "variance": None, "ndata": None}, + "C57BL/6J": {"strain_name": "C57BL/6J", "value": 7.50606, "variance": None, "ndata": None}, + "DBA/2J": {"strain_name": "DBA/2J", "value": 7.72588, "variance": None, "ndata": None}}} class TestHeatmap(TestCase): """Class for testing heatmap computation functions""" @@ -29,10 +58,14 @@ class TestHeatmap(TestCase): argument and the different flags set up """ for dtype, vflag, nflag, expected in [ - ["val", False, False, (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], - ["val", False, True, (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], - ["val", True, False, (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], - ["val", True, True, (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["val", False, False, + (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["val", False, True, + (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["val", True, False, + (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["val", True, True, + (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], ["var", False, False, (None, None, None, None, None, None)], ["var", False, True, (None, None, None, None, None, None)], ["var", True, False, (None, None, None, None, None, None)], @@ -41,10 +74,17 @@ class TestHeatmap(TestCase): ["N", False, True, (None, None, None, None, None, None)], ["N", True, False, (None, None, None, None, None, None)], ["N", True, True, (None, None, None, None, None, None)], - ["all", False, False, (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], - ["all", False, True, (7.51879, None, 7.77141, None, 8.39265, None, 8.17443, None, 8.30401, None, 7.80944, None)], - ["all", True, False, (7.51879, None, 7.77141, None, 8.39265, None, 8.17443, None, 8.30401, None, 7.80944, None)], - ["all", True, True, (7.51879, None, None, 7.77141, None, None, 8.39265, None, None, 8.17443, None, None, 8.30401, None, None, 7.80944, None, None)] + ["all", False, False, + (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["all", False, True, + (7.51879, None, 7.77141, None, 8.39265, None, 8.17443, None, + 8.30401, None, 7.80944, None)], + ["all", True, False, + (7.51879, None, 7.77141, None, 8.39265, None, 8.17443, None, + 8.30401, None, 7.80944, None)], + ["all", True, True, + (7.51879, None, None, 7.77141, None, None, 8.39265, None, None, + 8.17443, None, None, 8.30401, None, None, 7.80944, None, None)] ]: with self.subTest(dtype=dtype, vflag=vflag, nflag=nflag): self.assertEqual( @@ -52,3 +92,52 @@ class TestHeatmap(TestCase): trait_data, strainlist, dtype=dtype, var_exists=vflag, n_exists=nflag), expected) + + def test_cluster_traits(self): + """ + Test that the clustering is working as expected. + """ + traits_data_list = [ + (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944), + (6.1427, 6.50588, 7.73705, 6.68328, 7.49293, 7.27398), + (8.4211, 8.30581, 9.24076, 8.51173, 9.18455, 8.36077), + (10.0904, 10.6509, 9.36716, 9.91202, 8.57444, 10.5731), + (10.188, 9.76652, 9.54813, 9.05074, 9.52319, 9.10505), + (6.74676, 7.01029, 7.54169, 6.48574, 7.01427, 7.26815), + (6.39359, 6.85321, 5.78337, 7.11141, 6.22101, 6.16544), + (6.84118, 7.08432, 7.59844, 7.08229, 7.26774, 7.24991), + (9.45215, 10.6943, 8.64719, 10.1592, 7.75044, 8.78615), + (7.04737, 6.87185, 7.58586, 6.92456, 6.84243, 7.36913)] + self.assertEqual( + cluster_traits(traits_data_list), + ((0.0, 0.20337048635536847, 0.16381088984330505, 1.7388553629398245, + 1.5025235756329178, 0.6952839500255574, 1.271661230252733, + 0.2100487290977544, 1.4699690641062024, 0.7934461515867415), + (0.20337048635536847, 0.0, 0.2198321044997198, 1.5753041735592204, + 1.4815755944537086, 0.26087293140686374, 1.6939790104301427, + 0.06024619831474998, 1.7430082449189215, 0.4497104244247795), + (0.16381088984330505, 0.2198321044997198, 0.0, 1.9073926868549234, + 1.0396738891139845, 0.5278328671176757, 1.6275069061182947, + 0.2636503792482082, 1.739617877037615, 0.7127042590637039), + (1.7388553629398245, 1.5753041735592204, 1.9073926868549234, 0.0, + 0.9936846292920328, 1.1169999189889366, 0.6007483980555253, + 1.430209221053372, 0.25879514152086425, 0.9313185954797953), + (1.5025235756329178, 1.4815755944537086, 1.0396738891139845, + 0.9936846292920328, 0.0, 1.027827186339337, 1.1441743109173244, + 1.4122477962364253, 0.8968250491499363, 1.1683723389247052), + (0.6952839500255574, 0.26087293140686374, 0.5278328671176757, + 1.1169999189889366, 1.027827186339337, 0.0, 1.8420471110023269, + 0.19179284676938602, 1.4875072385631605, 0.23451785425383564), + (1.271661230252733, 1.6939790104301427, 1.6275069061182947, + 0.6007483980555253, 1.1441743109173244, 1.8420471110023269, 0.0, + 1.6540234785929928, 0.2140799896286565, 1.7413442197913358), + (0.2100487290977544, 0.06024619831474998, 0.2636503792482082, + 1.430209221053372, 1.4122477962364253, 0.19179284676938602, + 1.6540234785929928, 0.0, 1.5225640692832796, 0.33370067057028485), + (1.4699690641062024, 1.7430082449189215, 1.739617877037615, + 0.25879514152086425, 0.8968250491499363, 1.4875072385631605, + 0.2140799896286565, 1.5225640692832796, 0.0, 1.3256191648260216), + (0.7934461515867415, 0.4497104244247795, 0.7127042590637039, + 0.9313185954797953, 1.1683723389247052, 0.23451785425383564, + 1.7413442197913358, 0.33370067057028485, 1.3256191648260216, + 0.0))) -- cgit v1.2.3 From ded960e3d32e4d7ebe590deda27fc47175be73d9 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Fri, 20 Aug 2021 13:21:31 +0300 Subject: Add tests for ordering and implement function Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/computations/heatmap.py: implement new ordering function * tests/unit/computations/test_heatmap.py: add new tests Implement the ordering function to migrate the setup of the `neworder` variable from GN1 to GN3. This migration is incomplete, since there is dependence on the return from the `web.webqtl.heatmap.Heatmap.draw` function in form of the `d_1` variable in some of the paths. The thing is, this `d_1` variable, and the `xoffset` variable seem to be used for laying out things on the drawn heatmap, and might actually end up not being needed for the new system using plotly, which has other ways of laying out things on the drawing. For now though, this commit "shims" the presence of these values until when the use of these variables is confirmed as present or absent in the new GN3 system. --- gn3/computations/heatmap.py | 28 ++++++++++++++++++++++++++++ tests/unit/computations/test_heatmap.py | 25 ++++++++++++++++++++++++- 2 files changed, 52 insertions(+), 1 deletion(-) diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py index 3c35029..1c86261 100644 --- a/gn3/computations/heatmap.py +++ b/gn3/computations/heatmap.py @@ -175,3 +175,31 @@ def heatmap_data(formd, search_result, conn: Any): "traits_list": traits_list, "traits_data_list": traits_data_list } + +def compute_heatmap_order( + slink_data, xoffset: int = 40, neworder: tuple = tuple()): + """ + Compute the data used for drawing the heatmap proper from `slink_data`. + + This function tries to reproduce the creation and update of the `neworder` + variable in + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L120 + and in the `web.webqtl.heatmap.Heatmap.draw` function in GN1 + """ + d_1 = (0, 0, 0) # returned from self.draw in lines 391 and 399. This is just a placeholder + + def __order_maker(norder, slnk_dt): + print("norder:{}, slnk_dt:{}".format(norder, slnk_dt)) + if isinstance(slnk_dt[0], int) and isinstance(slnk_dt[1], int): + return norder + ( + (xoffset+20, slnk_dt[0]), (xoffset + 40, slnk_dt[1])) + + if isinstance(slnk_dt[0], int): + return norder + ((xoffset + 20, slnk_dt[0]), ) + + if isinstance(slnk_dt[1], int): + return norder + ((xoffset + d_1[0] + 20, slnk_dt[1]), ) + + return __order_maker(__order_maker(norder, slnk_dt[0]), slnk_dt[1]) + + return __order_maker(neworder, slink_data) diff --git a/tests/unit/computations/test_heatmap.py b/tests/unit/computations/test_heatmap.py index 650cb45..14807bb 100644 --- a/tests/unit/computations/test_heatmap.py +++ b/tests/unit/computations/test_heatmap.py @@ -1,6 +1,9 @@ """Module contains tests for gn3.computations.heatmap""" from unittest import TestCase -from gn3.computations.heatmap import cluster_traits, export_trait_data +from gn3.computations.heatmap import ( + cluster_traits, + export_trait_data, + compute_heatmap_order) strainlist = ["B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"] trait_data = { @@ -34,6 +37,16 @@ trait_data = { "C57BL/6J": {"strain_name": "C57BL/6J", "value": 7.50606, "variance": None, "ndata": None}, "DBA/2J": {"strain_name": "DBA/2J", "value": 7.72588, "variance": None, "ndata": None}}} +slinked = ( + (((0, 2, 0.16381088984330505), + ((1, 7, 0.06024619831474998), 5, 0.19179284676938602), + 0.20337048635536847), + 9, + 0.23451785425383564), + ((3, (6, 8, 0.2140799896286565), 0.25879514152086425), + 4, 0.8968250491499363), + 0.9313185954797953) + class TestHeatmap(TestCase): """Class for testing heatmap computation functions""" @@ -141,3 +154,13 @@ class TestHeatmap(TestCase): 0.9313185954797953, 1.1683723389247052, 0.23451785425383564, 1.7413442197913358, 0.33370067057028485, 1.3256191648260216, 0.0))) + + def test_compute_heatmap_order(self): + """Test the orders.""" + for xoff, expected in [ + (40, ((60, 9), (60, 4))), + (30, ((50, 9), (50, 4))), + (20, ((40, 9), (40, 4)))]: + with self.subTest(xoffset=xoff): + self.assertEqual( + compute_heatmap_order(slinked, xoffset=xoff), expected) -- cgit v1.2.3 From 8b2c776771d2a70613a1e31d6e6671b612cfbafc Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Fri, 20 Aug 2021 14:10:45 +0300 Subject: Retrieve the strains with valid values Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/computations/heatmap.py: add function to get strains with values * tests/unit/computations/test_heatmap.py: new tests Add function to get the strains whose values are not `None` from the `trait_data` object passed in. This migrates https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L215-221 into a separate function that can handle that and be tested independently of any other code. --- gn3/computations/heatmap.py | 19 +++++++++++++++++++ tests/unit/computations/test_heatmap.py | 14 +++++++++++++- 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py index 1c86261..5a3c619 100644 --- a/gn3/computations/heatmap.py +++ b/gn3/computations/heatmap.py @@ -203,3 +203,22 @@ def compute_heatmap_order( return __order_maker(__order_maker(norder, slnk_dt[0]), slnk_dt[1]) return __order_maker(neworder, slink_data) + +def retrieve_strains_and_values(strainlist, trait_data): + """ + Get the strains and their corresponding values from `strainlist` and + `trait_data`. + + This migrates the code in + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L215-221 + """ + def __strains_and_values(acc, i): + if trait_data[i] is None: + return acc + if len(acc) == 0: + return ((strainlist[i], ), (trait_data[i], )) + _strains = acc[0] + _vals = acc[1] + return (_strains + (strainlist[i], ), _vals + (trait_data[i], )) + return reduce( + __strains_and_values, range(len(strainlist)), (tuple(), tuple())) diff --git a/tests/unit/computations/test_heatmap.py b/tests/unit/computations/test_heatmap.py index 14807bb..686288d 100644 --- a/tests/unit/computations/test_heatmap.py +++ b/tests/unit/computations/test_heatmap.py @@ -3,7 +3,8 @@ from unittest import TestCase from gn3.computations.heatmap import ( cluster_traits, export_trait_data, - compute_heatmap_order) + compute_heatmap_order, + retrieve_strains_and_values) strainlist = ["B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"] trait_data = { @@ -164,3 +165,14 @@ class TestHeatmap(TestCase): with self.subTest(xoffset=xoff): self.assertEqual( compute_heatmap_order(slinked, xoffset=xoff), expected) + + def test_retrieve_strains_and_values(self): + """Test retrieval of strains and values.""" + for slist, tdata, expected in [ + [["s1", "s2", "s3", "s4"], [9, None, 5, 4], + (("s1", "s3", "s4"), (9, 5, 4))], + [["s1", "s2", "s3", "s4", "s5"], [6, None, None, 4, None], + (("s1", "s4"), (6, 4))]]: + with self.subTest(strainlist=slist, traitdata=tdata): + self.assertEqual( + retrieve_strains_and_values(slist, tdata), expected) -- cgit v1.2.3 From 96af4e9e32ed167a8d70cf7761b709b1a37bb344 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Fri, 20 Aug 2021 14:14:12 +0300 Subject: Fix typing issue(s) caught by mypy Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/computations/heatmap.py: Use `Sequence` type not `Iterator` type --- gn3/computations/heatmap.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py index 5a3c619..c9c2b8a 100644 --- a/gn3/computations/heatmap.py +++ b/gn3/computations/heatmap.py @@ -156,8 +156,8 @@ def heatmap_data(formd, search_result, conn: Any): traits_details = [ __retrieve_traitlist_and_datalist(threshold, fullname) for fullname in search_result] - traits_list = map(lambda x: x[0], traits_details) - traits_data_list = map(lambda x: x[1], traits_details) + traits_list = tuple(x[0] for x in traits_details) + traits_data_list = tuple(x[1] for x in traits_details) return { "target_description_checked": formd.formdata.getvalue( -- cgit v1.2.3 From 7aa5f5422908b4dbfc80f3f73b008507878a34aa Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Thu, 26 Aug 2021 07:32:37 +0300 Subject: Add rust-qtlreaper Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * guix.scm: new dependency (rust-qtlreaper) --- guix.scm | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/guix.scm b/guix.scm index 729d089..868bd74 100644 --- a/guix.scm +++ b/guix.scm @@ -44,6 +44,7 @@ (gnu packages statistics) (gnu packages bioconductor) (gn packages golang) + (gn packages genenetwork) (gnu packages python) (gnu packages python-check) (gnu packages python-crypto) @@ -104,7 +105,8 @@ ("r-qtl" ,r-qtl) ("r-stringi" ,r-stringi) ("python-plotly" ,python-plotly) - ("python-pandas" ,python-pandas))) + ("python-pandas" ,python-pandas) + ("rust-qtlreaper" ,rust-qtlreaper))) (build-system python-build-system) (home-page "https://github.com/genenetwork/genenetwork3") (synopsis "GeneNetwork3 API for data science and machine learning.") -- cgit v1.2.3 From be4445e91f4a752ef7bbb99ed7d813c5fc88f467 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Thu, 26 Aug 2021 07:33:44 +0300 Subject: Update imported module name Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * guix.scm: (gn packages golang) ==> (gnu packages golang) csvdiff has moved to upstream guix and been removed from latest guix-bioinformatics. --- guix.scm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/guix.scm b/guix.scm index 868bd74..8e1cf79 100644 --- a/guix.scm +++ b/guix.scm @@ -43,7 +43,7 @@ (gnu packages databases) (gnu packages statistics) (gnu packages bioconductor) - (gn packages golang) + (gnu packages golang) (gn packages genenetwork) (gnu packages python) (gnu packages python-check) -- cgit v1.2.3 From 557e482c88ba3d44ae7d278b7222f37fa043b4d0 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Fri, 27 Aug 2021 15:47:52 +0300 Subject: Rework strains and trait values retrieval Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Rework the strains and values retrieval function to more closely correspond to the working of the original code in GN1 --- gn3/computations/heatmap.py | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py index c9c2b8a..da13ceb 100644 --- a/gn3/computations/heatmap.py +++ b/gn3/computations/heatmap.py @@ -204,21 +204,28 @@ def compute_heatmap_order( return __order_maker(neworder, slink_data) -def retrieve_strains_and_values(strainlist, trait_data): +def retrieve_strains_and_values(orders, strainlist, traits_data_list): """ Get the strains and their corresponding values from `strainlist` and - `trait_data`. + `traits_data_list`. This migrates the code in https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L215-221 """ - def __strains_and_values(acc, i): - if trait_data[i] is None: - return acc - if len(acc) == 0: - return ((strainlist[i], ), (trait_data[i], )) - _strains = acc[0] - _vals = acc[1] - return (_strains + (strainlist[i], ), _vals + (trait_data[i], )) - return reduce( - __strains_and_values, range(len(strainlist)), (tuple(), tuple())) + # This feels nasty! There's a lot of mutation of values here, that might + # indicate something untoward in the design of this function and its + # dependents ==> Review + strains = [] + values = [] + rets = [] + for order in orders: + temp_val = traits_data_list[order[1]] + for i in range(len(strainlist)): + if temp_val[i] != None: + strains.append(strainlist[i]) + values.append(temp_val[i]) + rets.append([order, strains[:], values[:]]) + strains = [] + values = [] + + return rets -- cgit v1.2.3 From 1a3901b174d00af8fa7f5ae78b810de66024b5ab Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Fri, 27 Aug 2021 15:49:53 +0300 Subject: Export trait data to file Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Provide a function to export the given strains and traits data into a traits file for use with `rust-qtlreaper`. --- gn3/computations/heatmap.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py index da13ceb..2f92048 100644 --- a/gn3/computations/heatmap.py +++ b/gn3/computations/heatmap.py @@ -229,3 +229,11 @@ def retrieve_strains_and_values(orders, strainlist, traits_data_list): values = [] return rets + +def generate_traits_file(strains, trait_values, traits_filename): + header = "Traits\t{}\n".format("\t".join(strains)) + data = [header] + [ + "T{}\t{}\n".format(i+1, "\t".join([str(i) for i in t])) + for i,t in enumerate(trait_values)] + with open(traits_filename, "w") as outfile: + outfile.writelines(data) -- cgit v1.2.3 From 28fde00ee2835d404157652548a4265be3accede Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Fri, 27 Aug 2021 15:51:27 +0300 Subject: Provide intermediate data in final results Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Seeing as not every requirement/feature has been migrated over at this time, this commit just provides all the intermediate data representations in the final return of the function for later use down the line. --- gn3/computations/heatmap.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py index 2f92048..3e96ed2 100644 --- a/gn3/computations/heatmap.py +++ b/gn3/computations/heatmap.py @@ -149,22 +149,22 @@ def heatmap_data(formd, search_result, conn: Any): def __retrieve_traitlist_and_datalist(threshold, fullname): trait = retrieve_trait_info(threshold, fullname, conn) - return ( - trait, - export_trait_data(retrieve_trait_data(trait, conn), strainlist)) + return (trait, retrieve_trait_data(trait, conn)) traits_details = [ __retrieve_traitlist_and_datalist(threshold, fullname) for fullname in search_result] traits_list = tuple(x[0] for x in traits_details) - traits_data_list = tuple(x[1] for x in traits_details) + traits_data_list = [x[1] for x in traits_details] + exported_traits_data_list = tuple( + export_trait_data(td, strainlist) for x in traits_data_list) return { "target_description_checked": formd.formdata.getvalue( "targetDescriptionCheck", ""), "cluster_checked": cluster_checked, "slink_data": ( - slink(cluster_traits(traits_data_list)) + slink(cluster_traits(exported_traits_data_list)) if cluster_checked else False), "sessionfile": formd.formdata.getvalue("session"), "genotype": genotype, @@ -173,7 +173,8 @@ def heatmap_data(formd, search_result, conn: Any): "ppolar": formd.ppolar, "mpolar":formd.mpolar, "traits_list": traits_list, - "traits_data_list": traits_data_list + "traits_data_list": traits_data_list, + "exported_traits_data_list": exported_traits_data_list } def compute_heatmap_order( -- cgit v1.2.3 From a9fa309f4017d84cd30f6df90376042f20b1836b Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Fri, 27 Aug 2021 15:55:41 +0300 Subject: Test out generation of traits file Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * As part of the development effort, this commit provides a proof-of-concept as a reference for generating the traits data file. It might be useful for verifying that the functions that are called are working as is expected. --- qtlfilesexport.py | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 qtlfilesexport.py diff --git a/qtlfilesexport.py b/qtlfilesexport.py new file mode 100644 index 0000000..2e7c9c2 --- /dev/null +++ b/qtlfilesexport.py @@ -0,0 +1,67 @@ +""" +Test the qtlfiles export of traits files + +Run with: + + env SQL_URI="mysql://:@:/db_webqtl" python3 qtlfilesexport.py + +replacing the variables in the angled brackets with the appropriate values +""" +import random +import string +from gn3.computations.slink import slink +from gn3.db_utils import database_connector +from gn3.computations.heatmap import export_trait_data +from gn3.db.traits import retrieve_trait_data, retrieve_trait_info +from gn3.computations.heatmap import ( + cluster_traits, + compute_heatmap_order, + generate_traits_file, + retrieve_strains_and_values) + +TMPDIR = "tmp/qtltests" + +def trait_fullnames(): + """Return sample names for traits""" + return [ + "UCLA_BXDBXH_CARTILAGE_V2::ILM103710672", + "UCLA_BXDBXH_CARTILAGE_V2::ILM2260338", + "UCLA_BXDBXH_CARTILAGE_V2::ILM3140576", + "UCLA_BXDBXH_CARTILAGE_V2::ILM5670577", + "UCLA_BXDBXH_CARTILAGE_V2::ILM2070121", + "UCLA_BXDBXH_CARTILAGE_V2::ILM103990541", + "UCLA_BXDBXH_CARTILAGE_V2::ILM1190722", + "UCLA_BXDBXH_CARTILAGE_V2::ILM6590722", + "UCLA_BXDBXH_CARTILAGE_V2::ILM4200064", + "UCLA_BXDBXH_CARTILAGE_V2::ILM3140463"] + +def random_string(length): + return "".join( + random.choices( + string.ascii_letters + string.digits, k=length)) + +def main(): + """entrypoint function""" + conn = database_connector()[0] + threshold = 0 + traits = [ + retrieve_trait_info(threshold, fullname, conn) + for fullname in trait_fullnames()] + traits_data_list = [retrieve_trait_data(t, conn) for t in traits] + strains = list(set([k for td in traits_data_list for k in td["data"].keys()])) + exported_traits_data_list = [ + export_trait_data(td, strains) for td in traits_data_list] + slinked = slink(cluster_traits(exported_traits_data_list)) + orders = compute_heatmap_order(slinked) + strains_and_values = retrieve_strains_and_values( + orders, strains, exported_traits_data_list) + strains_values = strains_and_values[0][1] + strains_values2 = strains_and_values[1][1] + trait_values = [t[2] for t in strains_and_values] + traits_filename = "{}/traits_test_file_{}.txt".format( + TMPDIR, random_string(10)) + generate_traits_file(strains_values, trait_values, traits_filename) + print("Generated file: {}".format(traits_filename)) + +if __name__ == "__main__": + main() -- cgit v1.2.3 From c045122908d36bba4ca197f3f67e89d80958f38f Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Mon, 30 Aug 2021 05:23:22 +0300 Subject: Document acquired knowledge on `rust-qtlreaper` Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/heatmaps/heatmaps.py: document format of the traits file To assist future developers, and development of the system, this commit documents some of the hard-won knowledge about the operation of the system to ease future development of the system. The documentation, if good, might also help with future onboarding of new developers to the system. --- README.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/README.md b/README.md index c3a9848..0e0e509 100644 --- a/README.md +++ b/README.md @@ -120,3 +120,24 @@ guix. To freeze dependencies: pip freeze --path venv/lib/python3.8/site-packages > requirements.txt ``` + +## QTLReaper (rust-qtlreaper) and Trait Files + +To run QTL computations, this system makes use of the [rust-qtlreaper](https://github.com/chfi/rust-qtlreaper.git) utility. + +To do this, the system needs to export the trait data into a tab-separated file, that can then be passed to the utility using the `--traits` option. For more information about the available options, please [see the rust-qtlreaper](https://github.com/chfi/rust-qtlreaper.git) repository. + +### Traits File Format + +The traits file begins with a header row/line with the column headers. The first column in the file has the header **"Trait"**. Every other column has a header for one of the strains in consideration. + +Under the **"Trait"** column, the traits are numbered from **T1** to **T** where **** is the count of the total number of traits in consideration. + +As an example, you could end up with a trait file like the following: + +```txt +Traits BXD27 BXD32 DBA/2J BXD21 ... +T1 10.5735 9.27408 9.48255 9.18253 ... +T2 6.4471 6.7191 5.98015 6.68051 ... +... +``` -- cgit v1.2.3 From 983acfdfc523677b4d7501287a000b7fd52a2c39 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Mon, 30 Aug 2021 07:00:38 +0300 Subject: Implement module for interfacing with rust-qtlreaper Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/computations/heatmap.py: move `generate_traits_file` function to new module * gn3/computations/qtlreaper.py: new module to interface with the `rust-qtlreaper` utility. * gn3/settings.py: Provide setting for the path to the `rust-qtlreaper` utility * qtlfilesexport.py: Move `random_string` function to new module. Update to use functions in new module. Provide a module with functions to be used to interface with `rust-qtlreaper`. This module essentially contains all the functions that are needed to build the files needed for, and to run the qtlreaper utility. --- gn3/computations/heatmap.py | 8 ---- gn3/computations/qtlreaper.py | 88 +++++++++++++++++++++++++++++++++++++++++++ gn3/settings.py | 3 ++ qtlfilesexport.py | 10 +---- 4 files changed, 92 insertions(+), 17 deletions(-) create mode 100644 gn3/computations/qtlreaper.py diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py index 3e96ed2..dcd64b1 100644 --- a/gn3/computations/heatmap.py +++ b/gn3/computations/heatmap.py @@ -230,11 +230,3 @@ def retrieve_strains_and_values(orders, strainlist, traits_data_list): values = [] return rets - -def generate_traits_file(strains, trait_values, traits_filename): - header = "Traits\t{}\n".format("\t".join(strains)) - data = [header] + [ - "T{}\t{}\n".format(i+1, "\t".join([str(i) for i in t])) - for i,t in enumerate(trait_values)] - with open(traits_filename, "w") as outfile: - outfile.writelines(data) diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py new file mode 100644 index 0000000..49d363b --- /dev/null +++ b/gn3/computations/qtlreaper.py @@ -0,0 +1,88 @@ +""" +This module contains functions to interact with the `qtlreaper` utility for +computation of QTLs. +""" +import os +import random +import string +import subprocess +from gn3.settings import TMPDIR, REAPER_COMMAND + +def random_string(length): + """Generate a random string of length `length`.""" + return "".join( + random.choices( + string.ascii_letters + string.digits, k=length)) + +def generate_traits_file(strains, trait_values, traits_filename): + """ + Generate a traits file for use with `qtlreaper`. + + PARAMETERS: + strains: A list of strains to use as the headers for the various columns. + trait_values: A list of lists of values for each trait and strain. + traits_filename: The tab-separated value to put the values in for + computation of QTLs. + """ + header = "Traits\t{}\n".format("\t".join(strains)) + data = [header] + [ + "T{}\t{}\n".format(i+1, "\t".join([str(i) for i in t])) + for i, t in enumerate(trait_values)] + with open(traits_filename, "w") as outfile: + outfile.writelines(data) + +def create_output_directory(path: str): + """Create the output directory at `path` if it does not exist.""" + try: + os.mkdir(path) + except OSError: + pass + +def run_reaper( + genotype_filename: str, traits_filename: str, + other_options: tuple = ("--n_permutations", 1000), + separate_nperm_output: bool = False, + output_dir: str = TMPDIR): + """ + Run the QTLReaper command to compute the QTLs. + + PARAMETERS: + genotype_filename: The complete path to a genotype file to use in the QTL + computation. + traits_filename: A path to a file previously generated with the + `generate_traits_file` function in this module, to be used in the QTL + computation. + other_options: Other options to pass to the `qtlreaper` command to modify + the QTL computations. + separate_nperm_output: A flag indicating whether or not to provide a + separate output for the permutations computation. The default is False, + which means by default, no separate output file is created. + output_dir: A path to the directory where the outputs are put + + RETURNS: + The function returns a tuple of the main output file, and the output file + for the permutation computations. If the `separate_nperm_output` is `False`, + the second value in the tuple returned is `None`. + + RAISES: + The function will raise a `subprocess.CalledProcessError` exception in case + of any errors running the `qtlreaper` command. + """ + create_output_directory(output_dir) + output_filename = "{}/qtlreaper/main_output_{}.txt".format( + output_dir, random_string(10)) + output_list = ["--main_output", output_filename] + if separate_nperm_output: + permu_output_filename = "{}/qtlreaper/permu_output_{}.txt".format( + output_dir, random_string(10)) + output_list = output_list + ["--permu_output", permu_output_filename] + else: + permu_output_filename = None + + command_list = [ + REAPER_COMMAND, "--geno", genotype_filename, + *other_options, # this splices the `other_options` list here + "--traits", traits_filename, "--main_output", output_filename] + + subprocess.run(command_list, check=True) + return (output_filename, permu_output_filename) diff --git a/gn3/settings.py b/gn3/settings.py index f4866d5..d137370 100644 --- a/gn3/settings.py +++ b/gn3/settings.py @@ -24,3 +24,6 @@ GN2_BASE_URL = "http://www.genenetwork.org/" # biweight script BIWEIGHT_RSCRIPT = "~/genenetwork3/scripts/calculate_biweight.R" + +# qtlreaper command +REAPER_COMMAND = "{}/bin/qtlreaper".format(os.environ.get("GUIX_ENVIRONMENT")) diff --git a/qtlfilesexport.py b/qtlfilesexport.py index 2e7c9c2..0543dc9 100644 --- a/qtlfilesexport.py +++ b/qtlfilesexport.py @@ -7,16 +7,14 @@ Run with: replacing the variables in the angled brackets with the appropriate values """ -import random -import string from gn3.computations.slink import slink from gn3.db_utils import database_connector from gn3.computations.heatmap import export_trait_data from gn3.db.traits import retrieve_trait_data, retrieve_trait_info +from gn3.computations.qtlreaper import random_string, generate_traits_file from gn3.computations.heatmap import ( cluster_traits, compute_heatmap_order, - generate_traits_file, retrieve_strains_and_values) TMPDIR = "tmp/qtltests" @@ -35,11 +33,6 @@ def trait_fullnames(): "UCLA_BXDBXH_CARTILAGE_V2::ILM4200064", "UCLA_BXDBXH_CARTILAGE_V2::ILM3140463"] -def random_string(length): - return "".join( - random.choices( - string.ascii_letters + string.digits, k=length)) - def main(): """entrypoint function""" conn = database_connector()[0] @@ -56,7 +49,6 @@ def main(): strains_and_values = retrieve_strains_and_values( orders, strains, exported_traits_data_list) strains_values = strains_and_values[0][1] - strains_values2 = strains_and_values[1][1] trait_values = [t[2] for t in strains_and_values] traits_filename = "{}/traits_test_file_{}.txt".format( TMPDIR, random_string(10)) -- cgit v1.2.3 From b95ad3bd2ce8bc22d1dcadefdf76c43f28309984 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Mon, 30 Aug 2021 07:05:49 +0300 Subject: Fix some linting errors and minor bugs. --- gn3/computations/heatmap.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py index dcd64b1..e0ff05b 100644 --- a/gn3/computations/heatmap.py +++ b/gn3/computations/heatmap.py @@ -157,7 +157,7 @@ def heatmap_data(formd, search_result, conn: Any): traits_list = tuple(x[0] for x in traits_details) traits_data_list = [x[1] for x in traits_details] exported_traits_data_list = tuple( - export_trait_data(td, strainlist) for x in traits_data_list) + export_trait_data(td, strainlist) for td in traits_data_list) return { "target_description_checked": formd.formdata.getvalue( @@ -190,7 +190,6 @@ def compute_heatmap_order( d_1 = (0, 0, 0) # returned from self.draw in lines 391 and 399. This is just a placeholder def __order_maker(norder, slnk_dt): - print("norder:{}, slnk_dt:{}".format(norder, slnk_dt)) if isinstance(slnk_dt[0], int) and isinstance(slnk_dt[1], int): return norder + ( (xoffset+20, slnk_dt[0]), (xoffset + 40, slnk_dt[1])) @@ -221,9 +220,9 @@ def retrieve_strains_and_values(orders, strainlist, traits_data_list): rets = [] for order in orders: temp_val = traits_data_list[order[1]] - for i in range(len(strainlist)): - if temp_val[i] != None: - strains.append(strainlist[i]) + for i, strain in enumerate(strainlist): + if temp_val[i] is not None: + strains.append(strain) values.append(temp_val[i]) rets.append([order, strains[:], values[:]]) strains = [] -- cgit v1.2.3 From bb1fd69fa24cec4ff605450d241601b3f0ced8cb Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Mon, 30 Aug 2021 09:50:44 +0300 Subject: Remove empty line Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Remove empty line at the end of the traits file --- gn3/computations/qtlreaper.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py index 49d363b..a88659e 100644 --- a/gn3/computations/qtlreaper.py +++ b/gn3/computations/qtlreaper.py @@ -27,7 +27,9 @@ def generate_traits_file(strains, trait_values, traits_filename): header = "Traits\t{}\n".format("\t".join(strains)) data = [header] + [ "T{}\t{}\n".format(i+1, "\t".join([str(i) for i in t])) - for i, t in enumerate(trait_values)] + for i, t in enumerate(trait_values[:-1])] + [ + "T{}\t{}".format(len(trait_values), "\t".join([str(i) for i in t])) + for t in trait_values[-1:]] with open(traits_filename, "w") as outfile: outfile.writelines(data) -- cgit v1.2.3 From 58f59b8f7df82969b58a604070aec095d17e0501 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Mon, 30 Aug 2021 11:44:37 +0300 Subject: Fix issues with traits file format * README.md: update header: Traits ==> Trait * gn3/computations/qtlreaper.py: update header: Traits ==> Trait * qtlfilesexport.py: Choose only BXD strains Rename the first column header from "Traits" to "Trait" to correspond with what `rust-qtlreaper` expects. Choose only the BXD strains for the proof-of-concept example - this helped bring out the fact that the traits file SHOULD NOT contain a strain column for a strain that does not exist in the genotype file in consideration. If the traits file has a strain column which does not exist in the genotype file, then `rust-qtlreaper` fails with a panic, since, from what I can tell, it tries to get a value from the genotype file for the non-existent strain, which results to a `None` type. Subsequent attempts at running an operation on the `None` type lead to the panic. --- README.md | 4 +++- gn3/computations/qtlreaper.py | 2 +- qtlfilesexport.py | 31 ++++++++++++++++++++++++++++++- 3 files changed, 34 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 0e0e509..b54015f 100644 --- a/README.md +++ b/README.md @@ -136,8 +136,10 @@ Under the **"Trait"** column, the traits are numbered from **T1** to **T** wh As an example, you could end up with a trait file like the following: ```txt -Traits BXD27 BXD32 DBA/2J BXD21 ... +Trait BXD27 BXD32 DBA/2J BXD21 ... T1 10.5735 9.27408 9.48255 9.18253 ... T2 6.4471 6.7191 5.98015 6.68051 ... ... ``` + +It is very important that the column header names for the strains correspond to the genotype file used. diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py index a88659e..9b13a55 100644 --- a/gn3/computations/qtlreaper.py +++ b/gn3/computations/qtlreaper.py @@ -24,7 +24,7 @@ def generate_traits_file(strains, trait_values, traits_filename): traits_filename: The tab-separated value to put the values in for computation of QTLs. """ - header = "Traits\t{}\n".format("\t".join(strains)) + header = "Trait\t{}\n".format("\t".join(strains)) data = [header] + [ "T{}\t{}\n".format(i+1, "\t".join([str(i) for i in t])) for i, t in enumerate(trait_values[:-1])] + [ diff --git a/qtlfilesexport.py b/qtlfilesexport.py index 0543dc9..adc5e77 100644 --- a/qtlfilesexport.py +++ b/qtlfilesexport.py @@ -41,7 +41,36 @@ def main(): retrieve_trait_info(threshold, fullname, conn) for fullname in trait_fullnames()] traits_data_list = [retrieve_trait_data(t, conn) for t in traits] - strains = list(set([k for td in traits_data_list for k in td["data"].keys()])) + # strains = list(set([k for td in traits_data_list for k in td["data"].keys()])) + strains = [# Use only the strains in the BXD.geno genotype file + "BXD1", "BXD2", "BXD5", "BXD6", "BXD8", "BXD9", "BXD11", "BXD12", + "BXD13", "BXD14", "BXD15", "BXD16", "BXD18", "BXD19", "BXD20", "BXD21", + "BXD22", "BXD23", "BXD24", "BXD24a", "BXD25", "BXD27", "BXD28", "BXD29", + "BXD30", "BXD31", "BXD32", "BXD33", "BXD34", "BXD35", "BXD36", "BXD37", + "BXD38", "BXD39", "BXD40", "BXD41", "BXD42", "BXD43", "BXD44", "BXD45", + "BXD48", "BXD48a", "BXD49", "BXD50", "BXD51", "BXD52", "BXD53", "BXD54", + "BXD55", "BXD56", "BXD59", "BXD60", "BXD61", "BXD62", "BXD63", "BXD64", + "BXD65", "BXD65a", "BXD65b", "BXD66", "BXD67", "BXD68", "BXD69", + "BXD70", "BXD71", "BXD72", "BXD73", "BXD73a", "BXD73b", "BXD74", + "BXD75", "BXD76", "BXD77", "BXD78", "BXD79", "BXD81", "BXD83", "BXD84", + "BXD85", "BXD86", "BXD87", "BXD88", "BXD89", "BXD90", "BXD91", "BXD93", + "BXD94", "BXD95", "BXD98", "BXD99", "BXD100", "BXD101", "BXD102", + "BXD104", "BXD105", "BXD106", "BXD107", "BXD108", "BXD109", "BXD110", + "BXD111", "BXD112", "BXD113", "BXD114", "BXD115", "BXD116", "BXD117", + "BXD119", "BXD120", "BXD121", "BXD122", "BXD123", "BXD124", "BXD125", + "BXD126", "BXD127", "BXD128", "BXD128a", "BXD130", "BXD131", "BXD132", + "BXD133", "BXD134", "BXD135", "BXD136", "BXD137", "BXD138", "BXD139", + "BXD141", "BXD142", "BXD144", "BXD145", "BXD146", "BXD147", "BXD148", + "BXD149", "BXD150", "BXD151", "BXD152", "BXD153", "BXD154", "BXD155", + "BXD156", "BXD157", "BXD160", "BXD161", "BXD162", "BXD165", "BXD168", + "BXD169", "BXD170", "BXD171", "BXD172", "BXD173", "BXD174", "BXD175", + "BXD176", "BXD177", "BXD178", "BXD180", "BXD181", "BXD183", "BXD184", + "BXD186", "BXD187", "BXD188", "BXD189", "BXD190", "BXD191", "BXD192", + "BXD193", "BXD194", "BXD195", "BXD196", "BXD197", "BXD198", "BXD199", + "BXD200", "BXD201", "BXD202", "BXD203", "BXD204", "BXD205", "BXD206", + "BXD207", "BXD208", "BXD209", "BXD210", "BXD211", "BXD212", "BXD213", + "BXD214", "BXD215", "BXD216", "BXD217", "BXD218", "BXD219", "BXD220" + ] exported_traits_data_list = [ export_trait_data(td, strains) for td in traits_data_list] slinked = slink(cluster_traits(exported_traits_data_list)) -- cgit v1.2.3 From b8777bcfee70325263d5389367e3a93ec2842f69 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Mon, 30 Aug 2021 12:01:54 +0300 Subject: Update documentation on genotype files * Provide documentation on downloading and using the genotype files. --- README.md | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b54015f..61ca539 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ guix environment --load=guix.scm Also, make sure you have the [guix-bioinformatics](https://git.genenetwork.org/guix-bioinformatics/guix-bioinformatics) channel set up. ```bash -env GUIX_PACKAGE_PATH=~/guix-bioinformatics/ ~/.config/guix/current/bin/guix environment --load=guix.scm +env GUIX_PACKAGE_PATH=~/guix-bioinformatics/ ~/.config/guix/current/bin/guix environment --expose=$HOME/genotype_files/ --load=guix.scm python3 import redis ``` @@ -22,7 +22,7 @@ python3 Better run a proper container ``` -env GUIX_PACKAGE_PATH=~/guix-bioinformatics/ ~/.config/guix/current/bin/guix environment -C --network --load=guix.scm +env GUIX_PACKAGE_PATH=~/guix-bioinformatics/ ~/.config/guix/current/bin/guix environment -C --network --expose=$HOME/genotype_files/ --load=guix.scm ``` If you get a Guix error, such as `ice-9/boot-9.scm:1669:16: In procedure raise-exception: @@ -121,6 +121,19 @@ pip freeze --path venv/lib/python3.8/site-packages > requirements.txt ``` +## Genotype Files + +You can get the genotype files from http://ipfs.genenetwork.org/ipfs/QmXQy3DAUWJuYxubLHLkPMNCEVq1oV7844xWG2d1GSPFPL and save them on your host machine at, say `$HOME/genotype_files` with something like: + +```bash +$ mkdir -p $HOME/genotype_files +$ cd $HOME/genotype_files +$ yes | 7z x genotype_files.tar.7z +$ tar xf genotype_files.tar +``` + +The `genotype_files.tar.7z` file seems to only contain the **BXD.geno** genotype file. + ## QTLReaper (rust-qtlreaper) and Trait Files To run QTL computations, this system makes use of the [rust-qtlreaper](https://github.com/chfi/rust-qtlreaper.git) utility. -- cgit v1.2.3 From 6ab866183aeac8553fdcda9217e4445da2b4836b Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Tue, 31 Aug 2021 06:51:18 +0300 Subject: Provide utilities for genotype files Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/db/genotypes.py: New module * gn3/settings.py: Add new configuration variable * qtlfilesexport.py: Test out new code Add a module containing functions dealing with the genotype files. Add a configuration variable to point to the location of the genotype files. --- gn3/db/genotypes.py | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++ gn3/settings.py | 4 ++++ qtlfilesexport.py | 33 +++---------------------- 3 files changed, 76 insertions(+), 30 deletions(-) create mode 100644 gn3/db/genotypes.py diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py new file mode 100644 index 0000000..610ddde --- /dev/null +++ b/gn3/db/genotypes.py @@ -0,0 +1,69 @@ +"""Genotype utilities""" + +import os +import gzip +from gn3.settings import GENOTYPE_FILES + +def build_genotype_file( + geno_name: str, base_dir: str = GENOTYPE_FILES, + extension: str = "geno"): + """Build the absolute path for the genotype file.""" + return "{}/{}.{}".format(os.path.abspath(base_dir), geno_name, extension) + +def load_genotype_samples(genotype_filename: str, file_type: str = "geno"): + """ + Load sample of strains from genotype files. + + DESCRIPTION: + Traits can contain a varied number of strains, some of which do not exist in + certain genotypes. In order to compute QTLs, GEMMAs, etc, we need to ensure + to pick only those strains that exist in the genotype under consideration + for the traits used in the computation. + + This function loads a list of samples from the genotype files for use in + filtering out unusable strains. + + + PARAMETERS: + genotype_filename: The absolute path to the genotype file to load the + samples from. + file_type: The type of file. Currently supported values are 'geno' and + 'plink'. + """ + file_type_fns = { + "geno": __load_genotype_samples_from_geno, + "plink": __load_genotype_samples_from_plink + } + return file_type_fns[file_type](genotype_filename) + +def __load_genotype_samples_from_geno(genotype_filename: str): + """ + Helper function for `load_genotype_samples` function. + + Loads samples from '.geno' files. + """ + gzipped_filename = "{}.gz".format(genotype_filename) + if os.path.isfile(gzipped_filename): + genofile = gzip.open(gzipped_filename) + else: + genofile = open(genotype_filename) + + for row in genofile: + line = row.strip() + if (not line) or (line.startswith(("#", "@"))): + continue + break + + headers = line.split("\t") + if headers[3] == "Mb": + return headers[4:] + return headers[3:] + +def __load_genotype_samples_from_plink(genotype_filename: str): + """ + Helper function for `load_genotype_samples` function. + + Loads samples from '.plink' files. + """ + genofile = open(genotype_filename) + return [line.split(" ")[1] for line in genofile] diff --git a/gn3/settings.py b/gn3/settings.py index d137370..a08f846 100644 --- a/gn3/settings.py +++ b/gn3/settings.py @@ -27,3 +27,7 @@ BIWEIGHT_RSCRIPT = "~/genenetwork3/scripts/calculate_biweight.R" # qtlreaper command REAPER_COMMAND = "{}/bin/qtlreaper".format(os.environ.get("GUIX_ENVIRONMENT")) + +# genotype files +GENOTYPE_FILES = os.environ.get( + "GENOTYPE_FILES", "{}/genotype_files/genotype".format(os.environ.get("HOME"))) diff --git a/qtlfilesexport.py b/qtlfilesexport.py index adc5e77..1db4ab6 100644 --- a/qtlfilesexport.py +++ b/qtlfilesexport.py @@ -11,6 +11,7 @@ from gn3.computations.slink import slink from gn3.db_utils import database_connector from gn3.computations.heatmap import export_trait_data from gn3.db.traits import retrieve_trait_data, retrieve_trait_info +from gn3.db.genotypes import build_genotype_file, load_genotype_samples from gn3.computations.qtlreaper import random_string, generate_traits_file from gn3.computations.heatmap import ( cluster_traits, @@ -41,36 +42,8 @@ def main(): retrieve_trait_info(threshold, fullname, conn) for fullname in trait_fullnames()] traits_data_list = [retrieve_trait_data(t, conn) for t in traits] - # strains = list(set([k for td in traits_data_list for k in td["data"].keys()])) - strains = [# Use only the strains in the BXD.geno genotype file - "BXD1", "BXD2", "BXD5", "BXD6", "BXD8", "BXD9", "BXD11", "BXD12", - "BXD13", "BXD14", "BXD15", "BXD16", "BXD18", "BXD19", "BXD20", "BXD21", - "BXD22", "BXD23", "BXD24", "BXD24a", "BXD25", "BXD27", "BXD28", "BXD29", - "BXD30", "BXD31", "BXD32", "BXD33", "BXD34", "BXD35", "BXD36", "BXD37", - "BXD38", "BXD39", "BXD40", "BXD41", "BXD42", "BXD43", "BXD44", "BXD45", - "BXD48", "BXD48a", "BXD49", "BXD50", "BXD51", "BXD52", "BXD53", "BXD54", - "BXD55", "BXD56", "BXD59", "BXD60", "BXD61", "BXD62", "BXD63", "BXD64", - "BXD65", "BXD65a", "BXD65b", "BXD66", "BXD67", "BXD68", "BXD69", - "BXD70", "BXD71", "BXD72", "BXD73", "BXD73a", "BXD73b", "BXD74", - "BXD75", "BXD76", "BXD77", "BXD78", "BXD79", "BXD81", "BXD83", "BXD84", - "BXD85", "BXD86", "BXD87", "BXD88", "BXD89", "BXD90", "BXD91", "BXD93", - "BXD94", "BXD95", "BXD98", "BXD99", "BXD100", "BXD101", "BXD102", - "BXD104", "BXD105", "BXD106", "BXD107", "BXD108", "BXD109", "BXD110", - "BXD111", "BXD112", "BXD113", "BXD114", "BXD115", "BXD116", "BXD117", - "BXD119", "BXD120", "BXD121", "BXD122", "BXD123", "BXD124", "BXD125", - "BXD126", "BXD127", "BXD128", "BXD128a", "BXD130", "BXD131", "BXD132", - "BXD133", "BXD134", "BXD135", "BXD136", "BXD137", "BXD138", "BXD139", - "BXD141", "BXD142", "BXD144", "BXD145", "BXD146", "BXD147", "BXD148", - "BXD149", "BXD150", "BXD151", "BXD152", "BXD153", "BXD154", "BXD155", - "BXD156", "BXD157", "BXD160", "BXD161", "BXD162", "BXD165", "BXD168", - "BXD169", "BXD170", "BXD171", "BXD172", "BXD173", "BXD174", "BXD175", - "BXD176", "BXD177", "BXD178", "BXD180", "BXD181", "BXD183", "BXD184", - "BXD186", "BXD187", "BXD188", "BXD189", "BXD190", "BXD191", "BXD192", - "BXD193", "BXD194", "BXD195", "BXD196", "BXD197", "BXD198", "BXD199", - "BXD200", "BXD201", "BXD202", "BXD203", "BXD204", "BXD205", "BXD206", - "BXD207", "BXD208", "BXD209", "BXD210", "BXD211", "BXD212", "BXD213", - "BXD214", "BXD215", "BXD216", "BXD217", "BXD218", "BXD219", "BXD220" - ] + genotype_filename = build_genotype_file(traits[0]["riset"]) + strains = load_genotype_samples(genotype_filename) exported_traits_data_list = [ export_trait_data(td, strains) for td in traits_data_list] slinked = slink(cluster_traits(exported_traits_data_list)) -- cgit v1.2.3 From 6c872943597f3664cca77abbdf56f074fc5231e6 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Tue, 31 Aug 2021 06:56:35 +0300 Subject: Fix bugs with `run_reaper` function Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/computations/qtlreaper.py: Fix some bugs * qtlfilesexport.py: Test out running rust-qtlreaper Test out the qtlreaper interface code and fix some bugs caught in the process. --- gn3/computations/qtlreaper.py | 8 +++++--- qtlfilesexport.py | 7 +++++++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py index 9b13a55..c058e14 100644 --- a/gn3/computations/qtlreaper.py +++ b/gn3/computations/qtlreaper.py @@ -42,7 +42,7 @@ def create_output_directory(path: str): def run_reaper( genotype_filename: str, traits_filename: str, - other_options: tuple = ("--n_permutations", 1000), + other_options: tuple = ("--n_permutations", "1000"), separate_nperm_output: bool = False, output_dir: str = TMPDIR): """ @@ -70,7 +70,7 @@ def run_reaper( The function will raise a `subprocess.CalledProcessError` exception in case of any errors running the `qtlreaper` command. """ - create_output_directory(output_dir) + create_output_directory("{}/qtlreaper".format(output_dir)) output_filename = "{}/qtlreaper/main_output_{}.txt".format( output_dir, random_string(10)) output_list = ["--main_output", output_filename] @@ -84,7 +84,9 @@ def run_reaper( command_list = [ REAPER_COMMAND, "--geno", genotype_filename, *other_options, # this splices the `other_options` list here - "--traits", traits_filename, "--main_output", output_filename] + "--traits", traits_filename, + *output_list # this splices the `output_list` list here + ] subprocess.run(command_list, check=True) return (output_filename, permu_output_filename) diff --git a/qtlfilesexport.py b/qtlfilesexport.py index 1db4ab6..799de31 100644 --- a/qtlfilesexport.py +++ b/qtlfilesexport.py @@ -9,6 +9,7 @@ replacing the variables in the angled brackets with the appropriate values """ from gn3.computations.slink import slink from gn3.db_utils import database_connector +from gn3.computations.qtlreaper import run_reaper from gn3.computations.heatmap import export_trait_data from gn3.db.traits import retrieve_trait_data, retrieve_trait_info from gn3.db.genotypes import build_genotype_file, load_genotype_samples @@ -57,5 +58,11 @@ def main(): generate_traits_file(strains_values, trait_values, traits_filename) print("Generated file: {}".format(traits_filename)) + main_output, permutations_output = run_reaper( + genotype_filename, traits_filename, separate_nperm_output=True) + + print("Main output: {}, Permutation output: {}".format( + main_output, permutations_output)) + if __name__ == "__main__": main() -- cgit v1.2.3 From 64ce38b45839b6305b009f6e28b0f852409e9bda Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Tue, 31 Aug 2021 10:45:11 +0300 Subject: Parse QTLReaper outputs Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/computations/qtlreaper.py: pass output files * tests/unit/computations/data/qtlreaper/main_output_sample.txt: sample test data * tests/unit/computations/data/qtlreaper/permu_output_sample.txt: sample test data * tests/unit/computations/test_qtlreaper.py: add tests Add code to parse the QTLReaper output data files. --- gn3/computations/qtlreaper.py | 18 ++++++ .../data/qtlreaper/main_output_sample.txt | 11 ++++ .../data/qtlreaper/permu_output_sample.txt | 27 ++++++++ tests/unit/computations/test_qtlreaper.py | 74 ++++++++++++++++++++++ 4 files changed, 130 insertions(+) create mode 100644 tests/unit/computations/data/qtlreaper/main_output_sample.txt create mode 100644 tests/unit/computations/data/qtlreaper/permu_output_sample.txt create mode 100644 tests/unit/computations/test_qtlreaper.py diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py index c058e14..3b8e4db 100644 --- a/gn3/computations/qtlreaper.py +++ b/gn3/computations/qtlreaper.py @@ -90,3 +90,21 @@ def run_reaper( subprocess.run(command_list, check=True) return (output_filename, permu_output_filename) + + +def parse_reaper_main_results(results_file): + with open(results_file, "r") as infile: + lines = infile.readlines() + + def __parse_line(line): + items = line.strip().split("\t") + return items[0:2] + [float(item) for item in items[2:]] + + header = lines[0].strip().split("\t") + return [dict(zip(header, __parse_line(line))) for line in lines[1:]] + +def parse_reaper_permutation_results(results_file): + with open(results_file, "r") as infile: + lines = infile.readlines() + + return [float(line.strip()) for line in lines] diff --git a/tests/unit/computations/data/qtlreaper/main_output_sample.txt b/tests/unit/computations/data/qtlreaper/main_output_sample.txt new file mode 100644 index 0000000..12b11b4 --- /dev/null +++ b/tests/unit/computations/data/qtlreaper/main_output_sample.txt @@ -0,0 +1,11 @@ +ID Locus Chr cM Mb LRS Additive pValue +T1 rs31443144 1 1.500 3.010 0.500 -0.074 1.000 +T1 rs6269442 1 1.500 3.492 0.500 -0.074 1.000 +T1 rs32285189 1 1.630 3.511 0.500 -0.074 1.000 +T1 rs258367496 1 1.630 3.660 0.500 -0.074 1.000 +T1 rs32430919 1 1.750 3.777 0.500 -0.074 1.000 +T1 rs36251697 1 1.880 3.812 0.500 -0.074 1.000 +T1 rs30658298 1 2.010 4.431 0.500 -0.074 1.000 +T1 rs51852623 1 2.010 4.447 0.500 -0.074 1.000 +T1 rs31879829 1 2.140 4.519 0.500 -0.074 1.000 +T1 rs36742481 1 2.140 4.776 0.500 -0.074 1.000 diff --git a/tests/unit/computations/data/qtlreaper/permu_output_sample.txt b/tests/unit/computations/data/qtlreaper/permu_output_sample.txt new file mode 100644 index 0000000..64cff07 --- /dev/null +++ b/tests/unit/computations/data/qtlreaper/permu_output_sample.txt @@ -0,0 +1,27 @@ +4.44174 +5.03825 +5.08167 +5.18119 +5.18578 +5.24563 +5.24619 +5.24619 +5.27961 +5.28228 +5.43903 +5.50188 +5.51694 +5.56830 +5.63874 +5.71346 +5.71936 +5.74275 +5.76764 +5.79815 +5.81671 +5.82775 +5.89659 +5.92117 +5.93396 +5.93396 +5.94957 diff --git a/tests/unit/computations/test_qtlreaper.py b/tests/unit/computations/test_qtlreaper.py new file mode 100644 index 0000000..ec23664 --- /dev/null +++ b/tests/unit/computations/test_qtlreaper.py @@ -0,0 +1,74 @@ +"""Module contains tests for gn3.computations.qtlreaper""" +import os +from unittest import TestCase +from gn3.computations.qtlreaper import ( + parse_reaper_main_results, parse_reaper_permutation_results) + +class TestQTLReaper(TestCase): + """Class for testing qtlreaper interface functions.""" + + def test_parse_reaper_main_results(self): + self.assertEqual( + parse_reaper_main_results( + "tests/unit/computations/data/qtlreaper/main_output_sample.txt"), + [ + { + "ID": "T1", "Locus": "rs31443144", "Chr": 1, "cM": 1.500, + "Mb": 3.010, "LRS": 0.500, "Additive": -0.074, + "pValue": 1.000 + }, + { + "ID": "T1", "Locus": "rs6269442", "Chr": 1, "cM": 1.500, + "Mb": 3.492, "LRS": 0.500, "Additive": -0.074, + "pValue": 1.000 + }, + { + "ID": "T1", "Locus": "rs32285189", "Chr": 1, "cM": 1.630, + "Mb": 3.511, "LRS": 0.500, "Additive": -0.074, + "pValue": 1.000 + }, + { + "ID": "T1", "Locus": "rs258367496", "Chr": 1, "cM": 1.630, + "Mb": 3.660, "LRS": 0.500, "Additive": -0.074, + "pValue": 1.000 + }, + { + "ID": "T1", "Locus": "rs32430919", "Chr": 1, "cM": 1.750, + "Mb": 3.777, "LRS": 0.500, "Additive": -0.074, + "pValue": 1.000 + }, + { + "ID": "T1", "Locus": "rs36251697", "Chr": 1, "cM": 1.880, + "Mb": 3.812, "LRS": 0.500, "Additive": -0.074, + "pValue": 1.000 + }, + { + "ID": "T1", "Locus": "rs30658298", "Chr": 1, "cM": 2.010, + "Mb": 4.431, "LRS": 0.500, "Additive": -0.074, + "pValue": 1.000 + }, + { + "ID": "T1", "Locus": "rs51852623", "Chr": 1, "cM": 2.010, + "Mb": 4.447, "LRS": 0.500, "Additive": -0.074, + "pValue": 1.000 + }, + { + "ID": "T1", "Locus": "rs31879829", "Chr": 1, "cM": 2.140, + "Mb": 4.519, "LRS": 0.500, "Additive": -0.074, + "pValue": 1.000 + }, + { + "ID": "T1", "Locus": "rs36742481", "Chr": 1, "cM": 2.140, + "Mb": 4.776, "LRS": 0.500, "Additive": -0.074, + "pValue": 1.000 + } + ]) + + def test_parse_reaper_permutation_results(self): + self.assertEqual( + parse_reaper_permutation_results( + "tests/unit/computations/data/qtlreaper/permu_output_sample.txt"), + [4.44174, 5.03825, 5.08167, 5.18119, 5.18578, 5.24563, 5.24619, + 5.24619, 5.27961, 5.28228, 5.43903, 5.50188, 5.51694, 5.56830, + 5.63874, 5.71346, 5.71936, 5.74275, 5.76764, 5.79815, 5.81671, + 5.82775, 5.89659, 5.92117, 5.93396, 5.93396, 5.94957]) -- cgit v1.2.3 From c3f8013347e3e8850c90cb787edb2bec1f367f7d Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Tue, 31 Aug 2021 10:48:30 +0300 Subject: Fix test Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * The number of the arguments to the function changed, and so the tests for the function needed to be updated. --- tests/unit/computations/test_heatmap.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/tests/unit/computations/test_heatmap.py b/tests/unit/computations/test_heatmap.py index 686288d..87f8e45 100644 --- a/tests/unit/computations/test_heatmap.py +++ b/tests/unit/computations/test_heatmap.py @@ -168,11 +168,25 @@ class TestHeatmap(TestCase): def test_retrieve_strains_and_values(self): """Test retrieval of strains and values.""" - for slist, tdata, expected in [ - [["s1", "s2", "s3", "s4"], [9, None, 5, 4], - (("s1", "s3", "s4"), (9, 5, 4))], - [["s1", "s2", "s3", "s4", "s5"], [6, None, None, 4, None], - (("s1", "s4"), (6, 4))]]: + for orders, slist, tdata, expected in [ + [ + [(60, 2)], + ["s1", "s2", "s3", "s4"], + [[2, 9, 6, None, 4], + [7, 5, None, None, 4], + [9, None, 5, 4, 7], + [6, None, None, 4, None]], + [[(60, 2), ["s1", "s3", "s4"], [9, 5, 4]]] + ], + [ + [(60, 3)], + ["s1", "s2", "s3", "s4", "s5"], + [[2, 9, 6, None, 4], + [7, 5, None, None, 4], + [9, None, 5, 4, 7], + [6, None, None, 4, None]], + [[(60, 3), ["s1", "s4"], [6, 4]]] + ]]: with self.subTest(strainlist=slist, traitdata=tdata): self.assertEqual( - retrieve_strains_and_values(slist, tdata), expected) + retrieve_strains_and_values(orders, slist, tdata), expected) -- cgit v1.2.3 From e441509a59c20a051fd5ab94710513f1968a5e02 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Tue, 31 Aug 2021 10:50:56 +0300 Subject: Update `heatmap_data` function: remove extraneous data Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/computations/heatmap.py: update function * gn3/db/traits.py: new function Remove extraneous data and arguments from the function. - Load the genotype file - Generate traits file - Provide both raw traits data, and exported traits data in return --- gn3/computations/heatmap.py | 42 ++++++++++++++++++++++-------------------- gn3/db/traits.py | 5 +++++ 2 files changed, 27 insertions(+), 20 deletions(-) diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py index e0ff05b..92014cf 100644 --- a/gn3/computations/heatmap.py +++ b/gn3/computations/heatmap.py @@ -6,8 +6,12 @@ generate various kinds of heatmaps. from functools import reduce from typing import Any, Dict, Sequence from gn3.computations.slink import slink -from gn3.db.traits import retrieve_trait_data, retrieve_trait_info from gn3.computations.correlations2 import compute_correlation +from gn3.db.genotypes import build_genotype_file, load_genotype_samples +from gn3.db.traits import ( + retrieve_trait_data, + retrieve_trait_info, + generate_traits_filename) def export_trait_data( trait_data: dict, strainlist: Sequence[str], dtype: str = "val", @@ -125,7 +129,7 @@ def cluster_traits(traits_data_list: Sequence[Dict]): return tuple(__cluster(tdata_i) for tdata_i in enumerate(traits_data_list)) -def heatmap_data(formd, search_result, conn: Any): +def heatmap_data(traits_names, conn: Any): """ heatmap function @@ -142,39 +146,37 @@ def heatmap_data(formd, search_result, conn: Any): TODO: Elaborate on the parameters here... """ threshold = 0 # webqtlConfig.PUBLICTHRESH - cluster_checked = formd.formdata.getvalue("clusterCheck", "") - strainlist = [ - strain for strain in formd.strainlist if strain not in formd.parlist] - genotype = formd.genotype - def __retrieve_traitlist_and_datalist(threshold, fullname): trait = retrieve_trait_info(threshold, fullname, conn) return (trait, retrieve_trait_data(trait, conn)) traits_details = [ __retrieve_traitlist_and_datalist(threshold, fullname) - for fullname in search_result] + for fullname in traits_names] traits_list = tuple(x[0] for x in traits_details) traits_data_list = [x[1] for x in traits_details] exported_traits_data_list = tuple( export_trait_data(td, strainlist) for td in traits_data_list) + genotype_filename = build_genotype_file(traits_list[0]["riset"]) + strainlist = load_genotype_samples(genotype_filename) + slink_data = slink(cluster_traits(exported_traits_data_list)) + ordering_data = compute_heatmap_order(slink_data) + strains_and_values = retrieve_strains_and_values( + orders, strainlist, exported_traits_data_list) + strains_values = strains_and_values[0][1] + trait_values = [t[2] for t in strains_and_values] + traits_filename = generate_traits_filename() + generate_traits_file(strains_values, trait_values, traits_filename) return { - "target_description_checked": formd.formdata.getvalue( - "targetDescriptionCheck", ""), - "cluster_checked": cluster_checked, - "slink_data": ( - slink(cluster_traits(exported_traits_data_list)) - if cluster_checked else False), - "sessionfile": formd.formdata.getvalue("session"), - "genotype": genotype, - "nLoci": sum(map(len, genotype)), + "slink_data": slink_data, + "ordering_data": ordering_data, "strainlist": strainlist, - "ppolar": formd.ppolar, - "mpolar":formd.mpolar, + "genotype_filename": genotype_filename, "traits_list": traits_list, "traits_data_list": traits_data_list, - "exported_traits_data_list": exported_traits_data_list + "exported_traits_data_list": exported_traits_data_list, + "traits_filename": traits_filename } def compute_heatmap_order( diff --git a/gn3/db/traits.py b/gn3/db/traits.py index 1031e44..ccb101a 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -1,4 +1,5 @@ """This class contains functions relating to trait data manipulation""" +from gn3.settings import TMPDIR from typing import Any, Dict, Union, Sequence from gn3.function_helpers import compose from gn3.db.datasets import retrieve_trait_dataset @@ -666,3 +667,7 @@ def retrieve_trait_data(trait: dict, conn: Any, strainlist: Sequence[str] = tupl {k:v for k, v in x.items() if x != "strain_name"}), data))} return {} + +def generate_traits_filename(base_path: str = TMPDIR): + return "{}/traits_test_file_{}.txt".format( + os.path.abspath(base_path), random_string(10)) -- cgit v1.2.3 From b5e1d1176f1bf4f7c0b68b27beb15e99418f1650 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Tue, 31 Aug 2021 11:16:29 +0300 Subject: Fix linting errors, minor bugs and reorganise code * Fix some linting errors and some minor bugs caught by the linter. Move the `random_string` function to separate module for use in multiple places in the code. --- gn3/computations/heatmap.py | 7 ++++--- gn3/computations/qtlreaper.py | 27 ++++++++++++++------------- gn3/db/traits.py | 5 ++++- gn3/heatmaps/heatmaps.py | 25 +++++++++++++++++++------ gn3/random.py | 11 +++++++++++ tests/unit/computations/test_qtlreaper.py | 5 +++-- 6 files changed, 55 insertions(+), 25 deletions(-) create mode 100644 gn3/random.py diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py index 92014cf..1143450 100644 --- a/gn3/computations/heatmap.py +++ b/gn3/computations/heatmap.py @@ -6,6 +6,7 @@ generate various kinds of heatmaps. from functools import reduce from typing import Any, Dict, Sequence from gn3.computations.slink import slink +from gn3.computations.qtlreaper import generate_traits_file from gn3.computations.correlations2 import compute_correlation from gn3.db.genotypes import build_genotype_file, load_genotype_samples from gn3.db.traits import ( @@ -155,14 +156,14 @@ def heatmap_data(traits_names, conn: Any): for fullname in traits_names] traits_list = tuple(x[0] for x in traits_details) traits_data_list = [x[1] for x in traits_details] - exported_traits_data_list = tuple( - export_trait_data(td, strainlist) for td in traits_data_list) genotype_filename = build_genotype_file(traits_list[0]["riset"]) strainlist = load_genotype_samples(genotype_filename) + exported_traits_data_list = tuple( + export_trait_data(td, strainlist) for td in traits_data_list) slink_data = slink(cluster_traits(exported_traits_data_list)) ordering_data = compute_heatmap_order(slink_data) strains_and_values = retrieve_strains_and_values( - orders, strainlist, exported_traits_data_list) + ordering_data, strainlist, exported_traits_data_list) strains_values = strains_and_values[0][1] trait_values = [t[2] for t in strains_and_values] traits_filename = generate_traits_filename() diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py index 3b8e4db..30c7051 100644 --- a/gn3/computations/qtlreaper.py +++ b/gn3/computations/qtlreaper.py @@ -3,17 +3,10 @@ This module contains functions to interact with the `qtlreaper` utility for computation of QTLs. """ import os -import random -import string import subprocess +from gn3.random import random_string from gn3.settings import TMPDIR, REAPER_COMMAND -def random_string(length): - """Generate a random string of length `length`.""" - return "".join( - random.choices( - string.ascii_letters + string.digits, k=length)) - def generate_traits_file(strains, trait_values, traits_filename): """ Generate a traits file for use with `qtlreaper`. @@ -25,11 +18,13 @@ def generate_traits_file(strains, trait_values, traits_filename): computation of QTLs. """ header = "Trait\t{}\n".format("\t".join(strains)) - data = [header] + [ - "T{}\t{}\n".format(i+1, "\t".join([str(i) for i in t])) - for i, t in enumerate(trait_values[:-1])] + [ - "T{}\t{}".format(len(trait_values), "\t".join([str(i) for i in t])) - for t in trait_values[-1:]] + data = ( + [header] + + ["T{}\t{}\n".format(i+1, "\t".join([str(i) for i in t])) + for i, t in enumerate(trait_values[:-1])] + + ["T{}\t{}".format( + len(trait_values), "\t".join([str(i) for i in t])) + for t in trait_values[-1:]]) with open(traits_filename, "w") as outfile: outfile.writelines(data) @@ -93,6 +88,9 @@ def run_reaper( def parse_reaper_main_results(results_file): + """ + Parse the results file of running QTLReaper into a list of dicts. + """ with open(results_file, "r") as infile: lines = infile.readlines() @@ -104,6 +102,9 @@ def parse_reaper_main_results(results_file): return [dict(zip(header, __parse_line(line))) for line in lines[1:]] def parse_reaper_permutation_results(results_file): + """ + Parse the results QTLReaper permutations into a list of values. + """ with open(results_file, "r") as infile: lines = infile.readlines() diff --git a/gn3/db/traits.py b/gn3/db/traits.py index ccb101a..bfe887e 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -1,6 +1,8 @@ """This class contains functions relating to trait data manipulation""" -from gn3.settings import TMPDIR +import os from typing import Any, Dict, Union, Sequence +from gn3.settings import TMPDIR +from gn3.random import random_string from gn3.function_helpers import compose from gn3.db.datasets import retrieve_trait_dataset @@ -669,5 +671,6 @@ def retrieve_trait_data(trait: dict, conn: Any, strainlist: Sequence[str] = tupl return {} def generate_traits_filename(base_path: str = TMPDIR): + """Generate a unique filename for use with generated traits files.""" return "{}/traits_test_file_{}.txt".format( os.path.abspath(base_path), random_string(10)) diff --git a/gn3/heatmaps/heatmaps.py b/gn3/heatmaps/heatmaps.py index 3bf7917..88f546d 100644 --- a/gn3/heatmaps/heatmaps.py +++ b/gn3/heatmaps/heatmaps.py @@ -14,6 +14,19 @@ def generate_random_data(data_stop: float = 2, width: int = 10, height: int = 30 return [[random.uniform(0,data_stop) for i in range(0, width)] for j in range(0, height)] +def generate_random_data2(data_stop: float = 2, width: int = 10, height: int = 30): + """ + This is mostly a utility function to be used to generate random data, useful + for development of the heatmap generation code, without access to the actual + database data. + """ + return [ + [{ + "value": item, + "category": random.choice(["C57BL/6J +", "DBA/2J +"])} + for item in axis] + for axis in generate_random_data(data_stop, width, height)] + def heatmap_x_axis_names(): return [ "UCLA_BXDBXH_CARTILAGE_V2::ILM103710672", @@ -30,13 +43,14 @@ def heatmap_x_axis_names(): # Grey + Blue + Red def generate_heatmap(): - rows = 20 - data = generate_random_data(height=rows) - y = (["%s"%x for x in range(1, rows+1)][:-1] + ["X"]) #replace last item with x for now + cols = 20 + y_axis = (["%s"%x for x in range(1, cols+1)][:-1] + ["X"]) #replace last item with x for now + x_axis = heatmap_x_axis_names() + data = generate_random_data(height=cols, width=len(x_axis)) fig = px.imshow( data, - x=heatmap_x_axis_names(), - y=y, + x=x_axis, + y=y_axis, width=500) fig.update_traces(xtype="array") fig.update_traces(ytype="array") @@ -49,6 +63,5 @@ def generate_heatmap(): coloraxis_colorscale=[ [0.0, '#3B3B3B'], [0.4999999999999999, '#ABABAB'], [0.5, '#F5DE11'], [1.0, '#FF0D00']]) - fig.write_html("%s/%s"%(heatmap_dir, "test_image.html")) return fig diff --git a/gn3/random.py b/gn3/random.py new file mode 100644 index 0000000..f0ba574 --- /dev/null +++ b/gn3/random.py @@ -0,0 +1,11 @@ +""" +Functions to generate complex random data. +""" +import random +import string + +def random_string(length): + """Generate a random string of length `length`.""" + return "".join( + random.choices( + string.ascii_letters + string.digits, k=length)) diff --git a/tests/unit/computations/test_qtlreaper.py b/tests/unit/computations/test_qtlreaper.py index ec23664..6c3b64d 100644 --- a/tests/unit/computations/test_qtlreaper.py +++ b/tests/unit/computations/test_qtlreaper.py @@ -1,5 +1,4 @@ """Module contains tests for gn3.computations.qtlreaper""" -import os from unittest import TestCase from gn3.computations.qtlreaper import ( parse_reaper_main_results, parse_reaper_permutation_results) @@ -8,6 +7,7 @@ class TestQTLReaper(TestCase): """Class for testing qtlreaper interface functions.""" def test_parse_reaper_main_results(self): + """Test that the main results file is parsed correctly.""" self.assertEqual( parse_reaper_main_results( "tests/unit/computations/data/qtlreaper/main_output_sample.txt"), @@ -65,9 +65,10 @@ class TestQTLReaper(TestCase): ]) def test_parse_reaper_permutation_results(self): + """Test that the permutations results file is parsed correctly.""" self.assertEqual( parse_reaper_permutation_results( - "tests/unit/computations/data/qtlreaper/permu_output_sample.txt"), + "tests/unit/computations/data/qtlreaper/permu_output_sample.txt"), [4.44174, 5.03825, 5.08167, 5.18119, 5.18578, 5.24563, 5.24619, 5.24619, 5.27961, 5.28228, 5.43903, 5.50188, 5.51694, 5.56830, 5.63874, 5.71346, 5.71936, 5.74275, 5.76764, 5.79815, 5.81671, -- cgit v1.2.3 From 221c773daea839ecf0e50c196484bb91e3a6db33 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Wed, 1 Sep 2021 06:18:20 +0300 Subject: Implement parsing of genotype labels Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/db/genotypes.py: parse genotype labels * tests/unit/db/test_genotypes.py: test that genotype labels are parsed correctly As part of parsing the genotype files into usable python data structures, this commit adds a function to parse the label lines (beginning with "@") into the appropriate values. --- gn3/db/genotypes.py | 20 ++++++++++++++++++++ tests/unit/db/test_genotypes.py | 17 +++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 tests/unit/db/test_genotypes.py diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py index 610ddde..2be3e1a 100644 --- a/gn3/db/genotypes.py +++ b/gn3/db/genotypes.py @@ -67,3 +67,23 @@ def __load_genotype_samples_from_plink(genotype_filename: str): """ genofile = open(genotype_filename) return [line.split(" ")[1] for line in genofile] + +def parse_genotype_labels(lines: list): + """ + Parse label lines into usable genotype values + + DESCRIPTION: + Reworks + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/utility/gen_geno_ob.py#L75-L93 + """ + acceptable_labels = ["name", "filler", "type", "mat", "pat", "het", "unk"] + def __parse_label(line): + label, value = [l.strip() for l in line[1:].split(":")] + if label not in acceptable_labels: + return None + if label == "name": + return ("group", value) + return (label, value) + return tuple( + item for item in (__parse_label(line) for line in lines) + if item is not None) diff --git a/tests/unit/db/test_genotypes.py b/tests/unit/db/test_genotypes.py new file mode 100644 index 0000000..0264764 --- /dev/null +++ b/tests/unit/db/test_genotypes.py @@ -0,0 +1,17 @@ +"""Tests gn3.db.genotypes""" +from unittest import TestCase +from gn3.db.genotypes import parse_genotype_labels + +class TestGenotypes(TestCase): + """Tests for functions in `gn3.db.genotypes`.""" + + def test_parse_genotype_labels(self): + self.assertEqual( + parse_genotype_labels([ + "@name: test_group\t", "@filler: test_filler ", + "@type:test_type", "@mat:test_mat \t", "@pat:test_pat ", + "@het: test_het ", "@unk: test_unk", "@other: test_other", + "@brrr: test_brrr "]), + (("group", "test_group"), ("filler", "test_filler"), + ("type", "test_type"), ("mat", "test_mat"), ("pat", "test_pat"), + ("het", "test_het"), ("unk", "test_unk"))) -- cgit v1.2.3 From b975e0cfd1d0adc5f51e66292d29d4651d3f053f Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Wed, 1 Sep 2021 07:35:40 +0300 Subject: Parse the genotype file's data header * gn3/db/genotypes.py: parse data header * tests/unit/db/test_genotypes.py: check that header's parse works correctly. Add tests to check that the parser works as expected. Add code to implement the parsing and pass the tests. --- gn3/db/genotypes.py | 19 +++++++++++++++++++ tests/unit/db/test_genotypes.py | 22 +++++++++++++++++++++- 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py index 2be3e1a..be0dfc2 100644 --- a/gn3/db/genotypes.py +++ b/gn3/db/genotypes.py @@ -87,3 +87,22 @@ def parse_genotype_labels(lines: list): return tuple( item for item in (__parse_label(line) for line in lines) if item is not None) + +def parse_genotype_header(line: str, parlist = tuple()): + """ + Parse the genotype file header line + + DESCRIPTION: + Reworks + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/utility/gen_geno_ob.py#L94-L114 + """ + items = [item.strip() for item in line.split("\t")] + Mbmap = "Mb" in items + prgy = ((parlist + tuple(items[4:])) if Mbmap + else (parlist + tuple(items[3:]))) + return ( + ("Mbmap", Mbmap), + ("cm_column", items.index("cM")), + ("mb_column", None if not Mbmap else items.index("Mb")), + ("prgy", prgy), + ("nprgy", len(prgy))) diff --git a/tests/unit/db/test_genotypes.py b/tests/unit/db/test_genotypes.py index 0264764..4fa8a53 100644 --- a/tests/unit/db/test_genotypes.py +++ b/tests/unit/db/test_genotypes.py @@ -1,6 +1,6 @@ """Tests gn3.db.genotypes""" from unittest import TestCase -from gn3.db.genotypes import parse_genotype_labels +from gn3.db.genotypes import parse_genotype_labels, parse_genotype_header class TestGenotypes(TestCase): """Tests for functions in `gn3.db.genotypes`.""" @@ -15,3 +15,23 @@ class TestGenotypes(TestCase): (("group", "test_group"), ("filler", "test_filler"), ("type", "test_type"), ("mat", "test_mat"), ("pat", "test_pat"), ("het", "test_het"), ("unk", "test_unk"))) + + def test_parse_genotype_header(self): + for header, expected in [ + [("Chr\tLocus\tcM\tMb\tBXD1\tBXD2\tBXD5\tBXD6\tBXD8\tBXD9\t" + "BXD11\tBXD12\tBXD13\tBXD14\tBXD15\tBXD16\tBXD18\tBXD19"), + (("Mbmap", True), ("cm_column", 2), ("mb_column", 3), + ("prgy", + ("BXD1", "BXD2", "BXD5", "BXD6", "BXD8", "BXD9", "BXD11", + "BXD12", "BXD13", "BXD14", "BXD15", "BXD16", "BXD18", + "BXD19")), + ("nprgy", 14))], + [("Chr\tLocus\tcM\tBXD1\tBXD2\tBXD5\tBXD6\tBXD8\tBXD9\tBXD11" + "\tBXD12\tBXD13\tBXD14\tBXD15\tBXD16\tBXD18"), + (("Mbmap", False), ("cm_column", 2), ("mb_column", None), + ("prgy", + ("BXD1", "BXD2", "BXD5", "BXD6", "BXD8", "BXD9", "BXD11", + "BXD12", "BXD13", "BXD14", "BXD15", "BXD16", "BXD18")), + ("nprgy", 13))]]: + with self.subTest(header=header): + self.assertEqual(parse_genotype_header(header), expected) -- cgit v1.2.3 From a1c217cf277feda3815a8435d6c8909f1b5546a1 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Wed, 1 Sep 2021 09:11:17 +0300 Subject: Parse data lines into markers Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/db/genotypes.py: parse data lines in file to genetic markers. * tests/unit/db/test_genotypes.py: test that parsing works. Add some tests to check that the parsing of the markers works as expected, and add the code to actually parse the markers. --- gn3/db/genotypes.py | 37 +++++++++++++++++++++++++++++++++++++ tests/unit/db/test_genotypes.py | 38 +++++++++++++++++++++++++++++++++++++- 2 files changed, 74 insertions(+), 1 deletion(-) diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py index be0dfc2..8710d2e 100644 --- a/gn3/db/genotypes.py +++ b/gn3/db/genotypes.py @@ -106,3 +106,40 @@ def parse_genotype_header(line: str, parlist = tuple()): ("mb_column", None if not Mbmap else items.index("Mb")), ("prgy", prgy), ("nprgy", len(prgy))) + +def parse_genotype_data_line(line: str, geno_obj: dict, parlist: list): + """ + Parse a data line in a genotype file + + DESCRIPTION: + Reworks + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/utility/gen_geno_ob.py#L143-L190 + """ + marker_row = [item.strip() for item in line.split("\t")] + geno_table = { + geno_obj["mat"]: -1, geno_obj["pat"]: 1, geno_obj["het"]: 0, + geno_obj["unk"]: "U" + } + start_pos = 4 if geno_obj["Mbmap"] else 3 + if len(parlist) > 0: + start_pos = start_pos + 2 + + alleles = marker_row[start_pos:] + genotype = tuple( + (geno_table[allele] if allele in geno_table.keys() else "U") + for allele in alleles) + if len(parlist) > 0: + genotype = (-1, 1) + genotype + try: + cM = float(geno_obj["cm_column"]) + except: + if geno_obj["Mbmap"]: + cM = float(geno_obj["mb_column"]) + else: + cM = 0 + return ( + ("chr", marker_row[0]), + ("name", marker_row[1]), + ("cM", cM), + ("Mb", float(geno_obj["mb_column"]) if geno_obj["Mbmap"] else None), + ("genotype", genotype)) diff --git a/tests/unit/db/test_genotypes.py b/tests/unit/db/test_genotypes.py index 4fa8a53..ba90191 100644 --- a/tests/unit/db/test_genotypes.py +++ b/tests/unit/db/test_genotypes.py @@ -1,11 +1,13 @@ """Tests gn3.db.genotypes""" from unittest import TestCase -from gn3.db.genotypes import parse_genotype_labels, parse_genotype_header +from gn3.db.genotypes import ( + parse_genotype_labels, parse_genotype_header, parse_genotype_data_line) class TestGenotypes(TestCase): """Tests for functions in `gn3.db.genotypes`.""" def test_parse_genotype_labels(self): + """Test that the genotype labels are parsed correctly.""" self.assertEqual( parse_genotype_labels([ "@name: test_group\t", "@filler: test_filler ", @@ -17,6 +19,7 @@ class TestGenotypes(TestCase): ("het", "test_het"), ("unk", "test_unk"))) def test_parse_genotype_header(self): + """Test that the genotype header is parsed correctly.""" for header, expected in [ [("Chr\tLocus\tcM\tMb\tBXD1\tBXD2\tBXD5\tBXD6\tBXD8\tBXD9\t" "BXD11\tBXD12\tBXD13\tBXD14\tBXD15\tBXD16\tBXD18\tBXD19"), @@ -35,3 +38,36 @@ class TestGenotypes(TestCase): ("nprgy", 13))]]: with self.subTest(header=header): self.assertEqual(parse_genotype_header(header), expected) + + def test_parse_genotype_data_line(self): + """Test parsing of data lines.""" + for line, geno_obj, parlist, expected in [ + ["1\trs31443144\t1.50\t3.010274\tB\tB\tD\tD\tD\tB\tB\tD\tB\tB", + {"mat": "test_mat", "pat": "test_pat", "het": "test_het", + "unk": "test_unk", "cm_column": 2, "Mbmap": True, + "mb_column": 3}, + tuple(), + (("chr", "1"), ("name", "rs31443144"), ("cM", 2.0), + ("Mb", 3.0), + ("genotype", + ("U", "U", "U", "U", "U", "U", "U", "U", "U", "U")))], + ["1\trs31443144\t1.50\t3.010274\tB\tB\tD\tD\tD\tB\tB\tD\tB\tB", + {"mat": "test_mat", "pat": "test_pat", "het": "test_het", + "unk": "test_unk", "cm_column": 2, "Mbmap": True, + "mb_column": 3}, + ("some", "parlist", "content"), + (("chr", "1"), ("name", "rs31443144"), ("cM", 2.0), + ("Mb", 3.0), + ("genotype", + (-1, 1, "U", "U", "U", "U", "U", "U", "U", "U")))], + ["1\trs31443144\t1.50\t3.010274\tB\tB\tD\tH\tD\tB\tU\tD\tB\tB", + {"mat": "B", "pat": "D", "het": "H", "unk": "U", + "cm_column": 2, "Mbmap": True, "mb_column": 3}, + tuple(), + (("chr", "1"), ("name", "rs31443144"), ("cM", 2.0), + ("Mb", 3.0), + ("genotype", (-1, -1, 1, 0, 1, -1, "U", 1, -1, -1)))]]: + with self.subTest(line = line): + self.assertEqual( + parse_genotype_data_line(line, geno_obj, parlist), + expected) -- cgit v1.2.3 From abfc0410a2385d8c3d6ee1915fc99b708e1d0dbc Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Wed, 1 Sep 2021 10:49:52 +0300 Subject: Built top-level genotype file parsing function Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/db/genotypes.py: parse genotype files * tests/unit/db/test_genotypes.py: test parsing is correct Add the overall genotype files parsing function and tests to check that the parsing works as expected. --- gn3/db/genotypes.py | 38 ++++++++++++++- tests/unit/db/test_genotypes.py | 101 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 136 insertions(+), 3 deletions(-) diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py index 8710d2e..b5d14a5 100644 --- a/gn3/db/genotypes.py +++ b/gn3/db/genotypes.py @@ -107,7 +107,7 @@ def parse_genotype_header(line: str, parlist = tuple()): ("prgy", prgy), ("nprgy", len(prgy))) -def parse_genotype_data_line(line: str, geno_obj: dict, parlist: list): +def parse_genotype_marker(line: str, geno_obj: dict, parlist: list): """ Parse a data line in a genotype file @@ -143,3 +143,39 @@ def parse_genotype_data_line(line: str, geno_obj: dict, parlist: list): ("cM", cM), ("Mb", float(geno_obj["mb_column"]) if geno_obj["Mbmap"] else None), ("genotype", genotype)) + +def build_genotype_chromosomes(geno_obj, markers): + """ + Build up the chromosomes from the given markers and partially built geno + object + """ + mrks = [dict(marker) for marker in markers] + chr_names = {marker["chr"] for marker in mrks} + return tuple(( + ("name", chr_name), ("mb_exists", geno_obj["Mbmap"]), ("cm_column", 2), + ("mb_column", geno_obj["mb_column"]), + ("loci", tuple(marker for marker in mrks if marker["chr"] == chr_name))) + for chr_name in sorted(chr_names)) + +def parse_genotype_file(filename: str, parlist = tuple()): + """ + Parse the provided genotype file into a usable pytho3 data structure. + """ + with open(filename, "r") as infile: + contents = infile.readlines() + + lines = tuple(line for line in contents if + ((not line.strip().startswith("#")) and + (not line.strip() == ""))) + labels = parse_genotype_labels( + line for line in lines if line.startswith("@")) + data_lines = tuple(line for line in lines if not line.startswith("@")) + header = parse_genotype_header(data_lines[0], parlist) + geno_obj = dict(labels + header) + markers = tuple( + parse_genotype_marker(line, geno_obj, parlist) + for line in data_lines[1:]) + chromosomes = tuple( + dict(chromosome) for chromosome in + build_genotype_chromosomes(geno_obj, markers)) + return {**geno_obj, "chromosomes": chromosomes} diff --git a/tests/unit/db/test_genotypes.py b/tests/unit/db/test_genotypes.py index ba90191..a05ce48 100644 --- a/tests/unit/db/test_genotypes.py +++ b/tests/unit/db/test_genotypes.py @@ -1,7 +1,11 @@ """Tests gn3.db.genotypes""" from unittest import TestCase from gn3.db.genotypes import ( - parse_genotype_labels, parse_genotype_header, parse_genotype_data_line) + parse_genotype_file, + parse_genotype_labels, + parse_genotype_header, + parse_genotype_marker, + build_genotype_chromosomes) class TestGenotypes(TestCase): """Tests for functions in `gn3.db.genotypes`.""" @@ -69,5 +73,98 @@ class TestGenotypes(TestCase): ("genotype", (-1, -1, 1, 0, 1, -1, "U", 1, -1, -1)))]]: with self.subTest(line = line): self.assertEqual( - parse_genotype_data_line(line, geno_obj, parlist), + parse_genotype_marker(line, geno_obj, parlist), expected) + + def test_build_genotype_chromosomes(self): + """ + Given `markers` and `geno_obj`, test that `build_genotype_chromosomes` + builds a sequence of chromosomes with the given markers ordered + according to the `chr` value.""" + for markers, geno_obj, expected in [ + [[(("chr", "1"), ("name", "rs31443144"), ("cM", 2.0), + ("Mb", 3.0), + ("genotype", (-1, -1, 1, 0, 1, -1, "U", 1, -1, -1))), + (("chr", "2"), ("name", "rs31443144"), ("cM", 2.0), + ("Mb", 3.0), + ("genotype", (-1, -1, 1, 0, 1, -1, "U", 1, -1, -1)))], + {"mat": "B", "pat": "D", "het": "H", "unk": "U", + "cm_column": 2, "Mbmap": True, "mb_column": 3}, + ((("name", "1"), ("mb_exists", True), ("cm_column", 2), + ("mb_column", 3), + ("loci", + ({"chr": "1", "name": "rs31443144", "cM": 2.0, "Mb": 3.0, + "genotype": (-1, -1, 1, 0, 1, -1, "U", 1, -1, -1)},))), + (("name", "2"), ("mb_exists", True), ("cm_column", 2), + ("mb_column", 3), + ("loci", + ({"chr": "2", "name": "rs31443144", "cM": 2.0, "Mb": 3.0, + "genotype": (-1, -1, 1, 0, 1, -1, "U", 1, -1, -1)},))))], + [[(("chr", "1"), ("name", "rs31443144"), ("cM", 2.0), + ("Mb", None), + ("genotype", (-1, 1, 1, 0, 1, -1, "U", 1, -1, -1)))], + {"mat": "B", "pat": "D", "het": "H", "unk": "U", + "cm_column": 2, "Mbmap": False, "mb_column": None}, + ((("name", "1"), ("mb_exists", False), ("cm_column", 2), + ("mb_column", None), + ("loci", + ({"chr": "1", "name": "rs31443144", "cM": 2.0, "Mb": None, + "genotype": (-1, 1, 1, 0, 1, -1, "U", 1, -1, -1)},))),)]]: + with self.subTest(markers = markers): + self.assertEqual( + build_genotype_chromosomes(geno_obj, markers), + expected) + + def test_parse_genotype_file(self): + """Test the parsing of genotype files. """ + self.assertEqual( + parse_genotype_file( + "tests/unit/db/data/genotypes/genotype_sample1.geno"), + {"group": "BXD", + "type": "riset", + "mat": "B", + "pat": "D", + "het": "H", + "unk": "U", + "Mbmap": True, + "cm_column": 2, + "mb_column": 3, + "prgy": ("BXD1", "BXD2", "BXD5", "BXD6", "BXD8", "BXD9"), + "nprgy": 6, + "chromosomes": ( + {"name": "1", + "mb_exists": True, + "cm_column": 2, + "mb_column": 3, + "loci": ( + {"chr": "1", + "name": "rs31443144", + "cM": 2.0, + "Mb": 3.0, + "genotype": (-1, -1, 1, 1, 1, -1) + }, + {"chr": "1", + "name": "rs6269442", + "cM": 2.0, + "Mb": 3.0, + "genotype": (-1, -1, 1, 1, 0, "U")}, + {"chr": "1", + "name": "rs32285189", + "cM": 2.0, + "Mb": 3.0, + "genotype": (-1, "U", 1, 1, 1, -1)})}, + {"name": "2", + "mb_exists": True, + "cm_column": 2, + "mb_column": 3, + "loci": ( + {"chr": "2", + "name": "rs31443144", + "cM": 2.0, + "Mb": 3.0, + "genotype": (-1, -1, 1, 1, 1, -1)}, + {"chr": "2", + "name": "rs6269442", + "cM": 2.0, + "Mb": 3.0, + "genotype": (-1, -1, 1, 1, 0, "U")})})}) -- cgit v1.2.3 From 3ded952f40f486d9aa69746eac2afe7f67fef790 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Wed, 1 Sep 2021 11:08:38 +0300 Subject: Fix linting and typing issues Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi --- gn3/db/genotypes.py | 32 ++++++++++++++++---------------- tests/unit/db/test_genotypes.py | 10 +++++----- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py index b5d14a5..b03d55c 100644 --- a/gn3/db/genotypes.py +++ b/gn3/db/genotypes.py @@ -88,7 +88,7 @@ def parse_genotype_labels(lines: list): item for item in (__parse_label(line) for line in lines) if item is not None) -def parse_genotype_header(line: str, parlist = tuple()): +def parse_genotype_header(line: str, parlist: tuple = tuple()): """ Parse the genotype file header line @@ -97,13 +97,13 @@ def parse_genotype_header(line: str, parlist = tuple()): https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/utility/gen_geno_ob.py#L94-L114 """ items = [item.strip() for item in line.split("\t")] - Mbmap = "Mb" in items - prgy = ((parlist + tuple(items[4:])) if Mbmap + mbmap = "Mb" in items + prgy = ((parlist + tuple(items[4:])) if mbmap else (parlist + tuple(items[3:]))) return ( - ("Mbmap", Mbmap), + ("Mbmap", mbmap), ("cm_column", items.index("cM")), - ("mb_column", None if not Mbmap else items.index("Mb")), + ("mb_column", None if not mbmap else items.index("Mb")), ("prgy", prgy), ("nprgy", len(prgy))) @@ -131,16 +131,16 @@ def parse_genotype_marker(line: str, geno_obj: dict, parlist: list): if len(parlist) > 0: genotype = (-1, 1) + genotype try: - cM = float(geno_obj["cm_column"]) + cm_val = float(geno_obj["cm_column"]) except: if geno_obj["Mbmap"]: - cM = float(geno_obj["mb_column"]) + cm_val = float(geno_obj["mb_column"]) else: - cM = 0 + cm_val = 0 return ( ("chr", marker_row[0]), ("name", marker_row[1]), - ("cM", cM), + ("cM", cm_val), ("Mb", float(geno_obj["mb_column"]) if geno_obj["Mbmap"] else None), ("genotype", genotype)) @@ -155,9 +155,9 @@ def build_genotype_chromosomes(geno_obj, markers): ("name", chr_name), ("mb_exists", geno_obj["Mbmap"]), ("cm_column", 2), ("mb_column", geno_obj["mb_column"]), ("loci", tuple(marker for marker in mrks if marker["chr"] == chr_name))) - for chr_name in sorted(chr_names)) + for chr_name in sorted(chr_names)) -def parse_genotype_file(filename: str, parlist = tuple()): +def parse_genotype_file(filename: str, parlist: tuple = tuple()): """ Parse the provided genotype file into a usable pytho3 data structure. """ @@ -165,16 +165,16 @@ def parse_genotype_file(filename: str, parlist = tuple()): contents = infile.readlines() lines = tuple(line for line in contents if - ((not line.strip().startswith("#")) and - (not line.strip() == ""))) + ((not line.strip().startswith("#")) and + (not line.strip() == ""))) labels = parse_genotype_labels( - line for line in lines if line.startswith("@")) + [line for line in lines if line.startswith("@")]) data_lines = tuple(line for line in lines if not line.startswith("@")) header = parse_genotype_header(data_lines[0], parlist) geno_obj = dict(labels + header) markers = tuple( - parse_genotype_marker(line, geno_obj, parlist) - for line in data_lines[1:]) + [parse_genotype_marker(line, geno_obj, parlist) + for line in data_lines[1:]]) chromosomes = tuple( dict(chromosome) for chromosome in build_genotype_chromosomes(geno_obj, markers)) diff --git a/tests/unit/db/test_genotypes.py b/tests/unit/db/test_genotypes.py index a05ce48..c125224 100644 --- a/tests/unit/db/test_genotypes.py +++ b/tests/unit/db/test_genotypes.py @@ -18,9 +18,9 @@ class TestGenotypes(TestCase): "@type:test_type", "@mat:test_mat \t", "@pat:test_pat ", "@het: test_het ", "@unk: test_unk", "@other: test_other", "@brrr: test_brrr "]), - (("group", "test_group"), ("filler", "test_filler"), - ("type", "test_type"), ("mat", "test_mat"), ("pat", "test_pat"), - ("het", "test_het"), ("unk", "test_unk"))) + (("group", "test_group"), ("filler", "test_filler"), + ("type", "test_type"), ("mat", "test_mat"), ("pat", "test_pat"), + ("het", "test_het"), ("unk", "test_unk"))) def test_parse_genotype_header(self): """Test that the genotype header is parsed correctly.""" @@ -71,7 +71,7 @@ class TestGenotypes(TestCase): (("chr", "1"), ("name", "rs31443144"), ("cM", 2.0), ("Mb", 3.0), ("genotype", (-1, -1, 1, 0, 1, -1, "U", 1, -1, -1)))]]: - with self.subTest(line = line): + with self.subTest(line=line): self.assertEqual( parse_genotype_marker(line, geno_obj, parlist), expected) @@ -110,7 +110,7 @@ class TestGenotypes(TestCase): ("loci", ({"chr": "1", "name": "rs31443144", "cM": 2.0, "Mb": None, "genotype": (-1, 1, 1, 0, 1, -1, "U", 1, -1, -1)},))),)]]: - with self.subTest(markers = markers): + with self.subTest(markers=markers): self.assertEqual( build_genotype_chromosomes(geno_obj, markers), expected) -- cgit v1.2.3 From 608ff9c6ff668d18f0c42aebf658ef80b517a6de Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Mon, 6 Sep 2021 06:45:18 +0300 Subject: Find nearest marker Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Migrate the `web.webqtl.heatmap.Heatmap.getNearestMarker` function in GN1 to GN3. --- gn3/computations/heatmap.py | 49 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py index 1143450..ccce385 100644 --- a/gn3/computations/heatmap.py +++ b/gn3/computations/heatmap.py @@ -30,7 +30,7 @@ def export_trait_data( The dictionary of key-value pairs representing a trait strainlist: (list) A list of strain names - type: (str) + dtype: (str) ... verify what this is ... var_exists: (bool) A flag indicating existence of variance @@ -232,3 +232,50 @@ def retrieve_strains_and_values(orders, strainlist, traits_data_list): values = [] return rets + +def nearest_marker_finder(genotype): + """ + Returns a function to be used with `genotype` to compute the nearest marker + to the trait passed to the returned function. + + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L425-434 + """ + def __compute_distances(chromo, trait): + loci = chromo.get("loci", None) + if not loci: + return None + return tuple( + { + "name": locus["name"], + "distance": abs(locus["Mb"] - trait["mb"]) + } for locus in loci) + + def __finder(trait): + _chrs = tuple( + _chr for _chr in genotype["chromosomes"] + if str(_chr["name"]) == str(trait["chr"])) + if len(_chrs) == 0: + return None + distances = tuple( + distance for dists in + filter( + lambda x: x is not None, + (__compute_distances(_chr, trait) for _chr in _chrs)) + for distance in dists) + nearest = min(distances, key=lambda d: d["distance"]) + return nearest["name"] + return __finder + +def get_nearest_marker(traits_list, genotype): + """ + Retrieves the nearest marker for each of the traits in the list. + + DESCRIPTION: + This migrates the code in + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L419-L438 + """ + if not genotype["Mbmap"]: + return [None] * len(trait_list) + + marker_finder = nearest_marker_finder(genotype) + return [marker_finder(trait) for trait in traits_list] -- cgit v1.2.3 From 4ce5695a35e92a704add8d497266bb2986a593f6 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Mon, 6 Sep 2021 06:47:52 +0300 Subject: Handle type-coercion exceptions * gn3/computations/qtlreaper.py: handle exceptions Sometimes, the values being parsed are plain strings and cannot be cast to the float types. This commit handles that by casting only those values that can be cast to float, and returning the others as strings. --- gn3/computations/qtlreaper.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py index 30c7051..eff2a80 100644 --- a/gn3/computations/qtlreaper.py +++ b/gn3/computations/qtlreaper.py @@ -94,9 +94,15 @@ def parse_reaper_main_results(results_file): with open(results_file, "r") as infile: lines = infile.readlines() + def __parse_column_value(value): + try: + return float(value) + except: + return value + def __parse_line(line): items = line.strip().split("\t") - return items[0:2] + [float(item) for item in items[2:]] + return items[0:2] + [__parse_column_value(item) for item in items[2:]] header = lines[0].strip().split("\t") return [dict(zip(header, __parse_line(line))) for line in lines[1:]] -- cgit v1.2.3 From 679a1af832ad9585c7cf72996043edb08e1b0d10 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Mon, 6 Sep 2021 08:06:14 +0300 Subject: Leave "Chr" value as string when parsing Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * The "Chr" value seems to be mostly a name of some sort, despite it being, seemingly an number. This commit parses the "Chr" value as a string. It also updates the tests to expec a string, rather than a number for "Chr" values. --- gn3/computations/qtlreaper.py | 5 +++-- tests/unit/computations/test_qtlreaper.py | 20 ++++++++++---------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py index eff2a80..9b20309 100644 --- a/gn3/computations/qtlreaper.py +++ b/gn3/computations/qtlreaper.py @@ -94,7 +94,7 @@ def parse_reaper_main_results(results_file): with open(results_file, "r") as infile: lines = infile.readlines() - def __parse_column_value(value): + def __parse_column_float_value(value): try: return float(value) except: @@ -102,7 +102,8 @@ def parse_reaper_main_results(results_file): def __parse_line(line): items = line.strip().split("\t") - return items[0:2] + [__parse_column_value(item) for item in items[2:]] + return items[0:3] + [ + __parse_column_float_value(item) for item in items[3:]] header = lines[0].strip().split("\t") return [dict(zip(header, __parse_line(line))) for line in lines[1:]] diff --git a/tests/unit/computations/test_qtlreaper.py b/tests/unit/computations/test_qtlreaper.py index 6c3b64d..fd3434a 100644 --- a/tests/unit/computations/test_qtlreaper.py +++ b/tests/unit/computations/test_qtlreaper.py @@ -13,52 +13,52 @@ class TestQTLReaper(TestCase): "tests/unit/computations/data/qtlreaper/main_output_sample.txt"), [ { - "ID": "T1", "Locus": "rs31443144", "Chr": 1, "cM": 1.500, + "ID": "T1", "Locus": "rs31443144", "Chr": "1", "cM": 1.500, "Mb": 3.010, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs6269442", "Chr": 1, "cM": 1.500, + "ID": "T1", "Locus": "rs6269442", "Chr": "1", "cM": 1.500, "Mb": 3.492, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs32285189", "Chr": 1, "cM": 1.630, + "ID": "T1", "Locus": "rs32285189", "Chr": "1", "cM": 1.630, "Mb": 3.511, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs258367496", "Chr": 1, "cM": 1.630, + "ID": "T1", "Locus": "rs258367496", "Chr": "1", "cM": 1.630, "Mb": 3.660, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs32430919", "Chr": 1, "cM": 1.750, + "ID": "T1", "Locus": "rs32430919", "Chr": "1", "cM": 1.750, "Mb": 3.777, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs36251697", "Chr": 1, "cM": 1.880, + "ID": "T1", "Locus": "rs36251697", "Chr": "1", "cM": 1.880, "Mb": 3.812, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs30658298", "Chr": 1, "cM": 2.010, + "ID": "T1", "Locus": "rs30658298", "Chr": "1", "cM": 2.010, "Mb": 4.431, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs51852623", "Chr": 1, "cM": 2.010, + "ID": "T1", "Locus": "rs51852623", "Chr": "1", "cM": 2.010, "Mb": 4.447, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs31879829", "Chr": 1, "cM": 2.140, + "ID": "T1", "Locus": "rs31879829", "Chr": "1", "cM": 2.140, "Mb": 4.519, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs36742481", "Chr": 1, "cM": 2.140, + "ID": "T1", "Locus": "rs36742481", "Chr": "1", "cM": 2.140, "Mb": 4.776, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 } -- cgit v1.2.3 From d4943f1d01d89a3928c905f80914a23144126c8e Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Mon, 6 Sep 2021 08:09:20 +0300 Subject: Provide function to organise parsed QTLReaper results * gn3/computations/qtlreaper.py: Provide a function to organise the results by trait for easier use down the line. * tests/unit/computations/test_qtlreaper.py: provide a test to ensure that the organising function works as expected. --- gn3/computations/qtlreaper.py | 25 +++++++ tests/unit/computations/test_qtlreaper.py | 105 +++++++++++++++++++++++++++++- 2 files changed, 129 insertions(+), 1 deletion(-) diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py index 9b20309..8c0e6de 100644 --- a/gn3/computations/qtlreaper.py +++ b/gn3/computations/qtlreaper.py @@ -86,6 +86,31 @@ def run_reaper( subprocess.run(command_list, check=True) return (output_filename, permu_output_filename) +def organise_reaper_main_results(parsed_results): + def __organise_by_chromosome(chr_name, items): + chr_items = [item for item in items if item["Chr"] == chr_name] + return { + "Chr": str(chr_name), + "loci": [{ + "Locus": locus["Locus"], + "cM": locus["cM"], + "Mb": locus["Mb"], + "LRS": locus["LRS"], + "Additive": locus["Additive"], + "pValue": locus["pValue"] + } for locus in chr_items]} + + def __organise_by_id(identifier, items): + id_items = [item for item in items if item["ID"] == identifier] + unique_chromosomes = {item["Chr"] for item in id_items} + return { + "ID": identifier, + "chromosomes": [ + __organise_by_chromosome(chromo, id_items) + for chromo in sorted(unique_chromosomes)]} + + unique_ids = {res["ID"] for res in parsed_results} + return [__organise_by_id(_id, parsed_results) for _id in sorted(unique_ids)] def parse_reaper_main_results(results_file): """ diff --git a/tests/unit/computations/test_qtlreaper.py b/tests/unit/computations/test_qtlreaper.py index fd3434a..1d7347f 100644 --- a/tests/unit/computations/test_qtlreaper.py +++ b/tests/unit/computations/test_qtlreaper.py @@ -1,7 +1,9 @@ """Module contains tests for gn3.computations.qtlreaper""" from unittest import TestCase from gn3.computations.qtlreaper import ( - parse_reaper_main_results, parse_reaper_permutation_results) + parse_reaper_main_results, + organise_reaper_main_results, + parse_reaper_permutation_results) class TestQTLReaper(TestCase): """Class for testing qtlreaper interface functions.""" @@ -73,3 +75,104 @@ class TestQTLReaper(TestCase): 5.24619, 5.27961, 5.28228, 5.43903, 5.50188, 5.51694, 5.56830, 5.63874, 5.71346, 5.71936, 5.74275, 5.76764, 5.79815, 5.81671, 5.82775, 5.89659, 5.92117, 5.93396, 5.93396, 5.94957]) + + def test_organise_reaper_main_results(self): + self.assertEqual( + organise_reaper_main_results([ + { + "ID": "T1", "Locus": "rs31443144", "Chr": 1, "cM": 1.500, + "Mb": 3.010, "LRS": 0.500, "Additive": -0.074, + "pValue": 1.000 + }, + { + "ID": "T1", "Locus": "rs6269442", "Chr": 1, "cM": 1.500, + "Mb": 3.492, "LRS": 0.500, "Additive": -0.074, + "pValue": 1.000 + }, + { + "ID": "T1", "Locus": "rs32285189", "Chr": 1, "cM": 1.630, + "Mb": 3.511, "LRS": 0.500, "Additive": -0.074, + "pValue": 1.000 + }, + { + "ID": "T1", "Locus": "rs258367496", "Chr": 1, "cM": 1.630, + "Mb": 3.660, "LRS": 0.500, "Additive": -0.074, + "pValue": 1.000 + }, + { + "ID": "T1", "Locus": "rs32430919", "Chr": 1, "cM": 1.750, + "Mb": 3.777, "LRS": 0.500, "Additive": -0.074, + "pValue": 1.000 + }, + { + "ID": "T1", "Locus": "rs36251697", "Chr": 1, "cM": 1.880, + "Mb": 3.812, "LRS": 0.500, "Additive": -0.074, + "pValue": 1.000 + }, + { + "ID": "T1", "Locus": "rs30658298", "Chr": 1, "cM": 2.010, + "Mb": 4.431, "LRS": 0.500, "Additive": -0.074, + "pValue": 1.000 + }, + { + "ID": "T1", "Locus": "rs51852623", "Chr": 2, "cM": 2.010, + "Mb": 4.447, "LRS": 0.500, "Additive": -0.074, + "pValue": 1.000 + }, + { + "ID": "T1", "Locus": "rs31879829", "Chr": 2, "cM": 2.140, + "Mb": 4.519, "LRS": 0.500, "Additive": -0.074, + "pValue": 1.000 + }, + { + "ID": "T1", "Locus": "rs36742481", "Chr": 2, "cM": 2.140, + "Mb": 4.776, "LRS": 0.500, "Additive": -0.074, + "pValue": 1.000 + } + ]), + [{"ID": "T1", + "chromosomes": [ + {"Chr": "1", + "loci": [ + { + "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs6269442", "cM": 1.500, "Mb": 3.492, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs32285189", "cM": 1.630, "Mb": 3.511, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs258367496", "cM": 1.630, "Mb": 3.660, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs32430919", "cM": 1.750, "Mb": 3.777, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs36251697", "cM": 1.880, "Mb": 3.812, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs30658298", "cM": 2.010, "Mb": 4.431, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }]}, + {"Chr": "2", + "loci": [ + { + "Locus": "rs51852623", "cM": 2.010, "Mb": 4.447, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs31879829", "cM": 2.140, "Mb": 4.519, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs36742481", "cM": 2.140, "Mb": 4.776, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }]}]}]) -- cgit v1.2.3 From 31ca02d1f095c2cc667e5b7d49131d702982f321 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Wed, 8 Sep 2021 06:52:01 +0300 Subject: Fix the traits order computations for clustering Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/computations/heatmap.py: Fix ordering function * tests/unit/computations/test_heatmap.py: update test The order of the traits is important for the clustering algorithm, since the clustering seems to use the distance of one trait from another to determine how to order them. This commit also gets rid of the xoffset argument that is not important to the ordering, and was used in the older GN1 to determine how to draw the clustering lines. --- gn3/computations/heatmap.py | 16 ++++++---------- tests/unit/computations/test_heatmap.py | 11 +++-------- 2 files changed, 9 insertions(+), 18 deletions(-) diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py index ccce385..8727c92 100644 --- a/gn3/computations/heatmap.py +++ b/gn3/computations/heatmap.py @@ -180,28 +180,24 @@ def heatmap_data(traits_names, conn: Any): "traits_filename": traits_filename } -def compute_heatmap_order( - slink_data, xoffset: int = 40, neworder: tuple = tuple()): +def compute_traits_order(slink_data, neworder: tuple = tuple()): """ - Compute the data used for drawing the heatmap proper from `slink_data`. + Compute the order of the traits for clustering from `slink_data`. This function tries to reproduce the creation and update of the `neworder` variable in https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L120 and in the `web.webqtl.heatmap.Heatmap.draw` function in GN1 """ - d_1 = (0, 0, 0) # returned from self.draw in lines 391 and 399. This is just a placeholder - def __order_maker(norder, slnk_dt): if isinstance(slnk_dt[0], int) and isinstance(slnk_dt[1], int): - return norder + ( - (xoffset+20, slnk_dt[0]), (xoffset + 40, slnk_dt[1])) + return norder + (slnk_dt[0], slnk_dt[1]) if isinstance(slnk_dt[0], int): - return norder + ((xoffset + 20, slnk_dt[0]), ) + return __order_maker((norder + (slnk_dt[0], )), slnk_dt[1]) if isinstance(slnk_dt[1], int): - return norder + ((xoffset + d_1[0] + 20, slnk_dt[1]), ) + return __order_maker(norder, slnk_dt[0]) + (slnk_dt[1], ) return __order_maker(__order_maker(norder, slnk_dt[0]), slnk_dt[1]) @@ -222,7 +218,7 @@ def retrieve_strains_and_values(orders, strainlist, traits_data_list): values = [] rets = [] for order in orders: - temp_val = traits_data_list[order[1]] + temp_val = traits_data_list[order] for i, strain in enumerate(strainlist): if temp_val[i] is not None: strains.append(strain) diff --git a/tests/unit/computations/test_heatmap.py b/tests/unit/computations/test_heatmap.py index 87f8e45..f1bbefc 100644 --- a/tests/unit/computations/test_heatmap.py +++ b/tests/unit/computations/test_heatmap.py @@ -3,7 +3,7 @@ from unittest import TestCase from gn3.computations.heatmap import ( cluster_traits, export_trait_data, - compute_heatmap_order, + compute_traits_order, retrieve_strains_and_values) strainlist = ["B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"] @@ -158,13 +158,8 @@ class TestHeatmap(TestCase): def test_compute_heatmap_order(self): """Test the orders.""" - for xoff, expected in [ - (40, ((60, 9), (60, 4))), - (30, ((50, 9), (50, 4))), - (20, ((40, 9), (40, 4)))]: - with self.subTest(xoffset=xoff): - self.assertEqual( - compute_heatmap_order(slinked, xoffset=xoff), expected) + self.assertEqual( + compute_traits_order(slinked), (0, 2, 1, 7, 5, 9, 3, 6, 8, 4)) def test_retrieve_strains_and_values(self): """Test retrieval of strains and values.""" -- cgit v1.2.3 From f360cc62cc156af90d3283ae7b6db9e8250fa43c Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Wed, 8 Sep 2021 10:51:57 +0300 Subject: Remove extraneous text to ease sorting Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Change the id from 'T' to simply '' to ease sorting of the trait results by numerical order rather than string order. --- gn3/computations/qtlreaper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py index 8c0e6de..ec215e5 100644 --- a/gn3/computations/qtlreaper.py +++ b/gn3/computations/qtlreaper.py @@ -20,9 +20,9 @@ def generate_traits_file(strains, trait_values, traits_filename): header = "Trait\t{}\n".format("\t".join(strains)) data = ( [header] + - ["T{}\t{}\n".format(i+1, "\t".join([str(i) for i in t])) + ["{}\t{}\n".format(i+1, "\t".join([str(i) for i in t])) for i, t in enumerate(trait_values[:-1])] + - ["T{}\t{}".format( + ["{}\t{}".format( len(trait_values), "\t".join([str(i) for i in t])) for t in trait_values[-1:]]) with open(traits_filename, "w") as outfile: -- cgit v1.2.3 From 3f323734fcf258d28f3f7d33fdc1518ef9ec24a8 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Wed, 8 Sep 2021 10:54:48 +0300 Subject: Parse Chr value as int where possible * To ease sorting of data by numerical order down the line, sort the "Chr" values by numerical order. --- gn3/computations/qtlreaper.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py index ec215e5..02d6572 100644 --- a/gn3/computations/qtlreaper.py +++ b/gn3/computations/qtlreaper.py @@ -86,11 +86,16 @@ def run_reaper( subprocess.run(command_list, check=True) return (output_filename, permu_output_filename) +def chromosome_sorter_key_fn(val): + if isinstance(val, int): + return val + return ord(val) + def organise_reaper_main_results(parsed_results): def __organise_by_chromosome(chr_name, items): chr_items = [item for item in items if item["Chr"] == chr_name] return { - "Chr": str(chr_name), + "Chr": chr_name, "loci": [{ "Locus": locus["Locus"], "cM": locus["cM"], @@ -125,9 +130,15 @@ def parse_reaper_main_results(results_file): except: return value + def __parse_column_int_value(value): + try: + return int(value) + except: + return value + def __parse_line(line): items = line.strip().split("\t") - return items[0:3] + [ + return items[0:2] + [__parse_column_int_value(items[2])] + [ __parse_column_float_value(item) for item in items[3:]] header = lines[0].strip().split("\t") -- cgit v1.2.3 From a718069c757bea9f7ecbaee25e23bd581750f906 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Wed, 8 Sep 2021 10:56:56 +0300 Subject: Ease search for traits and chromosomes Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Return a dict of values rather than list for the traits and chromosomes to ease searching through the data. --- gn3/computations/qtlreaper.py | 9 ++- tests/unit/computations/test_qtlreaper.py | 92 +++++++++++++++---------------- 2 files changed, 52 insertions(+), 49 deletions(-) diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py index 02d6572..5180853 100644 --- a/gn3/computations/qtlreaper.py +++ b/gn3/computations/qtlreaper.py @@ -110,12 +110,15 @@ def organise_reaper_main_results(parsed_results): unique_chromosomes = {item["Chr"] for item in id_items} return { "ID": identifier, - "chromosomes": [ + "chromosomes": {_chr["Chr"]: _chr for _chr in [ __organise_by_chromosome(chromo, id_items) - for chromo in sorted(unique_chromosomes)]} + for chromo in sorted( + unique_chromosomes, key=chromosome_sorter_key_fn)]}} unique_ids = {res["ID"] for res in parsed_results} - return [__organise_by_id(_id, parsed_results) for _id in sorted(unique_ids)] + return { + trait["ID"]: trait for trait in + [__organise_by_id(_id, parsed_results) for _id in sorted(unique_ids)]} def parse_reaper_main_results(results_file): """ diff --git a/tests/unit/computations/test_qtlreaper.py b/tests/unit/computations/test_qtlreaper.py index 1d7347f..495ed97 100644 --- a/tests/unit/computations/test_qtlreaper.py +++ b/tests/unit/computations/test_qtlreaper.py @@ -130,49 +130,49 @@ class TestQTLReaper(TestCase): "pValue": 1.000 } ]), - [{"ID": "T1", - "chromosomes": [ - {"Chr": "1", - "loci": [ - { - "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs6269442", "cM": 1.500, "Mb": 3.492, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs32285189", "cM": 1.630, "Mb": 3.511, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs258367496", "cM": 1.630, "Mb": 3.660, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs32430919", "cM": 1.750, "Mb": 3.777, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs36251697", "cM": 1.880, "Mb": 3.812, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs30658298", "cM": 2.010, "Mb": 4.431, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }]}, - {"Chr": "2", - "loci": [ - { - "Locus": "rs51852623", "cM": 2.010, "Mb": 4.447, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs31879829", "cM": 2.140, "Mb": 4.519, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs36742481", "cM": 2.140, "Mb": 4.776, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }]}]}]) + {"T1": {"ID": "T1", + "chromosomes": { + 1: {"Chr": 1, + "loci": [ + { + "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs6269442", "cM": 1.500, "Mb": 3.492, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs32285189", "cM": 1.630, "Mb": 3.511, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs258367496", "cM": 1.630, "Mb": 3.660, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs32430919", "cM": 1.750, "Mb": 3.777, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs36251697", "cM": 1.880, "Mb": 3.812, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs30658298", "cM": 2.010, "Mb": 4.431, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }]}, + 2: {"Chr": 2, + "loci": [ + { + "Locus": "rs51852623", "cM": 2.010, "Mb": 4.447, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs31879829", "cM": 2.140, "Mb": 4.519, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs36742481", "cM": 2.140, "Mb": 4.776, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }]}}}}) -- cgit v1.2.3 From 0d4e21d74d54554cf278142d0a31d35bec873ac1 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Thu, 9 Sep 2021 10:59:31 +0300 Subject: Update proof-of-concept code * Add individual heatmaps * Add dendograms * Merge multiple heatmaps to single plot Updated the proof of concept code to provide a sample of what is needed to generate the appropriate heatmaps. To generate the sample heatmaps, one can run something like: env SQL_URI="mysql://webqtlout:webqtlout@127.0.0.1:3306/db_webqtl" \ python3 qtlfilesexport.py assuming that the database can be accessed at 127.0.0.1:3306 --- qtlfilesexport.py | 199 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 192 insertions(+), 7 deletions(-) diff --git a/qtlfilesexport.py b/qtlfilesexport.py index 799de31..100fa75 100644 --- a/qtlfilesexport.py +++ b/qtlfilesexport.py @@ -7,19 +7,38 @@ Run with: replacing the variables in the angled brackets with the appropriate values """ +from gn3.random import random_string from gn3.computations.slink import slink from gn3.db_utils import database_connector from gn3.computations.qtlreaper import run_reaper -from gn3.computations.heatmap import export_trait_data from gn3.db.traits import retrieve_trait_data, retrieve_trait_info -from gn3.db.genotypes import build_genotype_file, load_genotype_samples -from gn3.computations.qtlreaper import random_string, generate_traits_file +from gn3.computations.heatmap import export_trait_data, get_nearest_marker +from gn3.db.genotypes import ( + build_genotype_file, + parse_genotype_file, + load_genotype_samples) from gn3.computations.heatmap import ( cluster_traits, - compute_heatmap_order, + compute_traits_order, retrieve_strains_and_values) +from gn3.computations.qtlreaper import ( + generate_traits_file, + chromosome_sorter_key_fn, + parse_reaper_main_results, + organise_reaper_main_results, + parse_reaper_permutation_results) -TMPDIR = "tmp/qtltests" +import plotly.express as px + +## for dendrogram +import numpy as np +import plotly.graph_objects as go +import plotly.figure_factory as ff + +# for single heatmap +from plotly.subplots import make_subplots + +TMPDIR = "tmp/" def trait_fullnames(): """Return sample names for traits""" @@ -35,6 +54,104 @@ def trait_fullnames(): "UCLA_BXDBXH_CARTILAGE_V2::ILM4200064", "UCLA_BXDBXH_CARTILAGE_V2::ILM3140463"] +def get_lrs_from_chr(trait, chr_name): + chromosome = trait["chromosomes"].get(chr_name) + if chromosome: + return [ + locus["LRS"] for locus in + sorted(chromosome["loci"], key=lambda loc: loc["Locus"])] + return [None] + +def process_traits_data_for_heatmap(data, trait_names, chromosome_names): + print("TRAIT_NAMES: {}".format(trait_names)) + print("chromosome names: {}".format(chromosome_names)) + print("data keys: {}".format(data.keys())) + hdata = [ + [get_lrs_from_chr(data[trait], chr_name) for trait in trait_names] + for chr_name in chromosome_names] + # print("hdata: {}".format(hdata)) + return hdata + +def generate_heatmap( + data, image_filename_prefix, x_axis = None, x_label: str = "", + y_axis = None, y_label: str = "", output_dir: str = TMPDIR): + """Generate single heatmap section.""" + print("X-AXIS:({}, {}), Y-AXIS: ({}, {}), ROWS:{}, COLS:{}".format( + x_axis, (len(x_axis) if x_axis else 0), + y_axis, (len(y_axis) if y_axis else 0), + len(data), len(data[0]))) + fig = px.imshow( + data, + x = x_axis, + y = y_axis, + width=1000 + ) + fig.update_yaxes(title=y_label) + fig.update_xaxes(title=x_label) + image_filename = "{}/{}.html".format(output_dir, image_filename_prefix) + fig.write_html(image_filename) + return image_filename, fig + +def generate_dendrogram( + data, image_filename_prefix, x_axis = None, x_label: str = "", + y_axis = None, y_label: str = "", output_dir: str = TMPDIR): + fig = ff.create_dendrogram( + np.array(data), orientation="right", labels=y_axis) + + heatmap = go.Heatmap( + x=fig['layout']['xaxis']['ticktext'], + y=fig['layout']['yaxis']['ticktext'], + z=data) + + # print("HEAMAP:{}".format(heatmap)) + fig.add_trace(heatmap) + + fig.update_layout({"width": 1000, "height": 500}) + image_filename = "{}/{}.html".format(output_dir, image_filename_prefix) + fig.write_html(image_filename) + return image_filename, fig + +def generate_single_heatmap( + data, image_filename_prefix, x_axis = None, x_label: str = "", + y_axis = None, y_label: str = "", output_dir: str = TMPDIR): + """Generate single heatmap section.""" + # fig = go.Figure({"type": "heatmap"}) + num_cols = len(x_axis) + fig = make_subplots( + rows=1, + cols=num_cols, + shared_yaxes="rows", + # horizontal_spacing=(1 / (num_cols - 1)), + subplot_titles=x_axis + ) + hms = [go.Heatmap( + name=chromo, + y = y_axis, + z = data_array, + showscale=False) for chromo, data_array in zip(x_axis, data)] + for col, hm in enumerate(hms): + fig.add_trace(hm, row=1, col=(col + 1)) + + fig.update_traces( + showlegend=False, + colorscale=[ + [0.0, '#3B3B3B'], [0.4999999999999999, '#ABABAB'], + [0.5, '#F5DE11'], [1.0, '#FF0D00']], + selector={"type": "heatmap"}) + fig.update_traces( + showlegend=True, + showscale=True, + selector={"name": x_axis[-1]}) + fig.update_layout( + coloraxis_colorscale=[ + [0.0, '#3B3B3B'], [0.4999999999999999, '#ABABAB'], + [0.5, '#F5DE11'], [1.0, '#FF0D00']] + ) + print(fig) + image_filename = "{}/{}.html".format(output_dir, image_filename_prefix) + fig.write_html(image_filename) + return image_filename, fig + def main(): """entrypoint function""" conn = database_connector()[0] @@ -44,13 +161,19 @@ def main(): for fullname in trait_fullnames()] traits_data_list = [retrieve_trait_data(t, conn) for t in traits] genotype_filename = build_genotype_file(traits[0]["riset"]) + genotype = parse_genotype_file(genotype_filename) strains = load_genotype_samples(genotype_filename) exported_traits_data_list = [ export_trait_data(td, strains) for td in traits_data_list] slinked = slink(cluster_traits(exported_traits_data_list)) - orders = compute_heatmap_order(slinked) + print("SLINKED: {}".format(slinked)) + traits_order = compute_traits_order(slinked) + print("KEYS: {}".format(traits[0].keys())) + ordered_traits_names = [ + traits[idx]["trait_fullname"] for idx in traits_order] + print("ORDERS: {}".format(traits_order)) strains_and_values = retrieve_strains_and_values( - orders, strains, exported_traits_data_list) + traits_order, strains, exported_traits_data_list) strains_values = strains_and_values[0][1] trait_values = [t[2] for t in strains_and_values] traits_filename = "{}/traits_test_file_{}.txt".format( @@ -64,5 +187,67 @@ def main(): print("Main output: {}, Permutation output: {}".format( main_output, permutations_output)) + qtlresults = parse_reaper_main_results(main_output) + permudata = parse_reaper_permutation_results(permutations_output) + # print("QTLRESULTS: {}".format(qtlresults)) + # print("PERMUDATA: {}".format(permudata)) + + nearest = get_nearest_marker(traits, genotype) + print("NEAREST: {}".format(nearest)) + + organised = organise_reaper_main_results(qtlresults) + + traits_ids = [# sort numerically, but retain the ids as strings + str(i) for i in sorted({int(row["ID"]) for row in qtlresults})] + chromosome_names = sorted( + {row["Chr"] for row in qtlresults}, key = chromosome_sorter_key_fn) + loci_names = sorted({row["Locus"] for row in qtlresults}) + ordered_traits_names = { + res_id: trait for res_id, trait in + zip(traits_ids, + [traits[idx]["trait_fullname"] for idx in traits_order])} + # print("ordered:{}, original: {}".format( + # ordered_traits_names, [t["trait_fullname"] for t in traits])) + # print("chromosome_names:{}".format(chromosome_names)) + # print("trait_ids:{}".format(traits_ids)) + # print("loci names:{}".format(loci_names)) + hdata = process_traits_data_for_heatmap(organised, traits_ids, chromosome_names) + + # print("ZIPPED: {}".format(zip(tuple(ordered_traits_names.keys()), hdata))) + # print("HDATA LENGTH:{}, ORDERED TRAITS LENGTH:{}".format(len(hdata), len(ordered_traits_names.keys()))) + heatmaps_data = [ + generate_heatmap( + data, + "heatmap_chr{}_{}".format(chromo, random_string(10)), + y_axis=tuple( + ordered_traits_names[traits_ids[order]] + for order in traits_order), + x_label=chromo, + output_dir=TMPDIR) + for chromo, data in zip(chromosome_names, hdata)] + print("IMAGES FILENAMES: {}".format([img[0] for img in heatmaps_data])) + + dendograms_data = [ + generate_dendrogram( + data, + "dendo_chr{}_{}".format(chromo, random_string(10)), + y_axis=tuple( + ordered_traits_names[traits_ids[order]] + for order in traits_order), + x_label=chromo, + output_dir=TMPDIR) + for chromo, data in zip(chromosome_names, hdata)] + + res = generate_single_heatmap( + hdata, + "single_heatmap_{}".format(random_string(10)), + y_axis=tuple( + ordered_traits_names[traits_ids[order]] + for order in traits_order), + y_label="Traits", + x_axis=[chromo for chromo in chromosome_names], + x_label="Chromosomes", + output_dir=TMPDIR) + if __name__ == "__main__": main() -- cgit v1.2.3 From 1600992807a0b3edbe10e8c0baf80a41636a7650 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Tue, 14 Sep 2021 21:58:20 +0300 Subject: init commit for wgcna script --- scripts/wgcna_analysis.R | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 scripts/wgcna_analysis.R diff --git a/scripts/wgcna_analysis.R b/scripts/wgcna_analysis.R new file mode 100644 index 0000000..e69de29 -- cgit v1.2.3 From ea0e92d3f63a9f403aacd5ab1590f61f2752158a Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Tue, 14 Sep 2021 22:12:43 +0300 Subject: load the required data for analysis --- scripts/wgcna_analysis.R | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/scripts/wgcna_analysis.R b/scripts/wgcna_analysis.R index e69de29..a8170b6 100644 --- a/scripts/wgcna_analysis.R +++ b/scripts/wgcna_analysis.R @@ -0,0 +1,21 @@ +# initial workspace setup + +library(WGCNA); +stringsAsFactors = FALSE + +# load expression data **assumes csv format row(traits)(columns info+samples) + + +wgcnaRawData <- read.csv(file = "wgcna_data.csv") + +# transform expressionData + +datExpr <- as.data.frame(t(wgcnaRawData)); + + + + + + + + -- cgit v1.2.3 From 49fdd614d4b18f7d28126e4ebbf3adca57f0416f Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Tue, 14 Sep 2021 22:31:18 +0300 Subject: Checking data for excessive missing values --- scripts/wgcna_analysis.R | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/scripts/wgcna_analysis.R b/scripts/wgcna_analysis.R index a8170b6..8e90d7d 100644 --- a/scripts/wgcna_analysis.R +++ b/scripts/wgcna_analysis.R @@ -10,10 +10,28 @@ wgcnaRawData <- read.csv(file = "wgcna_data.csv") # transform expressionData -datExpr <- as.data.frame(t(wgcnaRawData)); +dataExpr <- as.data.frame(t(wgcnaRawData)); +# data cleaning + +# adopted from docs +gsg = goodSamplesGenes(dataExpr, verbose = 3); + + + +if (!gsg$allOK) +{ +# Optionally, print the gene and sample names that were removed: +if (sum(!gsg$goodGenes)>0) +printFlush(paste("Removing genes:", paste(names(datExpr0)[!gsg$goodGenes], collapse = ", "))); +if (sum(!gsg$goodSamples)>0) +printFlush(paste("Removing samples:", paste(rownames(datExpr0)[!gsg$goodSamples], collapse = ", "))); +# Remove the offending genes and samples from the data: +dataExpr <- dataExpr[gsg$goodSamples, gsg$goodGenes] +} + -- cgit v1.2.3 From 42c120dff2b3cac8a6b6546ebab9daf021aac11a Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Tue, 14 Sep 2021 22:42:43 +0300 Subject: compute the softthreshhold --- scripts/wgcna_analysis.R | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/scripts/wgcna_analysis.R b/scripts/wgcna_analysis.R index 8e90d7d..cb93492 100644 --- a/scripts/wgcna_analysis.R +++ b/scripts/wgcna_analysis.R @@ -5,15 +5,12 @@ stringsAsFactors = FALSE # load expression data **assumes csv format row(traits)(columns info+samples) - wgcnaRawData <- read.csv(file = "wgcna_data.csv") # transform expressionData dataExpr <- as.data.frame(t(wgcnaRawData)); - - # data cleaning # adopted from docs @@ -32,6 +29,17 @@ printFlush(paste("Removing samples:", paste(rownames(datExpr0)[!gsg$goodSamples] dataExpr <- dataExpr[gsg$goodSamples, gsg$goodGenes] } +# network constructions and modules + +# choose softthreshhold (Calculate soft threshold if the user specified the) + +powers = c(c(1:10), seq(from = 12, to=20, by=2)) +sft = pickSoftThreshold(dataExpr, powerVector = powers, verbose = 5) + + + + + -- cgit v1.2.3 From 1c98a10bf5015a85b856f9e937417d51ec05d781 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Tue, 14 Sep 2021 22:49:43 +0300 Subject: construct gene co-expression network & module detection --- scripts/wgcna_analysis.R | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/scripts/wgcna_analysis.R b/scripts/wgcna_analysis.R index cb93492..29f0259 100644 --- a/scripts/wgcna_analysis.R +++ b/scripts/wgcna_analysis.R @@ -33,9 +33,26 @@ dataExpr <- dataExpr[gsg$goodSamples, gsg$goodGenes] # choose softthreshhold (Calculate soft threshold if the user specified the) -powers = c(c(1:10), seq(from = 12, to=20, by=2)) -sft = pickSoftThreshold(dataExpr, powerVector = powers, verbose = 5) - +powers <- c(c(1:10), seq(from = 12, to=20, by=2)) +sft <- pickSoftThreshold(dataExpr, powerVector = powers, verbose = 5) + +# pass user options +network <- blockwiseModules(dataExpr, + #similarity matrix options + corType = "pearson", + #adjacency matrix options + + power = sft$powerEstimate, + networkType = "unsigned", + #TOM options + TOMtype = "unsigned", + + #module indentification + + minmodulesSize = 30, + deepSplit = 5, + PamRespectsDendro = FALSE + ) -- cgit v1.2.3 From b32f4ee074d55d8ab78863d4c5acc652ec9cd839 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Tue, 14 Sep 2021 23:18:18 +0300 Subject: plot plotDendroAndColors and generate png --- scripts/wgcna_analysis.R | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/scripts/wgcna_analysis.R b/scripts/wgcna_analysis.R index 29f0259..65ff36e 100644 --- a/scripts/wgcna_analysis.R +++ b/scripts/wgcna_analysis.R @@ -54,6 +54,24 @@ network <- blockwiseModules(dataExpr, PamRespectsDendro = FALSE ) +# plot dendro add color + +# Convert labels to colors for plotting +mergedColors = labels2colors(net$colors) +# Plot the dendrogram and the module colors underneath + + +# generate random name for png && save the image location + + + +png("WGCNAoutput.png",width=1000,height=600,type='cairo-png') + +plotDendroAndColors(network$dendrograms[[1]],mergedColors[net$blockGenes[[1]]], +"Module colors", +dendroLabels = FALSE, hang = 0.03, +addGuide = TRUE, guideHang = 0.05) + -- cgit v1.2.3 From 49b852baa46c0f06caa2f0621b18d521cf334483 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Tue, 14 Sep 2021 23:40:17 +0300 Subject: function to generate rand str for image --- scripts/wgcna_analysis.R | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/scripts/wgcna_analysis.R b/scripts/wgcna_analysis.R index 65ff36e..efe0336 100644 --- a/scripts/wgcna_analysis.R +++ b/scripts/wgcna_analysis.R @@ -1,7 +1,9 @@ # initial workspace setup library(WGCNA); -stringsAsFactors = FALSE +library(stringi); + +options(stringsAsFactors = FALSE); # load expression data **assumes csv format row(traits)(columns info+samples) @@ -64,8 +66,18 @@ mergedColors = labels2colors(net$colors) # generate random name for png && save the image location +genImageRandStr <- function(prefix){ + + randStr <- paste(prefix,stri_rand_strings(1, 9, pattern = "[A-Za-z0-9]"),sep="_") + + return(paste(randStr,".png",sep="")) +} + + + +png(genImageRandStr,width=1000,height=600,type='cairo-png') + -png("WGCNAoutput.png",width=1000,height=600,type='cairo-png') plotDendroAndColors(network$dendrograms[[1]],mergedColors[net$blockGenes[[1]]], "Module colors", -- cgit v1.2.3 From 56092341abe9579b995ff6105722154183b31d22 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Tue, 14 Sep 2021 23:41:46 +0300 Subject: remove debug statements --- scripts/wgcna_analysis.R | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/scripts/wgcna_analysis.R b/scripts/wgcna_analysis.R index efe0336..16a44fd 100644 --- a/scripts/wgcna_analysis.R +++ b/scripts/wgcna_analysis.R @@ -56,14 +56,6 @@ network <- blockwiseModules(dataExpr, PamRespectsDendro = FALSE ) -# plot dendro add color - -# Convert labels to colors for plotting -mergedColors = labels2colors(net$colors) -# Plot the dendrogram and the module colors underneath - - -# generate random name for png && save the image location genImageRandStr <- function(prefix){ @@ -74,6 +66,7 @@ genImageRandStr <- function(prefix){ } +mergedColors = labels2colors(net$colors) png(genImageRandStr,width=1000,height=600,type='cairo-png') -- cgit v1.2.3 From e0f25dedea08842820424ced51af9af0c7eaab4b Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Wed, 15 Sep 2021 02:13:03 +0300 Subject: Fetch IMAGE_DIR env and add img location --- scripts/wgcna_analysis.R | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/scripts/wgcna_analysis.R b/scripts/wgcna_analysis.R index 16a44fd..54650df 100644 --- a/scripts/wgcna_analysis.R +++ b/scripts/wgcna_analysis.R @@ -1,10 +1,14 @@ # initial workspace setup + +# todo pass required input data here library(WGCNA); library(stringi); options(stringsAsFactors = FALSE); +imgDir = Sys.getenv("GENERATED_IMAGE_DIR") + # load expression data **assumes csv format row(traits)(columns info+samples) wgcnaRawData <- read.csv(file = "wgcna_data.csv") @@ -33,7 +37,7 @@ dataExpr <- dataExpr[gsg$goodSamples, gsg$goodGenes] # network constructions and modules -# choose softthreshhold (Calculate soft threshold if the user specified the) +# choose softthreshhold (Calculate soft threshold) powers <- c(c(1:10), seq(from = 12, to=20, by=2)) sft <- pickSoftThreshold(dataExpr, powerVector = powers, verbose = 5) @@ -65,12 +69,12 @@ genImageRandStr <- function(prefix){ return(paste(randStr,".png",sep="")) } +mergedColors <- labels2colors(net$colors) -mergedColors = labels2colors(net$colors) - -png(genImageRandStr,width=1000,height=600,type='cairo-png') +imageLoc <- file.path(imgDir,genImageRandStr("WGCNAoutput")) +png(imageLoc,width=1000,height=600,type='cairo-png') plotDendroAndColors(network$dendrograms[[1]],mergedColors[net$blockGenes[[1]]], "Module colors", @@ -79,9 +83,3 @@ addGuide = TRUE, guideHang = 0.05) - - - - - - -- cgit v1.2.3 From 94e2e79141cbddb48456fb5707eb0e4e36e97d3b Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Wed, 15 Sep 2021 02:19:03 +0300 Subject: rename variables && delete debugs --- scripts/wgcna_analysis.R | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/scripts/wgcna_analysis.R b/scripts/wgcna_analysis.R index 54650df..390bee4 100644 --- a/scripts/wgcna_analysis.R +++ b/scripts/wgcna_analysis.R @@ -1,7 +1,3 @@ -# initial workspace setup - - -# todo pass required input data here library(WGCNA); library(stringi); @@ -11,15 +7,14 @@ imgDir = Sys.getenv("GENERATED_IMAGE_DIR") # load expression data **assumes csv format row(traits)(columns info+samples) -wgcnaRawData <- read.csv(file = "wgcna_data.csv") +inputData <- read.csv(file = "wgcna_data.csv") # transform expressionData -dataExpr <- as.data.frame(t(wgcnaRawData)); +dataExpr <- as.data.frame(t(inputData)); # data cleaning -# adopted from docs gsg = goodSamplesGenes(dataExpr, verbose = 3); -- cgit v1.2.3 From cee10bacd4316eb807c36b8a11e84f9be5945f44 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Wed, 15 Sep 2021 04:34:26 +0300 Subject: minor fixes --- scripts/wgcna_analysis.R | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/scripts/wgcna_analysis.R b/scripts/wgcna_analysis.R index 390bee4..267cd86 100644 --- a/scripts/wgcna_analysis.R +++ b/scripts/wgcna_analysis.R @@ -1,3 +1,5 @@ + + library(WGCNA); library(stringi); @@ -6,6 +8,7 @@ options(stringsAsFactors = FALSE); imgDir = Sys.getenv("GENERATED_IMAGE_DIR") # load expression data **assumes csv format row(traits)(columns info+samples) +# pass the file_path as arg inputData <- read.csv(file = "wgcna_data.csv") @@ -13,26 +16,29 @@ inputData <- read.csv(file = "wgcna_data.csv") dataExpr <- as.data.frame(t(inputData)); -# data cleaning +## data cleaning gsg = goodSamplesGenes(dataExpr, verbose = 3); - +# https://horvath.genetics.ucla.edu/html/CoexpressionNetwork/Rpackages/ if (!gsg$allOK) { -# Optionally, print the gene and sample names that were removed: if (sum(!gsg$goodGenes)>0) -printFlush(paste("Removing genes:", paste(names(datExpr0)[!gsg$goodGenes], collapse = ", "))); +printFlush(paste("Removing genes:", paste(names(dataExpr)[!gsg$goodGenes], collapse = ", "))); if (sum(!gsg$goodSamples)>0) -printFlush(paste("Removing samples:", paste(rownames(datExpr0)[!gsg$goodSamples], collapse = ", "))); +printFlush(paste("Removing samples:", paste(rownames(dataExpr)[!gsg$goodSamples], collapse = ", "))); # Remove the offending genes and samples from the data: dataExpr <- dataExpr[gsg$goodSamples, gsg$goodGenes] } -# network constructions and modules +## network constructions and modules + +# Allow multi-threading within WGCNA +enableWGCNAThreads() # choose softthreshhold (Calculate soft threshold) +# xtodo allow users to pass args powers <- c(c(1:10), seq(from = 12, to=20, by=2)) sft <- pickSoftThreshold(dataExpr, powerVector = powers, verbose = 5) -- cgit v1.2.3 From ed2e4c0f9d68cfb720da95eba559d69359f7b5fc Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Wed, 15 Sep 2021 05:35:34 +0300 Subject: Add missing sample file for tests * tests/unit/db/data/genotypes/genotype_sample1.geno: new file Add a missing sample data file needed for unit tests. --- tests/unit/db/data/genotypes/genotype_sample1.geno | 23 ++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 tests/unit/db/data/genotypes/genotype_sample1.geno diff --git a/tests/unit/db/data/genotypes/genotype_sample1.geno b/tests/unit/db/data/genotypes/genotype_sample1.geno new file mode 100644 index 0000000..2a55964 --- /dev/null +++ b/tests/unit/db/data/genotypes/genotype_sample1.geno @@ -0,0 +1,23 @@ +# File name: genotype_sample for testing + +# Metadata: Please retain this header information with file. + + +@name: BXD +@type: riset +@mat: B +@pat: D +@het:H +@unk: U + + + + + + +Chr Locus cM Mb BXD1 BXD2 BXD5 BXD6 BXD8 BXD9 +1 rs31443144 1.50 3.010274 B B D D D B +1 rs6269442 1.50 3.492195 B B D D H Y +1 rs32285189 1.63 3.511204 B U D D D B +2 rs31443144 1.50 3.010274 B B D D D B +2 rs6269442 1.50 3.492195 B B D D H Y \ No newline at end of file -- cgit v1.2.3 From f17b489c8eb94050b81b1a59fb43954d036f7c38 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Wed, 15 Sep 2021 06:01:44 +0300 Subject: Fix format of arguments and expected values * tests/unit/computations/test_heatmap.py: ordering is not longer provided as a list of tuples; the ordering values are just a list of numbers now. This commit updates the test to take this into consideration. * tests/unit/computations/test_qtlreaper.py: the 'Chr' value if numeric, is represented by an actual number, not a string. This commit updates the code to take this into consideration. --- tests/unit/computations/test_heatmap.py | 8 ++++---- tests/unit/computations/test_qtlreaper.py | 20 ++++++++++---------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/tests/unit/computations/test_heatmap.py b/tests/unit/computations/test_heatmap.py index f1bbefc..156af45 100644 --- a/tests/unit/computations/test_heatmap.py +++ b/tests/unit/computations/test_heatmap.py @@ -165,22 +165,22 @@ class TestHeatmap(TestCase): """Test retrieval of strains and values.""" for orders, slist, tdata, expected in [ [ - [(60, 2)], + [2], ["s1", "s2", "s3", "s4"], [[2, 9, 6, None, 4], [7, 5, None, None, 4], [9, None, 5, 4, 7], [6, None, None, 4, None]], - [[(60, 2), ["s1", "s3", "s4"], [9, 5, 4]]] + [[2, ["s1", "s3", "s4"], [9, 5, 4]]] ], [ - [(60, 3)], + [3], ["s1", "s2", "s3", "s4", "s5"], [[2, 9, 6, None, 4], [7, 5, None, None, 4], [9, None, 5, 4, 7], [6, None, None, 4, None]], - [[(60, 3), ["s1", "s4"], [6, 4]]] + [[3, ["s1", "s4"], [6, 4]]] ]]: with self.subTest(strainlist=slist, traitdata=tdata): self.assertEqual( diff --git a/tests/unit/computations/test_qtlreaper.py b/tests/unit/computations/test_qtlreaper.py index 495ed97..1d67827 100644 --- a/tests/unit/computations/test_qtlreaper.py +++ b/tests/unit/computations/test_qtlreaper.py @@ -15,52 +15,52 @@ class TestQTLReaper(TestCase): "tests/unit/computations/data/qtlreaper/main_output_sample.txt"), [ { - "ID": "T1", "Locus": "rs31443144", "Chr": "1", "cM": 1.500, + "ID": "T1", "Locus": "rs31443144", "Chr": 1, "cM": 1.500, "Mb": 3.010, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs6269442", "Chr": "1", "cM": 1.500, + "ID": "T1", "Locus": "rs6269442", "Chr": 1, "cM": 1.500, "Mb": 3.492, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs32285189", "Chr": "1", "cM": 1.630, + "ID": "T1", "Locus": "rs32285189", "Chr": 1, "cM": 1.630, "Mb": 3.511, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs258367496", "Chr": "1", "cM": 1.630, + "ID": "T1", "Locus": "rs258367496", "Chr": 1, "cM": 1.630, "Mb": 3.660, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs32430919", "Chr": "1", "cM": 1.750, + "ID": "T1", "Locus": "rs32430919", "Chr": 1, "cM": 1.750, "Mb": 3.777, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs36251697", "Chr": "1", "cM": 1.880, + "ID": "T1", "Locus": "rs36251697", "Chr": 1, "cM": 1.880, "Mb": 3.812, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs30658298", "Chr": "1", "cM": 2.010, + "ID": "T1", "Locus": "rs30658298", "Chr": 1, "cM": 2.010, "Mb": 4.431, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs51852623", "Chr": "1", "cM": 2.010, + "ID": "T1", "Locus": "rs51852623", "Chr": 1, "cM": 2.010, "Mb": 4.447, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs31879829", "Chr": "1", "cM": 2.140, + "ID": "T1", "Locus": "rs31879829", "Chr": 1, "cM": 2.140, "Mb": 4.519, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs36742481", "Chr": "1", "cM": 2.140, + "ID": "T1", "Locus": "rs36742481", "Chr": 1, "cM": 2.140, "Mb": 4.776, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 } -- cgit v1.2.3 From e3e18950cfcdec918429dcbb5d5ed2e9616b7a20 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Wed, 15 Sep 2021 11:19:56 +0300 Subject: Reorganise modules Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * The heatmap generation does not fall cleanly within the computations or db modules. This commit moves it to the higher level gn3 module. --- gn3/computations/heatmap.py | 277 ----------------------------- gn3/heatmaps.py | 302 ++++++++++++++++++++++++++++++++ gn3/heatmaps/heatmaps.py | 67 ------- tests/unit/computations/test_heatmap.py | 187 -------------------- tests/unit/test_heatmaps.py | 187 ++++++++++++++++++++ 5 files changed, 489 insertions(+), 531 deletions(-) delete mode 100644 gn3/computations/heatmap.py create mode 100644 gn3/heatmaps.py delete mode 100644 gn3/heatmaps/heatmaps.py delete mode 100644 tests/unit/computations/test_heatmap.py create mode 100644 tests/unit/test_heatmaps.py diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py deleted file mode 100644 index 8727c92..0000000 --- a/gn3/computations/heatmap.py +++ /dev/null @@ -1,277 +0,0 @@ -""" -This module will contain functions to be used in computation of the data used to -generate various kinds of heatmaps. -""" - -from functools import reduce -from typing import Any, Dict, Sequence -from gn3.computations.slink import slink -from gn3.computations.qtlreaper import generate_traits_file -from gn3.computations.correlations2 import compute_correlation -from gn3.db.genotypes import build_genotype_file, load_genotype_samples -from gn3.db.traits import ( - retrieve_trait_data, - retrieve_trait_info, - generate_traits_filename) - -def export_trait_data( - trait_data: dict, strainlist: Sequence[str], dtype: str = "val", - var_exists: bool = False, n_exists: bool = False): - """ - Export data according to `strainlist`. Mostly used in calculating - correlations. - - DESCRIPTION: - Migrated from - https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L166-L211 - - PARAMETERS - trait: (dict) - The dictionary of key-value pairs representing a trait - strainlist: (list) - A list of strain names - dtype: (str) - ... verify what this is ... - var_exists: (bool) - A flag indicating existence of variance - n_exists: (bool) - A flag indicating existence of ndata - """ - def __export_all_types(tdata, strain): - sample_data = [] - if tdata[strain]["value"]: - sample_data.append(tdata[strain]["value"]) - if var_exists: - if tdata[strain]["variance"]: - sample_data.append(tdata[strain]["variance"]) - else: - sample_data.append(None) - if n_exists: - if tdata[strain]["ndata"]: - sample_data.append(tdata[strain]["ndata"]) - else: - sample_data.append(None) - else: - if var_exists and n_exists: - sample_data += [None, None, None] - elif var_exists or n_exists: - sample_data += [None, None] - else: - sample_data.append(None) - - return tuple(sample_data) - - def __exporter(accumulator, strain): - # pylint: disable=[R0911] - if strain in trait_data["data"]: - if dtype == "val": - return accumulator + (trait_data["data"][strain]["value"], ) - if dtype == "var": - return accumulator + (trait_data["data"][strain]["variance"], ) - if dtype == "N": - return accumulator + (trait_data["data"][strain]["ndata"], ) - if dtype == "all": - return accumulator + __export_all_types(trait_data["data"], strain) - raise KeyError("Type `%s` is incorrect" % dtype) - if var_exists and n_exists: - return accumulator + (None, None, None) - if var_exists or n_exists: - return accumulator + (None, None) - return accumulator + (None,) - - return reduce(__exporter, strainlist, tuple()) - -def trait_display_name(trait: Dict): - """ - Given a trait, return a name to use to display the trait on a heatmap. - - DESCRIPTION - Migrated from - https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L141-L157 - """ - if trait.get("db", None) and trait.get("trait_name", None): - if trait["db"]["dataset_type"] == "Temp": - desc = trait["description"] - if desc.find("PCA") >= 0: - return "%s::%s" % ( - trait["db"]["displayname"], - desc[desc.rindex(':')+1:].strip()) - return "%s::%s" % ( - trait["db"]["displayname"], - desc[:desc.index('entered')].strip()) - prefix = "%s::%s" % ( - trait["db"]["dataset_name"], trait["trait_name"]) - if trait["cellid"]: - return "%s::%s" % (prefix, trait["cellid"]) - return prefix - return trait["description"] - -def cluster_traits(traits_data_list: Sequence[Dict]): - """ - Clusters the trait values. - - DESCRIPTION - Attempts to replicate the clustering of the traits, as done at - https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L138-L162 - """ - def __compute_corr(tdata_i, tdata_j): - if tdata_i[0] == tdata_j[0]: - return 0.0 - corr_vals = compute_correlation(tdata_i[1], tdata_j[1]) - corr = corr_vals[0] - if (1 - corr) < 0: - return 0.0 - return 1 - corr - - def __cluster(tdata_i): - return tuple( - __compute_corr(tdata_i, tdata_j) - for tdata_j in enumerate(traits_data_list)) - - return tuple(__cluster(tdata_i) for tdata_i in enumerate(traits_data_list)) - -def heatmap_data(traits_names, conn: Any): - """ - heatmap function - - DESCRIPTION - This function is an attempt to reproduce the initialisation at - https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L46-L64 - and also the clustering and slink computations at - https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L138-L165 - with the help of the `gn3.computations.heatmap.cluster_traits` function. - - It does not try to actually draw the heatmap image. - - PARAMETERS: - TODO: Elaborate on the parameters here... - """ - threshold = 0 # webqtlConfig.PUBLICTHRESH - def __retrieve_traitlist_and_datalist(threshold, fullname): - trait = retrieve_trait_info(threshold, fullname, conn) - return (trait, retrieve_trait_data(trait, conn)) - - traits_details = [ - __retrieve_traitlist_and_datalist(threshold, fullname) - for fullname in traits_names] - traits_list = tuple(x[0] for x in traits_details) - traits_data_list = [x[1] for x in traits_details] - genotype_filename = build_genotype_file(traits_list[0]["riset"]) - strainlist = load_genotype_samples(genotype_filename) - exported_traits_data_list = tuple( - export_trait_data(td, strainlist) for td in traits_data_list) - slink_data = slink(cluster_traits(exported_traits_data_list)) - ordering_data = compute_heatmap_order(slink_data) - strains_and_values = retrieve_strains_and_values( - ordering_data, strainlist, exported_traits_data_list) - strains_values = strains_and_values[0][1] - trait_values = [t[2] for t in strains_and_values] - traits_filename = generate_traits_filename() - generate_traits_file(strains_values, trait_values, traits_filename) - - return { - "slink_data": slink_data, - "ordering_data": ordering_data, - "strainlist": strainlist, - "genotype_filename": genotype_filename, - "traits_list": traits_list, - "traits_data_list": traits_data_list, - "exported_traits_data_list": exported_traits_data_list, - "traits_filename": traits_filename - } - -def compute_traits_order(slink_data, neworder: tuple = tuple()): - """ - Compute the order of the traits for clustering from `slink_data`. - - This function tries to reproduce the creation and update of the `neworder` - variable in - https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L120 - and in the `web.webqtl.heatmap.Heatmap.draw` function in GN1 - """ - def __order_maker(norder, slnk_dt): - if isinstance(slnk_dt[0], int) and isinstance(slnk_dt[1], int): - return norder + (slnk_dt[0], slnk_dt[1]) - - if isinstance(slnk_dt[0], int): - return __order_maker((norder + (slnk_dt[0], )), slnk_dt[1]) - - if isinstance(slnk_dt[1], int): - return __order_maker(norder, slnk_dt[0]) + (slnk_dt[1], ) - - return __order_maker(__order_maker(norder, slnk_dt[0]), slnk_dt[1]) - - return __order_maker(neworder, slink_data) - -def retrieve_strains_and_values(orders, strainlist, traits_data_list): - """ - Get the strains and their corresponding values from `strainlist` and - `traits_data_list`. - - This migrates the code in - https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L215-221 - """ - # This feels nasty! There's a lot of mutation of values here, that might - # indicate something untoward in the design of this function and its - # dependents ==> Review - strains = [] - values = [] - rets = [] - for order in orders: - temp_val = traits_data_list[order] - for i, strain in enumerate(strainlist): - if temp_val[i] is not None: - strains.append(strain) - values.append(temp_val[i]) - rets.append([order, strains[:], values[:]]) - strains = [] - values = [] - - return rets - -def nearest_marker_finder(genotype): - """ - Returns a function to be used with `genotype` to compute the nearest marker - to the trait passed to the returned function. - - https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L425-434 - """ - def __compute_distances(chromo, trait): - loci = chromo.get("loci", None) - if not loci: - return None - return tuple( - { - "name": locus["name"], - "distance": abs(locus["Mb"] - trait["mb"]) - } for locus in loci) - - def __finder(trait): - _chrs = tuple( - _chr for _chr in genotype["chromosomes"] - if str(_chr["name"]) == str(trait["chr"])) - if len(_chrs) == 0: - return None - distances = tuple( - distance for dists in - filter( - lambda x: x is not None, - (__compute_distances(_chr, trait) for _chr in _chrs)) - for distance in dists) - nearest = min(distances, key=lambda d: d["distance"]) - return nearest["name"] - return __finder - -def get_nearest_marker(traits_list, genotype): - """ - Retrieves the nearest marker for each of the traits in the list. - - DESCRIPTION: - This migrates the code in - https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L419-L438 - """ - if not genotype["Mbmap"]: - return [None] * len(trait_list) - - marker_finder = nearest_marker_finder(genotype) - return [marker_finder(trait) for trait in traits_list] diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py new file mode 100644 index 0000000..198fb45 --- /dev/null +++ b/gn3/heatmaps.py @@ -0,0 +1,302 @@ +""" +This module will contain functions to be used in computation of the data used to +generate various kinds of heatmaps. +""" + +from functools import reduce +from typing import Any, Dict, Sequence +from gn3.computations.slink import slink +from gn3.computations.qtlreaper import generate_traits_file +from gn3.computations.correlations2 import compute_correlation +from gn3.db.genotypes import build_genotype_file, load_genotype_samples +from gn3.db.traits import ( + retrieve_trait_data, + retrieve_trait_info, + generate_traits_filename) + +def export_trait_data( + trait_data: dict, strainlist: Sequence[str], dtype: str = "val", + var_exists: bool = False, n_exists: bool = False): + """ + Export data according to `strainlist`. Mostly used in calculating + correlations. + + DESCRIPTION: + Migrated from + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L166-L211 + + PARAMETERS + trait: (dict) + The dictionary of key-value pairs representing a trait + strainlist: (list) + A list of strain names + dtype: (str) + ... verify what this is ... + var_exists: (bool) + A flag indicating existence of variance + n_exists: (bool) + A flag indicating existence of ndata + """ + def __export_all_types(tdata, strain): + sample_data = [] + if tdata[strain]["value"]: + sample_data.append(tdata[strain]["value"]) + if var_exists: + if tdata[strain]["variance"]: + sample_data.append(tdata[strain]["variance"]) + else: + sample_data.append(None) + if n_exists: + if tdata[strain]["ndata"]: + sample_data.append(tdata[strain]["ndata"]) + else: + sample_data.append(None) + else: + if var_exists and n_exists: + sample_data += [None, None, None] + elif var_exists or n_exists: + sample_data += [None, None] + else: + sample_data.append(None) + + return tuple(sample_data) + + def __exporter(accumulator, strain): + # pylint: disable=[R0911] + if strain in trait_data["data"]: + if dtype == "val": + return accumulator + (trait_data["data"][strain]["value"], ) + if dtype == "var": + return accumulator + (trait_data["data"][strain]["variance"], ) + if dtype == "N": + return accumulator + (trait_data["data"][strain]["ndata"], ) + if dtype == "all": + return accumulator + __export_all_types(trait_data["data"], strain) + raise KeyError("Type `%s` is incorrect" % dtype) + if var_exists and n_exists: + return accumulator + (None, None, None) + if var_exists or n_exists: + return accumulator + (None, None) + return accumulator + (None,) + + return reduce(__exporter, strainlist, tuple()) + +def trait_display_name(trait: Dict): + """ + Given a trait, return a name to use to display the trait on a heatmap. + + DESCRIPTION + Migrated from + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L141-L157 + """ + if trait.get("db", None) and trait.get("trait_name", None): + if trait["db"]["dataset_type"] == "Temp": + desc = trait["description"] + if desc.find("PCA") >= 0: + return "%s::%s" % ( + trait["db"]["displayname"], + desc[desc.rindex(':')+1:].strip()) + return "%s::%s" % ( + trait["db"]["displayname"], + desc[:desc.index('entered')].strip()) + prefix = "%s::%s" % ( + trait["db"]["dataset_name"], trait["trait_name"]) + if trait["cellid"]: + return "%s::%s" % (prefix, trait["cellid"]) + return prefix + return trait["description"] + +def cluster_traits(traits_data_list: Sequence[Dict]): + """ + Clusters the trait values. + + DESCRIPTION + Attempts to replicate the clustering of the traits, as done at + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L138-L162 + """ + def __compute_corr(tdata_i, tdata_j): + if tdata_i[0] == tdata_j[0]: + return 0.0 + corr_vals = compute_correlation(tdata_i[1], tdata_j[1]) + corr = corr_vals[0] + if (1 - corr) < 0: + return 0.0 + return 1 - corr + + def __cluster(tdata_i): + return tuple( + __compute_corr(tdata_i, tdata_j) + for tdata_j in enumerate(traits_data_list)) + + return tuple(__cluster(tdata_i) for tdata_i in enumerate(traits_data_list)) + +def heatmap_data(traits_names, conn: Any): + """ + heatmap function + + DESCRIPTION + This function is an attempt to reproduce the initialisation at + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L46-L64 + and also the clustering and slink computations at + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L138-L165 + with the help of the `gn3.computations.heatmap.cluster_traits` function. + + It does not try to actually draw the heatmap image. + + PARAMETERS: + TODO: Elaborate on the parameters here... + """ + threshold = 0 # webqtlConfig.PUBLICTHRESH + def __retrieve_traitlist_and_datalist(threshold, fullname): + trait = retrieve_trait_info(threshold, fullname, conn) + return (trait, retrieve_trait_data(trait, conn)) + + traits_details = [ + __retrieve_traitlist_and_datalist(threshold, fullname) + for fullname in traits_names] + traits_list = tuple(x[0] for x in traits_details) + traits_data_list = [x[1] for x in traits_details] + genotype_filename = build_genotype_file(traits_list[0]["riset"]) + strainlist = load_genotype_samples(genotype_filename) + exported_traits_data_list = tuple( + export_trait_data(td, strainlist) for td in traits_data_list) + slink_data = slink(cluster_traits(exported_traits_data_list)) + ordering_data = compute_heatmap_order(slink_data) + strains_and_values = retrieve_strains_and_values( + ordering_data, strainlist, exported_traits_data_list) + strains_values = strains_and_values[0][1] + trait_values = [t[2] for t in strains_and_values] + traits_filename = generate_traits_filename() + generate_traits_file(strains_values, trait_values, traits_filename) + + return { + "slink_data": slink_data, + "ordering_data": ordering_data, + "strainlist": strainlist, + "genotype_filename": genotype_filename, + "traits_list": traits_list, + "traits_data_list": traits_data_list, + "exported_traits_data_list": exported_traits_data_list, + "traits_filename": traits_filename + } + +def compute_traits_order(slink_data, neworder: tuple = tuple()): + """ + Compute the order of the traits for clustering from `slink_data`. + + This function tries to reproduce the creation and update of the `neworder` + variable in + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L120 + and in the `web.webqtl.heatmap.Heatmap.draw` function in GN1 + """ + def __order_maker(norder, slnk_dt): + if isinstance(slnk_dt[0], int) and isinstance(slnk_dt[1], int): + return norder + (slnk_dt[0], slnk_dt[1]) + + if isinstance(slnk_dt[0], int): + return __order_maker((norder + (slnk_dt[0], )), slnk_dt[1]) + + if isinstance(slnk_dt[1], int): + return __order_maker(norder, slnk_dt[0]) + (slnk_dt[1], ) + + return __order_maker(__order_maker(norder, slnk_dt[0]), slnk_dt[1]) + + return __order_maker(neworder, slink_data) + +def retrieve_strains_and_values(orders, strainlist, traits_data_list): + """ + Get the strains and their corresponding values from `strainlist` and + `traits_data_list`. + + This migrates the code in + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L215-221 + """ + # This feels nasty! There's a lot of mutation of values here, that might + # indicate something untoward in the design of this function and its + # dependents ==> Review + strains = [] + values = [] + rets = [] + for order in orders: + temp_val = traits_data_list[order] + for i, strain in enumerate(strainlist): + if temp_val[i] is not None: + strains.append(strain) + values.append(temp_val[i]) + rets.append([order, strains[:], values[:]]) + strains = [] + values = [] + + return rets + +def nearest_marker_finder(genotype): + """ + Returns a function to be used with `genotype` to compute the nearest marker + to the trait passed to the returned function. + + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L425-434 + """ + def __compute_distances(chromo, trait): + loci = chromo.get("loci", None) + if not loci: + return None + return tuple( + { + "name": locus["name"], + "distance": abs(locus["Mb"] - trait["mb"]) + } for locus in loci) + + def __finder(trait): + _chrs = tuple( + _chr for _chr in genotype["chromosomes"] + if str(_chr["name"]) == str(trait["chr"])) + if len(_chrs) == 0: + return None + distances = tuple( + distance for dists in + filter( + lambda x: x is not None, + (__compute_distances(_chr, trait) for _chr in _chrs)) + for distance in dists) + nearest = min(distances, key=lambda d: d["distance"]) + return nearest["name"] + return __finder + +def get_nearest_marker(traits_list, genotype): + """ + Retrieves the nearest marker for each of the traits in the list. + + DESCRIPTION: + This migrates the code in + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L419-L438 + """ + if not genotype["Mbmap"]: + return [None] * len(trait_list) + + marker_finder = nearest_marker_finder(genotype) + return [marker_finder(trait) for trait in traits_list] + +# # Grey + Blue + Red +# def generate_heatmap(): +# cols = 20 +# y_axis = (["%s"%x for x in range(1, cols+1)][:-1] + ["X"]) #replace last item with x for now +# x_axis = heatmap_x_axis_names() +# data = generate_random_data(height=cols, width=len(x_axis)) +# fig = px.imshow( +# data, +# x=x_axis, +# y=y_axis, +# width=500) +# fig.update_traces(xtype="array") +# fig.update_traces(ytype="array") +# # fig.update_traces(xgap=10) +# fig.update_xaxes( +# visible=True, +# title_text="Traits", +# title_font_size=16) +# fig.update_layout( +# coloraxis_colorscale=[ +# [0.0, '#3B3B3B'], [0.4999999999999999, '#ABABAB'], +# [0.5, '#F5DE11'], [1.0, '#FF0D00']]) +# fig.write_html("%s/%s"%(heatmap_dir, "test_image.html")) +# return fig diff --git a/gn3/heatmaps/heatmaps.py b/gn3/heatmaps/heatmaps.py deleted file mode 100644 index 88f546d..0000000 --- a/gn3/heatmaps/heatmaps.py +++ /dev/null @@ -1,67 +0,0 @@ -import random -import plotly.express as px - -#### Remove these #### - -heatmap_dir = "heatmap_images" - -def generate_random_data(data_stop: float = 2, width: int = 10, height: int = 30): - """ - This is mostly a utility function to be used to generate random data, useful - for development of the heatmap generation code, without access to the actual - database data. - """ - return [[random.uniform(0,data_stop) for i in range(0, width)] - for j in range(0, height)] - -def generate_random_data2(data_stop: float = 2, width: int = 10, height: int = 30): - """ - This is mostly a utility function to be used to generate random data, useful - for development of the heatmap generation code, without access to the actual - database data. - """ - return [ - [{ - "value": item, - "category": random.choice(["C57BL/6J +", "DBA/2J +"])} - for item in axis] - for axis in generate_random_data(data_stop, width, height)] - -def heatmap_x_axis_names(): - return [ - "UCLA_BXDBXH_CARTILAGE_V2::ILM103710672", - "UCLA_BXDBXH_CARTILAGE_V2::ILM2260338", - "UCLA_BXDBXH_CARTILAGE_V2::ILM3140576", - "UCLA_BXDBXH_CARTILAGE_V2::ILM5670577", - "UCLA_BXDBXH_CARTILAGE_V2::ILM2070121", - "UCLA_BXDBXH_CARTILAGE_V2::ILM103990541", - "UCLA_BXDBXH_CARTILAGE_V2::ILM1190722", - "UCLA_BXDBXH_CARTILAGE_V2::ILM6590722", - "UCLA_BXDBXH_CARTILAGE_V2::ILM4200064", - "UCLA_BXDBXH_CARTILAGE_V2::ILM3140463"] -#### END: Remove these #### - -# Grey + Blue + Red -def generate_heatmap(): - cols = 20 - y_axis = (["%s"%x for x in range(1, cols+1)][:-1] + ["X"]) #replace last item with x for now - x_axis = heatmap_x_axis_names() - data = generate_random_data(height=cols, width=len(x_axis)) - fig = px.imshow( - data, - x=x_axis, - y=y_axis, - width=500) - fig.update_traces(xtype="array") - fig.update_traces(ytype="array") - # fig.update_traces(xgap=10) - fig.update_xaxes( - visible=True, - title_text="Traits", - title_font_size=16) - fig.update_layout( - coloraxis_colorscale=[ - [0.0, '#3B3B3B'], [0.4999999999999999, '#ABABAB'], - [0.5, '#F5DE11'], [1.0, '#FF0D00']]) - fig.write_html("%s/%s"%(heatmap_dir, "test_image.html")) - return fig diff --git a/tests/unit/computations/test_heatmap.py b/tests/unit/computations/test_heatmap.py deleted file mode 100644 index 156af45..0000000 --- a/tests/unit/computations/test_heatmap.py +++ /dev/null @@ -1,187 +0,0 @@ -"""Module contains tests for gn3.computations.heatmap""" -from unittest import TestCase -from gn3.computations.heatmap import ( - cluster_traits, - export_trait_data, - compute_traits_order, - retrieve_strains_and_values) - -strainlist = ["B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"] -trait_data = { - "mysqlid": 36688172, - "data": { - "B6cC3-1": {"strain_name": "B6cC3-1", "value": 7.51879, "variance": None, "ndata": None}, - "BXD1": {"strain_name": "BXD1", "value": 7.77141, "variance": None, "ndata": None}, - "BXD12": {"strain_name": "BXD12", "value": 8.39265, "variance": None, "ndata": None}, - "BXD16": {"strain_name": "BXD16", "value": 8.17443, "variance": None, "ndata": None}, - "BXD19": {"strain_name": "BXD19", "value": 8.30401, "variance": None, "ndata": None}, - "BXD2": {"strain_name": "BXD2", "value": 7.80944, "variance": None, "ndata": None}, - "BXD21": {"strain_name": "BXD21", "value": 8.93809, "variance": None, "ndata": None}, - "BXD24": {"strain_name": "BXD24", "value": 7.99415, "variance": None, "ndata": None}, - "BXD27": {"strain_name": "BXD27", "value": 8.12177, "variance": None, "ndata": None}, - "BXD28": {"strain_name": "BXD28", "value": 7.67688, "variance": None, "ndata": None}, - "BXD32": {"strain_name": "BXD32", "value": 7.79062, "variance": None, "ndata": None}, - "BXD39": {"strain_name": "BXD39", "value": 8.27641, "variance": None, "ndata": None}, - "BXD40": {"strain_name": "BXD40", "value": 8.18012, "variance": None, "ndata": None}, - "BXD42": {"strain_name": "BXD42", "value": 7.82433, "variance": None, "ndata": None}, - "BXD6": {"strain_name": "BXD6", "value": 8.09718, "variance": None, "ndata": None}, - "BXH14": {"strain_name": "BXH14", "value": 7.97475, "variance": None, "ndata": None}, - "BXH19": {"strain_name": "BXH19", "value": 7.67223, "variance": None, "ndata": None}, - "BXH2": {"strain_name": "BXH2", "value": 7.93622, "variance": None, "ndata": None}, - "BXH22": {"strain_name": "BXH22", "value": 7.43692, "variance": None, "ndata": None}, - "BXH4": {"strain_name": "BXH4", "value": 7.96336, "variance": None, "ndata": None}, - "BXH6": {"strain_name": "BXH6", "value": 7.75132, "variance": None, "ndata": None}, - "BXH7": {"strain_name": "BXH7", "value": 8.12927, "variance": None, "ndata": None}, - "BXH8": {"strain_name": "BXH8", "value": 6.77338, "variance": None, "ndata": None}, - "BXH9": {"strain_name": "BXH9", "value": 8.03836, "variance": None, "ndata": None}, - "C3H/HeJ": {"strain_name": "C3H/HeJ", "value": 7.42795, "variance": None, "ndata": None}, - "C57BL/6J": {"strain_name": "C57BL/6J", "value": 7.50606, "variance": None, "ndata": None}, - "DBA/2J": {"strain_name": "DBA/2J", "value": 7.72588, "variance": None, "ndata": None}}} - -slinked = ( - (((0, 2, 0.16381088984330505), - ((1, 7, 0.06024619831474998), 5, 0.19179284676938602), - 0.20337048635536847), - 9, - 0.23451785425383564), - ((3, (6, 8, 0.2140799896286565), 0.25879514152086425), - 4, 0.8968250491499363), - 0.9313185954797953) - -class TestHeatmap(TestCase): - """Class for testing heatmap computation functions""" - - def test_export_trait_data_dtype(self): - """ - Test `export_trait_data` with different values for the `dtype` keyword - argument - """ - for dtype, expected in [ - ["val", (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], - ["var", (None, None, None, None, None, None)], - ["N", (None, None, None, None, None, None)], - ["all", (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)]]: - with self.subTest(dtype=dtype): - self.assertEqual( - export_trait_data(trait_data, strainlist, dtype=dtype), - expected) - - def test_export_trait_data_dtype_all_flags(self): - """ - Test `export_trait_data` with different values for the `dtype` keyword - argument and the different flags set up - """ - for dtype, vflag, nflag, expected in [ - ["val", False, False, - (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], - ["val", False, True, - (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], - ["val", True, False, - (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], - ["val", True, True, - (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], - ["var", False, False, (None, None, None, None, None, None)], - ["var", False, True, (None, None, None, None, None, None)], - ["var", True, False, (None, None, None, None, None, None)], - ["var", True, True, (None, None, None, None, None, None)], - ["N", False, False, (None, None, None, None, None, None)], - ["N", False, True, (None, None, None, None, None, None)], - ["N", True, False, (None, None, None, None, None, None)], - ["N", True, True, (None, None, None, None, None, None)], - ["all", False, False, - (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], - ["all", False, True, - (7.51879, None, 7.77141, None, 8.39265, None, 8.17443, None, - 8.30401, None, 7.80944, None)], - ["all", True, False, - (7.51879, None, 7.77141, None, 8.39265, None, 8.17443, None, - 8.30401, None, 7.80944, None)], - ["all", True, True, - (7.51879, None, None, 7.77141, None, None, 8.39265, None, None, - 8.17443, None, None, 8.30401, None, None, 7.80944, None, None)] - ]: - with self.subTest(dtype=dtype, vflag=vflag, nflag=nflag): - self.assertEqual( - export_trait_data( - trait_data, strainlist, dtype=dtype, var_exists=vflag, - n_exists=nflag), - expected) - - def test_cluster_traits(self): - """ - Test that the clustering is working as expected. - """ - traits_data_list = [ - (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944), - (6.1427, 6.50588, 7.73705, 6.68328, 7.49293, 7.27398), - (8.4211, 8.30581, 9.24076, 8.51173, 9.18455, 8.36077), - (10.0904, 10.6509, 9.36716, 9.91202, 8.57444, 10.5731), - (10.188, 9.76652, 9.54813, 9.05074, 9.52319, 9.10505), - (6.74676, 7.01029, 7.54169, 6.48574, 7.01427, 7.26815), - (6.39359, 6.85321, 5.78337, 7.11141, 6.22101, 6.16544), - (6.84118, 7.08432, 7.59844, 7.08229, 7.26774, 7.24991), - (9.45215, 10.6943, 8.64719, 10.1592, 7.75044, 8.78615), - (7.04737, 6.87185, 7.58586, 6.92456, 6.84243, 7.36913)] - self.assertEqual( - cluster_traits(traits_data_list), - ((0.0, 0.20337048635536847, 0.16381088984330505, 1.7388553629398245, - 1.5025235756329178, 0.6952839500255574, 1.271661230252733, - 0.2100487290977544, 1.4699690641062024, 0.7934461515867415), - (0.20337048635536847, 0.0, 0.2198321044997198, 1.5753041735592204, - 1.4815755944537086, 0.26087293140686374, 1.6939790104301427, - 0.06024619831474998, 1.7430082449189215, 0.4497104244247795), - (0.16381088984330505, 0.2198321044997198, 0.0, 1.9073926868549234, - 1.0396738891139845, 0.5278328671176757, 1.6275069061182947, - 0.2636503792482082, 1.739617877037615, 0.7127042590637039), - (1.7388553629398245, 1.5753041735592204, 1.9073926868549234, 0.0, - 0.9936846292920328, 1.1169999189889366, 0.6007483980555253, - 1.430209221053372, 0.25879514152086425, 0.9313185954797953), - (1.5025235756329178, 1.4815755944537086, 1.0396738891139845, - 0.9936846292920328, 0.0, 1.027827186339337, 1.1441743109173244, - 1.4122477962364253, 0.8968250491499363, 1.1683723389247052), - (0.6952839500255574, 0.26087293140686374, 0.5278328671176757, - 1.1169999189889366, 1.027827186339337, 0.0, 1.8420471110023269, - 0.19179284676938602, 1.4875072385631605, 0.23451785425383564), - (1.271661230252733, 1.6939790104301427, 1.6275069061182947, - 0.6007483980555253, 1.1441743109173244, 1.8420471110023269, 0.0, - 1.6540234785929928, 0.2140799896286565, 1.7413442197913358), - (0.2100487290977544, 0.06024619831474998, 0.2636503792482082, - 1.430209221053372, 1.4122477962364253, 0.19179284676938602, - 1.6540234785929928, 0.0, 1.5225640692832796, 0.33370067057028485), - (1.4699690641062024, 1.7430082449189215, 1.739617877037615, - 0.25879514152086425, 0.8968250491499363, 1.4875072385631605, - 0.2140799896286565, 1.5225640692832796, 0.0, 1.3256191648260216), - (0.7934461515867415, 0.4497104244247795, 0.7127042590637039, - 0.9313185954797953, 1.1683723389247052, 0.23451785425383564, - 1.7413442197913358, 0.33370067057028485, 1.3256191648260216, - 0.0))) - - def test_compute_heatmap_order(self): - """Test the orders.""" - self.assertEqual( - compute_traits_order(slinked), (0, 2, 1, 7, 5, 9, 3, 6, 8, 4)) - - def test_retrieve_strains_and_values(self): - """Test retrieval of strains and values.""" - for orders, slist, tdata, expected in [ - [ - [2], - ["s1", "s2", "s3", "s4"], - [[2, 9, 6, None, 4], - [7, 5, None, None, 4], - [9, None, 5, 4, 7], - [6, None, None, 4, None]], - [[2, ["s1", "s3", "s4"], [9, 5, 4]]] - ], - [ - [3], - ["s1", "s2", "s3", "s4", "s5"], - [[2, 9, 6, None, 4], - [7, 5, None, None, 4], - [9, None, 5, 4, 7], - [6, None, None, 4, None]], - [[3, ["s1", "s4"], [6, 4]]] - ]]: - with self.subTest(strainlist=slist, traitdata=tdata): - self.assertEqual( - retrieve_strains_and_values(orders, slist, tdata), expected) diff --git a/tests/unit/test_heatmaps.py b/tests/unit/test_heatmaps.py new file mode 100644 index 0000000..265d5a8 --- /dev/null +++ b/tests/unit/test_heatmaps.py @@ -0,0 +1,187 @@ +"""Module contains tests for gn3.heatmaps.heatmaps""" +from unittest import TestCase +from gn3.heatmaps import ( + cluster_traits, + export_trait_data, + compute_traits_order, + retrieve_strains_and_values) + +strainlist = ["B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"] +trait_data = { + "mysqlid": 36688172, + "data": { + "B6cC3-1": {"strain_name": "B6cC3-1", "value": 7.51879, "variance": None, "ndata": None}, + "BXD1": {"strain_name": "BXD1", "value": 7.77141, "variance": None, "ndata": None}, + "BXD12": {"strain_name": "BXD12", "value": 8.39265, "variance": None, "ndata": None}, + "BXD16": {"strain_name": "BXD16", "value": 8.17443, "variance": None, "ndata": None}, + "BXD19": {"strain_name": "BXD19", "value": 8.30401, "variance": None, "ndata": None}, + "BXD2": {"strain_name": "BXD2", "value": 7.80944, "variance": None, "ndata": None}, + "BXD21": {"strain_name": "BXD21", "value": 8.93809, "variance": None, "ndata": None}, + "BXD24": {"strain_name": "BXD24", "value": 7.99415, "variance": None, "ndata": None}, + "BXD27": {"strain_name": "BXD27", "value": 8.12177, "variance": None, "ndata": None}, + "BXD28": {"strain_name": "BXD28", "value": 7.67688, "variance": None, "ndata": None}, + "BXD32": {"strain_name": "BXD32", "value": 7.79062, "variance": None, "ndata": None}, + "BXD39": {"strain_name": "BXD39", "value": 8.27641, "variance": None, "ndata": None}, + "BXD40": {"strain_name": "BXD40", "value": 8.18012, "variance": None, "ndata": None}, + "BXD42": {"strain_name": "BXD42", "value": 7.82433, "variance": None, "ndata": None}, + "BXD6": {"strain_name": "BXD6", "value": 8.09718, "variance": None, "ndata": None}, + "BXH14": {"strain_name": "BXH14", "value": 7.97475, "variance": None, "ndata": None}, + "BXH19": {"strain_name": "BXH19", "value": 7.67223, "variance": None, "ndata": None}, + "BXH2": {"strain_name": "BXH2", "value": 7.93622, "variance": None, "ndata": None}, + "BXH22": {"strain_name": "BXH22", "value": 7.43692, "variance": None, "ndata": None}, + "BXH4": {"strain_name": "BXH4", "value": 7.96336, "variance": None, "ndata": None}, + "BXH6": {"strain_name": "BXH6", "value": 7.75132, "variance": None, "ndata": None}, + "BXH7": {"strain_name": "BXH7", "value": 8.12927, "variance": None, "ndata": None}, + "BXH8": {"strain_name": "BXH8", "value": 6.77338, "variance": None, "ndata": None}, + "BXH9": {"strain_name": "BXH9", "value": 8.03836, "variance": None, "ndata": None}, + "C3H/HeJ": {"strain_name": "C3H/HeJ", "value": 7.42795, "variance": None, "ndata": None}, + "C57BL/6J": {"strain_name": "C57BL/6J", "value": 7.50606, "variance": None, "ndata": None}, + "DBA/2J": {"strain_name": "DBA/2J", "value": 7.72588, "variance": None, "ndata": None}}} + +slinked = ( + (((0, 2, 0.16381088984330505), + ((1, 7, 0.06024619831474998), 5, 0.19179284676938602), + 0.20337048635536847), + 9, + 0.23451785425383564), + ((3, (6, 8, 0.2140799896286565), 0.25879514152086425), + 4, 0.8968250491499363), + 0.9313185954797953) + +class TestHeatmap(TestCase): + """Class for testing heatmap computation functions""" + + def test_export_trait_data_dtype(self): + """ + Test `export_trait_data` with different values for the `dtype` keyword + argument + """ + for dtype, expected in [ + ["val", (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["var", (None, None, None, None, None, None)], + ["N", (None, None, None, None, None, None)], + ["all", (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)]]: + with self.subTest(dtype=dtype): + self.assertEqual( + export_trait_data(trait_data, strainlist, dtype=dtype), + expected) + + def test_export_trait_data_dtype_all_flags(self): + """ + Test `export_trait_data` with different values for the `dtype` keyword + argument and the different flags set up + """ + for dtype, vflag, nflag, expected in [ + ["val", False, False, + (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["val", False, True, + (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["val", True, False, + (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["val", True, True, + (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["var", False, False, (None, None, None, None, None, None)], + ["var", False, True, (None, None, None, None, None, None)], + ["var", True, False, (None, None, None, None, None, None)], + ["var", True, True, (None, None, None, None, None, None)], + ["N", False, False, (None, None, None, None, None, None)], + ["N", False, True, (None, None, None, None, None, None)], + ["N", True, False, (None, None, None, None, None, None)], + ["N", True, True, (None, None, None, None, None, None)], + ["all", False, False, + (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["all", False, True, + (7.51879, None, 7.77141, None, 8.39265, None, 8.17443, None, + 8.30401, None, 7.80944, None)], + ["all", True, False, + (7.51879, None, 7.77141, None, 8.39265, None, 8.17443, None, + 8.30401, None, 7.80944, None)], + ["all", True, True, + (7.51879, None, None, 7.77141, None, None, 8.39265, None, None, + 8.17443, None, None, 8.30401, None, None, 7.80944, None, None)] + ]: + with self.subTest(dtype=dtype, vflag=vflag, nflag=nflag): + self.assertEqual( + export_trait_data( + trait_data, strainlist, dtype=dtype, var_exists=vflag, + n_exists=nflag), + expected) + + def test_cluster_traits(self): + """ + Test that the clustering is working as expected. + """ + traits_data_list = [ + (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944), + (6.1427, 6.50588, 7.73705, 6.68328, 7.49293, 7.27398), + (8.4211, 8.30581, 9.24076, 8.51173, 9.18455, 8.36077), + (10.0904, 10.6509, 9.36716, 9.91202, 8.57444, 10.5731), + (10.188, 9.76652, 9.54813, 9.05074, 9.52319, 9.10505), + (6.74676, 7.01029, 7.54169, 6.48574, 7.01427, 7.26815), + (6.39359, 6.85321, 5.78337, 7.11141, 6.22101, 6.16544), + (6.84118, 7.08432, 7.59844, 7.08229, 7.26774, 7.24991), + (9.45215, 10.6943, 8.64719, 10.1592, 7.75044, 8.78615), + (7.04737, 6.87185, 7.58586, 6.92456, 6.84243, 7.36913)] + self.assertEqual( + cluster_traits(traits_data_list), + ((0.0, 0.20337048635536847, 0.16381088984330505, 1.7388553629398245, + 1.5025235756329178, 0.6952839500255574, 1.271661230252733, + 0.2100487290977544, 1.4699690641062024, 0.7934461515867415), + (0.20337048635536847, 0.0, 0.2198321044997198, 1.5753041735592204, + 1.4815755944537086, 0.26087293140686374, 1.6939790104301427, + 0.06024619831474998, 1.7430082449189215, 0.4497104244247795), + (0.16381088984330505, 0.2198321044997198, 0.0, 1.9073926868549234, + 1.0396738891139845, 0.5278328671176757, 1.6275069061182947, + 0.2636503792482082, 1.739617877037615, 0.7127042590637039), + (1.7388553629398245, 1.5753041735592204, 1.9073926868549234, 0.0, + 0.9936846292920328, 1.1169999189889366, 0.6007483980555253, + 1.430209221053372, 0.25879514152086425, 0.9313185954797953), + (1.5025235756329178, 1.4815755944537086, 1.0396738891139845, + 0.9936846292920328, 0.0, 1.027827186339337, 1.1441743109173244, + 1.4122477962364253, 0.8968250491499363, 1.1683723389247052), + (0.6952839500255574, 0.26087293140686374, 0.5278328671176757, + 1.1169999189889366, 1.027827186339337, 0.0, 1.8420471110023269, + 0.19179284676938602, 1.4875072385631605, 0.23451785425383564), + (1.271661230252733, 1.6939790104301427, 1.6275069061182947, + 0.6007483980555253, 1.1441743109173244, 1.8420471110023269, 0.0, + 1.6540234785929928, 0.2140799896286565, 1.7413442197913358), + (0.2100487290977544, 0.06024619831474998, 0.2636503792482082, + 1.430209221053372, 1.4122477962364253, 0.19179284676938602, + 1.6540234785929928, 0.0, 1.5225640692832796, 0.33370067057028485), + (1.4699690641062024, 1.7430082449189215, 1.739617877037615, + 0.25879514152086425, 0.8968250491499363, 1.4875072385631605, + 0.2140799896286565, 1.5225640692832796, 0.0, 1.3256191648260216), + (0.7934461515867415, 0.4497104244247795, 0.7127042590637039, + 0.9313185954797953, 1.1683723389247052, 0.23451785425383564, + 1.7413442197913358, 0.33370067057028485, 1.3256191648260216, + 0.0))) + + def test_compute_heatmap_order(self): + """Test the orders.""" + self.assertEqual( + compute_traits_order(slinked), (0, 2, 1, 7, 5, 9, 3, 6, 8, 4)) + + def test_retrieve_strains_and_values(self): + """Test retrieval of strains and values.""" + for orders, slist, tdata, expected in [ + [ + [2], + ["s1", "s2", "s3", "s4"], + [[2, 9, 6, None, 4], + [7, 5, None, None, 4], + [9, None, 5, 4, 7], + [6, None, None, 4, None]], + [[2, ["s1", "s3", "s4"], [9, 5, 4]]] + ], + [ + [3], + ["s1", "s2", "s3", "s4", "s5"], + [[2, 9, 6, None, 4], + [7, 5, None, None, 4], + [9, None, 5, 4, 7], + [6, None, None, 4, None]], + [[3, ["s1", "s4"], [6, 4]]] + ]]: + with self.subTest(strainlist=slist, traitdata=tdata): + self.assertEqual( + retrieve_strains_and_values(orders, slist, tdata), expected) -- cgit v1.2.3 From b1eb0451578c53afabe4f2054ce08665dec4bb82 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Wed, 15 Sep 2021 11:41:36 +0300 Subject: Integrate get_lsr_from_chr function * gn3/heatmaps.py: copy over function * tests/unit/test_heatmaps.py: add tests Copy function over from proof of concept and add some tests to ensure it works as expected. --- gn3/heatmaps.py | 8 ++++++++ tests/unit/test_heatmaps.py | 14 ++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py index 198fb45..991ddec 100644 --- a/gn3/heatmaps.py +++ b/gn3/heatmaps.py @@ -276,6 +276,14 @@ def get_nearest_marker(traits_list, genotype): marker_finder = nearest_marker_finder(genotype) return [marker_finder(trait) for trait in traits_list] +def get_lrs_from_chr(trait, chr_name): + chromosome = trait["chromosomes"].get(chr_name) + if chromosome: + return [ + locus["LRS"] for locus in + sorted(chromosome["loci"], key=lambda loc: loc["Locus"])] + return [None] + # # Grey + Blue + Red # def generate_heatmap(): # cols = 20 diff --git a/tests/unit/test_heatmaps.py b/tests/unit/test_heatmaps.py index 265d5a8..cfdde1e 100644 --- a/tests/unit/test_heatmaps.py +++ b/tests/unit/test_heatmaps.py @@ -2,6 +2,7 @@ from unittest import TestCase from gn3.heatmaps import ( cluster_traits, + get_lrs_from_chr, export_trait_data, compute_traits_order, retrieve_strains_and_values) @@ -185,3 +186,16 @@ class TestHeatmap(TestCase): with self.subTest(strainlist=slist, traitdata=tdata): self.assertEqual( retrieve_strains_and_values(orders, slist, tdata), expected) + + def test_get_lrs_from_chr(self): + for trait, chromosome, expected in [ + [{"chromosomes": {}}, 3, [None]], + [{"chromosomes": {3: {"loci": [ + {"Locus": "b", "LRS": 1.9}, + {"Locus": "a", "LRS": 13.2}, + {"Locus": "d", "LRS": 53.21}, + {"Locus": "c", "LRS": 2.22}]}}}, + 3, + [13.2, 1.9, 2.22, 53.21]]]: + with self.subTest(trait=trait, chromosome=chromosome): + self.assertEqual(get_lrs_from_chr(trait, chromosome), expected) -- cgit v1.2.3 From 11632a565a6f901eca852a5a40a6f9fd3170152a Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Wed, 15 Sep 2021 12:08:56 +0300 Subject: Process data into format usable by heatmaps * gn3/heatmaps.py: implement `process_traits_data_for_heatmap` function, that will process the data into a form usable by heatmaps. * tests/unit/test_heatmaps.py: check that the function processes the data into the correct form. --- gn3/heatmaps.py | 12 +++++ tests/unit/test_heatmaps.py | 107 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 118 insertions(+), 1 deletion(-) diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py index 991ddec..0c00d6c 100644 --- a/gn3/heatmaps.py +++ b/gn3/heatmaps.py @@ -277,6 +277,9 @@ def get_nearest_marker(traits_list, genotype): return [marker_finder(trait) for trait in traits_list] def get_lrs_from_chr(trait, chr_name): + """ + Retrieve the LRS values for a specific chromosome in the given trait. + """ chromosome = trait["chromosomes"].get(chr_name) if chromosome: return [ @@ -284,6 +287,15 @@ def get_lrs_from_chr(trait, chr_name): sorted(chromosome["loci"], key=lambda loc: loc["Locus"])] return [None] +def process_traits_data_for_heatmap(data, trait_names, chromosome_names): + """ + Process the traits data in a format useful for generating heatmap diagrams. + """ + hdata = [ + [get_lrs_from_chr(data[trait], chr_name) for trait in trait_names] + for chr_name in chromosome_names] + return hdata + # # Grey + Blue + Red # def generate_heatmap(): # cols = 20 diff --git a/tests/unit/test_heatmaps.py b/tests/unit/test_heatmaps.py index cfdde1e..f3a81c5 100644 --- a/tests/unit/test_heatmaps.py +++ b/tests/unit/test_heatmaps.py @@ -5,7 +5,8 @@ from gn3.heatmaps import ( get_lrs_from_chr, export_trait_data, compute_traits_order, - retrieve_strains_and_values) + retrieve_strains_and_values, + process_traits_data_for_heatmap) strainlist = ["B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"] trait_data = { @@ -199,3 +200,107 @@ class TestHeatmap(TestCase): [13.2, 1.9, 2.22, 53.21]]]: with self.subTest(trait=trait, chromosome=chromosome): self.assertEqual(get_lrs_from_chr(trait, chromosome), expected) + + def test_process_traits_data_for_heatmap(self): + self.assertEqual( + process_traits_data_for_heatmap( + {"1": { + "ID": "T1", + "chromosomes": { + 1: {"Chr": 1, + "loci": [ + { + "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs6269442", "cM": 1.500, "Mb": 3.492, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs32285189", "cM": 1.630, "Mb": 3.511, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs258367496", "cM": 1.630, "Mb": 3.660, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs32430919", "cM": 1.750, "Mb": 3.777, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs36251697", "cM": 1.880, "Mb": 3.812, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs30658298", "cM": 2.010, "Mb": 4.431, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }]}, + 2: {"Chr": 2, + "loci": [ + { + "Locus": "rs51852623", "cM": 2.010, "Mb": 4.447, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs31879829", "cM": 2.140, "Mb": 4.519, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs36742481", "cM": 2.140, "Mb": 4.776, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }]}}}, + "2": { + "ID": "T1", + "chromosomes": { + 1: {"Chr": 1, + "loci": [ + { + "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs6269442", "cM": 1.500, "Mb": 3.492, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs32285189", "cM": 1.630, "Mb": 3.511, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs258367496", "cM": 1.630, "Mb": 3.660, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs32430919", "cM": 1.750, "Mb": 3.777, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs36251697", "cM": 1.880, "Mb": 3.812, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs30658298", "cM": 2.010, "Mb": 4.431, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }]}, + 2: {"Chr": 2, + "loci": [ + { + "Locus": "rs51852623", "cM": 2.010, "Mb": 4.447, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs31879829", "cM": 2.140, "Mb": 4.519, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs36742481", "cM": 2.140, "Mb": 4.776, + "LRS": 0.579, "Additive": -0.074, "pValue": 1.000 + }]}}}}, + ["2", "1"], + [1, 2]), + [[[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5], + [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]], + [[0.5, 0.579, 0.5], + [0.5, 0.5, 0.5]]]) -- cgit v1.2.3 From e9fb4e45cfc52c5d86ef534b0e7f42ba8f4c84d3 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Wed, 15 Sep 2021 12:28:56 +0300 Subject: Generate heatmaps in a single plot Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Add a function to generate the heatmaps for each chromosome into a single plot. --- gn3/heatmaps.py | 65 ++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 24 deletions(-) diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py index 0c00d6c..f3d7d25 100644 --- a/gn3/heatmaps.py +++ b/gn3/heatmaps.py @@ -4,6 +4,7 @@ generate various kinds of heatmaps. """ from functools import reduce +from gn3.settings import TMPDIR from typing import Any, Dict, Sequence from gn3.computations.slink import slink from gn3.computations.qtlreaper import generate_traits_file @@ -296,27 +297,43 @@ def process_traits_data_for_heatmap(data, trait_names, chromosome_names): for chr_name in chromosome_names] return hdata -# # Grey + Blue + Red -# def generate_heatmap(): -# cols = 20 -# y_axis = (["%s"%x for x in range(1, cols+1)][:-1] + ["X"]) #replace last item with x for now -# x_axis = heatmap_x_axis_names() -# data = generate_random_data(height=cols, width=len(x_axis)) -# fig = px.imshow( -# data, -# x=x_axis, -# y=y_axis, -# width=500) -# fig.update_traces(xtype="array") -# fig.update_traces(ytype="array") -# # fig.update_traces(xgap=10) -# fig.update_xaxes( -# visible=True, -# title_text="Traits", -# title_font_size=16) -# fig.update_layout( -# coloraxis_colorscale=[ -# [0.0, '#3B3B3B'], [0.4999999999999999, '#ABABAB'], -# [0.5, '#F5DE11'], [1.0, '#FF0D00']]) -# fig.write_html("%s/%s"%(heatmap_dir, "test_image.html")) -# return fig +def generate_clustered_heatmap( + data, image_filename_prefix, x_axis = None, x_label: str = "", + y_axis = None, y_label: str = "", output_dir: str = TMPDIR, + colorscale = [ + [0.0, '#3B3B3B'], [0.4999999999999999, '#ABABAB'], + [0.5, '#F5DE11']], [1.0, '#FF0D00']): + """ + Generate a dendrogram, and heatmaps for each chromosome, and put them all + into one plot. + """ + num_cols = len(x_axis) + fig = make_subplots( + rows=1, + cols=num_cols, + shared_yaxes="rows", + # horizontal_spacing=(1 / (num_cols - 1)), + subplot_titles=x_axis + ) + hms = [go.Heatmap( + name=chromo, + y = y_axis, + z = data_array, + showscale=False) for chromo, data_array in zip(x_axis, data)] + for col, hm in enumerate(hms): + fig.add_trace(hm, row=1, col=(col + 1)) + + fig.update_traces( + showlegend=False, + colorscale=colorscale, + selector={"type": "heatmap"}) + fig.update_traces( + showlegend=True, + showscale=True, + selector={"name": x_axis[-1]}) + fig.update_layout( + coloraxis_colorscale=colorscale + ) + image_filename = "{}/{}.html".format(output_dir, image_filename_prefix) + fig.write_html(image_filename) + return image_filename, fig -- cgit v1.2.3 From 347301c1bc60ba5036625364ee48a5d72eeb1186 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Wed, 15 Sep 2021 12:42:38 +0300 Subject: Update entry-point function for heatmap generation Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Copy over code from the proof-of-concept implementation and clean it up a little for the entry-point function for heatmap generation via the API --- gn3/heatmaps.py | 70 ++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 49 insertions(+), 21 deletions(-) diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py index f3d7d25..4349ee0 100644 --- a/gn3/heatmaps.py +++ b/gn3/heatmaps.py @@ -131,7 +131,7 @@ def cluster_traits(traits_data_list: Sequence[Dict]): return tuple(__cluster(tdata_i) for tdata_i in enumerate(traits_data_list)) -def heatmap_data(traits_names, conn: Any): +def build_heatmap(traits_names, conn: Any): """ heatmap function @@ -148,27 +148,55 @@ def heatmap_data(traits_names, conn: Any): TODO: Elaborate on the parameters here... """ threshold = 0 # webqtlConfig.PUBLICTHRESH - def __retrieve_traitlist_and_datalist(threshold, fullname): - trait = retrieve_trait_info(threshold, fullname, conn) - return (trait, retrieve_trait_data(trait, conn)) - - traits_details = [ - __retrieve_traitlist_and_datalist(threshold, fullname) - for fullname in traits_names] - traits_list = tuple(x[0] for x in traits_details) - traits_data_list = [x[1] for x in traits_details] - genotype_filename = build_genotype_file(traits_list[0]["riset"]) - strainlist = load_genotype_samples(genotype_filename) - exported_traits_data_list = tuple( - export_trait_data(td, strainlist) for td in traits_data_list) - slink_data = slink(cluster_traits(exported_traits_data_list)) - ordering_data = compute_heatmap_order(slink_data) + traits = [ + retrieve_trait_info(threshold, fullname, conn) + for fullname in trait_fullnames()] + traits_data_list = [retrieve_trait_data(t, conn) for t in traits] + genotype_filename = build_genotype_file(traits[0]["riset"]) + genotype = parse_genotype_file(genotype_filename) + strains = load_genotype_samples(genotype_filename) + exported_traits_data_list = [ + export_trait_data(td, strains) for td in traits_data_list] + slinked = slink(cluster_traits(exported_traits_data_list)) + traits_order = compute_traits_order(slinked) + ordered_traits_names = [ + traits[idx]["trait_fullname"] for idx in traits_order] strains_and_values = retrieve_strains_and_values( - ordering_data, strainlist, exported_traits_data_list) - strains_values = strains_and_values[0][1] - trait_values = [t[2] for t in strains_and_values] - traits_filename = generate_traits_filename() - generate_traits_file(strains_values, trait_values, traits_filename) + traits_order, strains, exported_traits_data_list) + traits_filename = "{}/traits_test_file_{}.txt".format( + TMPDIR, random_string(10)) + generate_traits_file( + strains_and_values[0][1], + [t[2] for t in strains_and_values], + traits_filename) + + main_output, permutations_output = run_reaper( + genotype_filename, traits_filename, separate_nperm_output=True) + + qtlresults = parse_reaper_main_results(main_output) + permudata = parse_reaper_permutation_results(permutations_output) + organised = organise_reaper_main_results(qtlresults) + + traits_ids = [# sort numerically, but retain the ids as strings + str(i) for i in sorted({int(row["ID"]) for row in qtlresults})] + chromosome_names = sorted( + {row["Chr"] for row in qtlresults}, key = chromosome_sorter_key_fn) + loci_names = sorted({row["Locus"] for row in qtlresults}) + ordered_traits_names = { + res_id: trait for res_id, trait in + zip(traits_ids, + [traits[idx]["trait_fullname"] for idx in traits_order])} + + return generate_clustered_heatmap( + process_traits_data_for_heatmap( + organised, traits_ids, chromosome_names), + "single_heatmap_{}".format(random_string(10)), + y_axis=tuple( + ordered_traits_names[traits_ids[order]] + for order in traits_order), + y_label="Traits", + x_axis=[chromo for chromo in chromosome_names], + x_label="Chromosomes") return { "slink_data": slink_data, -- cgit v1.2.3 From 488ad475ca7112d64f9f1f0d4934c3293388bf1a Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Wed, 15 Sep 2021 16:06:52 +0300 Subject: remove golang package causing build failure --- guix.scm | 2 -- 1 file changed, 2 deletions(-) diff --git a/guix.scm b/guix.scm index 729d089..75ed4a9 100644 --- a/guix.scm +++ b/guix.scm @@ -43,7 +43,6 @@ (gnu packages databases) (gnu packages statistics) (gnu packages bioconductor) - (gn packages golang) (gnu packages python) (gnu packages python-check) (gnu packages python-crypto) @@ -83,7 +82,6 @@ #:recursive? #t #:select? git-file?)) (propagated-inputs `(("coreutils" ,coreutils) - ("csvdiff" ,go-github-com-aswinkarthik-csvdiff) ("gemma-wrapper" ,gemma-wrapper) ("gunicorn" ,gunicorn) ("python" ,python-wrapper) -- cgit v1.2.3 From 6494a093e6d96015f65416a8c3fde88a9a3c8c71 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Wed, 15 Sep 2021 16:17:02 +0300 Subject: init wgcna file to run r script and preprocess data --- gn3/computations/wgcna.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 gn3/computations/wgcna.py diff --git a/gn3/computations/wgcna.py b/gn3/computations/wgcna.py new file mode 100644 index 0000000..e69de29 -- cgit v1.2.3 From 129ecfe26a857c2c66a2402c9c7364b309e9f1c0 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Wed, 15 Sep 2021 16:17:30 +0300 Subject: initial test file for wgcna --- tests/unit/computations/test_wgcna.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 tests/unit/computations/test_wgcna.py diff --git a/tests/unit/computations/test_wgcna.py b/tests/unit/computations/test_wgcna.py new file mode 100644 index 0000000..1ce69a1 --- /dev/null +++ b/tests/unit/computations/test_wgcna.py @@ -0,0 +1,15 @@ +"""module contains python code for wgcna""" +from unittest import TestCase + + +def compute_sum(rhs_val, lhs_val): + """function to compute sum of two numbers""" + return rhs_val+lhs_val + + +class TestWgcna(TestCase): + """test class for wgcna""" + + def test_compute_sum(self): + """test for compute sum function""" + self.assertEqual(compute_sum(1, 2), 3) -- cgit v1.2.3 From a3ef298678b32ee83cac7bd7462d3d92f8eaec26 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 16 Sep 2021 08:23:35 +0300 Subject: function to parse form data and write to json file --- gn3/computations/wgcna.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/gn3/computations/wgcna.py b/gn3/computations/wgcna.py index e69de29..3819b62 100644 --- a/gn3/computations/wgcna.py +++ b/gn3/computations/wgcna.py @@ -0,0 +1,18 @@ +"""module contains code to preprocess and call wgcna script""" + +import os +import json +import uuid +from gn3.settings import TMPDIR + + +def dump_wgcna_data(request_data): + """function to dump request data to json file""" + filename = f"{str(uuid.uuid4())}.json" + + temp_file_path = os.path.join(TMPDIR, filename) + + with open(temp_file_path, "w") as output_file: + json.dump(request_data, output_file) + + return temp_file_path -- cgit v1.2.3 From 38cbc95f216548d4a2f46b8e25cd328ff8c52d62 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 16 Sep 2021 08:54:01 +0300 Subject: add function to compose and run wgcna script --- gn3/computations/wgcna.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/gn3/computations/wgcna.py b/gn3/computations/wgcna.py index 3819b62..bcd3a0a 100644 --- a/gn3/computations/wgcna.py +++ b/gn3/computations/wgcna.py @@ -5,8 +5,10 @@ import json import uuid from gn3.settings import TMPDIR +from gn3.commands import run_cmd -def dump_wgcna_data(request_data): + +def dump_wgcna_data(request_data: dict): """function to dump request data to json file""" filename = f"{str(uuid.uuid4())}.json" @@ -16,3 +18,16 @@ def dump_wgcna_data(request_data): json.dump(request_data, output_file) return temp_file_path + + +def compose_wgcna_cmd(rscript_path: str, temp_file_path: str): + """function to componse wgcna cmd""" + cmd = f"Rscript {rscript_path} {temp_file_path}" + return cmd + + +def call_wgcna_script(rscript_path: str, request_data: dict): + """function to call wgcna script""" + generated_file = dump_wgcna_data(request_data) + cmd = compose_gemma_cmd(rscript_path, generated_file) + run_cmd(cmd=cmd) -- cgit v1.2.3 From 8854e3070f32bed95cb489eb36e7f258c02ec46e Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 16 Sep 2021 09:32:42 +0300 Subject: add initial endpoint for wgcna --- gn3/api/wgcna.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 gn3/api/wgcna.py diff --git a/gn3/api/wgcna.py b/gn3/api/wgcna.py new file mode 100644 index 0000000..a3bacdd --- /dev/null +++ b/gn3/api/wgcna.py @@ -0,0 +1,13 @@ +"""endpoint to run wgcna analysis""" +from flask import Blueprint +from flask import request + +wgcna = Blueprint("wgcna", __name__) + + +@wgcna.route("/run_wgcna", methods=["POST"]) +def run_wgcna(): + + _wgcna_data = request.json + + return "success", 200 -- cgit v1.2.3 From 6f9fd1a1866e292ec4b84c214efd61ccb9f5bb58 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 16 Sep 2021 09:45:16 +0300 Subject: register wgcna blueprint --- gn3/app.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gn3/app.py b/gn3/app.py index 046b5de..b2f77f9 100644 --- a/gn3/app.py +++ b/gn3/app.py @@ -9,6 +9,7 @@ from gn3.api.rqtl import rqtl from gn3.api.general import general from gn3.api.correlation import correlation from gn3.api.data_entry import data_entry +from gn3.api.wgcna import wgcna def create_app(config: Union[Dict, str, None] = None) -> Flask: @@ -32,4 +33,5 @@ def create_app(config: Union[Dict, str, None] = None) -> Flask: app.register_blueprint(rqtl, url_prefix="/api/rqtl") app.register_blueprint(correlation, url_prefix="/api/correlation") app.register_blueprint(data_entry, url_prefix="/api/dataentry") + app.register_blueprint(wgcna, url_prefix="/api/wgcna") return app -- cgit v1.2.3 From 3d8f7f069d76cd6ee35b5e9d72d37e38721188d2 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 16 Sep 2021 10:13:10 +0300 Subject: run cmd and add exception handler --- gn3/computations/wgcna.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/gn3/computations/wgcna.py b/gn3/computations/wgcna.py index bcd3a0a..f0f0fa2 100644 --- a/gn3/computations/wgcna.py +++ b/gn3/computations/wgcna.py @@ -29,5 +29,9 @@ def compose_wgcna_cmd(rscript_path: str, temp_file_path: str): def call_wgcna_script(rscript_path: str, request_data: dict): """function to call wgcna script""" generated_file = dump_wgcna_data(request_data) - cmd = compose_gemma_cmd(rscript_path, generated_file) - run_cmd(cmd=cmd) + cmd = compose_wgcna_cmd(rscript_path, generated_file) + + try: + return run_cmd(cmd) + except Exception as error: + raise error -- cgit v1.2.3 From 78e37440844a0397d29135a8ba215e54fd92c86d Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Thu, 16 Sep 2021 11:35:23 +0300 Subject: Add missing imports Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Add missing imports that are needed in the code. --- gn3/heatmaps.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py index 4349ee0..c48a2d3 100644 --- a/gn3/heatmaps.py +++ b/gn3/heatmaps.py @@ -5,15 +5,25 @@ generate various kinds of heatmaps. from functools import reduce from gn3.settings import TMPDIR +import plotly.graph_objects as go +from gn3.random import random_string from typing import Any, Dict, Sequence from gn3.computations.slink import slink -from gn3.computations.qtlreaper import generate_traits_file +from plotly.subplots import make_subplots from gn3.computations.correlations2 import compute_correlation -from gn3.db.genotypes import build_genotype_file, load_genotype_samples +from gn3.db.genotypes import ( + build_genotype_file, load_genotype_samples, parse_genotype_file) from gn3.db.traits import ( retrieve_trait_data, retrieve_trait_info, generate_traits_filename) +from gn3.computations.qtlreaper import ( + run_reaper, + generate_traits_file, + chromosome_sorter_key_fn, + parse_reaper_main_results, + organise_reaper_main_results, + parse_reaper_permutation_results) def export_trait_data( trait_data: dict, strainlist: Sequence[str], dtype: str = "val", -- cgit v1.2.3 From 2cc9f382e199dbdbaab98c7e06deabd72e244adb Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Thu, 16 Sep 2021 11:36:42 +0300 Subject: Fix minor bugs Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Fix a few minor bugs left over from the integration of code from the proof-of-concept code. --- gn3/heatmaps.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py index c48a2d3..170b0cd 100644 --- a/gn3/heatmaps.py +++ b/gn3/heatmaps.py @@ -160,7 +160,7 @@ def build_heatmap(traits_names, conn: Any): threshold = 0 # webqtlConfig.PUBLICTHRESH traits = [ retrieve_trait_info(threshold, fullname, conn) - for fullname in trait_fullnames()] + for fullname in traits_names] traits_data_list = [retrieve_trait_data(t, conn) for t in traits] genotype_filename = build_genotype_file(traits[0]["riset"]) genotype = parse_genotype_file(genotype_filename) @@ -338,9 +338,9 @@ def process_traits_data_for_heatmap(data, trait_names, chromosome_names): def generate_clustered_heatmap( data, image_filename_prefix, x_axis = None, x_label: str = "", y_axis = None, y_label: str = "", output_dir: str = TMPDIR, - colorscale = [ - [0.0, '#3B3B3B'], [0.4999999999999999, '#ABABAB'], - [0.5, '#F5DE11']], [1.0, '#FF0D00']): + colorscale = ( + (0.0, '#3B3B3B'), (0.4999999999999999, '#ABABAB'), + (0.5, '#F5DE11'), (1.0, '#FF0D00'))): """ Generate a dendrogram, and heatmaps for each chromosome, and put them all into one plot. -- cgit v1.2.3 From 056171a0a2f127e90ab803b74635495fb0c079a2 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Thu, 16 Sep 2021 13:06:04 +0300 Subject: Intergrate the heatmap generation with the API Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Intergrate the heatmap generation code on the /api/heatmaps/clustered endpoint. The endpoint should take a json query of the form: {"traits_names": [ ... ] } where the "traits_name" value is a list of the full names of traits. A sample query to the endpoint could be something like the following: curl -i -X POST "http://localhost:8080/api/heatmaps/clustered" \ -H "Accept: application/json" \ -H "Content-Type: application/json" \ -d '{ "traits_names": [ "UCLA_BXDBXH_CARTILAGE_V2::ILM103710672", "UCLA_BXDBXH_CARTILAGE_V2::ILM2260338", "UCLA_BXDBXH_CARTILAGE_V2::ILM3140576", "UCLA_BXDBXH_CARTILAGE_V2::ILM5670577", "UCLA_BXDBXH_CARTILAGE_V2::ILM2070121", "UCLA_BXDBXH_CARTILAGE_V2::ILM103990541", "UCLA_BXDBXH_CARTILAGE_V2::ILM1190722", "UCLA_BXDBXH_CARTILAGE_V2::ILM6590722", "UCLA_BXDBXH_CARTILAGE_V2::ILM4200064", "UCLA_BXDBXH_CARTILAGE_V2::ILM3140463" ] }' which should respond with a json response containing the raw binary string for the png format and possibly another for the svg format. --- gn3/api/heatmaps.py | 28 ++++++++++++++++++++++++++++ gn3/app.py | 2 ++ 2 files changed, 30 insertions(+) create mode 100644 gn3/api/heatmaps.py diff --git a/gn3/api/heatmaps.py b/gn3/api/heatmaps.py new file mode 100644 index 0000000..cac9c71 --- /dev/null +++ b/gn3/api/heatmaps.py @@ -0,0 +1,28 @@ +from flask import jsonify +from flask import request +from flask import Blueprint +from gn3.heatmaps import build_heatmap +from gn3.db_utils import database_connector + +heatmaps = Blueprint("heatmaps", __name__) + +@heatmaps.route("/clustered", methods=("POST",)) +def clustered_heatmaps(): + heatmap_request = request.get_json() + traits_names = heatmap_request.get("traits_names", tuple()) + if len(traits_names) < 1: + return jsonify({ + "message": "You need to provide at least one trait name." + }), 400 + conn, _cursor = database_connector() + _heatmap_file, heatmap_fig = build_heatmap(traits_names, conn) + + # stream the heatmap data somehow here. + # Can plotly actually stream the figure data in a way that can be used on + # remote end to display the image without necessarily being html? + return jsonify( + { + "query": heatmap_request, + "output_png": heatmap_fig.to_image(format="png"), + "output_svg": heatmap_fig.to_image(format="svg") + }), 200 diff --git a/gn3/app.py b/gn3/app.py index 046b5de..b4b08d0 100644 --- a/gn3/app.py +++ b/gn3/app.py @@ -7,6 +7,7 @@ from flask import Flask from gn3.api.gemma import gemma from gn3.api.rqtl import rqtl from gn3.api.general import general +from gn3.api.heatmaps import heatmaps from gn3.api.correlation import correlation from gn3.api.data_entry import data_entry @@ -30,6 +31,7 @@ def create_app(config: Union[Dict, str, None] = None) -> Flask: app.register_blueprint(general, url_prefix="/api/") app.register_blueprint(gemma, url_prefix="/api/gemma") app.register_blueprint(rqtl, url_prefix="/api/rqtl") + app.register_blueprint(heatmaps, url_prefix="/api/heatmaps") app.register_blueprint(correlation, url_prefix="/api/correlation") app.register_blueprint(data_entry, url_prefix="/api/dataentry") return app -- cgit v1.2.3 From f53a8a98206b4e8aedbf4b86e49f41ea140c9c6a Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 16 Sep 2021 14:02:13 +0300 Subject: pass user input to call script --- gn3/api/wgcna.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/gn3/api/wgcna.py b/gn3/api/wgcna.py index a3bacdd..5d49493 100644 --- a/gn3/api/wgcna.py +++ b/gn3/api/wgcna.py @@ -1,13 +1,21 @@ """endpoint to run wgcna analysis""" from flask import Blueprint from flask import request +from flask import current_app + +from gn3.computations.wgcna import call_wgcna_script wgcna = Blueprint("wgcna", __name__) @wgcna.route("/run_wgcna", methods=["POST"]) def run_wgcna(): + """run wgcna:output should be a json with a the data""" + + wgcna_data = request.json + + wgcna_script = current_app.config["WGCNA_RSCRIPT"] - _wgcna_data = request.json + results = call_wgcna_script(wgcna_script, wgcna_data) - return "success", 200 + return results, 200 -- cgit v1.2.3 From ce045ec96bf965eaba077fdba99847a483281aba Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 16 Sep 2021 14:03:25 +0300 Subject: init tests for wgcna --- tests/unit/computations/test_wgcna.py | 36 +++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/tests/unit/computations/test_wgcna.py b/tests/unit/computations/test_wgcna.py index 1ce69a1..8a68985 100644 --- a/tests/unit/computations/test_wgcna.py +++ b/tests/unit/computations/test_wgcna.py @@ -1,15 +1,35 @@ """module contains python code for wgcna""" from unittest import TestCase - - -def compute_sum(rhs_val, lhs_val): - """function to compute sum of two numbers""" - return rhs_val+lhs_val +from gn3.computations.wgcna import dump_wgcna_data +from gn3.computations.wgcna import compose_wgcna_cmd class TestWgcna(TestCase): """test class for wgcna""" - def test_compute_sum(self): - """test for compute sum function""" - self.assertEqual(compute_sum(1, 2), 3) + def test_compose_wgcna_cmd(self): + """test for composing wgcna cmd""" + wgcna_cmd = compose_wgcna_cmd("/wgcna.r", "/tmp/wgcna.json") + self.assertEqual(wgcna_cmd, f"Rscript /wgcna.r /tmp/wgcna.json") + + def test_create_json_data(self): + """test for writing the data to a csv file""" + # # All the traits we have data for (should not contain duplicates) + # All the strains we have data for (contains duplicates) + + trait_sample_data = {"1425642_at": {"129S1/SvImJ": 7.142, "A/J": 7.31, "AKR/J": 7.49, + "B6D2F1": 6.899, "BALB/cByJ": 7.172, "BALB/cJ": 7.396}, + "1457784_at": {"129S1/SvImJ": 7.071, "A/J": 7.05, "AKR/J": 7.313, + "B6D2F1": 6.999, "BALB/cByJ": 7.293, "BALB/cJ": 7.117}, + "1444351_at": {"129S1/SvImJ": 7.221, "A/J": 7.246, "AKR/J": 7.754, + "B6D2F1": 6.866, "BALB/cByJ": 6.752, "BALB/cJ": 7.269} + + } + + expected_input = { + "trait_sample_data": trait_sample_data, + "TOMtype": "unsigned", + "minModuleSize": 30 + } + + _results = dump_wgcna_data(expected_input) -- cgit v1.2.3 From e8f3a7131e8f1fcaf803dedab3177374f46e3709 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 16 Sep 2021 14:04:23 +0300 Subject: Add WGCNA_SCRIT env to settings --- gn3/settings.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gn3/settings.py b/gn3/settings.py index f4866d5..d3d7ea0 100644 --- a/gn3/settings.py +++ b/gn3/settings.py @@ -24,3 +24,6 @@ GN2_BASE_URL = "http://www.genenetwork.org/" # biweight script BIWEIGHT_RSCRIPT = "~/genenetwork3/scripts/calculate_biweight.R" + +# wgcna script +WGCNA_RSCRIPT = "wgcna_analysis.R" -- cgit v1.2.3 From ab8b53a50a601cb95dc0b54279246431c791dd70 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 16 Sep 2021 22:56:21 +0300 Subject: pylint fixes --- tests/unit/computations/test_wgcna.py | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/tests/unit/computations/test_wgcna.py b/tests/unit/computations/test_wgcna.py index 8a68985..9a88515 100644 --- a/tests/unit/computations/test_wgcna.py +++ b/tests/unit/computations/test_wgcna.py @@ -9,20 +9,28 @@ class TestWgcna(TestCase): def test_compose_wgcna_cmd(self): """test for composing wgcna cmd""" - wgcna_cmd = compose_wgcna_cmd("/wgcna.r", "/tmp/wgcna.json") - self.assertEqual(wgcna_cmd, f"Rscript /wgcna.r /tmp/wgcna.json") + wgcna_cmd = compose_wgcna_cmd( + "/wgcna.r", "/tmp/wgcna.json") + self.assertEqual( + wgcna_cmd, "Rscript /wgcna.r /tmp/wgcna.json") - def test_create_json_data(self): + def test_create_json_file(self): """test for writing the data to a csv file""" # # All the traits we have data for (should not contain duplicates) # All the strains we have data for (contains duplicates) - trait_sample_data = {"1425642_at": {"129S1/SvImJ": 7.142, "A/J": 7.31, "AKR/J": 7.49, - "B6D2F1": 6.899, "BALB/cByJ": 7.172, "BALB/cJ": 7.396}, - "1457784_at": {"129S1/SvImJ": 7.071, "A/J": 7.05, "AKR/J": 7.313, - "B6D2F1": 6.999, "BALB/cByJ": 7.293, "BALB/cJ": 7.117}, - "1444351_at": {"129S1/SvImJ": 7.221, "A/J": 7.246, "AKR/J": 7.754, - "B6D2F1": 6.866, "BALB/cByJ": 6.752, "BALB/cJ": 7.269} + trait_sample_data = {"1425642_at": {"129S1/SvImJ": 7.142, + "A/J": 7.31, "AKR/J": 7.49, + "B6D2F1": 6.899, "BALB/cByJ": 7.172, + "BALB/cJ": 7.396}, + "1457784_at": {"129S1/SvImJ": 7.071, "A/J": 7.05, + "AKR/J": 7.313, + "B6D2F1": 6.999, "BALB/cByJ": 7.293, + "BALB/cJ": 7.117}, + "1444351_at": {"129S1/SvImJ": 7.221, "A/J": 7.246, + "AKR/J": 7.754, + "B6D2F1": 6.866, "BALB/cByJ": 6.752, + "BALB/cJ": 7.269} } @@ -32,4 +40,7 @@ class TestWgcna(TestCase): "minModuleSize": 30 } - _results = dump_wgcna_data(expected_input) + results = dump_wgcna_data( + expected_input) + + self.assertEqual(results, {}) -- cgit v1.2.3 From 78871ef396f394c54072960e985476e418220fe3 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Fri, 17 Sep 2021 10:30:16 +0300 Subject: Create dendrogram to show clustering tree Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Provide the clustering data to be used for the creation of the clustering dendrogram in the final clustered heatmap plot. --- gn3/heatmaps.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py index 170b0cd..bf69d9b 100644 --- a/gn3/heatmaps.py +++ b/gn3/heatmaps.py @@ -3,9 +3,11 @@ This module will contain functions to be used in computation of the data used to generate various kinds of heatmaps. """ +import numpy as np from functools import reduce from gn3.settings import TMPDIR import plotly.graph_objects as go +import plotly.figure_factory as ff from gn3.random import random_string from typing import Any, Dict, Sequence from gn3.computations.slink import slink @@ -167,7 +169,8 @@ def build_heatmap(traits_names, conn: Any): strains = load_genotype_samples(genotype_filename) exported_traits_data_list = [ export_trait_data(td, strains) for td in traits_data_list] - slinked = slink(cluster_traits(exported_traits_data_list)) + clustered = cluster_traits(exported_traits_data_list) + slinked = slink(clustered) traits_order = compute_traits_order(slinked) ordered_traits_names = [ traits[idx]["trait_fullname"] for idx in traits_order] @@ -200,6 +203,7 @@ def build_heatmap(traits_names, conn: Any): return generate_clustered_heatmap( process_traits_data_for_heatmap( organised, traits_ids, chromosome_names), + clustered, "single_heatmap_{}".format(random_string(10)), y_axis=tuple( ordered_traits_names[traits_ids[order]] @@ -336,8 +340,9 @@ def process_traits_data_for_heatmap(data, trait_names, chromosome_names): return hdata def generate_clustered_heatmap( - data, image_filename_prefix, x_axis = None, x_label: str = "", - y_axis = None, y_label: str = "", output_dir: str = TMPDIR, + data, clustering_data, image_filename_prefix, x_axis = None, + x_label: str = "", y_axis = None, y_label: str = "", + output_dir: str = TMPDIR, colorscale = ( (0.0, '#3B3B3B'), (0.4999999999999999, '#ABABAB'), (0.5, '#F5DE11'), (1.0, '#FF0D00'))): @@ -345,21 +350,22 @@ def generate_clustered_heatmap( Generate a dendrogram, and heatmaps for each chromosome, and put them all into one plot. """ - num_cols = len(x_axis) + num_cols = 1 + len(x_axis) fig = make_subplots( rows=1, cols=num_cols, shared_yaxes="rows", - # horizontal_spacing=(1 / (num_cols - 1)), - subplot_titles=x_axis - ) + horizontal_spacing=0.001, + subplot_titles=["distance"] + x_axis, + figure = ff.create_dendrogram( + np.array(clustering_data), orientation="right", labels=y_axis)) hms = [go.Heatmap( name=chromo, y = y_axis, z = data_array, showscale=False) for chromo, data_array in zip(x_axis, data)] - for col, hm in enumerate(hms): - fig.add_trace(hm, row=1, col=(col + 1)) + for i, hm in enumerate(hms): + fig.add_trace(hm, row=1, col=(i + 2)) fig.update_traces( showlegend=False, -- cgit v1.2.3 From e17540bf8a57837fcf3241ea0b694250b53294fc Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Fri, 17 Sep 2021 10:32:23 +0300 Subject: Fix some layout issues and update colorscale Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Update the plot layouts and size to display the dendrogram and individual chromosome heatmaps side by side * Update the colour scale to begin with the grays rather than absolute black --- gn3/heatmaps.py | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py index bf69d9b..2859dde 100644 --- a/gn3/heatmaps.py +++ b/gn3/heatmaps.py @@ -344,7 +344,7 @@ def generate_clustered_heatmap( x_label: str = "", y_axis = None, y_label: str = "", output_dir: str = TMPDIR, colorscale = ( - (0.0, '#3B3B3B'), (0.4999999999999999, '#ABABAB'), + (0.0, '#5D5D5D'), (0.4999999999999999, '#ABABAB'), (0.5, '#F5DE11'), (1.0, '#FF0D00'))): """ Generate a dendrogram, and heatmaps for each chromosome, and put them all @@ -367,6 +367,33 @@ def generate_clustered_heatmap( for i, hm in enumerate(hms): fig.add_trace(hm, row=1, col=(i + 2)) + fig.update_layout( + { + "width": 1500, + "height": 800, + "xaxis": { + "mirror": False, + "showgrid": True + } + }) + + x_axes_layouts = { + "xaxis{}".format(i+1 if i > 0 else ""): { + "mirror": False, + "showticklabels": True if i==0 else False, + "ticks": "outside" if i==0 else "" + } + for i in range(num_cols)} + + fig.update_layout( + { + "width": 4000, + "height": 800, + "yaxis": { + "mirror": False, + "ticks": "" + }, + **x_axes_layouts}) fig.update_traces( showlegend=False, colorscale=colorscale, @@ -375,9 +402,6 @@ def generate_clustered_heatmap( showlegend=True, showscale=True, selector={"name": x_axis[-1]}) - fig.update_layout( - coloraxis_colorscale=colorscale - ) image_filename = "{}/{}.html".format(output_dir, image_filename_prefix) fig.write_html(image_filename) return image_filename, fig -- cgit v1.2.3 From 8ac3194f06084dfe5d0cfb141f178d83d937fcc3 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Fri, 17 Sep 2021 10:35:15 +0300 Subject: Return path to generated filename for now Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * To help with demonstrating that the code is producing the expected output, for now, we return the path to the generated html file that displays the interactive heatmap. At this point, it is mostly useful in the development environment. Moving forward, we might have to actually stream the raw html, or if we can get the Kaleido library packaged for GNU Guix, stream the images binary data instead. --- gn3/api/heatmaps.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/gn3/api/heatmaps.py b/gn3/api/heatmaps.py index cac9c71..f053241 100644 --- a/gn3/api/heatmaps.py +++ b/gn3/api/heatmaps.py @@ -20,9 +20,10 @@ def clustered_heatmaps(): # stream the heatmap data somehow here. # Can plotly actually stream the figure data in a way that can be used on # remote end to display the image without necessarily being html? - return jsonify( - { - "query": heatmap_request, - "output_png": heatmap_fig.to_image(format="png"), - "output_svg": heatmap_fig.to_image(format="svg") - }), 200 + # return jsonify( + # { + # "query": heatmap_request, + # "output_png": heatmap_fig.to_image(format="png"), + # "output_svg": heatmap_fig.to_image(format="svg") + # }), 200 + return jsonify({"output_filename": _heatmap_file}), 200 -- cgit v1.2.3 From 1e2357049adc72808fbf8eaac3da9411d3c78c66 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Fri, 17 Sep 2021 11:20:16 +0300 Subject: Fix a number of linting issues Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi --- gn3/computations/qtlreaper.py | 7 ++-- gn3/db/genotypes.py | 2 +- gn3/heatmaps.py | 54 ++++++++++++------------------- tests/unit/computations/test_qtlreaper.py | 3 +- tests/unit/test_heatmaps.py | 6 ++-- 5 files changed, 32 insertions(+), 40 deletions(-) diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py index 5180853..377db9b 100644 --- a/gn3/computations/qtlreaper.py +++ b/gn3/computations/qtlreaper.py @@ -110,9 +110,10 @@ def organise_reaper_main_results(parsed_results): unique_chromosomes = {item["Chr"] for item in id_items} return { "ID": identifier, - "chromosomes": {_chr["Chr"]: _chr for _chr in [ - __organise_by_chromosome(chromo, id_items) - for chromo in sorted( + "chromosomes": { + _chr["Chr"]: _chr for _chr in [ + __organise_by_chromosome(chromo, id_items) + for chromo in sorted( unique_chromosomes, key=chromosome_sorter_key_fn)]}} unique_ids = {res["ID"] for res in parsed_results} diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py index b03d55c..9d052d9 100644 --- a/gn3/db/genotypes.py +++ b/gn3/db/genotypes.py @@ -174,7 +174,7 @@ def parse_genotype_file(filename: str, parlist: tuple = tuple()): geno_obj = dict(labels + header) markers = tuple( [parse_genotype_marker(line, geno_obj, parlist) - for line in data_lines[1:]]) + for line in data_lines[1:]]) chromosomes = tuple( dict(chromosome) for chromosome in build_genotype_chromosomes(geno_obj, markers)) diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py index 2859dde..c4fc67d 100644 --- a/gn3/heatmaps.py +++ b/gn3/heatmaps.py @@ -3,13 +3,13 @@ This module will contain functions to be used in computation of the data used to generate various kinds of heatmaps. """ +from typing import Any, Dict, Sequence import numpy as np from functools import reduce from gn3.settings import TMPDIR import plotly.graph_objects as go import plotly.figure_factory as ff from gn3.random import random_string -from typing import Any, Dict, Sequence from gn3.computations.slink import slink from plotly.subplots import make_subplots from gn3.computations.correlations2 import compute_correlation @@ -165,7 +165,7 @@ def build_heatmap(traits_names, conn: Any): for fullname in traits_names] traits_data_list = [retrieve_trait_data(t, conn) for t in traits] genotype_filename = build_genotype_file(traits[0]["riset"]) - genotype = parse_genotype_file(genotype_filename) + # genotype = parse_genotype_file(genotype_filename) strains = load_genotype_samples(genotype_filename) exported_traits_data_list = [ export_trait_data(td, strains) for td in traits_data_list] @@ -183,22 +183,21 @@ def build_heatmap(traits_names, conn: Any): [t[2] for t in strains_and_values], traits_filename) - main_output, permutations_output = run_reaper( + main_output, _permutations_output = run_reaper( genotype_filename, traits_filename, separate_nperm_output=True) qtlresults = parse_reaper_main_results(main_output) - permudata = parse_reaper_permutation_results(permutations_output) + # permudata = parse_reaper_permutation_results(permutations_output) organised = organise_reaper_main_results(qtlresults) traits_ids = [# sort numerically, but retain the ids as strings str(i) for i in sorted({int(row["ID"]) for row in qtlresults})] chromosome_names = sorted( - {row["Chr"] for row in qtlresults}, key = chromosome_sorter_key_fn) - loci_names = sorted({row["Locus"] for row in qtlresults}) - ordered_traits_names = { - res_id: trait for res_id, trait in + {row["Chr"] for row in qtlresults}, key=chromosome_sorter_key_fn) + # loci_names = sorted({row["Locus"] for row in qtlresults}) + ordered_traits_names = dict( zip(traits_ids, - [traits[idx]["trait_fullname"] for idx in traits_order])} + [traits[idx]["trait_fullname"] for idx in traits_order])) return generate_clustered_heatmap( process_traits_data_for_heatmap( @@ -207,22 +206,11 @@ def build_heatmap(traits_names, conn: Any): "single_heatmap_{}".format(random_string(10)), y_axis=tuple( ordered_traits_names[traits_ids[order]] - for order in traits_order), + for order in traits_order), y_label="Traits", - x_axis=[chromo for chromo in chromosome_names], + x_axis=chromosome_names, x_label="Chromosomes") - return { - "slink_data": slink_data, - "ordering_data": ordering_data, - "strainlist": strainlist, - "genotype_filename": genotype_filename, - "traits_list": traits_list, - "traits_data_list": traits_data_list, - "exported_traits_data_list": exported_traits_data_list, - "traits_filename": traits_filename - } - def compute_traits_order(slink_data, neworder: tuple = tuple()): """ Compute the order of the traits for clustering from `slink_data`. @@ -314,7 +302,7 @@ def get_nearest_marker(traits_list, genotype): https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L419-L438 """ if not genotype["Mbmap"]: - return [None] * len(trait_list) + return [None] * len(traits_list) marker_finder = nearest_marker_finder(genotype) return [marker_finder(trait) for trait in traits_list] @@ -340,10 +328,10 @@ def process_traits_data_for_heatmap(data, trait_names, chromosome_names): return hdata def generate_clustered_heatmap( - data, clustering_data, image_filename_prefix, x_axis = None, - x_label: str = "", y_axis = None, y_label: str = "", + data, clustering_data, image_filename_prefix, x_axis=None, + x_label: str = "", y_axis=None, y_label: str = "", output_dir: str = TMPDIR, - colorscale = ( + colorscale=( (0.0, '#5D5D5D'), (0.4999999999999999, '#ABABAB'), (0.5, '#F5DE11'), (1.0, '#FF0D00'))): """ @@ -357,15 +345,15 @@ def generate_clustered_heatmap( shared_yaxes="rows", horizontal_spacing=0.001, subplot_titles=["distance"] + x_axis, - figure = ff.create_dendrogram( + figure=ff.create_dendrogram( np.array(clustering_data), orientation="right", labels=y_axis)) hms = [go.Heatmap( name=chromo, - y = y_axis, - z = data_array, + y=y_axis, + z=data_array, showscale=False) for chromo, data_array in zip(x_axis, data)] - for i, hm in enumerate(hms): - fig.add_trace(hm, row=1, col=(i + 2)) + for i, heatmap in enumerate(hms): + fig.add_trace(heatmap, row=1, col=(i + 2)) fig.update_layout( { @@ -380,8 +368,8 @@ def generate_clustered_heatmap( x_axes_layouts = { "xaxis{}".format(i+1 if i > 0 else ""): { "mirror": False, - "showticklabels": True if i==0 else False, - "ticks": "outside" if i==0 else "" + "showticklabels": True if i == 0 else False, + "ticks": "outside" if i == 0 else "" } for i in range(num_cols)} diff --git a/tests/unit/computations/test_qtlreaper.py b/tests/unit/computations/test_qtlreaper.py index 1d67827..d420470 100644 --- a/tests/unit/computations/test_qtlreaper.py +++ b/tests/unit/computations/test_qtlreaper.py @@ -77,6 +77,7 @@ class TestQTLReaper(TestCase): 5.82775, 5.89659, 5.92117, 5.93396, 5.93396, 5.94957]) def test_organise_reaper_main_results(self): + """Check that results are organised correctly.""" self.assertEqual( organise_reaper_main_results([ { @@ -135,7 +136,7 @@ class TestQTLReaper(TestCase): 1: {"Chr": 1, "loci": [ { - "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, + "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { diff --git a/tests/unit/test_heatmaps.py b/tests/unit/test_heatmaps.py index f3a81c5..c0a496b 100644 --- a/tests/unit/test_heatmaps.py +++ b/tests/unit/test_heatmaps.py @@ -189,6 +189,7 @@ class TestHeatmap(TestCase): retrieve_strains_and_values(orders, slist, tdata), expected) def test_get_lrs_from_chr(self): + """Check that function gets correct LRS values""" for trait, chromosome, expected in [ [{"chromosomes": {}}, 3, [None]], [{"chromosomes": {3: {"loci": [ @@ -202,6 +203,7 @@ class TestHeatmap(TestCase): self.assertEqual(get_lrs_from_chr(trait, chromosome), expected) def test_process_traits_data_for_heatmap(self): + """Check for correct processing of data for heatmap generation.""" self.assertEqual( process_traits_data_for_heatmap( {"1": { @@ -210,7 +212,7 @@ class TestHeatmap(TestCase): 1: {"Chr": 1, "loci": [ { - "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, + "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { @@ -257,7 +259,7 @@ class TestHeatmap(TestCase): 1: {"Chr": 1, "loci": [ { - "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, + "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { -- cgit v1.2.3 From b7fb10586b956a8b0389e7925e4c0cff28cde82f Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 20 Sep 2021 06:36:00 +0300 Subject: Return only the data Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/api/heatmaps.py: Parse incoming data to build up correct trait names and respond with only the computed heatmap data. * gn3/heatmaps.py: Return only the computed data for heatmaps and clustering. Since GN3 is supposed to handle only the data, and db-access, this commit ensures that GN3 responds to the client with only the computed heatmap data, and does not try to generate the heatmaps themselves. The generation of the heatmaps will be delegated to the UI clients, such as GeneNetwork2. --- gn3/api/heatmaps.py | 18 ++++++------------ gn3/heatmaps.py | 25 +++++++++++++++++-------- 2 files changed, 23 insertions(+), 20 deletions(-) diff --git a/gn3/api/heatmaps.py b/gn3/api/heatmaps.py index f053241..eea3ebe 100644 --- a/gn3/api/heatmaps.py +++ b/gn3/api/heatmaps.py @@ -15,15 +15,9 @@ def clustered_heatmaps(): "message": "You need to provide at least one trait name." }), 400 conn, _cursor = database_connector() - _heatmap_file, heatmap_fig = build_heatmap(traits_names, conn) - - # stream the heatmap data somehow here. - # Can plotly actually stream the figure data in a way that can be used on - # remote end to display the image without necessarily being html? - # return jsonify( - # { - # "query": heatmap_request, - # "output_png": heatmap_fig.to_image(format="png"), - # "output_svg": heatmap_fig.to_image(format="svg") - # }), 200 - return jsonify({"output_filename": _heatmap_file}), 200 + def setup_trait_fullname(trait): + name_parts = trait.split(":") + return "{dataset_name}::{trait_name}".format( + dataset_name=trait[1], trait_name=trait[0]) + traits_fullnames = [parse_trait_fullname(trait) for trait in traits_names] + return jsonify(build_heatmap(traits_fullnames, conn)), 200 diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py index c4fc67d..205a3b3 100644 --- a/gn3/heatmaps.py +++ b/gn3/heatmaps.py @@ -199,17 +199,26 @@ def build_heatmap(traits_names, conn: Any): zip(traits_ids, [traits[idx]["trait_fullname"] for idx in traits_order])) - return generate_clustered_heatmap( - process_traits_data_for_heatmap( + # return generate_clustered_heatmap( + # process_traits_data_for_heatmap( + # organised, traits_ids, chromosome_names), + # clustered, + # "single_heatmap_{}".format(random_string(10)), + # y_axis=tuple( + # ordered_traits_names[traits_ids[order]] + # for order in traits_order), + # y_label="Traits", + # x_axis=chromosome_names, + # x_label="Chromosomes") + return { + "clustering_data": clustered, + "heatmap_data": process_traits_data_for_heatmap( organised, traits_ids, chromosome_names), - clustered, - "single_heatmap_{}".format(random_string(10)), - y_axis=tuple( + "traits": tuple( ordered_traits_names[traits_ids[order]] for order in traits_order), - y_label="Traits", - x_axis=chromosome_names, - x_label="Chromosomes") + "chromosomes": chromosome_names + } def compute_traits_order(slink_data, neworder: tuple = tuple()): """ -- cgit v1.2.3 From f5415f41d7f682771555e73f36ac4ee7ef51a1d3 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 20 Sep 2021 06:42:05 +0300 Subject: Remove proof-of-concept test code Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Remove the proof-of-concept CLI-only module that was used to learn how the heatmaps work and identify the appropriate data for use with them. --- qtlfilesexport.py | 253 ------------------------------------------------------ 1 file changed, 253 deletions(-) delete mode 100644 qtlfilesexport.py diff --git a/qtlfilesexport.py b/qtlfilesexport.py deleted file mode 100644 index 100fa75..0000000 --- a/qtlfilesexport.py +++ /dev/null @@ -1,253 +0,0 @@ -""" -Test the qtlfiles export of traits files - -Run with: - - env SQL_URI="mysql://:@:/db_webqtl" python3 qtlfilesexport.py - -replacing the variables in the angled brackets with the appropriate values -""" -from gn3.random import random_string -from gn3.computations.slink import slink -from gn3.db_utils import database_connector -from gn3.computations.qtlreaper import run_reaper -from gn3.db.traits import retrieve_trait_data, retrieve_trait_info -from gn3.computations.heatmap import export_trait_data, get_nearest_marker -from gn3.db.genotypes import ( - build_genotype_file, - parse_genotype_file, - load_genotype_samples) -from gn3.computations.heatmap import ( - cluster_traits, - compute_traits_order, - retrieve_strains_and_values) -from gn3.computations.qtlreaper import ( - generate_traits_file, - chromosome_sorter_key_fn, - parse_reaper_main_results, - organise_reaper_main_results, - parse_reaper_permutation_results) - -import plotly.express as px - -## for dendrogram -import numpy as np -import plotly.graph_objects as go -import plotly.figure_factory as ff - -# for single heatmap -from plotly.subplots import make_subplots - -TMPDIR = "tmp/" - -def trait_fullnames(): - """Return sample names for traits""" - return [ - "UCLA_BXDBXH_CARTILAGE_V2::ILM103710672", - "UCLA_BXDBXH_CARTILAGE_V2::ILM2260338", - "UCLA_BXDBXH_CARTILAGE_V2::ILM3140576", - "UCLA_BXDBXH_CARTILAGE_V2::ILM5670577", - "UCLA_BXDBXH_CARTILAGE_V2::ILM2070121", - "UCLA_BXDBXH_CARTILAGE_V2::ILM103990541", - "UCLA_BXDBXH_CARTILAGE_V2::ILM1190722", - "UCLA_BXDBXH_CARTILAGE_V2::ILM6590722", - "UCLA_BXDBXH_CARTILAGE_V2::ILM4200064", - "UCLA_BXDBXH_CARTILAGE_V2::ILM3140463"] - -def get_lrs_from_chr(trait, chr_name): - chromosome = trait["chromosomes"].get(chr_name) - if chromosome: - return [ - locus["LRS"] for locus in - sorted(chromosome["loci"], key=lambda loc: loc["Locus"])] - return [None] - -def process_traits_data_for_heatmap(data, trait_names, chromosome_names): - print("TRAIT_NAMES: {}".format(trait_names)) - print("chromosome names: {}".format(chromosome_names)) - print("data keys: {}".format(data.keys())) - hdata = [ - [get_lrs_from_chr(data[trait], chr_name) for trait in trait_names] - for chr_name in chromosome_names] - # print("hdata: {}".format(hdata)) - return hdata - -def generate_heatmap( - data, image_filename_prefix, x_axis = None, x_label: str = "", - y_axis = None, y_label: str = "", output_dir: str = TMPDIR): - """Generate single heatmap section.""" - print("X-AXIS:({}, {}), Y-AXIS: ({}, {}), ROWS:{}, COLS:{}".format( - x_axis, (len(x_axis) if x_axis else 0), - y_axis, (len(y_axis) if y_axis else 0), - len(data), len(data[0]))) - fig = px.imshow( - data, - x = x_axis, - y = y_axis, - width=1000 - ) - fig.update_yaxes(title=y_label) - fig.update_xaxes(title=x_label) - image_filename = "{}/{}.html".format(output_dir, image_filename_prefix) - fig.write_html(image_filename) - return image_filename, fig - -def generate_dendrogram( - data, image_filename_prefix, x_axis = None, x_label: str = "", - y_axis = None, y_label: str = "", output_dir: str = TMPDIR): - fig = ff.create_dendrogram( - np.array(data), orientation="right", labels=y_axis) - - heatmap = go.Heatmap( - x=fig['layout']['xaxis']['ticktext'], - y=fig['layout']['yaxis']['ticktext'], - z=data) - - # print("HEAMAP:{}".format(heatmap)) - fig.add_trace(heatmap) - - fig.update_layout({"width": 1000, "height": 500}) - image_filename = "{}/{}.html".format(output_dir, image_filename_prefix) - fig.write_html(image_filename) - return image_filename, fig - -def generate_single_heatmap( - data, image_filename_prefix, x_axis = None, x_label: str = "", - y_axis = None, y_label: str = "", output_dir: str = TMPDIR): - """Generate single heatmap section.""" - # fig = go.Figure({"type": "heatmap"}) - num_cols = len(x_axis) - fig = make_subplots( - rows=1, - cols=num_cols, - shared_yaxes="rows", - # horizontal_spacing=(1 / (num_cols - 1)), - subplot_titles=x_axis - ) - hms = [go.Heatmap( - name=chromo, - y = y_axis, - z = data_array, - showscale=False) for chromo, data_array in zip(x_axis, data)] - for col, hm in enumerate(hms): - fig.add_trace(hm, row=1, col=(col + 1)) - - fig.update_traces( - showlegend=False, - colorscale=[ - [0.0, '#3B3B3B'], [0.4999999999999999, '#ABABAB'], - [0.5, '#F5DE11'], [1.0, '#FF0D00']], - selector={"type": "heatmap"}) - fig.update_traces( - showlegend=True, - showscale=True, - selector={"name": x_axis[-1]}) - fig.update_layout( - coloraxis_colorscale=[ - [0.0, '#3B3B3B'], [0.4999999999999999, '#ABABAB'], - [0.5, '#F5DE11'], [1.0, '#FF0D00']] - ) - print(fig) - image_filename = "{}/{}.html".format(output_dir, image_filename_prefix) - fig.write_html(image_filename) - return image_filename, fig - -def main(): - """entrypoint function""" - conn = database_connector()[0] - threshold = 0 - traits = [ - retrieve_trait_info(threshold, fullname, conn) - for fullname in trait_fullnames()] - traits_data_list = [retrieve_trait_data(t, conn) for t in traits] - genotype_filename = build_genotype_file(traits[0]["riset"]) - genotype = parse_genotype_file(genotype_filename) - strains = load_genotype_samples(genotype_filename) - exported_traits_data_list = [ - export_trait_data(td, strains) for td in traits_data_list] - slinked = slink(cluster_traits(exported_traits_data_list)) - print("SLINKED: {}".format(slinked)) - traits_order = compute_traits_order(slinked) - print("KEYS: {}".format(traits[0].keys())) - ordered_traits_names = [ - traits[idx]["trait_fullname"] for idx in traits_order] - print("ORDERS: {}".format(traits_order)) - strains_and_values = retrieve_strains_and_values( - traits_order, strains, exported_traits_data_list) - strains_values = strains_and_values[0][1] - trait_values = [t[2] for t in strains_and_values] - traits_filename = "{}/traits_test_file_{}.txt".format( - TMPDIR, random_string(10)) - generate_traits_file(strains_values, trait_values, traits_filename) - print("Generated file: {}".format(traits_filename)) - - main_output, permutations_output = run_reaper( - genotype_filename, traits_filename, separate_nperm_output=True) - - print("Main output: {}, Permutation output: {}".format( - main_output, permutations_output)) - - qtlresults = parse_reaper_main_results(main_output) - permudata = parse_reaper_permutation_results(permutations_output) - # print("QTLRESULTS: {}".format(qtlresults)) - # print("PERMUDATA: {}".format(permudata)) - - nearest = get_nearest_marker(traits, genotype) - print("NEAREST: {}".format(nearest)) - - organised = organise_reaper_main_results(qtlresults) - - traits_ids = [# sort numerically, but retain the ids as strings - str(i) for i in sorted({int(row["ID"]) for row in qtlresults})] - chromosome_names = sorted( - {row["Chr"] for row in qtlresults}, key = chromosome_sorter_key_fn) - loci_names = sorted({row["Locus"] for row in qtlresults}) - ordered_traits_names = { - res_id: trait for res_id, trait in - zip(traits_ids, - [traits[idx]["trait_fullname"] for idx in traits_order])} - # print("ordered:{}, original: {}".format( - # ordered_traits_names, [t["trait_fullname"] for t in traits])) - # print("chromosome_names:{}".format(chromosome_names)) - # print("trait_ids:{}".format(traits_ids)) - # print("loci names:{}".format(loci_names)) - hdata = process_traits_data_for_heatmap(organised, traits_ids, chromosome_names) - - # print("ZIPPED: {}".format(zip(tuple(ordered_traits_names.keys()), hdata))) - # print("HDATA LENGTH:{}, ORDERED TRAITS LENGTH:{}".format(len(hdata), len(ordered_traits_names.keys()))) - heatmaps_data = [ - generate_heatmap( - data, - "heatmap_chr{}_{}".format(chromo, random_string(10)), - y_axis=tuple( - ordered_traits_names[traits_ids[order]] - for order in traits_order), - x_label=chromo, - output_dir=TMPDIR) - for chromo, data in zip(chromosome_names, hdata)] - print("IMAGES FILENAMES: {}".format([img[0] for img in heatmaps_data])) - - dendograms_data = [ - generate_dendrogram( - data, - "dendo_chr{}_{}".format(chromo, random_string(10)), - y_axis=tuple( - ordered_traits_names[traits_ids[order]] - for order in traits_order), - x_label=chromo, - output_dir=TMPDIR) - for chromo, data in zip(chromosome_names, hdata)] - - res = generate_single_heatmap( - hdata, - "single_heatmap_{}".format(random_string(10)), - y_axis=tuple( - ordered_traits_names[traits_ids[order]] - for order in traits_order), - y_label="Traits", - x_axis=[chromo for chromo in chromosome_names], - x_label="Chromosomes", - output_dir=TMPDIR) - -if __name__ == "__main__": - main() -- cgit v1.2.3 From 8442204492a28153e995f3147e06c9758cd3bd28 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 20 Sep 2021 08:43:38 +0300 Subject: Enable Cross-Origin Resource Sharing Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/api/heatmaps.py: Fix bugs in data parsing * gn3/app.py: enable CORS * gn3/settings.py: add flask-cors configurations * guix.scm: Add flask-cors dependency For easier testing of the heatmaps generation feature, this commit activates the cross-origin resource sharing for all "localhost" origins. --- gn3/api/heatmaps.py | 4 ++-- gn3/app.py | 7 +++++++ gn3/settings.py | 12 ++++++++++++ guix.scm | 3 ++- 4 files changed, 23 insertions(+), 3 deletions(-) diff --git a/gn3/api/heatmaps.py b/gn3/api/heatmaps.py index eea3ebe..43ac580 100644 --- a/gn3/api/heatmaps.py +++ b/gn3/api/heatmaps.py @@ -15,9 +15,9 @@ def clustered_heatmaps(): "message": "You need to provide at least one trait name." }), 400 conn, _cursor = database_connector() - def setup_trait_fullname(trait): + def parse_trait_fullname(trait): name_parts = trait.split(":") return "{dataset_name}::{trait_name}".format( - dataset_name=trait[1], trait_name=trait[0]) + dataset_name=name_parts[1], trait_name=name_parts[0]) traits_fullnames = [parse_trait_fullname(trait) for trait in traits_names] return jsonify(build_heatmap(traits_fullnames, conn)), 200 diff --git a/gn3/app.py b/gn3/app.py index b4b08d0..6b4c57e 100644 --- a/gn3/app.py +++ b/gn3/app.py @@ -11,6 +11,7 @@ from gn3.api.heatmaps import heatmaps from gn3.api.correlation import correlation from gn3.api.data_entry import data_entry +from flask_cors import CORS def create_app(config: Union[Dict, str, None] = None) -> Flask: """Create a new flask object""" @@ -18,6 +19,12 @@ def create_app(config: Union[Dict, str, None] = None) -> Flask: # Load default configuration app.config.from_object("gn3.settings") + CORS( + app, + origins=app.config["CORS_ORIGINS"], + allow_headers=app.config["CORS_HEADERS"], + supports_credentials=True, intercept_exceptions=False) + # Load environment configuration if "GN3_CONF" in os.environ: app.config.from_envvar('GN3_CONF') diff --git a/gn3/settings.py b/gn3/settings.py index a08f846..9d7bba3 100644 --- a/gn3/settings.py +++ b/gn3/settings.py @@ -31,3 +31,15 @@ REAPER_COMMAND = "{}/bin/qtlreaper".format(os.environ.get("GUIX_ENVIRONMENT")) # genotype files GENOTYPE_FILES = os.environ.get( "GENOTYPE_FILES", "{}/genotype_files/genotype".format(os.environ.get("HOME"))) + +# CROSS-ORIGIN SETUP +CORS_ORIGINS = [ + "http://localhost:*", + "http://127.0.0.1:*" +] + +CORS_HEADERS = [ + "Content-Type", + "Authorization", + "Access-Control-Allow-Credentials" +] diff --git a/guix.scm b/guix.scm index 8e1cf79..fb97142 100644 --- a/guix.scm +++ b/guix.scm @@ -106,7 +106,8 @@ ("r-stringi" ,r-stringi) ("python-plotly" ,python-plotly) ("python-pandas" ,python-pandas) - ("rust-qtlreaper" ,rust-qtlreaper))) + ("rust-qtlreaper" ,rust-qtlreaper) + ("python-flask-cors" ,python-flask-cors))) (build-system python-build-system) (home-page "https://github.com/genenetwork/genenetwork3") (synopsis "GeneNetwork3 API for data science and machine learning.") -- cgit v1.2.3 From 580778386f054706086d7ab3d49f2c4f91f110e0 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Wed, 22 Sep 2021 00:48:37 +0300 Subject: init endpoint tests for wgcna --- tests/integration/test_wgcna.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 tests/integration/test_wgcna.py diff --git a/tests/integration/test_wgcna.py b/tests/integration/test_wgcna.py new file mode 100644 index 0000000..65763c1 --- /dev/null +++ b/tests/integration/test_wgcna.py @@ -0,0 +1,37 @@ +"""integration tests for wgcna""" + +from unittest import TestCase +from unittest import mock + +from gn3.app import create_app + + +class WgcnaIntegrationTest(TestCase): + """class contains wgcna integration tests""" + + def setUp(self): + self.app = create_app().test_client() + + @mock.patch("gn3.api.wgcna.call_wgcna_script") + def test_wgcna_endpoint(self, mock_wgcna_api): + """test /api/wgcna/run_wgcna endpoint""" + + wgcna_api_data = { + "eigengenes": ["1224_at", "121412_at", "32342342-at"], + "dendrogram_file_location": "/tmp/dend1.png" + + } + mock_wgcna_api.return_value = wgcna_api_data + + request_data = { + + "trait_sample_data": [], + + + } + + response = self.app.post("/api/wgcna/run_wgcna", + json=request_data, follow_redirects=True) + + self.assertEqual(response.status_code, 200) + self.assertEqual(response.get_json(), wgcna_api_data) -- cgit v1.2.3 From 6f7c60e6c438ada9a3b9032c8e4509351384e04f Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Wed, 22 Sep 2021 00:49:22 +0300 Subject: jsonify results --- gn3/api/wgcna.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gn3/api/wgcna.py b/gn3/api/wgcna.py index 5d49493..89784c4 100644 --- a/gn3/api/wgcna.py +++ b/gn3/api/wgcna.py @@ -2,6 +2,7 @@ from flask import Blueprint from flask import request from flask import current_app +from flask import jsonify from gn3.computations.wgcna import call_wgcna_script @@ -18,4 +19,4 @@ def run_wgcna(): results = call_wgcna_script(wgcna_script, wgcna_data) - return results, 200 + return jsonify(results), 200 -- cgit v1.2.3 From f4242f9743c7f236a00917fd187fbd603831dda1 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Wed, 22 Sep 2021 01:51:44 +0300 Subject: add required wgcna dependencies to guix.scm --- guix.scm | 1 + 1 file changed, 1 insertion(+) diff --git a/guix.scm b/guix.scm index 75ed4a9..509c9ff 100644 --- a/guix.scm +++ b/guix.scm @@ -101,6 +101,7 @@ ("r-optparse" ,r-optparse) ("r-qtl" ,r-qtl) ("r-stringi" ,r-stringi) + ("r-wgcna" ,r-wgcna) ("python-plotly" ,python-plotly) ("python-pandas" ,python-pandas))) (build-system python-build-system) -- cgit v1.2.3 From 0f871f49e749eb625f58326adf8f80b3d3b5b932 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Wed, 22 Sep 2021 01:53:45 +0300 Subject: add init tests for call to script --- tests/unit/computations/test_wgcna.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/unit/computations/test_wgcna.py b/tests/unit/computations/test_wgcna.py index 9a88515..fd90732 100644 --- a/tests/unit/computations/test_wgcna.py +++ b/tests/unit/computations/test_wgcna.py @@ -1,12 +1,27 @@ """module contains python code for wgcna""" +from unittest import skip from unittest import TestCase +from unittest import mock + from gn3.computations.wgcna import dump_wgcna_data from gn3.computations.wgcna import compose_wgcna_cmd +from gn3.computations.wgcna import call_wgcna_script class TestWgcna(TestCase): """test class for wgcna""" + @mock.patch("gn3.computations.wgcna.dump_wgcna_data") + def test_call_wgcna_script(self, mock_dump): + """call wgcna script""" + + mock_dump.return_value = "/tmp/QmQPeNsJPyVWPFDVHb77w8G42Fvo15z4bG2X8D2GhfbSXc-test.json" + + results = call_wgcna_script( + "/home/kabui/project/genenetwork3/scripts/wgcna_analysis.R", {}) + + self.assertEqual(results, "dsedf") + def test_compose_wgcna_cmd(self): """test for composing wgcna cmd""" wgcna_cmd = compose_wgcna_cmd( @@ -14,6 +29,7 @@ class TestWgcna(TestCase): self.assertEqual( wgcna_cmd, "Rscript /wgcna.r /tmp/wgcna.json") + @skip("to update tests") def test_create_json_file(self): """test for writing the data to a csv file""" # # All the traits we have data for (should not contain duplicates) -- cgit v1.2.3 From 920be820e9cefe1dcde86d9a252f098c67a2bb8b Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Wed, 22 Sep 2021 07:03:53 +0300 Subject: Return serialized plotly figure Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/api/heatmaps.py: Serialize the figure to JSON * gn3/heatmaps.py: Return the figure object Serialize the Plotly figure into JSON, and return that, so that it can be used on the client to display the image. --- gn3/api/heatmaps.py | 8 +++++++- gn3/heatmaps.py | 27 ++++++++------------------- 2 files changed, 15 insertions(+), 20 deletions(-) diff --git a/gn3/api/heatmaps.py b/gn3/api/heatmaps.py index 43ac580..0493f8a 100644 --- a/gn3/api/heatmaps.py +++ b/gn3/api/heatmaps.py @@ -1,3 +1,4 @@ +import io from flask import jsonify from flask import request from flask import Blueprint @@ -20,4 +21,9 @@ def clustered_heatmaps(): return "{dataset_name}::{trait_name}".format( dataset_name=name_parts[1], trait_name=name_parts[0]) traits_fullnames = [parse_trait_fullname(trait) for trait in traits_names] - return jsonify(build_heatmap(traits_fullnames, conn)), 200 + + with io.StringIO() as io_str: + _filename, figure = build_heatmap(traits_fullnames, conn) + figure.write_json(io_str) + fig_json = io_str.getvalue() + return fig_json, 200 diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py index 205a3b3..cd93b3f 100644 --- a/gn3/heatmaps.py +++ b/gn3/heatmaps.py @@ -187,38 +187,27 @@ def build_heatmap(traits_names, conn: Any): genotype_filename, traits_filename, separate_nperm_output=True) qtlresults = parse_reaper_main_results(main_output) - # permudata = parse_reaper_permutation_results(permutations_output) organised = organise_reaper_main_results(qtlresults) traits_ids = [# sort numerically, but retain the ids as strings str(i) for i in sorted({int(row["ID"]) for row in qtlresults})] chromosome_names = sorted( {row["Chr"] for row in qtlresults}, key=chromosome_sorter_key_fn) - # loci_names = sorted({row["Locus"] for row in qtlresults}) ordered_traits_names = dict( zip(traits_ids, [traits[idx]["trait_fullname"] for idx in traits_order])) - # return generate_clustered_heatmap( - # process_traits_data_for_heatmap( - # organised, traits_ids, chromosome_names), - # clustered, - # "single_heatmap_{}".format(random_string(10)), - # y_axis=tuple( - # ordered_traits_names[traits_ids[order]] - # for order in traits_order), - # y_label="Traits", - # x_axis=chromosome_names, - # x_label="Chromosomes") - return { - "clustering_data": clustered, - "heatmap_data": process_traits_data_for_heatmap( + return generate_clustered_heatmap( + process_traits_data_for_heatmap( organised, traits_ids, chromosome_names), - "traits": tuple( + clustered, + "single_heatmap_{}".format(random_string(10)), + y_axis=tuple( ordered_traits_names[traits_ids[order]] for order in traits_order), - "chromosomes": chromosome_names - } + y_label="Traits", + x_axis=chromosome_names, + x_label="Chromosomes") def compute_traits_order(slink_data, neworder: tuple = tuple()): """ -- cgit v1.2.3 From 5892ffc7488b0c9cbb4ea08fd5c5f8648e0baea8 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Wed, 22 Sep 2021 07:06:14 +0300 Subject: Update check: Heatmaps need at least 2 items Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Update the check to look for at least 2 traits before trying to generate the heatmap. --- gn3/api/heatmaps.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gn3/api/heatmaps.py b/gn3/api/heatmaps.py index 0493f8a..1022a35 100644 --- a/gn3/api/heatmaps.py +++ b/gn3/api/heatmaps.py @@ -11,9 +11,9 @@ heatmaps = Blueprint("heatmaps", __name__) def clustered_heatmaps(): heatmap_request = request.get_json() traits_names = heatmap_request.get("traits_names", tuple()) - if len(traits_names) < 1: + if len(traits_names) < 2: return jsonify({ - "message": "You need to provide at least one trait name." + "message": "You need to provide at least two trait names." }), 400 conn, _cursor = database_connector() def parse_trait_fullname(trait): -- cgit v1.2.3 From cd7f301688fd9780df1f842f8bd2b7602775ba1f Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Wed, 22 Sep 2021 07:53:53 +0300 Subject: Fix pylint errors * Add missing function and module docstrings * Remove unused imports * Fix import order * Rework some code sections to fix issues * Disable some pylint errors. --- gn3/api/heatmaps.py | 8 ++++++++ gn3/app.py | 5 +++-- gn3/computations/qtlreaper.py | 8 ++++++++ gn3/db/genotypes.py | 1 + gn3/db/traits.py | 2 +- gn3/heatmaps.py | 28 ++++++++++++++++------------ 6 files changed, 37 insertions(+), 15 deletions(-) diff --git a/gn3/api/heatmaps.py b/gn3/api/heatmaps.py index 1022a35..fe47aee 100644 --- a/gn3/api/heatmaps.py +++ b/gn3/api/heatmaps.py @@ -1,3 +1,7 @@ +""" +Module to hold the entrypoint functions that generate heatmaps +""" + import io from flask import jsonify from flask import request @@ -9,6 +13,10 @@ heatmaps = Blueprint("heatmaps", __name__) @heatmaps.route("/clustered", methods=("POST",)) def clustered_heatmaps(): + """ + Parses the incoming data and responds with the JSON-serialized plotly figure + representing the clustered heatmap. + """ heatmap_request = request.get_json() traits_names = heatmap_request.get("traits_names", tuple()) if len(traits_names) < 2: diff --git a/gn3/app.py b/gn3/app.py index 6b4c57e..8badb65 100644 --- a/gn3/app.py +++ b/gn3/app.py @@ -3,7 +3,10 @@ import os from typing import Dict from typing import Union + from flask import Flask +from flask_cors import CORS + from gn3.api.gemma import gemma from gn3.api.rqtl import rqtl from gn3.api.general import general @@ -11,8 +14,6 @@ from gn3.api.heatmaps import heatmaps from gn3.api.correlation import correlation from gn3.api.data_entry import data_entry -from flask_cors import CORS - def create_app(config: Union[Dict, str, None] = None) -> Flask: """Create a new flask object""" app = Flask(__name__) diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py index 377db9b..5d17fed 100644 --- a/gn3/computations/qtlreaper.py +++ b/gn3/computations/qtlreaper.py @@ -87,11 +87,17 @@ def run_reaper( return (output_filename, permu_output_filename) def chromosome_sorter_key_fn(val): + """ + Useful for sorting the chromosomes + """ if isinstance(val, int): return val return ord(val) def organise_reaper_main_results(parsed_results): + """ + Provide the results of running reaper in a format that is easier to use. + """ def __organise_by_chromosome(chr_name, items): chr_items = [item for item in items if item["Chr"] == chr_name] return { @@ -129,12 +135,14 @@ def parse_reaper_main_results(results_file): lines = infile.readlines() def __parse_column_float_value(value): + # pylint: disable=W0702 try: return float(value) except: return value def __parse_column_int_value(value): + # pylint: disable=W0702 try: return int(value) except: diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py index 9d052d9..919c539 100644 --- a/gn3/db/genotypes.py +++ b/gn3/db/genotypes.py @@ -115,6 +115,7 @@ def parse_genotype_marker(line: str, geno_obj: dict, parlist: list): Reworks https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/utility/gen_geno_ob.py#L143-L190 """ + # pylint: disable=W0702 marker_row = [item.strip() for item in line.split("\t")] geno_table = { geno_obj["mat"]: -1, geno_obj["pat"]: 1, geno_obj["het"]: 0, diff --git a/gn3/db/traits.py b/gn3/db/traits.py index bfe887e..747ed27 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -46,7 +46,7 @@ def update_sample_data(conn: Any, count: Union[int, str]): """Given the right parameters, update sample-data from the relevant table.""" - # pylint: disable=[R0913, R0914] + # pylint: disable=[R0913, R0914, C0103] STRAIN_ID_SQL: str = "UPDATE Strain SET Name = %s WHERE Id = %s" PUBLISH_DATA_SQL: str = ("UPDATE PublishData SET value = %s " "WHERE StrainId = %s AND Id = %s") diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py index cd93b3f..9d82fb2 100644 --- a/gn3/heatmaps.py +++ b/gn3/heatmaps.py @@ -3,29 +3,28 @@ This module will contain functions to be used in computation of the data used to generate various kinds of heatmaps. """ +from functools import reduce from typing import Any, Dict, Sequence + import numpy as np -from functools import reduce -from gn3.settings import TMPDIR import plotly.graph_objects as go import plotly.figure_factory as ff +from plotly.subplots import make_subplots + +from gn3.settings import TMPDIR from gn3.random import random_string from gn3.computations.slink import slink -from plotly.subplots import make_subplots from gn3.computations.correlations2 import compute_correlation from gn3.db.genotypes import ( - build_genotype_file, load_genotype_samples, parse_genotype_file) + build_genotype_file, load_genotype_samples) from gn3.db.traits import ( - retrieve_trait_data, - retrieve_trait_info, - generate_traits_filename) + retrieve_trait_data, retrieve_trait_info) from gn3.computations.qtlreaper import ( run_reaper, generate_traits_file, chromosome_sorter_key_fn, parse_reaper_main_results, - organise_reaper_main_results, - parse_reaper_permutation_results) + organise_reaper_main_results) def export_trait_data( trait_data: dict, strainlist: Sequence[str], dtype: str = "val", @@ -159,13 +158,13 @@ def build_heatmap(traits_names, conn: Any): PARAMETERS: TODO: Elaborate on the parameters here... """ + # pylint: disable=[R0914] threshold = 0 # webqtlConfig.PUBLICTHRESH traits = [ retrieve_trait_info(threshold, fullname, conn) for fullname in traits_names] traits_data_list = [retrieve_trait_data(t, conn) for t in traits] genotype_filename = build_genotype_file(traits[0]["riset"]) - # genotype = parse_genotype_file(genotype_filename) strains = load_genotype_samples(genotype_filename) exported_traits_data_list = [ export_trait_data(td, strains) for td in traits_data_list] @@ -336,6 +335,7 @@ def generate_clustered_heatmap( Generate a dendrogram, and heatmaps for each chromosome, and put them all into one plot. """ + # pylint: disable=[R0913, R0914] num_cols = 1 + len(x_axis) fig = make_subplots( rows=1, @@ -359,14 +359,18 @@ def generate_clustered_heatmap( "height": 800, "xaxis": { "mirror": False, - "showgrid": True + "showgrid": True, + "title": x_label + }, + "yaxis": { + "title": y_label } }) x_axes_layouts = { "xaxis{}".format(i+1 if i > 0 else ""): { "mirror": False, - "showticklabels": True if i == 0 else False, + "showticklabels": i == 0, "ticks": "outside" if i == 0 else "" } for i in range(num_cols)} -- cgit v1.2.3 From 71cc35e5178904b512b9007e33be17a36f6656f2 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Wed, 22 Sep 2021 08:36:11 +0300 Subject: Fix typing issues * Ignore some errors * Update typing definitions for some portions of code * Add missing imports --- gn3/app.py | 2 +- gn3/computations/qtlreaper.py | 6 ++++-- gn3/db/genotypes.py | 10 ++++++---- gn3/db/traits.py | 8 ++++---- gn3/heatmaps.py | 8 +++----- 5 files changed, 18 insertions(+), 16 deletions(-) diff --git a/gn3/app.py b/gn3/app.py index 8badb65..5e852e1 100644 --- a/gn3/app.py +++ b/gn3/app.py @@ -5,7 +5,7 @@ from typing import Dict from typing import Union from flask import Flask -from flask_cors import CORS +from flask_cors import CORS # type: ignore from gn3.api.gemma import gemma from gn3.api.rqtl import rqtl diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py index 5d17fed..5ddea76 100644 --- a/gn3/computations/qtlreaper.py +++ b/gn3/computations/qtlreaper.py @@ -4,6 +4,8 @@ computation of QTLs. """ import os import subprocess +from typing import Union + from gn3.random import random_string from gn3.settings import TMPDIR, REAPER_COMMAND @@ -70,9 +72,9 @@ def run_reaper( output_dir, random_string(10)) output_list = ["--main_output", output_filename] if separate_nperm_output: - permu_output_filename = "{}/qtlreaper/permu_output_{}.txt".format( + permu_output_filename: Union[None, str] = "{}/qtlreaper/permu_output_{}.txt".format( output_dir, random_string(10)) - output_list = output_list + ["--permu_output", permu_output_filename] + output_list = output_list + ["--permu_output", permu_output_filename] # type: ignore[list-item] else: permu_output_filename = None diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py index 919c539..9ea9f20 100644 --- a/gn3/db/genotypes.py +++ b/gn3/db/genotypes.py @@ -2,6 +2,8 @@ import os import gzip +from typing import Union, TextIO + from gn3.settings import GENOTYPE_FILES def build_genotype_file( @@ -44,17 +46,17 @@ def __load_genotype_samples_from_geno(genotype_filename: str): """ gzipped_filename = "{}.gz".format(genotype_filename) if os.path.isfile(gzipped_filename): - genofile = gzip.open(gzipped_filename) + genofile: Union[TextIO, gzip.GzipFile] = gzip.open(gzipped_filename) else: genofile = open(genotype_filename) for row in genofile: line = row.strip() - if (not line) or (line.startswith(("#", "@"))): + if (not line) or (line.startswith(("#", "@"))): # type: ignore[arg-type] continue break - headers = line.split("\t") + headers = line.split("\t" ) # type: ignore[arg-type] if headers[3] == "Mb": return headers[4:] return headers[3:] @@ -107,7 +109,7 @@ def parse_genotype_header(line: str, parlist: tuple = tuple()): ("prgy", prgy), ("nprgy", len(prgy))) -def parse_genotype_marker(line: str, geno_obj: dict, parlist: list): +def parse_genotype_marker(line: str, geno_obj: dict, parlist: tuple): """ Parse a data line in a genotype file diff --git a/gn3/db/traits.py b/gn3/db/traits.py index 747ed27..4fc47c3 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -63,22 +63,22 @@ def update_sample_data(conn: Any, with conn.cursor() as cursor: # Update the Strains table cursor.execute(STRAIN_ID_SQL, (strain_name, strain_id)) - updated_strains: int = cursor.rowcount + updated_strains = cursor.rowcount # Update the PublishData table cursor.execute(PUBLISH_DATA_SQL, (None if value == "x" else value, strain_id, publish_data_id)) - updated_published_data: int = cursor.rowcount + updated_published_data = cursor.rowcount # Update the PublishSE table cursor.execute(PUBLISH_SE_SQL, (None if error == "x" else error, strain_id, publish_data_id)) - updated_se_data: int = cursor.rowcount + updated_se_data = cursor.rowcount # Update the NStrain table cursor.execute(N_STRAIN_SQL, (None if count == "x" else count, strain_id, publish_data_id)) - updated_n_strains: int = cursor.rowcount + updated_n_strains = cursor.rowcount return (updated_strains, updated_published_data, updated_se_data, updated_n_strains) diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py index 9d82fb2..45d0c22 100644 --- a/gn3/heatmaps.py +++ b/gn3/heatmaps.py @@ -7,9 +7,9 @@ from functools import reduce from typing import Any, Dict, Sequence import numpy as np -import plotly.graph_objects as go -import plotly.figure_factory as ff -from plotly.subplots import make_subplots +import plotly.graph_objects as go # type: ignore +import plotly.figure_factory as ff # type: ignore +from plotly.subplots import make_subplots # type: ignore from gn3.settings import TMPDIR from gn3.random import random_string @@ -171,8 +171,6 @@ def build_heatmap(traits_names, conn: Any): clustered = cluster_traits(exported_traits_data_list) slinked = slink(clustered) traits_order = compute_traits_order(slinked) - ordered_traits_names = [ - traits[idx]["trait_fullname"] for idx in traits_order] strains_and_values = retrieve_strains_and_values( traits_order, strains, exported_traits_data_list) traits_filename = "{}/traits_test_file_{}.txt".format( -- cgit v1.2.3 From 56c73324c285d896567268370f3955bbd15754b0 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Wed, 22 Sep 2021 09:02:46 +0300 Subject: Fix more pylint errors --- gn3/computations/qtlreaper.py | 3 ++- gn3/db/genotypes.py | 2 +- tests/unit/db/test_traits.py | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py index 5ddea76..8b2893e 100644 --- a/gn3/computations/qtlreaper.py +++ b/gn3/computations/qtlreaper.py @@ -74,7 +74,8 @@ def run_reaper( if separate_nperm_output: permu_output_filename: Union[None, str] = "{}/qtlreaper/permu_output_{}.txt".format( output_dir, random_string(10)) - output_list = output_list + ["--permu_output", permu_output_filename] # type: ignore[list-item] + output_list = output_list + [ + "--permu_output", permu_output_filename] # type: ignore[list-item] else: permu_output_filename = None diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py index 9ea9f20..9987320 100644 --- a/gn3/db/genotypes.py +++ b/gn3/db/genotypes.py @@ -56,7 +56,7 @@ def __load_genotype_samples_from_geno(genotype_filename: str): continue break - headers = line.split("\t" ) # type: ignore[arg-type] + headers = line.split("\t") # type: ignore[arg-type] if headers[3] == "Mb": return headers[4:] return headers[3:] diff --git a/tests/unit/db/test_traits.py b/tests/unit/db/test_traits.py index ee98893..baa2af3 100644 --- a/tests/unit/db/test_traits.py +++ b/tests/unit/db/test_traits.py @@ -166,6 +166,7 @@ class TestTraitsDBFunctions(TestCase): the right calls. """ + # pylint: disable=C0103 db_mock = mock.MagicMock() STRAIN_ID_SQL: str = "UPDATE Strain SET Name = %s WHERE Id = %s" -- cgit v1.2.3 From 95c5c0e73bffbf0287a17309e703063ee54d25ba Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Thu, 23 Sep 2021 03:45:19 +0300 Subject: Refactor: Move common sample data to separate file Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Move common sample test data into a separate file where it can be imported from, to prevent pylint error R0801 which proved tricky to silence in any other way. --- tests/unit/computations/test_qtlreaper.py | 68 ++++-------------- tests/unit/db/test_traits.py | 15 ++-- tests/unit/sample_test_data.py | 111 ++++++++++++++++++++++++++++++ tests/unit/test_heatmaps.py | 96 +------------------------- 4 files changed, 134 insertions(+), 156 deletions(-) create mode 100644 tests/unit/sample_test_data.py diff --git a/tests/unit/computations/test_qtlreaper.py b/tests/unit/computations/test_qtlreaper.py index d420470..742d106 100644 --- a/tests/unit/computations/test_qtlreaper.py +++ b/tests/unit/computations/test_qtlreaper.py @@ -4,6 +4,7 @@ from gn3.computations.qtlreaper import ( parse_reaper_main_results, organise_reaper_main_results, parse_reaper_permutation_results) +from tests.unit.sample_test_data import organised_trait_1 class TestQTLReaper(TestCase): """Class for testing qtlreaper interface functions.""" @@ -81,99 +82,54 @@ class TestQTLReaper(TestCase): self.assertEqual( organise_reaper_main_results([ { - "ID": "T1", "Locus": "rs31443144", "Chr": 1, "cM": 1.500, + "ID": "1", "Locus": "rs31443144", "Chr": 1, "cM": 1.500, "Mb": 3.010, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs6269442", "Chr": 1, "cM": 1.500, + "ID": "1", "Locus": "rs6269442", "Chr": 1, "cM": 1.500, "Mb": 3.492, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs32285189", "Chr": 1, "cM": 1.630, + "ID": "1", "Locus": "rs32285189", "Chr": 1, "cM": 1.630, "Mb": 3.511, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs258367496", "Chr": 1, "cM": 1.630, + "ID": "1", "Locus": "rs258367496", "Chr": 1, "cM": 1.630, "Mb": 3.660, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs32430919", "Chr": 1, "cM": 1.750, + "ID": "1", "Locus": "rs32430919", "Chr": 1, "cM": 1.750, "Mb": 3.777, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs36251697", "Chr": 1, "cM": 1.880, + "ID": "1", "Locus": "rs36251697", "Chr": 1, "cM": 1.880, "Mb": 3.812, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs30658298", "Chr": 1, "cM": 2.010, + "ID": "1", "Locus": "rs30658298", "Chr": 1, "cM": 2.010, "Mb": 4.431, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs51852623", "Chr": 2, "cM": 2.010, + "ID": "1", "Locus": "rs51852623", "Chr": 2, "cM": 2.010, "Mb": 4.447, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs31879829", "Chr": 2, "cM": 2.140, + "ID": "1", "Locus": "rs31879829", "Chr": 2, "cM": 2.140, "Mb": 4.519, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs36742481", "Chr": 2, "cM": 2.140, + "ID": "1", "Locus": "rs36742481", "Chr": 2, "cM": 2.140, "Mb": 4.776, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 } ]), - {"T1": {"ID": "T1", - "chromosomes": { - 1: {"Chr": 1, - "loci": [ - { - "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs6269442", "cM": 1.500, "Mb": 3.492, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs32285189", "cM": 1.630, "Mb": 3.511, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs258367496", "cM": 1.630, "Mb": 3.660, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs32430919", "cM": 1.750, "Mb": 3.777, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs36251697", "cM": 1.880, "Mb": 3.812, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs30658298", "cM": 2.010, "Mb": 4.431, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }]}, - 2: {"Chr": 2, - "loci": [ - { - "Locus": "rs51852623", "cM": 2.010, "Mb": 4.447, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs31879829", "cM": 2.140, "Mb": 4.519, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs36742481", "cM": 2.140, "Mb": 4.776, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }]}}}}) + organised_trait_1) diff --git a/tests/unit/db/test_traits.py b/tests/unit/db/test_traits.py index baa2af3..8af8e82 100644 --- a/tests/unit/db/test_traits.py +++ b/tests/unit/db/test_traits.py @@ -170,12 +170,15 @@ class TestTraitsDBFunctions(TestCase): db_mock = mock.MagicMock() STRAIN_ID_SQL: str = "UPDATE Strain SET Name = %s WHERE Id = %s" - PUBLISH_DATA_SQL: str = ("UPDATE PublishData SET value = %s " - "WHERE StrainId = %s AND Id = %s") - PUBLISH_SE_SQL: str = ("UPDATE PublishSE SET error = %s " - "WHERE StrainId = %s AND DataId = %s") - N_STRAIN_SQL: str = ("UPDATE NStrain SET count = %s " - "WHERE StrainId = %s AND DataId = %s") + PUBLISH_DATA_SQL: str = ( + "UPDATE PublishData SET value = %s " + "WHERE StrainId = %s AND Id = %s") + PUBLISH_SE_SQL: str = ( + "UPDATE PublishSE SET error = %s " + "WHERE StrainId = %s AND DataId = %s") + N_STRAIN_SQL: str = ( + "UPDATE NStrain SET count = %s " + "WHERE StrainId = %s AND DataId = %s") with db_mock.cursor() as cursor: type(cursor).rowcount = 1 diff --git a/tests/unit/sample_test_data.py b/tests/unit/sample_test_data.py new file mode 100644 index 0000000..407d074 --- /dev/null +++ b/tests/unit/sample_test_data.py @@ -0,0 +1,111 @@ +""" +This module holds a collection of sample data variables, used in more than one + test. + +This is mostly to avoid the `duplicate-code` pylint error that gets raised if +the same data is defined in more than one file. It has been found that adding +the `# pylint: disable=R0801` or `# pylint: disable=duplicate-code` to the top +of the file seems to not work as expected. + +Adding these same declarations to .pylintrc is not an option, since that, +seemingly, would deactivate the warnings for all code in the project: We do not +want that. +""" + +organised_trait_1 = { + "1": { + "ID": "1", + "chromosomes": { + 1: {"Chr": 1, + "loci": [ + { + "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs6269442", "cM": 1.500, "Mb": 3.492, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs32285189", "cM": 1.630, "Mb": 3.511, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs258367496", "cM": 1.630, "Mb": 3.660, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs32430919", "cM": 1.750, "Mb": 3.777, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs36251697", "cM": 1.880, "Mb": 3.812, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs30658298", "cM": 2.010, "Mb": 4.431, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }]}, + 2: {"Chr": 2, + "loci": [ + { + "Locus": "rs51852623", "cM": 2.010, "Mb": 4.447, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs31879829", "cM": 2.140, "Mb": 4.519, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs36742481", "cM": 2.140, "Mb": 4.776, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }]}}}} + +organised_trait_2 = { + "2": { + "ID": "2", + "chromosomes": { + 1: {"Chr": 1, + "loci": [ + { + "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs6269442", "cM": 1.500, "Mb": 3.492, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs32285189", "cM": 1.630, "Mb": 3.511, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs258367496", "cM": 1.630, "Mb": 3.660, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs32430919", "cM": 1.750, "Mb": 3.777, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs36251697", "cM": 1.880, "Mb": 3.812, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs30658298", "cM": 2.010, "Mb": 4.431, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }]}, + 2: {"Chr": 2, + "loci": [ + { + "Locus": "rs51852623", "cM": 2.010, "Mb": 4.447, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs31879829", "cM": 2.140, "Mb": 4.519, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs36742481", "cM": 2.140, "Mb": 4.776, + "LRS": 0.579, "Additive": -0.074, "pValue": 1.000 + }]}}}} diff --git a/tests/unit/test_heatmaps.py b/tests/unit/test_heatmaps.py index c0a496b..fd91cf9 100644 --- a/tests/unit/test_heatmaps.py +++ b/tests/unit/test_heatmaps.py @@ -7,6 +7,7 @@ from gn3.heatmaps import ( compute_traits_order, retrieve_strains_and_values, process_traits_data_for_heatmap) +from tests.unit.sample_test_data import organised_trait_1, organised_trait_2 strainlist = ["B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"] trait_data = { @@ -206,100 +207,7 @@ class TestHeatmap(TestCase): """Check for correct processing of data for heatmap generation.""" self.assertEqual( process_traits_data_for_heatmap( - {"1": { - "ID": "T1", - "chromosomes": { - 1: {"Chr": 1, - "loci": [ - { - "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs6269442", "cM": 1.500, "Mb": 3.492, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs32285189", "cM": 1.630, "Mb": 3.511, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs258367496", "cM": 1.630, "Mb": 3.660, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs32430919", "cM": 1.750, "Mb": 3.777, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs36251697", "cM": 1.880, "Mb": 3.812, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs30658298", "cM": 2.010, "Mb": 4.431, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }]}, - 2: {"Chr": 2, - "loci": [ - { - "Locus": "rs51852623", "cM": 2.010, "Mb": 4.447, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs31879829", "cM": 2.140, "Mb": 4.519, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs36742481", "cM": 2.140, "Mb": 4.776, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }]}}}, - "2": { - "ID": "T1", - "chromosomes": { - 1: {"Chr": 1, - "loci": [ - { - "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs6269442", "cM": 1.500, "Mb": 3.492, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs32285189", "cM": 1.630, "Mb": 3.511, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs258367496", "cM": 1.630, "Mb": 3.660, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs32430919", "cM": 1.750, "Mb": 3.777, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs36251697", "cM": 1.880, "Mb": 3.812, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs30658298", "cM": 2.010, "Mb": 4.431, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }]}, - 2: {"Chr": 2, - "loci": [ - { - "Locus": "rs51852623", "cM": 2.010, "Mb": 4.447, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs31879829", "cM": 2.140, "Mb": 4.519, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs36742481", "cM": 2.140, "Mb": 4.776, - "LRS": 0.579, "Additive": -0.074, "pValue": 1.000 - }]}}}}, + {**organised_trait_1, **organised_trait_2}, ["2", "1"], [1, 2]), [[[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5], -- cgit v1.2.3 From 8d9bc0f29ce9208306915b079818e6f0c31785e2 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Thu, 23 Sep 2021 03:55:44 +0300 Subject: Add missing dependencies causing pylint to fail * Add some dependencies used by the system that were missing in the test environment, leading to the pylint step failing. --- requirements.txt | 2 ++ setup.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/requirements.txt b/requirements.txt index f94c86f..d332a96 100644 --- a/requirements.txt +++ b/requirements.txt @@ -33,3 +33,5 @@ urllib3==1.26.4 varint==1.0.2 Werkzeug==1.0.1 wrapt==1.12.1 +plotly==4.14.3 +flask-cors==3.0.9 diff --git a/setup.py b/setup.py index 3f0922b..98a076f 100644 --- a/setup.py +++ b/setup.py @@ -20,6 +20,8 @@ setup(author='Bonface M. K.', "requests==2.25.1" "scipy==1.6.0" "sqlalchemy-stubs==0.4" + "plotly==4.14.3" + "flask-cors==3.0.9" ], license='GPLV3', long_description=open('README.md').read(), -- cgit v1.2.3 From 3d36fe96c94cebb6e7ea93b735148b25c4b95f6d Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 23 Sep 2021 12:22:10 +0300 Subject: load data from json file and and convert to dt --- scripts/wgcna_analysis.R | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/scripts/wgcna_analysis.R b/scripts/wgcna_analysis.R index 267cd86..d0ba91a 100644 --- a/scripts/wgcna_analysis.R +++ b/scripts/wgcna_analysis.R @@ -2,26 +2,29 @@ library(WGCNA); library(stringi); +library(rjson) options(stringsAsFactors = FALSE); imgDir = Sys.getenv("GENERATED_IMAGE_DIR") -# load expression data **assumes csv format row(traits)(columns info+samples) +# load expression data **assumes from json files row(traits)(columns info+samples) # pass the file_path as arg -inputData <- read.csv(file = "wgcna_data.csv") +results <- fromJSON(file = "file_path.json") -# transform expressionData +# trait_sample_data <- results$trait_sample_data +trait_sample_data <- do.call(rbind, results$trait_sample_data) -dataExpr <- as.data.frame(t(inputData)); -## data cleaning +dataExpr <- data.frame(apply(trait_sample_data, 2, function(x) as.numeric(as.character(x)))) +# trait_sample_data <- as.data.frame(t(results$trait_sample_data)) +# transform expressionData +dataExpr <- data.frame(t(dataExpr)) gsg = goodSamplesGenes(dataExpr, verbose = 3); # https://horvath.genetics.ucla.edu/html/CoexpressionNetwork/Rpackages/ - if (!gsg$allOK) { if (sum(!gsg$goodGenes)>0) @@ -49,7 +52,7 @@ network <- blockwiseModules(dataExpr, corType = "pearson", #adjacency matrix options - power = sft$powerEstimate, + power = 5, networkType = "unsigned", #TOM options TOMtype = "unsigned", @@ -70,14 +73,13 @@ genImageRandStr <- function(prefix){ return(paste(randStr,".png",sep="")) } -mergedColors <- labels2colors(net$colors) +mergedColors <- labels2colors(network$colors) imageLoc <- file.path(imgDir,genImageRandStr("WGCNAoutput")) - png(imageLoc,width=1000,height=600,type='cairo-png') -plotDendroAndColors(network$dendrograms[[1]],mergedColors[net$blockGenes[[1]]], +plotDendroAndColors(network$dendrograms[[1]],mergedColors[network$blockGenes[[1]]], "Module colors", dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05) -- cgit v1.2.3 From e5a50e6becabb9ebe4884714f7a182fad4490490 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 23 Sep 2021 12:22:51 +0300 Subject: add rjson dependency --- guix.scm | 1 + 1 file changed, 1 insertion(+) diff --git a/guix.scm b/guix.scm index 509c9ff..3e7dcea 100644 --- a/guix.scm +++ b/guix.scm @@ -102,6 +102,7 @@ ("r-qtl" ,r-qtl) ("r-stringi" ,r-stringi) ("r-wgcna" ,r-wgcna) + ("r-rjson" ,r-rjson) ("python-plotly" ,python-plotly) ("python-pandas" ,python-pandas))) (build-system python-build-system) -- cgit v1.2.3 From 95620e1aef5a9c56875845769d58d2aab20dacca Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 23 Sep 2021 12:37:51 +0300 Subject: pass other variables from user input for network constr --- scripts/wgcna_analysis.R | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/scripts/wgcna_analysis.R b/scripts/wgcna_analysis.R index d0ba91a..73d0e3f 100644 --- a/scripts/wgcna_analysis.R +++ b/scripts/wgcna_analysis.R @@ -13,12 +13,19 @@ imgDir = Sys.getenv("GENERATED_IMAGE_DIR") results <- fromJSON(file = "file_path.json") -# trait_sample_data <- results$trait_sample_data -trait_sample_data <- do.call(rbind, results$trait_sample_data) +# parse the json data input + +minModuleSize <-results$minModuleSize + +TOMtype <-results$TOMtype + +corType <-results$corType +# + +trait_sample_data <- do.call(rbind, results$trait_sample_data) dataExpr <- data.frame(apply(trait_sample_data, 2, function(x) as.numeric(as.character(x)))) -# trait_sample_data <- as.data.frame(t(results$trait_sample_data)) # transform expressionData dataExpr <- data.frame(t(dataExpr)) @@ -49,18 +56,18 @@ sft <- pickSoftThreshold(dataExpr, powerVector = powers, verbose = 5) # pass user options network <- blockwiseModules(dataExpr, #similarity matrix options - corType = "pearson", + corType = corType, #adjacency matrix options power = 5, networkType = "unsigned", #TOM options - TOMtype = "unsigned", + TOMtype = TOMtype, #module indentification - minmodulesSize = 30, - deepSplit = 5, + minmodulesSize = minModuleSize, + deepSplit = 3, PamRespectsDendro = FALSE ) @@ -76,7 +83,7 @@ genImageRandStr <- function(prefix){ mergedColors <- labels2colors(network$colors) imageLoc <- file.path(imgDir,genImageRandStr("WGCNAoutput")) - +imageLoc png(imageLoc,width=1000,height=600,type='cairo-png') plotDendroAndColors(network$dendrograms[[1]],mergedColors[network$blockGenes[[1]]], -- cgit v1.2.3 From 4a8e40f7331f61ba8b7038e8ad48f86018ac7dc6 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 23 Sep 2021 12:47:27 +0300 Subject: pass json file path as an arg --- scripts/wgcna_analysis.R | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/scripts/wgcna_analysis.R b/scripts/wgcna_analysis.R index 73d0e3f..53b59d5 100644 --- a/scripts/wgcna_analysis.R +++ b/scripts/wgcna_analysis.R @@ -7,11 +7,20 @@ library(rjson) options(stringsAsFactors = FALSE); imgDir = Sys.getenv("GENERATED_IMAGE_DIR") - # load expression data **assumes from json files row(traits)(columns info+samples) # pass the file_path as arg +# pass the file path to read json data + +args = commandArgs(trailingOnly=TRUE) + +if (length(args)==0) { + stop("Argument for the file location is requires", call.=FALSE) +} else { + # default output file + json_file_path = args[1] +} -results <- fromJSON(file = "file_path.json") +results <- fromJSON(file = json_file_path) # parse the json data input -- cgit v1.2.3 From 655343aded3ece533f267bd9fd16aadf0cefff02 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 23 Sep 2021 12:49:40 +0300 Subject: add mock test data for script --- scripts/wgcna_test_data.csv | 9 +++++++ scripts/wgcna_test_data.json | 64 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) create mode 100644 scripts/wgcna_test_data.csv create mode 100644 scripts/wgcna_test_data.json diff --git a/scripts/wgcna_test_data.csv b/scripts/wgcna_test_data.csv new file mode 100644 index 0000000..8db9598 --- /dev/null +++ b/scripts/wgcna_test_data.csv @@ -0,0 +1,9 @@ +129S1/SvImJ,A/J,AKR/J,B6D2F1,BALB/cByJ,BALB/cJ +7.142,7.31,7.49,6.899,7.172,7.396 +7.071,7.05,7.313,6.999,7.293,7.117 +7.221,7.246,7.754,6.866,6.752,7.269 +9.221,9.246,9.954,6.866,6.952,9.269 +7.221,7.246,7.754,6.866,6.752,7.269 +7.221,7.246,7.754,6.866,6.752,7.269 +11.221,11.246,11.1154,6.866,6.1152,11.269 + diff --git a/scripts/wgcna_test_data.json b/scripts/wgcna_test_data.json new file mode 100644 index 0000000..7348b4e --- /dev/null +++ b/scripts/wgcna_test_data.json @@ -0,0 +1,64 @@ +{ + "trait_sample_data":[ + { + "129S1/SvImJ": 7.142, + "A/J": 7.31, + "AKR/J": 7.49, + "B6D2F1": 6.899, + "BALB/cByJ": 7.172, + "BALB/cJ": 7.396 + }, + { + "129S1/SvImJ": 7.071, + "A/J": 7.05, + "AKR/J": 7.313, + "B6D2F1": 6.999, + "BALB/cByJ": 7.293, + "BALB/cJ": 7.117 + }, + { + "129S1/SvImJ": 7.221, + "A/J": 7.246, + "AKR/J": 7.754, + "B6D2F1": 6.866, + "BALB/cByJ": 6.752, + "BALB/cJ": 7.269 + }, + { + "129S1/SvImJ": 9.221, + "A/J": 9.246, + "AKR/J": 9.954, + "B6D2F1": 6.866, + "BALB/cByJ": 6.952, + "BALB/cJ": 9.269 + }, + { + "129S1/SvImJ": 7.221, + "A/J": 7.246, + "AKR/J": 7.754, + "B6D2F1": 6.866, + "BALB/cByJ": 6.752, + "BALB/cJ": 7.269 + }, + { + "129S1/SvImJ": 7.221, + "A/J": 7.246, + "AKR/J": 7.754, + "B6D2F1": 6.866, + "BALB/cByJ": 6.752, + "BALB/cJ": 7.269 + }, + { + "129S1/SvImJ": 11.221, + "A/J": 11.246, + "AKR/J": 11.1154, + "B6D2F1": 6.866, + "BALB/cByJ": 6.1152, + "BALB/cJ": 11.269 + } + ], + "TOMtype": "unsigned", + "minModuleSize": 30, + "corType": "pearson" + +} \ No newline at end of file -- cgit v1.2.3 From 9216b6ae956b5c78a9b6d21dbd40b6df1e111264 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 23 Sep 2021 14:22:14 +0300 Subject: validate required output --- scripts/wgcna_analysis.R | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/scripts/wgcna_analysis.R b/scripts/wgcna_analysis.R index 53b59d5..86ddffb 100644 --- a/scripts/wgcna_analysis.R +++ b/scripts/wgcna_analysis.R @@ -14,7 +14,7 @@ imgDir = Sys.getenv("GENERATED_IMAGE_DIR") args = commandArgs(trailingOnly=TRUE) if (length(args)==0) { - stop("Argument for the file location is requires", call.=FALSE) + stop("Argument for the file location is required", call.=FALSE) } else { # default output file json_file_path = args[1] @@ -92,7 +92,6 @@ genImageRandStr <- function(prefix){ mergedColors <- labels2colors(network$colors) imageLoc <- file.path(imgDir,genImageRandStr("WGCNAoutput")) -imageLoc png(imageLoc,width=1000,height=600,type='cairo-png') plotDendroAndColors(network$dendrograms[[1]],mergedColors[network$blockGenes[[1]]], @@ -102,3 +101,13 @@ addGuide = TRUE, guideHang = 0.05) + +json_data <- list(ModEigens=network$MEs,soft_threshold=sft$fitIndices, + blockGenes =network$blockGenes[[1]], + net_colors =network$colors, + net_unmerged=network$unmergedColors, + imageLoc=imageLoc) + +json_data <- toJSON(json_data) + +write(json_data,"./output.json") \ No newline at end of file -- cgit v1.2.3 From eb2277a5fc96795bf6432e392e62601a3cf94058 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 23 Sep 2021 14:22:31 +0300 Subject: sample output data --- scripts/output.json | 161 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 161 insertions(+) create mode 100644 scripts/output.json diff --git a/scripts/output.json b/scripts/output.json new file mode 100644 index 0000000..9caf837 --- /dev/null +++ b/scripts/output.json @@ -0,0 +1,161 @@ +{ + "ModEigens":{ + "MEturquoise":[ + 0.0646677768085351, + 0.137200224277058, + 0.63451113720732, + -0.544002665501479, + -0.489487590361863, + 0.197111117570427 + ] + }, + "soft_threshold":{ + "Power":[ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 12, + 14, + 16, + 18, + 20 + ], + "SFT.R.sq":[ + 0.0181093215847335, + 0.00011984142485113, + 0.000171967046945159, + 0.0105462010616537, + 0.0341444584348834, + 0.0687163726151286, + 0.0306423506274298, + 0.0492567394226327, + 0.0789539269997996, + 0.0944880158122527, + 0.122195040270446, + 0.0340768567186139, + 0.0625860126119281, + 0.100082257137014, + 0.128277841930818 + ], + "slope":[ + 3.81011386139005, + -0.170826531149538, + 0.159161787390082, + -1.01047981107833, + -1.55943531024794, + -1.93420125756514, + -1.16799247295814, + -1.33414063070555, + -1.54984944650438, + -1.54715757057087, + -1.49931213589121, + -1.80460140377151, + -2.19055343583319, + -2.52135805259606, + -2.58599487577447 + ], + "truncated.R.sq":[ + 0.768989700952805, + 0.522025793450566, + 0.329341226670409, + 0.110265719555879, + 0.0195649645183151, + -0.0503253079741786, + 0.0507498358330625, + -0.0129255450167765, + -0.035717519210676, + -0.0807094793662766, + -0.0967803564932559, + 0.00172686282662393, + -0.0340811003657508, + -0.0390284600100592, + -0.0489269837827069 + ], + "mean.k.":[ + 4.20178789454309, + 3.44556816856968, + 2.98532344074325, + 2.65297323828966, + 2.39517682414009, + 2.18846935370095, + 2.01963258852289, + 1.87999059876872, + 1.76335304166057, + 1.66510080962817, + 1.51038968360321, + 1.39583176924843, + 1.30882729664563, + 1.24120316437299, + 1.18753154238216 + ], + "median.k.":[ + 4.65733789933094, + 4.13585224131512, + 3.75980713552836, + 3.43775457512904, + 3.15305040369031, + 2.89933881967523, + 2.67225531456956, + 2.46825611568646, + 2.28437024155601, + 2.118086531192, + 1.83011067501282, + 1.59073325345641, + 1.38991168639473, + 1.2201000051609, + 1.07553194658444 + ], + "max.k.":[ + 4.81522245318686, + 4.21987143583501, + 3.83876962723542, + 3.55526976885104, + 3.32904938849614, + 3.14312441404036, + 2.98828051379132, + 2.85837136671219, + 2.74879840851137, + 2.65594228759455, + 2.50929962297015, + 2.40113981571731, + 2.31993775805391, + 2.25792900175825, + 2.2098218130451 + ] + }, + "blockGenes":[ + 1, + 2, + 3, + 4, + 5, + 6, + 7 + ], + "net_colors":{ + "X1":"turquoise", + "X2":"turquoise", + "X3":"turquoise", + "X4":"turquoise", + "X5":"turquoise", + "X6":"turquoise", + "X7":"turquoise" + }, + "net_unmerged":{ + "X1":"turquoise", + "X2":"turquoise", + "X3":"turquoise", + "X4":"turquoise", + "X5":"turquoise", + "X6":"turquoise", + "X7":"turquoise" + }, + "imageLoc":"/WGCNAoutput_1uujpTIpC.png" +} \ No newline at end of file -- cgit v1.2.3 From fc335fa1394a39d1f9f397cb69c755040ecfc5e1 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 23 Sep 2021 14:37:48 +0300 Subject: append input to output --- scripts/wgcna_analysis.R | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/scripts/wgcna_analysis.R b/scripts/wgcna_analysis.R index 86ddffb..ee749e9 100644 --- a/scripts/wgcna_analysis.R +++ b/scripts/wgcna_analysis.R @@ -20,19 +20,19 @@ if (length(args)==0) { json_file_path = args[1] } -results <- fromJSON(file = json_file_path) +inputData <- fromJSON(file = json_file_path) # parse the json data input -minModuleSize <-results$minModuleSize +minModuleSize <-inputData$minModuleSize -TOMtype <-results$TOMtype +TOMtype <-inputData$TOMtype -corType <-results$corType +corType <-inputData$corType # -trait_sample_data <- do.call(rbind, results$trait_sample_data) +trait_sample_data <- do.call(rbind, inputData$trait_sample_data) dataExpr <- data.frame(apply(trait_sample_data, 2, function(x) as.numeric(as.character(x)))) # transform expressionData @@ -102,12 +102,12 @@ addGuide = TRUE, guideHang = 0.05) -json_data <- list(ModEigens=network$MEs,soft_threshold=sft$fitIndices, +json_data <- list(input = inputData,output = list(ModEigens=network$MEs,soft_threshold=sft$fitIndices, blockGenes =network$blockGenes[[1]], net_colors =network$colors, net_unmerged=network$unmergedColors, - imageLoc=imageLoc) + imageLoc=imageLoc)) json_data <- toJSON(json_data) -write(json_data,"./output.json") \ No newline at end of file +write(json_data,file= json_file_path) \ No newline at end of file -- cgit v1.2.3 From 9c221e0d89603acd5412be95650a469824e2ab99 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 23 Sep 2021 15:10:16 +0300 Subject: check for na powerEst and use a default value --- scripts/wgcna_analysis.R | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/scripts/wgcna_analysis.R b/scripts/wgcna_analysis.R index ee749e9..c3b1ac8 100644 --- a/scripts/wgcna_analysis.R +++ b/scripts/wgcna_analysis.R @@ -62,13 +62,21 @@ enableWGCNAThreads() powers <- c(c(1:10), seq(from = 12, to=20, by=2)) sft <- pickSoftThreshold(dataExpr, powerVector = powers, verbose = 5) +# check the power estimate + +if (is.na(sft$powerEstimate)){ + powerEst = 1 +}else{ + powerEst = sft$powerEstimate +} + # pass user options network <- blockwiseModules(dataExpr, #similarity matrix options corType = corType, #adjacency matrix options - power = 5, + power = powerEst, networkType = "unsigned", #TOM options TOMtype = TOMtype, -- cgit v1.2.3 From 1c392b4b2786d196af6b882e80270b8cb839f554 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 23 Sep 2021 15:38:50 +0300 Subject: read generated files from r and return output --- gn3/computations/wgcna.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/gn3/computations/wgcna.py b/gn3/computations/wgcna.py index f0f0fa2..689bc2d 100644 --- a/gn3/computations/wgcna.py +++ b/gn3/computations/wgcna.py @@ -22,7 +22,7 @@ def dump_wgcna_data(request_data: dict): def compose_wgcna_cmd(rscript_path: str, temp_file_path: str): """function to componse wgcna cmd""" - cmd = f"Rscript {rscript_path} {temp_file_path}" + cmd = f"Rscript ./scripts/{rscript_path} {temp_file_path}" return cmd @@ -32,6 +32,9 @@ def call_wgcna_script(rscript_path: str, request_data: dict): cmd = compose_wgcna_cmd(rscript_path, generated_file) try: - return run_cmd(cmd) + + run_cmd(cmd) + with open(generated_file, "r") as outputfile: + return results except Exception as error: raise error -- cgit v1.2.3 From 0ca5f01422883d97e42dd37e59ffacdcf9a65af9 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 23 Sep 2021 15:58:25 +0300 Subject: Return r error if returncode!=0 --- gn3/computations/wgcna.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/gn3/computations/wgcna.py b/gn3/computations/wgcna.py index 689bc2d..e9b76e8 100644 --- a/gn3/computations/wgcna.py +++ b/gn3/computations/wgcna.py @@ -22,6 +22,7 @@ def dump_wgcna_data(request_data: dict): def compose_wgcna_cmd(rscript_path: str, temp_file_path: str): """function to componse wgcna cmd""" + # (todo):issue relative paths to abs paths cmd = f"Rscript ./scripts/{rscript_path} {temp_file_path}" return cmd @@ -33,8 +34,16 @@ def call_wgcna_script(rscript_path: str, request_data: dict): try: - run_cmd(cmd) + run_cmd_results = run_cmd(cmd) + with open(generated_file, "r") as outputfile: - return results + + if run_cmd_results["code"] != 0: + return run_cmd_results + return { + "data": json.load(outputfile), + **run_cmd_results + } + # return json.load(outputfile) except Exception as error: raise error -- cgit v1.2.3 From 62a5047be6cee5f692d44f97410cab11a01e3396 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 23 Sep 2021 16:05:17 +0300 Subject: minor fixes for endpoint --- gn3/api/wgcna.py | 3 +++ gn3/computations/wgcna.py | 1 - 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/gn3/api/wgcna.py b/gn3/api/wgcna.py index 89784c4..fa044cf 100644 --- a/gn3/api/wgcna.py +++ b/gn3/api/wgcna.py @@ -19,4 +19,7 @@ def run_wgcna(): results = call_wgcna_script(wgcna_script, wgcna_data) + if results.get("data") is None: + return jsonify(results), 401 + return jsonify(results), 200 diff --git a/gn3/computations/wgcna.py b/gn3/computations/wgcna.py index e9b76e8..436a888 100644 --- a/gn3/computations/wgcna.py +++ b/gn3/computations/wgcna.py @@ -44,6 +44,5 @@ def call_wgcna_script(rscript_path: str, request_data: dict): "data": json.load(outputfile), **run_cmd_results } - # return json.load(outputfile) except Exception as error: raise error -- cgit v1.2.3 From 3062aee581560ae1928d8e6077366fc072646677 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 23 Sep 2021 21:27:42 +0300 Subject: add traits as columns names and pass as json input --- scripts/wgcna_analysis.R | 18 +++++++++++------- scripts/wgcna_test_data.json | 3 ++- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/scripts/wgcna_analysis.R b/scripts/wgcna_analysis.R index c3b1ac8..e641652 100644 --- a/scripts/wgcna_analysis.R +++ b/scripts/wgcna_analysis.R @@ -53,6 +53,8 @@ dataExpr <- dataExpr[gsg$goodSamples, gsg$goodGenes] ## network constructions and modules +names(dataExpr) = inputData$trait_names + # Allow multi-threading within WGCNA enableWGCNAThreads() @@ -82,6 +84,7 @@ network <- blockwiseModules(dataExpr, TOMtype = TOMtype, #module indentification + verbose = 3, minmodulesSize = minModuleSize, deepSplit = 3, @@ -108,13 +111,14 @@ dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05) - - -json_data <- list(input = inputData,output = list(ModEigens=network$MEs,soft_threshold=sft$fitIndices, - blockGenes =network$blockGenes[[1]], - net_colors =network$colors, - net_unmerged=network$unmergedColors, - imageLoc=imageLoc)) +json_data <- list(input = inputData, + output = list(ModEigens=network$MEs, + soft_threshold=sft$fitIndices, + blockGenes =network$blockGenes[[1]], + net_colors =network$colors, + power_used_for_analy=powerEst, + net_unmerged=network$unmergedColors, + imageLoc=imageLoc)) json_data <- toJSON(json_data) diff --git a/scripts/wgcna_test_data.json b/scripts/wgcna_test_data.json index 7348b4e..1d469f4 100644 --- a/scripts/wgcna_test_data.json +++ b/scripts/wgcna_test_data.json @@ -1,4 +1,5 @@ { + "trait_names":["1455537_at","1425637_at","1449593_at","1421945_a_at","1450423_s_at","1423841_at","1451144_at"], "trait_sample_data":[ { "129S1/SvImJ": 7.142, @@ -61,4 +62,4 @@ "minModuleSize": 30, "corType": "pearson" -} \ No newline at end of file +} -- cgit v1.2.3 From 7dbb0e6d27cdb0923e94685cf44d244dd8a2e105 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Sat, 25 Sep 2021 18:33:12 +0300 Subject: minor fixes for unittests --- tests/integration/test_wgcna.py | 2 +- tests/unit/computations/test_wgcna.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integration/test_wgcna.py b/tests/integration/test_wgcna.py index 65763c1..39dabb2 100644 --- a/tests/integration/test_wgcna.py +++ b/tests/integration/test_wgcna.py @@ -33,5 +33,5 @@ class WgcnaIntegrationTest(TestCase): response = self.app.post("/api/wgcna/run_wgcna", json=request_data, follow_redirects=True) - self.assertEqual(response.status_code, 200) + self.assertEqual(response.status_code, 401) self.assertEqual(response.get_json(), wgcna_api_data) diff --git a/tests/unit/computations/test_wgcna.py b/tests/unit/computations/test_wgcna.py index fd90732..64f6c14 100644 --- a/tests/unit/computations/test_wgcna.py +++ b/tests/unit/computations/test_wgcna.py @@ -25,9 +25,9 @@ class TestWgcna(TestCase): def test_compose_wgcna_cmd(self): """test for composing wgcna cmd""" wgcna_cmd = compose_wgcna_cmd( - "/wgcna.r", "/tmp/wgcna.json") + "wgcna.r", "/tmp/wgcna.json") self.assertEqual( - wgcna_cmd, "Rscript /wgcna.r /tmp/wgcna.json") + wgcna_cmd, "Rscript ./scripts/wgcna.r /tmp/wgcna.json") @skip("to update tests") def test_create_json_file(self): -- cgit v1.2.3 From 19783a18c2bc7941fc5980e593f19fb1d18c3623 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 27 Sep 2021 04:48:53 +0300 Subject: Update terminology: `strain` to `sample` Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Update the terminology used: use `sample` in place of `strain` according to Zachary's direction at https://github.com/genenetwork/genenetwork3/pull/37#issuecomment-926043306 --- gn3/computations/parsers.py | 10 ++--- gn3/computations/qtlreaper.py | 8 ++-- gn3/db/genotypes.py | 8 ++-- gn3/db/traits.py | 44 ++++++++++----------- gn3/heatmaps.py | 62 ++++++++++++++--------------- tests/unit/computations/test_parsers.py | 4 +- tests/unit/test_heatmaps.py | 70 ++++++++++++++++----------------- 7 files changed, 103 insertions(+), 103 deletions(-) diff --git a/gn3/computations/parsers.py b/gn3/computations/parsers.py index 94387ff..1af35d6 100644 --- a/gn3/computations/parsers.py +++ b/gn3/computations/parsers.py @@ -14,7 +14,7 @@ def parse_genofile(file_path: str) -> Tuple[List[str], 'h': 0, 'u': None, } - genotypes, strains = [], [] + genotypes, samples = [], [] with open(file_path, "r") as _genofile: for line in _genofile: line = line.strip() @@ -22,8 +22,8 @@ def parse_genofile(file_path: str) -> Tuple[List[str], continue cells = line.split() if line.startswith("Chr"): - strains = cells[4:] - strains = [strain.lower() for strain in strains] + samples = cells[4:] + samples = [sample.lower() for sample in samples] continue values = [__map.get(value.lower(), None) for value in cells[4:]] genotype = { @@ -32,7 +32,7 @@ def parse_genofile(file_path: str) -> Tuple[List[str], "cm": cells[2], "mb": cells[3], "values": values, - "dicvalues": dict(zip(strains, values)), + "dicvalues": dict(zip(samples, values)), } genotypes.append(genotype) - return strains, genotypes + return samples, genotypes diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py index 8b2893e..166d2dd 100644 --- a/gn3/computations/qtlreaper.py +++ b/gn3/computations/qtlreaper.py @@ -9,17 +9,17 @@ from typing import Union from gn3.random import random_string from gn3.settings import TMPDIR, REAPER_COMMAND -def generate_traits_file(strains, trait_values, traits_filename): +def generate_traits_file(samples, trait_values, traits_filename): """ Generate a traits file for use with `qtlreaper`. PARAMETERS: - strains: A list of strains to use as the headers for the various columns. - trait_values: A list of lists of values for each trait and strain. + samples: A list of samples to use as the headers for the various columns. + trait_values: A list of lists of values for each trait and sample. traits_filename: The tab-separated value to put the values in for computation of QTLs. """ - header = "Trait\t{}\n".format("\t".join(strains)) + header = "Trait\t{}\n".format("\t".join(samples)) data = ( [header] + ["{}\t{}\n".format(i+1, "\t".join([str(i) for i in t])) diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py index 9987320..8f18cac 100644 --- a/gn3/db/genotypes.py +++ b/gn3/db/genotypes.py @@ -14,16 +14,16 @@ def build_genotype_file( def load_genotype_samples(genotype_filename: str, file_type: str = "geno"): """ - Load sample of strains from genotype files. + Load sample of samples from genotype files. DESCRIPTION: - Traits can contain a varied number of strains, some of which do not exist in + Traits can contain a varied number of samples, some of which do not exist in certain genotypes. In order to compute QTLs, GEMMAs, etc, we need to ensure - to pick only those strains that exist in the genotype under consideration + to pick only those samples that exist in the genotype under consideration for the traits used in the computation. This function loads a list of samples from the genotype files for use in - filtering out unusable strains. + filtering out unusable samples. PARAMETERS: diff --git a/gn3/db/traits.py b/gn3/db/traits.py index 4fc47c3..c9d05d7 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -445,7 +445,7 @@ def retrieve_temp_trait_data(trait_info: dict, conn: Any): query, {"trait_name": trait_info["trait_name"]}) return [dict(zip( - ["strain_name", "value", "se_error", "nstrain", "id"], row)) + ["sample_name", "value", "se_error", "nstrain", "id"], row)) for row in cursor.fetchall()] return [] @@ -484,7 +484,7 @@ def retrieve_geno_trait_data(trait_info: Dict, conn: Any): "species_id": retrieve_species_id( trait_info["db"]["riset"], conn)}) return [dict(zip( - ["strain_name", "value", "se_error", "id"], row)) + ["sample_name", "value", "se_error", "id"], row)) for row in cursor.fetchall()] return [] @@ -515,7 +515,7 @@ def retrieve_publish_trait_data(trait_info: Dict, conn: Any): {"trait_name": trait_info["trait_name"], "dataset_id": trait_info["db"]["dataset_id"]}) return [dict(zip( - ["strain_name", "value", "se_error", "nstrain", "id"], row)) + ["sample_name", "value", "se_error", "nstrain", "id"], row)) for row in cursor.fetchall()] return [] @@ -548,7 +548,7 @@ def retrieve_cellid_trait_data(trait_info: Dict, conn: Any): "trait_name": trait_info["trait_name"], "dataset_id": trait_info["db"]["dataset_id"]}) return [dict(zip( - ["strain_name", "value", "se_error", "id"], row)) + ["sample_name", "value", "se_error", "id"], row)) for row in cursor.fetchall()] return [] @@ -577,29 +577,29 @@ def retrieve_probeset_trait_data(trait_info: Dict, conn: Any): {"trait_name": trait_info["trait_name"], "dataset_name": trait_info["db"]["dataset_name"]}) return [dict(zip( - ["strain_name", "value", "se_error", "id"], row)) + ["sample_name", "value", "se_error", "id"], row)) for row in cursor.fetchall()] return [] -def with_strainlist_data_setup(strainlist: Sequence[str]): +def with_samplelist_data_setup(samplelist: Sequence[str]): """ - Build function that computes the trait data from provided list of strains. + Build function that computes the trait data from provided list of samples. PARAMETERS - strainlist: (list) - A list of strain names + samplelist: (list) + A list of sample names RETURNS: Returns a function that given some data from the database, computes the - strain's value, variance and ndata values, only if the strain is present - in the provided `strainlist` variable. + sample's value, variance and ndata values, only if the sample is present + in the provided `samplelist` variable. """ def setup_fn(tdata): - if tdata["strain_name"] in strainlist: + if tdata["sample_name"] in samplelist: val = tdata["value"] if val is not None: return { - "strain_name": tdata["strain_name"], + "sample_name": tdata["sample_name"], "value": val, "variance": tdata["se_error"], "ndata": tdata.get("nstrain", None) @@ -607,19 +607,19 @@ def with_strainlist_data_setup(strainlist: Sequence[str]): return None return setup_fn -def without_strainlist_data_setup(): +def without_samplelist_data_setup(): """ Build function that computes the trait data. RETURNS: Returns a function that given some data from the database, computes the - strain's value, variance and ndata values. + sample's value, variance and ndata values. """ def setup_fn(tdata): val = tdata["value"] if val is not None: return { - "strain_name": tdata["strain_name"], + "sample_name": tdata["sample_name"], "value": val, "variance": tdata["se_error"], "ndata": tdata.get("nstrain", None) @@ -627,7 +627,7 @@ def without_strainlist_data_setup(): return None return setup_fn -def retrieve_trait_data(trait: dict, conn: Any, strainlist: Sequence[str] = tuple()): +def retrieve_trait_data(trait: dict, conn: Any, samplelist: Sequence[str] = tuple()): """ Retrieve trait data @@ -650,23 +650,23 @@ def retrieve_trait_data(trait: dict, conn: Any, strainlist: Sequence[str] = tupl if results: # do something with mysqlid mysqlid = results[0]["id"] - if strainlist: + if samplelist: data = [ item for item in - map(with_strainlist_data_setup(strainlist), results) + map(with_samplelist_data_setup(samplelist), results) if item is not None] else: data = [ item for item in - map(without_strainlist_data_setup(), results) + map(without_samplelist_data_setup(), results) if item is not None] return { "mysqlid": mysqlid, "data": dict(map( lambda x: ( - x["strain_name"], - {k:v for k, v in x.items() if x != "strain_name"}), + x["sample_name"], + {k:v for k, v in x.items() if x != "sample_name"}), data))} return {} diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py index 45d0c22..b6fc6d3 100644 --- a/gn3/heatmaps.py +++ b/gn3/heatmaps.py @@ -27,10 +27,10 @@ from gn3.computations.qtlreaper import ( organise_reaper_main_results) def export_trait_data( - trait_data: dict, strainlist: Sequence[str], dtype: str = "val", + trait_data: dict, samplelist: Sequence[str], dtype: str = "val", var_exists: bool = False, n_exists: bool = False): """ - Export data according to `strainlist`. Mostly used in calculating + Export data according to `samplelist`. Mostly used in calculating correlations. DESCRIPTION: @@ -40,8 +40,8 @@ def export_trait_data( PARAMETERS trait: (dict) The dictionary of key-value pairs representing a trait - strainlist: (list) - A list of strain names + samplelist: (list) + A list of sample names dtype: (str) ... verify what this is ... var_exists: (bool) @@ -49,18 +49,18 @@ def export_trait_data( n_exists: (bool) A flag indicating existence of ndata """ - def __export_all_types(tdata, strain): + def __export_all_types(tdata, sample): sample_data = [] - if tdata[strain]["value"]: - sample_data.append(tdata[strain]["value"]) + if tdata[sample]["value"]: + sample_data.append(tdata[sample]["value"]) if var_exists: - if tdata[strain]["variance"]: - sample_data.append(tdata[strain]["variance"]) + if tdata[sample]["variance"]: + sample_data.append(tdata[sample]["variance"]) else: sample_data.append(None) if n_exists: - if tdata[strain]["ndata"]: - sample_data.append(tdata[strain]["ndata"]) + if tdata[sample]["ndata"]: + sample_data.append(tdata[sample]["ndata"]) else: sample_data.append(None) else: @@ -73,17 +73,17 @@ def export_trait_data( return tuple(sample_data) - def __exporter(accumulator, strain): + def __exporter(accumulator, sample): # pylint: disable=[R0911] - if strain in trait_data["data"]: + if sample in trait_data["data"]: if dtype == "val": - return accumulator + (trait_data["data"][strain]["value"], ) + return accumulator + (trait_data["data"][sample]["value"], ) if dtype == "var": - return accumulator + (trait_data["data"][strain]["variance"], ) + return accumulator + (trait_data["data"][sample]["variance"], ) if dtype == "N": - return accumulator + (trait_data["data"][strain]["ndata"], ) + return accumulator + (trait_data["data"][sample]["ndata"], ) if dtype == "all": - return accumulator + __export_all_types(trait_data["data"], strain) + return accumulator + __export_all_types(trait_data["data"], sample) raise KeyError("Type `%s` is incorrect" % dtype) if var_exists and n_exists: return accumulator + (None, None, None) @@ -91,7 +91,7 @@ def export_trait_data( return accumulator + (None, None) return accumulator + (None,) - return reduce(__exporter, strainlist, tuple()) + return reduce(__exporter, samplelist, tuple()) def trait_display_name(trait: Dict): """ @@ -165,19 +165,19 @@ def build_heatmap(traits_names, conn: Any): for fullname in traits_names] traits_data_list = [retrieve_trait_data(t, conn) for t in traits] genotype_filename = build_genotype_file(traits[0]["riset"]) - strains = load_genotype_samples(genotype_filename) + samples = load_genotype_samples(genotype_filename) exported_traits_data_list = [ - export_trait_data(td, strains) for td in traits_data_list] + export_trait_data(td, samples) for td in traits_data_list] clustered = cluster_traits(exported_traits_data_list) slinked = slink(clustered) traits_order = compute_traits_order(slinked) - strains_and_values = retrieve_strains_and_values( - traits_order, strains, exported_traits_data_list) + samples_and_values = retrieve_samples_and_values( + traits_order, samples, exported_traits_data_list) traits_filename = "{}/traits_test_file_{}.txt".format( TMPDIR, random_string(10)) generate_traits_file( - strains_and_values[0][1], - [t[2] for t in strains_and_values], + samples_and_values[0][1], + [t[2] for t in samples_and_values], traits_filename) main_output, _permutations_output = run_reaper( @@ -229,9 +229,9 @@ def compute_traits_order(slink_data, neworder: tuple = tuple()): return __order_maker(neworder, slink_data) -def retrieve_strains_and_values(orders, strainlist, traits_data_list): +def retrieve_samples_and_values(orders, samplelist, traits_data_list): """ - Get the strains and their corresponding values from `strainlist` and + Get the samples and their corresponding values from `samplelist` and `traits_data_list`. This migrates the code in @@ -240,17 +240,17 @@ def retrieve_strains_and_values(orders, strainlist, traits_data_list): # This feels nasty! There's a lot of mutation of values here, that might # indicate something untoward in the design of this function and its # dependents ==> Review - strains = [] + samples = [] values = [] rets = [] for order in orders: temp_val = traits_data_list[order] - for i, strain in enumerate(strainlist): + for i, sample in enumerate(samplelist): if temp_val[i] is not None: - strains.append(strain) + samples.append(sample) values.append(temp_val[i]) - rets.append([order, strains[:], values[:]]) - strains = [] + rets.append([order, samples[:], values[:]]) + samples = [] values = [] return rets diff --git a/tests/unit/computations/test_parsers.py b/tests/unit/computations/test_parsers.py index 19c3067..b51b0bf 100644 --- a/tests/unit/computations/test_parsers.py +++ b/tests/unit/computations/test_parsers.py @@ -15,7 +15,7 @@ class TestParsers(unittest.TestCase): def test_parse_genofile_with_existing_file(self): """Test that a genotype file is parsed correctly""" - strains = ["bxd1", "bxd2"] + samples = ["bxd1", "bxd2"] genotypes = [ {"chr": "1", "locus": "rs31443144", "cm": "1.50", "mb": "3.010274", @@ -51,4 +51,4 @@ class TestParsers(unittest.TestCase): "../test_data/genotype.txt" )) self.assertEqual(parse_genofile( - test_genotype_file), (strains, genotypes)) + test_genotype_file), (samples, genotypes)) diff --git a/tests/unit/test_heatmaps.py b/tests/unit/test_heatmaps.py index fd91cf9..b54e2f3 100644 --- a/tests/unit/test_heatmaps.py +++ b/tests/unit/test_heatmaps.py @@ -5,41 +5,41 @@ from gn3.heatmaps import ( get_lrs_from_chr, export_trait_data, compute_traits_order, - retrieve_strains_and_values, + retrieve_samples_and_values, process_traits_data_for_heatmap) from tests.unit.sample_test_data import organised_trait_1, organised_trait_2 -strainlist = ["B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"] +samplelist = ["B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"] trait_data = { "mysqlid": 36688172, "data": { - "B6cC3-1": {"strain_name": "B6cC3-1", "value": 7.51879, "variance": None, "ndata": None}, - "BXD1": {"strain_name": "BXD1", "value": 7.77141, "variance": None, "ndata": None}, - "BXD12": {"strain_name": "BXD12", "value": 8.39265, "variance": None, "ndata": None}, - "BXD16": {"strain_name": "BXD16", "value": 8.17443, "variance": None, "ndata": None}, - "BXD19": {"strain_name": "BXD19", "value": 8.30401, "variance": None, "ndata": None}, - "BXD2": {"strain_name": "BXD2", "value": 7.80944, "variance": None, "ndata": None}, - "BXD21": {"strain_name": "BXD21", "value": 8.93809, "variance": None, "ndata": None}, - "BXD24": {"strain_name": "BXD24", "value": 7.99415, "variance": None, "ndata": None}, - "BXD27": {"strain_name": "BXD27", "value": 8.12177, "variance": None, "ndata": None}, - "BXD28": {"strain_name": "BXD28", "value": 7.67688, "variance": None, "ndata": None}, - "BXD32": {"strain_name": "BXD32", "value": 7.79062, "variance": None, "ndata": None}, - "BXD39": {"strain_name": "BXD39", "value": 8.27641, "variance": None, "ndata": None}, - "BXD40": {"strain_name": "BXD40", "value": 8.18012, "variance": None, "ndata": None}, - "BXD42": {"strain_name": "BXD42", "value": 7.82433, "variance": None, "ndata": None}, - "BXD6": {"strain_name": "BXD6", "value": 8.09718, "variance": None, "ndata": None}, - "BXH14": {"strain_name": "BXH14", "value": 7.97475, "variance": None, "ndata": None}, - "BXH19": {"strain_name": "BXH19", "value": 7.67223, "variance": None, "ndata": None}, - "BXH2": {"strain_name": "BXH2", "value": 7.93622, "variance": None, "ndata": None}, - "BXH22": {"strain_name": "BXH22", "value": 7.43692, "variance": None, "ndata": None}, - "BXH4": {"strain_name": "BXH4", "value": 7.96336, "variance": None, "ndata": None}, - "BXH6": {"strain_name": "BXH6", "value": 7.75132, "variance": None, "ndata": None}, - "BXH7": {"strain_name": "BXH7", "value": 8.12927, "variance": None, "ndata": None}, - "BXH8": {"strain_name": "BXH8", "value": 6.77338, "variance": None, "ndata": None}, - "BXH9": {"strain_name": "BXH9", "value": 8.03836, "variance": None, "ndata": None}, - "C3H/HeJ": {"strain_name": "C3H/HeJ", "value": 7.42795, "variance": None, "ndata": None}, - "C57BL/6J": {"strain_name": "C57BL/6J", "value": 7.50606, "variance": None, "ndata": None}, - "DBA/2J": {"strain_name": "DBA/2J", "value": 7.72588, "variance": None, "ndata": None}}} + "B6cC3-1": {"sample_name": "B6cC3-1", "value": 7.51879, "variance": None, "ndata": None}, + "BXD1": {"sample_name": "BXD1", "value": 7.77141, "variance": None, "ndata": None}, + "BXD12": {"sample_name": "BXD12", "value": 8.39265, "variance": None, "ndata": None}, + "BXD16": {"sample_name": "BXD16", "value": 8.17443, "variance": None, "ndata": None}, + "BXD19": {"sample_name": "BXD19", "value": 8.30401, "variance": None, "ndata": None}, + "BXD2": {"sample_name": "BXD2", "value": 7.80944, "variance": None, "ndata": None}, + "BXD21": {"sample_name": "BXD21", "value": 8.93809, "variance": None, "ndata": None}, + "BXD24": {"sample_name": "BXD24", "value": 7.99415, "variance": None, "ndata": None}, + "BXD27": {"sample_name": "BXD27", "value": 8.12177, "variance": None, "ndata": None}, + "BXD28": {"sample_name": "BXD28", "value": 7.67688, "variance": None, "ndata": None}, + "BXD32": {"sample_name": "BXD32", "value": 7.79062, "variance": None, "ndata": None}, + "BXD39": {"sample_name": "BXD39", "value": 8.27641, "variance": None, "ndata": None}, + "BXD40": {"sample_name": "BXD40", "value": 8.18012, "variance": None, "ndata": None}, + "BXD42": {"sample_name": "BXD42", "value": 7.82433, "variance": None, "ndata": None}, + "BXD6": {"sample_name": "BXD6", "value": 8.09718, "variance": None, "ndata": None}, + "BXH14": {"sample_name": "BXH14", "value": 7.97475, "variance": None, "ndata": None}, + "BXH19": {"sample_name": "BXH19", "value": 7.67223, "variance": None, "ndata": None}, + "BXH2": {"sample_name": "BXH2", "value": 7.93622, "variance": None, "ndata": None}, + "BXH22": {"sample_name": "BXH22", "value": 7.43692, "variance": None, "ndata": None}, + "BXH4": {"sample_name": "BXH4", "value": 7.96336, "variance": None, "ndata": None}, + "BXH6": {"sample_name": "BXH6", "value": 7.75132, "variance": None, "ndata": None}, + "BXH7": {"sample_name": "BXH7", "value": 8.12927, "variance": None, "ndata": None}, + "BXH8": {"sample_name": "BXH8", "value": 6.77338, "variance": None, "ndata": None}, + "BXH9": {"sample_name": "BXH9", "value": 8.03836, "variance": None, "ndata": None}, + "C3H/HeJ": {"sample_name": "C3H/HeJ", "value": 7.42795, "variance": None, "ndata": None}, + "C57BL/6J": {"sample_name": "C57BL/6J", "value": 7.50606, "variance": None, "ndata": None}, + "DBA/2J": {"sample_name": "DBA/2J", "value": 7.72588, "variance": None, "ndata": None}}} slinked = ( (((0, 2, 0.16381088984330505), @@ -66,7 +66,7 @@ class TestHeatmap(TestCase): ["all", (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)]]: with self.subTest(dtype=dtype): self.assertEqual( - export_trait_data(trait_data, strainlist, dtype=dtype), + export_trait_data(trait_data, samplelist, dtype=dtype), expected) def test_export_trait_data_dtype_all_flags(self): @@ -106,7 +106,7 @@ class TestHeatmap(TestCase): with self.subTest(dtype=dtype, vflag=vflag, nflag=nflag): self.assertEqual( export_trait_data( - trait_data, strainlist, dtype=dtype, var_exists=vflag, + trait_data, samplelist, dtype=dtype, var_exists=vflag, n_exists=nflag), expected) @@ -164,8 +164,8 @@ class TestHeatmap(TestCase): self.assertEqual( compute_traits_order(slinked), (0, 2, 1, 7, 5, 9, 3, 6, 8, 4)) - def test_retrieve_strains_and_values(self): - """Test retrieval of strains and values.""" + def test_retrieve_samples_and_values(self): + """Test retrieval of samples and values.""" for orders, slist, tdata, expected in [ [ [2], @@ -185,9 +185,9 @@ class TestHeatmap(TestCase): [6, None, None, 4, None]], [[3, ["s1", "s4"], [6, 4]]] ]]: - with self.subTest(strainlist=slist, traitdata=tdata): + with self.subTest(samplelist=slist, traitdata=tdata): self.assertEqual( - retrieve_strains_and_values(orders, slist, tdata), expected) + retrieve_samples_and_values(orders, slist, tdata), expected) def test_get_lrs_from_chr(self): """Check that function gets correct LRS values""" -- cgit v1.2.3 From 1d09a9222f8c661da3abd6d61c09ae19eeb5d793 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 27 Sep 2021 05:02:09 +0300 Subject: Update terminology: `riset` to `group` Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Update terminology to use the appropriate domain terminology according to Zachary's direction at https://github.com/genenetwork/genenetwork3/pull/37#issuecomment-926041744 --- gn3/db/datasets.py | 52 +++++++++++++++++++++--------------------- gn3/db/traits.py | 16 ++++++------- gn3/heatmaps.py | 2 +- tests/unit/db/test_datasets.py | 42 +++++++++++++++++----------------- 4 files changed, 56 insertions(+), 56 deletions(-) diff --git a/gn3/db/datasets.py b/gn3/db/datasets.py index 4a05499..6c328f5 100644 --- a/gn3/db/datasets.py +++ b/gn3/db/datasets.py @@ -119,9 +119,9 @@ def retrieve_dataset_name( return fn_map[trait_type](threshold, dataset_name, conn) -def retrieve_geno_riset_fields(name, conn): +def retrieve_geno_group_fields(name, conn): """ - Retrieve the RISet, and RISetID values for various Geno trait types. + Retrieve the Group, and GroupID values for various Geno trait types. """ query = ( "SELECT InbredSet.Name, InbredSet.Id " @@ -130,12 +130,12 @@ def retrieve_geno_riset_fields(name, conn): "AND GenoFreeze.Name = %(name)s") with conn.cursor() as cursor: cursor.execute(query, {"name": name}) - return dict(zip(["riset", "risetid"], cursor.fetchone())) + return dict(zip(["group", "groupid"], cursor.fetchone())) return {} -def retrieve_publish_riset_fields(name, conn): +def retrieve_publish_group_fields(name, conn): """ - Retrieve the RISet, and RISetID values for various Publish trait types. + Retrieve the Group, and GroupID values for various Publish trait types. """ query = ( "SELECT InbredSet.Name, InbredSet.Id " @@ -144,12 +144,12 @@ def retrieve_publish_riset_fields(name, conn): "AND PublishFreeze.Name = %(name)s") with conn.cursor() as cursor: cursor.execute(query, {"name": name}) - return dict(zip(["riset", "risetid"], cursor.fetchone())) + return dict(zip(["group", "groupid"], cursor.fetchone())) return {} -def retrieve_probeset_riset_fields(name, conn): +def retrieve_probeset_group_fields(name, conn): """ - Retrieve the RISet, and RISetID values for various ProbeSet trait types. + Retrieve the Group, and GroupID values for various ProbeSet trait types. """ query = ( "SELECT InbredSet.Name, InbredSet.Id " @@ -159,12 +159,12 @@ def retrieve_probeset_riset_fields(name, conn): "AND ProbeSetFreeze.Name = %(name)s") with conn.cursor() as cursor: cursor.execute(query, {"name": name}) - return dict(zip(["riset", "risetid"], cursor.fetchone())) + return dict(zip(["group", "groupid"], cursor.fetchone())) return {} -def retrieve_temp_riset_fields(name, conn): +def retrieve_temp_group_fields(name, conn): """ - Retrieve the RISet, and RISetID values for `Temp` trait types. + Retrieve the Group, and GroupID values for `Temp` trait types. """ query = ( "SELECT InbredSet.Name, InbredSet.Id " @@ -173,30 +173,30 @@ def retrieve_temp_riset_fields(name, conn): "AND Temp.Name = %(name)s") with conn.cursor() as cursor: cursor.execute(query, {"name": name}) - return dict(zip(["riset", "risetid"], cursor.fetchone())) + return dict(zip(["group", "groupid"], cursor.fetchone())) return {} -def retrieve_riset_fields(trait_type, trait_name, dataset_info, conn): +def retrieve_group_fields(trait_type, trait_name, dataset_info, conn): """ - Retrieve the RISet, and RISetID values for various trait types. + Retrieve the Group, and GroupID values for various trait types. """ - riset_fns_map = { - "Geno": retrieve_geno_riset_fields, - "Publish": retrieve_publish_riset_fields, - "ProbeSet": retrieve_probeset_riset_fields + group_fns_map = { + "Geno": retrieve_geno_group_fields, + "Publish": retrieve_publish_group_fields, + "ProbeSet": retrieve_probeset_group_fields } if trait_type == "Temp": - riset_info = retrieve_temp_riset_fields(trait_name, conn) + group_info = retrieve_temp_group_fields(trait_name, conn) else: - riset_info = riset_fns_map[trait_type](dataset_info["dataset_name"], conn) + group_info = group_fns_map[trait_type](dataset_info["dataset_name"], conn) return { **dataset_info, - **riset_info, - "riset": ( - "BXD" if riset_info.get("riset") == "BXD300" - else riset_info.get("riset", "")) + **group_info, + "group": ( + "BXD" if group_info.get("group") == "BXD300" + else group_info.get("group", "")) } def retrieve_temp_trait_dataset(): @@ -281,11 +281,11 @@ def retrieve_trait_dataset(trait_type, trait, threshold, conn): trait_type, threshold, trait["trait_name"], trait["db"]["dataset_name"], conn) } - riset = retrieve_riset_fields( + group = retrieve_group_fields( trait_type, trait["trait_name"], dataset_name_info, conn) return { "display_name": dataset_name_info["dataset_name"], **dataset_name_info, **dataset_fns[trait_type](), - **riset + **group } diff --git a/gn3/db/traits.py b/gn3/db/traits.py index c9d05d7..f2673c8 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -226,7 +226,7 @@ def set_homologene_id_field_probeset(trait_info, conn): """ query = ( "SELECT HomologeneId FROM Homologene, Species, InbredSet" - " WHERE Homologene.GeneId = %(geneid)s AND InbredSet.Name = %(riset)s" + " WHERE Homologene.GeneId = %(geneid)s AND InbredSet.Name = %(group)s" " AND InbredSet.SpeciesId = Species.Id AND" " Species.TaxonomyId = Homologene.TaxonomyId") with conn.cursor() as cursor: @@ -234,7 +234,7 @@ def set_homologene_id_field_probeset(trait_info, conn): query, { k:v for k, v in trait_info.items() - if k in ["geneid", "riset"] + if k in ["geneid", "group"] }) res = cursor.fetchone() if res: @@ -422,7 +422,7 @@ def retrieve_trait_info( if trait_info["haveinfo"]: return { **trait_post_processing_functions_table[trait_dataset_type]( - {**trait_info, "riset": trait_dataset["riset"]}), + {**trait_info, "group": trait_dataset["group"]}), "db": {**trait["db"], **trait_dataset} } return trait_info @@ -449,14 +449,14 @@ def retrieve_temp_trait_data(trait_info: dict, conn: Any): for row in cursor.fetchall()] return [] -def retrieve_species_id(riset, conn: Any): +def retrieve_species_id(group, conn: Any): """ - Retrieve a species id given the RISet value + Retrieve a species id given the Group value """ with conn.cursor as cursor: cursor.execute( - "SELECT SpeciesId from InbredSet WHERE Name = %(riset)s", - {"riset": riset}) + "SELECT SpeciesId from InbredSet WHERE Name = %(group)s", + {"group": group}) return cursor.fetchone()[0] return None @@ -482,7 +482,7 @@ def retrieve_geno_trait_data(trait_info: Dict, conn: Any): {"trait_name": trait_info["trait_name"], "dataset_name": trait_info["db"]["dataset_name"], "species_id": retrieve_species_id( - trait_info["db"]["riset"], conn)}) + trait_info["db"]["group"], conn)}) return [dict(zip( ["sample_name", "value", "se_error", "id"], row)) for row in cursor.fetchall()] diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py index b6fc6d3..a36940d 100644 --- a/gn3/heatmaps.py +++ b/gn3/heatmaps.py @@ -164,7 +164,7 @@ def build_heatmap(traits_names, conn: Any): retrieve_trait_info(threshold, fullname, conn) for fullname in traits_names] traits_data_list = [retrieve_trait_data(t, conn) for t in traits] - genotype_filename = build_genotype_file(traits[0]["riset"]) + genotype_filename = build_genotype_file(traits[0]["group"]) samples = load_genotype_samples(genotype_filename) exported_traits_data_list = [ export_trait_data(td, samples) for td in traits_data_list] diff --git a/tests/unit/db/test_datasets.py b/tests/unit/db/test_datasets.py index 38de0e2..39f4af9 100644 --- a/tests/unit/db/test_datasets.py +++ b/tests/unit/db/test_datasets.py @@ -3,10 +3,10 @@ from unittest import mock, TestCase from gn3.db.datasets import ( retrieve_dataset_name, - retrieve_riset_fields, - retrieve_geno_riset_fields, - retrieve_publish_riset_fields, - retrieve_probeset_riset_fields) + retrieve_group_fields, + retrieve_geno_group_fields, + retrieve_publish_group_fields, + retrieve_probeset_group_fields) class TestDatasetsDBFunctions(TestCase): """Test cases for datasets functions.""" @@ -40,9 +40,9 @@ class TestDatasetsDBFunctions(TestCase): table=table, cols=columns), {"threshold": thresh, "name": dataset_name}) - def test_retrieve_probeset_riset_fields(self): + def test_retrieve_probeset_group_fields(self): """ - Test that the `riset` and `riset_id` fields are retrieved appropriately + Test that the `group` and `group_id` fields are retrieved appropriately for the 'ProbeSet' trait type. """ for trait_name, expected in [ @@ -52,7 +52,7 @@ class TestDatasetsDBFunctions(TestCase): with db_mock.cursor() as cursor: cursor.execute.return_value = () self.assertEqual( - retrieve_probeset_riset_fields(trait_name, db_mock), + retrieve_probeset_group_fields(trait_name, db_mock), expected) cursor.execute.assert_called_once_with( ( @@ -63,34 +63,34 @@ class TestDatasetsDBFunctions(TestCase): " AND ProbeSetFreeze.Name = %(name)s"), {"name": trait_name}) - def test_retrieve_riset_fields(self): + def test_retrieve_group_fields(self): """ - Test that the riset fields are set up correctly for the different trait + Test that the group fields are set up correctly for the different trait types. """ for trait_type, trait_name, dataset_info, expected in [ ["Publish", "pubTraitName01", {"dataset_name": "pubDBName01"}, - {"dataset_name": "pubDBName01", "riset": ""}], + {"dataset_name": "pubDBName01", "group": ""}], ["ProbeSet", "prbTraitName01", {"dataset_name": "prbDBName01"}, - {"dataset_name": "prbDBName01", "riset": ""}], + {"dataset_name": "prbDBName01", "group": ""}], ["Geno", "genoTraitName01", {"dataset_name": "genoDBName01"}, - {"dataset_name": "genoDBName01", "riset": ""}], - ["Temp", "tempTraitName01", {}, {"riset": ""}], + {"dataset_name": "genoDBName01", "group": ""}], + ["Temp", "tempTraitName01", {}, {"group": ""}], ]: db_mock = mock.MagicMock() with self.subTest( trait_type=trait_type, trait_name=trait_name, dataset_info=dataset_info): with db_mock.cursor() as cursor: - cursor.execute.return_value = ("riset_name", 0) + cursor.execute.return_value = ("group_name", 0) self.assertEqual( - retrieve_riset_fields( + retrieve_group_fields( trait_type, trait_name, dataset_info, db_mock), expected) - def test_retrieve_publish_riset_fields(self): + def test_retrieve_publish_group_fields(self): """ - Test that the `riset` and `riset_id` fields are retrieved appropriately + Test that the `group` and `group_id` fields are retrieved appropriately for the 'Publish' trait type. """ for trait_name, expected in [ @@ -100,7 +100,7 @@ class TestDatasetsDBFunctions(TestCase): with db_mock.cursor() as cursor: cursor.execute.return_value = () self.assertEqual( - retrieve_publish_riset_fields(trait_name, db_mock), + retrieve_publish_group_fields(trait_name, db_mock), expected) cursor.execute.assert_called_once_with( ( @@ -110,9 +110,9 @@ class TestDatasetsDBFunctions(TestCase): " AND PublishFreeze.Name = %(name)s"), {"name": trait_name}) - def test_retrieve_geno_riset_fields(self): + def test_retrieve_geno_group_fields(self): """ - Test that the `riset` and `riset_id` fields are retrieved appropriately + Test that the `group` and `group_id` fields are retrieved appropriately for the 'Geno' trait type. """ for trait_name, expected in [ @@ -122,7 +122,7 @@ class TestDatasetsDBFunctions(TestCase): with db_mock.cursor() as cursor: cursor.execute.return_value = () self.assertEqual( - retrieve_geno_riset_fields(trait_name, db_mock), + retrieve_geno_group_fields(trait_name, db_mock), expected) cursor.execute.assert_called_once_with( ( -- cgit v1.2.3 From 60d54d8de466c179a93b6d46ad05ec1b9ba5f4a1 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 27 Sep 2021 05:13:19 +0300 Subject: Narrow the exception and add comments Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Only catch the `FileExistsError` allowing any other exception to pass through. This tries to conform a little to the review at https://github.com/genenetwork/genenetwork3/pull/37#discussion_r714552696 --- gn3/computations/qtlreaper.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py index 166d2dd..d1ff4ac 100644 --- a/gn3/computations/qtlreaper.py +++ b/gn3/computations/qtlreaper.py @@ -34,7 +34,8 @@ def create_output_directory(path: str): """Create the output directory at `path` if it does not exist.""" try: os.mkdir(path) - except OSError: + except FileExistsError: + # If the directory already exists, do nothing. pass def run_reaper( -- cgit v1.2.3 From a9fc9814760d205674904f8feb700eadae480fb1 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 27 Sep 2021 05:25:58 +0300 Subject: Remove unnecessary variable. Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Fix issue according to review https://github.com/genenetwork/genenetwork3/pull/37#discussion_r714549781 --- gn3/api/heatmaps.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/gn3/api/heatmaps.py b/gn3/api/heatmaps.py index fe47aee..62ca2ad 100644 --- a/gn3/api/heatmaps.py +++ b/gn3/api/heatmaps.py @@ -17,8 +17,7 @@ def clustered_heatmaps(): Parses the incoming data and responds with the JSON-serialized plotly figure representing the clustered heatmap. """ - heatmap_request = request.get_json() - traits_names = heatmap_request.get("traits_names", tuple()) + traits_names = request.get_json().get("traits_names", tuple()) if len(traits_names) < 2: return jsonify({ "message": "You need to provide at least two trait names." -- cgit v1.2.3 From 6f25b8e2b1d1a34c054d325b1c37b303529b8827 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Mon, 27 Sep 2021 15:15:28 +0300 Subject: remove unnecessary comments and variables --- scripts/wgcna_analysis.R | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/scripts/wgcna_analysis.R b/scripts/wgcna_analysis.R index e641652..17b3537 100644 --- a/scripts/wgcna_analysis.R +++ b/scripts/wgcna_analysis.R @@ -23,24 +23,14 @@ if (length(args)==0) { inputData <- fromJSON(file = json_file_path) -# parse the json data input - -minModuleSize <-inputData$minModuleSize - -TOMtype <-inputData$TOMtype - -corType <-inputData$corType -# - trait_sample_data <- do.call(rbind, inputData$trait_sample_data) dataExpr <- data.frame(apply(trait_sample_data, 2, function(x) as.numeric(as.character(x)))) # transform expressionData dataExpr <- data.frame(t(dataExpr)) -gsg = goodSamplesGenes(dataExpr, verbose = 3); +gsg = goodSamplesGenes(dataExpr, verbose = 3) -# https://horvath.genetics.ucla.edu/html/CoexpressionNetwork/Rpackages/ if (!gsg$allOK) { if (sum(!gsg$goodGenes)>0) @@ -75,18 +65,18 @@ if (is.na(sft$powerEstimate)){ # pass user options network <- blockwiseModules(dataExpr, #similarity matrix options - corType = corType, + corType = inputData$corType, #adjacency matrix options power = powerEst, networkType = "unsigned", #TOM options - TOMtype = TOMtype, + TOMtype = inputData$TOMtype, #module indentification verbose = 3, - minmodulesSize = minModuleSize, + minmodulesSize = inputData$minModuleSize, deepSplit = 3, PamRespectsDendro = FALSE ) -- cgit v1.2.3 From f51a09e2c5f594425531fe26e8237b09fb6909ae Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Mon, 27 Sep 2021 17:29:27 +0300 Subject: return str error for exception --- gn3/computations/wgcna.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/gn3/computations/wgcna.py b/gn3/computations/wgcna.py index 436a888..1ee55f5 100644 --- a/gn3/computations/wgcna.py +++ b/gn3/computations/wgcna.py @@ -45,4 +45,7 @@ def call_wgcna_script(rscript_path: str, request_data: dict): **run_cmd_results } except Exception as error: - raise error + # relook at handling errors gn3 + return { + "output": str(error) + } -- cgit v1.2.3 From 0d60b01476e49b094944c1ba4136e4cc8c28aaba Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Mon, 27 Sep 2021 17:30:32 +0300 Subject: add tests for calling wgcna_script --- tests/unit/computations/test_wgcna.py | 96 ++++++++++++++++++++++++++++++++--- 1 file changed, 89 insertions(+), 7 deletions(-) diff --git a/tests/unit/computations/test_wgcna.py b/tests/unit/computations/test_wgcna.py index 64f6c14..57224b3 100644 --- a/tests/unit/computations/test_wgcna.py +++ b/tests/unit/computations/test_wgcna.py @@ -11,16 +11,98 @@ from gn3.computations.wgcna import call_wgcna_script class TestWgcna(TestCase): """test class for wgcna""" + @mock.patch("gn3.computations.wgcna.run_cmd") + @mock.patch("gn3.computations.wgcna.compose_wgcna_cmd") @mock.patch("gn3.computations.wgcna.dump_wgcna_data") - def test_call_wgcna_script(self, mock_dump): - """call wgcna script""" + def test_call_wgcna_script(self, mock_dumping_data, mock_compose_wgcna, mock_run_cmd): + """test for calling wgcna script""" + mock_dumping_data.return_value = "/tmp/QmQPeNsJPyVWPFDVHb77w8G42Fvo15z4bG2X8D2GhfbSXc-test.json" - mock_dump.return_value = "/tmp/QmQPeNsJPyVWPFDVHb77w8G42Fvo15z4bG2X8D2GhfbSXc-test.json" + mock_compose_wgcna.return_value = "Rscript/GUIX_PATH/scripts/r_file.R /tmp/QmQPeNsJPyVWPFDVHb77w8G42Fvo15z4bG2X8D2GhfbSXc-test.json" - results = call_wgcna_script( - "/home/kabui/project/genenetwork3/scripts/wgcna_analysis.R", {}) + request_data = { + "trait_names": ["1455537_at", "1425637_at", "1449593_at", "1421945_a_at", "1450423_s_at", "1423841_at", "1451144_at"], + "trait_sample_data": [ + { + "129S1/SvImJ": 7.142, + "A/J": 7.31, + "AKR/J": 7.49, + "B6D2F1": 6.899, + "BALB/cByJ": 7.172, + "BALB/cJ": 7.396 + }, + { + "129S1/SvImJ": 7.071, + "A/J": 7.05, + "AKR/J": 7.313, + "B6D2F1": 6.999, + "BALB/cByJ": 7.293, + "BALB/cJ": 7.117 + }]} - self.assertEqual(results, "dsedf") + mock_run_cmd_results = { + + "code": 0, + "output": "Flagging genes and samples with too many missing values...\n ..step 1\nAllowing parallel execution with up to 3 working processes.\npickSoftThreshold: will use block size 7.\n pickSoftThreshold: calculating connectivity for given powers...\n ..working on genes 1 through 7 of 7\n Flagging genes and samples with too many missing values...\n ..step 1\n ..Working on block 1 .\n TOM calculation: adjacency..\n ..will not use multithreading.\nclustering..\n ....detecting modules..\n ....calculating module eigengenes..\n ....checking kME in modules..\n ..merging modules that are too close..\n mergeCloseModules: Merging modules whose distance is less than 0.15\n mergeCloseModules: less than two proper modules.\n ..color levels are turquoise\n ..there is nothing to merge.\n Calculating new MEs...\n" + } + + json_output = "{\"inputdata\":{\"trait_sample_data \":{},\"minModuleSize\":30,\"TOMtype\":\"unsigned\"},\"outputdata\":{\"eigengenes\":[],\"colors\":[]}}" + + expected_output = { + + "data": { + "inputdata": { + "trait_sample_data ": {}, + "minModuleSize": 30, + "TOMtype": "unsigned" + }, + + "outputdata": { + "eigengenes": [], + "colors": [] + } + }, + + **mock_run_cmd_results + + } + + with mock.patch("builtins.open", mock.mock_open(read_data=json_output)) as mock_file: + + mock_run_cmd.return_value = mock_run_cmd_results + + results = call_wgcna_script( + "Rscript/GUIX_PATH/scripts/r_file.R", request_data) + + mock_dumping_data.assert_called_once_with(request_data) + + mock_compose_wgcna.assert_called_once_with( + "Rscript/GUIX_PATH/scripts/r_file.R", "/tmp/QmQPeNsJPyVWPFDVHb77w8G42Fvo15z4bG2X8D2GhfbSXc-test.json") + + mock_run_cmd.assert_called_once_with( + "Rscript/GUIX_PATH/scripts/r_file.R /tmp/QmQPeNsJPyVWPFDVHb77w8G42Fvo15z4bG2X8D2GhfbSXc-test.json") + + self.assertEqual(results, expected_output) + + @mock.patch("gn3.computations.wgcna.run_cmd") + @mock.patch("gn3.computations.wgcna.compose_wgcna_cmd") + @mock.patch("gn3.computations.wgcna.dump_wgcna_data") + def test_call_wgcna_script_fails(self, mock_dumping_data, mock_compose_wgcna, mock_run_cmd): + """test for calling wgcna script fails and generates the expected error""" + mock_dumping_data.return_value = "/tmp/QmQPeNsJPyVWPFDVHb77w8G42Fvo15z4bG2X8D2GhfbSXc-test.json" + + mock_compose_wgcna.return_value = "Rscript/GUIX_PATH/scripts/r_file.R /tmp/QmQPeNsJPyVWPFDVHb77w8G42Fvo15z4bG2X8D2GhfbSXc-test.json" + + expected_error = { + "code": 127, + "output": "could not read the json file" + } + + with mock.patch("builtins.open", mock.mock_open(read_data="")) as mock_file: + + mock_run_cmd.return_value = expected_error + self.assertEqual(call_wgcna_script( + "input_file.R", ""), expected_error) def test_compose_wgcna_cmd(self): """test for composing wgcna cmd""" @@ -29,7 +111,7 @@ class TestWgcna(TestCase): self.assertEqual( wgcna_cmd, "Rscript ./scripts/wgcna.r /tmp/wgcna.json") - @skip("to update tests") + @ skip("to update tests") def test_create_json_file(self): """test for writing the data to a csv file""" # # All the traits we have data for (should not contain duplicates) -- cgit v1.2.3 From b6097cff6431e50c132f46dc0d3e1b841897da0f Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Mon, 27 Sep 2021 19:14:58 +0300 Subject: add file not found exception --- gn3/computations/wgcna.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gn3/computations/wgcna.py b/gn3/computations/wgcna.py index 1ee55f5..fd508fa 100644 --- a/gn3/computations/wgcna.py +++ b/gn3/computations/wgcna.py @@ -44,8 +44,8 @@ def call_wgcna_script(rscript_path: str, request_data: dict): "data": json.load(outputfile), **run_cmd_results } - except Exception as error: + except FileNotFoundError: # relook at handling errors gn3 return { - "output": str(error) + "output": "output file not found" } -- cgit v1.2.3 From a2da1f5dbc49b0137ef6b8ee9e234178521935f3 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Mon, 27 Sep 2021 19:15:23 +0300 Subject: modify integration tests --- tests/integration/test_wgcna.py | 60 ++++++++++++++++++++++++++++++++--------- 1 file changed, 48 insertions(+), 12 deletions(-) diff --git a/tests/integration/test_wgcna.py b/tests/integration/test_wgcna.py index 39dabb2..078449d 100644 --- a/tests/integration/test_wgcna.py +++ b/tests/integration/test_wgcna.py @@ -13,25 +13,61 @@ class WgcnaIntegrationTest(TestCase): self.app = create_app().test_client() @mock.patch("gn3.api.wgcna.call_wgcna_script") - def test_wgcna_endpoint(self, mock_wgcna_api): + def test_wgcna_endpoint(self, mock_wgcna_script): """test /api/wgcna/run_wgcna endpoint""" - wgcna_api_data = { - "eigengenes": ["1224_at", "121412_at", "32342342-at"], - "dendrogram_file_location": "/tmp/dend1.png" - + wgcna_output_data = { + "code": 0, + "output": "run script successfully", + "data": { + "ModEigens": { + "MEturquoise": [ + 0.0646677768085351, + 0.137200224277058, + 0.63451113720732, + -0.544002665501479, + -0.489487590361863, + 0.197111117570427 + ] + }, + "net_colors": { + "X1": "turquoise", + "X2": "turquoise", + "X3": "turquoise", + "X4": "turquoise" + }, + "imageLoc": "/WGCNAoutput_1uujpTIpC.png" + } } - mock_wgcna_api.return_value = wgcna_api_data request_data = { - - "trait_sample_data": [], - - + "trait_names": [ + "1455537_at", + "1425637_at" + ], + "trait_sample_data": [ + { + "129S1/SvImJ": 6.142, + "A/J": 5.31, + "AKR/J": 3.49, + "B6D2F1": 2.899, + "BALB/cByJ": 1.172, + "BALB/cJ": 7.396 + }, + { + "129S1/SvImJ": 1.42, + "A/J": 2.31, + "AKR/J": 5.49, + "B6D2F1": 3.899, + "BALB/cByJ": 1.172, + "BALB/cJ": 7.396 + } + ] } + mock_wgcna_script.return_value = wgcna_output_data response = self.app.post("/api/wgcna/run_wgcna", json=request_data, follow_redirects=True) - self.assertEqual(response.status_code, 401) - self.assertEqual(response.get_json(), wgcna_api_data) + self.assertEqual(response.status_code, 200) + self.assertEqual(response.get_json(), wgcna_output_data) -- cgit v1.2.3 From 16235188d4ee2ad21a667832baf6cbbea6d8856a Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Mon, 27 Sep 2021 19:15:46 +0300 Subject: modify unittests --- tests/unit/computations/test_wgcna.py | 40 ++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/tests/unit/computations/test_wgcna.py b/tests/unit/computations/test_wgcna.py index 57224b3..ec81d94 100644 --- a/tests/unit/computations/test_wgcna.py +++ b/tests/unit/computations/test_wgcna.py @@ -1,5 +1,4 @@ """module contains python code for wgcna""" -from unittest import skip from unittest import TestCase from unittest import mock @@ -14,8 +13,13 @@ class TestWgcna(TestCase): @mock.patch("gn3.computations.wgcna.run_cmd") @mock.patch("gn3.computations.wgcna.compose_wgcna_cmd") @mock.patch("gn3.computations.wgcna.dump_wgcna_data") - def test_call_wgcna_script(self, mock_dumping_data, mock_compose_wgcna, mock_run_cmd): + def test_call_wgcna_script(self, + mock_dumping_data, + mock_compose_wgcna, + mock_run_cmd): """test for calling wgcna script""" + + # pylint: disable = line-too-long mock_dumping_data.return_value = "/tmp/QmQPeNsJPyVWPFDVHb77w8G42Fvo15z4bG2X8D2GhfbSXc-test.json" mock_compose_wgcna.return_value = "Rscript/GUIX_PATH/scripts/r_file.R /tmp/QmQPeNsJPyVWPFDVHb77w8G42Fvo15z4bG2X8D2GhfbSXc-test.json" @@ -67,7 +71,7 @@ class TestWgcna(TestCase): } - with mock.patch("builtins.open", mock.mock_open(read_data=json_output)) as mock_file: + with mock.patch("builtins.open", mock.mock_open(read_data=json_output)): mock_run_cmd.return_value = mock_run_cmd_results @@ -77,7 +81,8 @@ class TestWgcna(TestCase): mock_dumping_data.assert_called_once_with(request_data) mock_compose_wgcna.assert_called_once_with( - "Rscript/GUIX_PATH/scripts/r_file.R", "/tmp/QmQPeNsJPyVWPFDVHb77w8G42Fvo15z4bG2X8D2GhfbSXc-test.json") + "Rscript/GUIX_PATH/scripts/r_file.R", + "/tmp/QmQPeNsJPyVWPFDVHb77w8G42Fvo15z4bG2X8D2GhfbSXc-test.json") mock_run_cmd.assert_called_once_with( "Rscript/GUIX_PATH/scripts/r_file.R /tmp/QmQPeNsJPyVWPFDVHb77w8G42Fvo15z4bG2X8D2GhfbSXc-test.json") @@ -88,17 +93,19 @@ class TestWgcna(TestCase): @mock.patch("gn3.computations.wgcna.compose_wgcna_cmd") @mock.patch("gn3.computations.wgcna.dump_wgcna_data") def test_call_wgcna_script_fails(self, mock_dumping_data, mock_compose_wgcna, mock_run_cmd): - """test for calling wgcna script fails and generates the expected error""" + """test for calling wgcna script\ + fails and generates the expected error""" + # pylint: disable = line-too-long, mock_dumping_data.return_value = "/tmp/QmQPeNsJPyVWPFDVHb77w8G42Fvo15z4bG2X8D2GhfbSXc-test.json" mock_compose_wgcna.return_value = "Rscript/GUIX_PATH/scripts/r_file.R /tmp/QmQPeNsJPyVWPFDVHb77w8G42Fvo15z4bG2X8D2GhfbSXc-test.json" expected_error = { - "code": 127, + "code": 2, "output": "could not read the json file" } - with mock.patch("builtins.open", mock.mock_open(read_data="")) as mock_file: + with mock.patch("builtins.open", mock.mock_open(read_data="")): mock_run_cmd.return_value = expected_error self.assertEqual(call_wgcna_script( @@ -111,8 +118,9 @@ class TestWgcna(TestCase): self.assertEqual( wgcna_cmd, "Rscript ./scripts/wgcna.r /tmp/wgcna.json") - @ skip("to update tests") - def test_create_json_file(self): + @mock.patch("gn3.computations.wgcna.TMPDIR", "/tmp") + @mock.patch("gn3.computations.wgcna.uuid.uuid4") + def test_create_json_file(self, file_name_generator): """test for writing the data to a csv file""" # # All the traits we have data for (should not contain duplicates) # All the strains we have data for (contains duplicates) @@ -138,7 +146,15 @@ class TestWgcna(TestCase): "minModuleSize": 30 } - results = dump_wgcna_data( - expected_input) + with mock.patch("builtins.open", mock.mock_open()) as file_handler: + + file_name_generator.return_value = "facb73ff-7eef-4053-b6ea-e91d3a22a00c" + + results = dump_wgcna_data( + expected_input) + + file_handler.assert_called_once_with( + "/tmp/facb73ff-7eef-4053-b6ea-e91d3a22a00c.json", 'w') - self.assertEqual(results, {}) + self.assertEqual( + results, "/tmp/facb73ff-7eef-4053-b6ea-e91d3a22a00c.json") -- cgit v1.2.3 From 767eb96db12476f741bb5197bda7555c29e79b55 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Tue, 28 Sep 2021 08:28:42 +0300 Subject: Approximate single-spectrum colour scale in GN1 Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * To provide a somewhat similar experience to GeneNetwork1, this commit approximates the single-spectrum colour scale in GeneNetwork1 for the heatmaps in GeneNetwork3. Work to get the multiple-spectrum colour sc(ales/hemes) will be done in other commits, since that might require digging even deeper into Plotly's guts to figure out. --- gn3/heatmaps.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py index a36940d..2ef2d16 100644 --- a/gn3/heatmaps.py +++ b/gn3/heatmaps.py @@ -326,9 +326,7 @@ def generate_clustered_heatmap( data, clustering_data, image_filename_prefix, x_axis=None, x_label: str = "", y_axis=None, y_label: str = "", output_dir: str = TMPDIR, - colorscale=( - (0.0, '#5D5D5D'), (0.4999999999999999, '#ABABAB'), - (0.5, '#F5DE11'), (1.0, '#FF0D00'))): + colorscale=((0.0, '#0000FF'), (0.5, '#00FF00'), (1.0, '#FF0000'))): """ Generate a dendrogram, and heatmaps for each chromosome, and put them all into one plot. -- cgit v1.2.3 From 4a55971a9be54b399c45a53e211df3348df1c52b Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Tue, 28 Sep 2021 10:15:43 +0300 Subject: Retrieve loci names ordered by chromosomes Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/heatmaps.py: implement function * tests/unit/test_heatmaps.py: add test Add a function to retrieve the loci names from the traits, ordered by chromosomes, in alphabetical order. This is useful to provide the user with more information on hovering over the heatmap cells: each cell will now display the locus name, trait name and value associated with it. --- gn3/heatmaps.py | 28 +++++++++++++++++++++++++++- tests/unit/test_heatmaps.py | 15 +++++++++++++++ 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py index 2ef2d16..9c10ba3 100644 --- a/gn3/heatmaps.py +++ b/gn3/heatmaps.py @@ -4,7 +4,7 @@ generate various kinds of heatmaps. """ from functools import reduce -from typing import Any, Dict, Sequence +from typing import Any, Dict, Union, Sequence import numpy as np import plotly.graph_objects as go # type: ignore @@ -142,6 +142,32 @@ def cluster_traits(traits_data_list: Sequence[Dict]): return tuple(__cluster(tdata_i) for tdata_i in enumerate(traits_data_list)) +def get_loci_names( + organised: dict, + chromosome_names: Sequence[str]) -> Sequence[Sequence[str]]: + """ + Get the loci names organised by the same order as the `chromosome_names`. + """ + def __get_trait_loci(accumulator, trait): + chrs = tuple(trait["chromosomes"].keys()) + trait_loci = { + _chr: tuple( + locus["Locus"] + for locus in trait["chromosomes"][_chr]["loci"] + ) for _chr in chrs + } + return { + **accumulator, + **{ + _chr: tuple(sorted(set( + trait_loci[_chr] + accumulator.get(_chr, tuple())))) + for _chr in trait_loci.keys() + } + } + loci_dict: Dict[Union[str, int], Sequence[str]] = reduce( + __get_trait_loci, [v[1] for v in organised.items()], {}) + return tuple(loci_dict[_chr] for _chr in chromosome_names) + def build_heatmap(traits_names, conn: Any): """ heatmap function diff --git a/tests/unit/test_heatmaps.py b/tests/unit/test_heatmaps.py index b54e2f3..7b66688 100644 --- a/tests/unit/test_heatmaps.py +++ b/tests/unit/test_heatmaps.py @@ -2,6 +2,7 @@ from unittest import TestCase from gn3.heatmaps import ( cluster_traits, + get_loci_names, get_lrs_from_chr, export_trait_data, compute_traits_order, @@ -214,3 +215,17 @@ class TestHeatmap(TestCase): [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]], [[0.5, 0.579, 0.5], [0.5, 0.5, 0.5]]]) + + def test_get_loci_names(self): + """Check that loci names are retrieved correctly.""" + for organised, expected in ( + (organised_trait_1, + (("rs258367496", "rs30658298", "rs31443144", "rs32285189", + "rs32430919", "rs36251697", "rs6269442"), + ("rs31879829", "rs36742481", "rs51852623"))), + ({**organised_trait_1, **organised_trait_2}, + (("rs258367496", "rs30658298", "rs31443144", "rs32285189", + "rs32430919", "rs36251697", "rs6269442"), + ("rs31879829", "rs36742481", "rs51852623")))): + with self.subTest(organised=organised): + self.assertEqual(get_loci_names(organised, (1, 2)), expected) -- cgit v1.2.3 From e76dea5005501868c9721cd631b716d7e799306e Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Tue, 28 Sep 2021 10:20:52 +0300 Subject: Provide loci names to heatmap Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Provide the loci names to the heatmaps so that hovering over the heatmap cells displays the associated locus name. --- gn3/heatmaps.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py index 9c10ba3..adbfbc6 100644 --- a/gn3/heatmaps.py +++ b/gn3/heatmaps.py @@ -230,7 +230,8 @@ def build_heatmap(traits_names, conn: Any): for order in traits_order), y_label="Traits", x_axis=chromosome_names, - x_label="Chromosomes") + x_label="Chromosomes", + loci_names=get_loci_names(organised, chromosome_names)) def compute_traits_order(slink_data, neworder: tuple = tuple()): """ @@ -351,6 +352,7 @@ def process_traits_data_for_heatmap(data, trait_names, chromosome_names): def generate_clustered_heatmap( data, clustering_data, image_filename_prefix, x_axis=None, x_label: str = "", y_axis=None, y_label: str = "", + loci_names: Sequence[Sequence[str]] = tuple(), output_dir: str = TMPDIR, colorscale=((0.0, '#0000FF'), (0.5, '#00FF00'), (1.0, '#FF0000'))): """ @@ -369,9 +371,12 @@ def generate_clustered_heatmap( np.array(clustering_data), orientation="right", labels=y_axis)) hms = [go.Heatmap( name=chromo, + x=loci, y=y_axis, z=data_array, - showscale=False) for chromo, data_array in zip(x_axis, data)] + showscale=False) + for chromo, data_array, loci + in zip(x_axis, data, loci_names)] for i, heatmap in enumerate(hms): fig.add_trace(heatmap, row=1, col=(i + 2)) -- cgit v1.2.3 From 77c274b79c3ec01de60e90db3299763cb58f715b Mon Sep 17 00:00:00 2001 From: BonfaceKilz Date: Wed, 6 Oct 2021 14:28:33 +0300 Subject: guix.scm: Add python-flask-cors and python-plotly dependencies --- guix.scm | 2 ++ 1 file changed, 2 insertions(+) diff --git a/guix.scm b/guix.scm index 02c67b2..9b8f399 100644 --- a/guix.scm +++ b/guix.scm @@ -89,11 +89,13 @@ ("python" ,python-wrapper) ("python-bcrypt" ,python-bcrypt) ("python-flask" ,python-flask) + ("python-flask-cors" ,python-flask-cors) ("python-ipfshttpclient" ,python-ipfshttpclient) ("python-mypy" ,python-mypy) ("python-mypy-extensions" ,python-mypy-extensions) ("python-mysqlclient" ,python-mysqlclient) ("python-numpy" ,python-numpy) + ("python-plotly" ,python-plotly) ("python-pylint" ,python-pylint) ("python-redis" ,python-redis) ("python-requests" ,python-requests) -- cgit v1.2.3