From 36044483d365a907a9da6ad8a7b3f0dfb0a918e2 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Thu, 12 Aug 2021 17:21:44 +0300 Subject: Initialise heatmap generation module Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/heatmaps/heatmaps.py: Initialise the module with some code to be used to test out plotly features on the command-line. * guix.scm: Add `python-plotly` and `python-pandas` as dependencies. --- gn3/heatmaps/heatmaps.py | 54 ++++++++++++++++++++++++++++++++++++++++++++++++ guix.scm | 5 ++++- 2 files changed, 58 insertions(+), 1 deletion(-) create mode 100644 gn3/heatmaps/heatmaps.py diff --git a/gn3/heatmaps/heatmaps.py b/gn3/heatmaps/heatmaps.py new file mode 100644 index 0000000..3bf7917 --- /dev/null +++ b/gn3/heatmaps/heatmaps.py @@ -0,0 +1,54 @@ +import random +import plotly.express as px + +#### Remove these #### + +heatmap_dir = "heatmap_images" + +def generate_random_data(data_stop: float = 2, width: int = 10, height: int = 30): + """ + This is mostly a utility function to be used to generate random data, useful + for development of the heatmap generation code, without access to the actual + database data. + """ + return [[random.uniform(0,data_stop) for i in range(0, width)] + for j in range(0, height)] + +def heatmap_x_axis_names(): + return [ + "UCLA_BXDBXH_CARTILAGE_V2::ILM103710672", + "UCLA_BXDBXH_CARTILAGE_V2::ILM2260338", + "UCLA_BXDBXH_CARTILAGE_V2::ILM3140576", + "UCLA_BXDBXH_CARTILAGE_V2::ILM5670577", + "UCLA_BXDBXH_CARTILAGE_V2::ILM2070121", + "UCLA_BXDBXH_CARTILAGE_V2::ILM103990541", + "UCLA_BXDBXH_CARTILAGE_V2::ILM1190722", + "UCLA_BXDBXH_CARTILAGE_V2::ILM6590722", + "UCLA_BXDBXH_CARTILAGE_V2::ILM4200064", + "UCLA_BXDBXH_CARTILAGE_V2::ILM3140463"] +#### END: Remove these #### + +# Grey + Blue + Red +def generate_heatmap(): + rows = 20 + data = generate_random_data(height=rows) + y = (["%s"%x for x in range(1, rows+1)][:-1] + ["X"]) #replace last item with x for now + fig = px.imshow( + data, + x=heatmap_x_axis_names(), + y=y, + width=500) + fig.update_traces(xtype="array") + fig.update_traces(ytype="array") + # fig.update_traces(xgap=10) + fig.update_xaxes( + visible=True, + title_text="Traits", + title_font_size=16) + fig.update_layout( + coloraxis_colorscale=[ + [0.0, '#3B3B3B'], [0.4999999999999999, '#ABABAB'], + [0.5, '#F5DE11'], [1.0, '#FF0D00']]) + + fig.write_html("%s/%s"%(heatmap_dir, "test_image.html")) + return fig diff --git a/guix.scm b/guix.scm index f94fe1a..729d089 100644 --- a/guix.scm +++ b/guix.scm @@ -38,6 +38,7 @@ (gn packages python) (gnu packages base) (gnu packages check) + (gnu packages graph) (gnu packages cran) (gnu packages databases) (gnu packages statistics) @@ -101,7 +102,9 @@ ,python-sqlalchemy-stubs) ("r-optparse" ,r-optparse) ("r-qtl" ,r-qtl) - ("r-stringi" ,r-stringi))) + ("r-stringi" ,r-stringi) + ("python-plotly" ,python-plotly) + ("python-pandas" ,python-pandas))) (build-system python-build-system) (home-page "https://github.com/genenetwork/genenetwork3") (synopsis "GeneNetwork3 API for data science and machine learning.") -- cgit v1.2.3 From 5f56ac39c60b345e1a135c75f4bf35f8e881f4d6 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Tue, 17 Aug 2021 08:44:20 +0300 Subject: Fix errors: add in missing parenthesis Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Call the `cursor.fetchone()` function to get results. Without the parenthesis, the code was trying to use the function itself as the results, which was a bug, and would lead to failure. --- gn3/db/datasets.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gn3/db/datasets.py b/gn3/db/datasets.py index 53d6811..4a05499 100644 --- a/gn3/db/datasets.py +++ b/gn3/db/datasets.py @@ -25,7 +25,7 @@ def retrieve_probeset_trait_dataset_name( return dict(zip( ["dataset_id", "dataset_name", "dataset_fullname", "dataset_shortname", "dataset_datascale"], - cursor.fetchone)) + cursor.fetchone())) def retrieve_publish_trait_dataset_name( threshold: int, name: str, connection: Any): @@ -49,7 +49,7 @@ def retrieve_publish_trait_dataset_name( return dict(zip( ["dataset_id", "dataset_name", "dataset_fullname", "dataset_shortname"], - cursor.fetchone)) + cursor.fetchone())) def retrieve_geno_trait_dataset_name( threshold: int, name: str, connection: Any): @@ -73,7 +73,7 @@ def retrieve_geno_trait_dataset_name( return dict(zip( ["dataset_id", "dataset_name", "dataset_fullname", "dataset_shortname"], - cursor.fetchone)) + cursor.fetchone())) def retrieve_temp_trait_dataset_name( threshold: int, name: str, connection: Any): @@ -97,7 +97,7 @@ def retrieve_temp_trait_dataset_name( return dict(zip( ["dataset_id", "dataset_name", "dataset_fullname", "dataset_shortname"], - cursor.fetchone)) + cursor.fetchone())) def retrieve_dataset_name( trait_type: str, threshold: int, trait_name: str, dataset_name: str, -- cgit v1.2.3 From 1a9d28e6db2140cc7b3491c6dbcf4fc8cd8c09b6 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Tue, 17 Aug 2021 08:47:11 +0300 Subject: Add tests and fix errors caught with tests Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/computations/heatmap.py: fix errors * tests/unit/computations/test_heatmap.py: new tests Add new tests with the expected source data format, and expected results. Fix all errors that were caught by running the tests --- gn3/computations/heatmap.py | 18 +++++------ tests/unit/computations/test_heatmap.py | 54 +++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 9 deletions(-) create mode 100644 tests/unit/computations/test_heatmap.py diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py index a0e778a..8a86fe8 100644 --- a/gn3/computations/heatmap.py +++ b/gn3/computations/heatmap.py @@ -34,11 +34,11 @@ def export_trait_data( """ def __export_all_types(tdata, strain): sample_data = [] - if tdata[strain]["val"]: - sample_data.append(tdata[strain]["val"]) + if tdata[strain]["value"]: + sample_data.append(tdata[strain]["value"]) if var_exists: - if tdata[strain].var: - sample_data.append(tdata[strain]["var"]) + if tdata[strain]["variance"]: + sample_data.append(tdata[strain]["variance"]) else: sample_data.append(None) if n_exists: @@ -58,15 +58,15 @@ def export_trait_data( def __exporter(accumulator, strain): # pylint: disable=[R0911] - if trait_data.has_key(strain): + if strain in trait_data["data"]: if dtype == "val": - return accumulator + (trait_data[strain]["val"], ) + return accumulator + (trait_data["data"][strain]["value"], ) if dtype == "var": - return accumulator + (trait_data[strain]["var"], ) + return accumulator + (trait_data["data"][strain]["variance"], ) if dtype == "N": - return trait_data[strain]["ndata"] + return accumulator + (trait_data["data"][strain]["ndata"], ) if dtype == "all": - return accumulator + __export_all_types(trait_data, strain) + return accumulator + __export_all_types(trait_data["data"], strain) raise KeyError("Type `%s` is incorrect" % dtype) if var_exists and n_exists: return accumulator + (None, None, None) diff --git a/tests/unit/computations/test_heatmap.py b/tests/unit/computations/test_heatmap.py new file mode 100644 index 0000000..78303ae --- /dev/null +++ b/tests/unit/computations/test_heatmap.py @@ -0,0 +1,54 @@ +"""Module contains tests for gn3.computations.heatmap""" +from unittest import TestCase +from gn3.computations.heatmap import export_trait_data + +strainlist = ["B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"] +trait_data = {"mysqlid": 36688172, "data": {"B6cC3-1": {"strain_name": "B6cC3-1", "value": 7.51879, "variance": None, "ndata": None}, "BXD1": {"strain_name": "BXD1", "value": 7.77141, "variance": None, "ndata": None}, "BXD12": {"strain_name": "BXD12", "value": 8.39265, "variance": None, "ndata": None}, "BXD16": {"strain_name": "BXD16", "value": 8.17443, "variance": None, "ndata": None}, "BXD19": {"strain_name": "BXD19", "value": 8.30401, "variance": None, "ndata": None}, "BXD2": {"strain_name": "BXD2", "value": 7.80944, "variance": None, "ndata": None}, "BXD21": {"strain_name": "BXD21", "value": 8.93809, "variance": None, "ndata": None}, "BXD24": {"strain_name": "BXD24", "value": 7.99415, "variance": None, "ndata": None}, "BXD27": {"strain_name": "BXD27", "value": 8.12177, "variance": None, "ndata": None}, "BXD28": {"strain_name": "BXD28", "value": 7.67688, "variance": None, "ndata": None}, "BXD32": {"strain_name": "BXD32", "value": 7.79062, "variance": None, "ndata": None}, "BXD39": {"strain_name": "BXD39", "value": 8.27641, "variance": None, "ndata": None}, "BXD40": {"strain_name": "BXD40", "value": 8.18012, "variance": None, "ndata": None}, "BXD42": {"strain_name": "BXD42", "value": 7.82433, "variance": None, "ndata": None}, "BXD6": {"strain_name": "BXD6", "value": 8.09718, "variance": None, "ndata": None}, "BXH14": {"strain_name": "BXH14", "value": 7.97475, "variance": None, "ndata": None}, "BXH19": {"strain_name": "BXH19", "value": 7.67223, "variance": None, "ndata": None}, "BXH2": {"strain_name": "BXH2", "value": 7.93622, "variance": None, "ndata": None}, "BXH22": {"strain_name": "BXH22", "value": 7.43692, "variance": None, "ndata": None}, "BXH4": {"strain_name": "BXH4", "value": 7.96336, "variance": None, "ndata": None}, "BXH6": {"strain_name": "BXH6", "value": 7.75132, "variance": None, "ndata": None}, "BXH7": {"strain_name": "BXH7", "value": 8.12927, "variance": None, "ndata": None}, "BXH8": {"strain_name": "BXH8", "value": 6.77338, "variance": None, "ndata": None}, "BXH9": {"strain_name": "BXH9", "value": 8.03836, "variance": None, "ndata": None}, "C3H/HeJ": {"strain_name": "C3H/HeJ", "value": 7.42795, "variance": None, "ndata": None}, "C57BL/6J": {"strain_name": "C57BL/6J", "value": 7.50606, "variance": None, "ndata": None}, "DBA/2J": {"strain_name": "DBA/2J", "value": 7.72588, "variance": None, "ndata": None}}} + +class TestHeatmap(TestCase): + """Class for testing heatmap computation functions""" + + def test_export_trait_data_dtype(self): + """ + Test `export_trait_data` with different values for the `dtype` keyword + argument + """ + for dtype, expected in [ + ["val", (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["var", (None, None, None, None, None, None)], + ["N", (None, None, None, None, None, None)], + ["all", (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)]]: + with self.subTest(dtype=dtype): + self.assertEqual( + export_trait_data(trait_data, strainlist, dtype=dtype), + expected) + + def test_export_trait_data_dtype_all_flags(self): + """ + Test `export_trait_data` with different values for the `dtype` keyword + argument and the different flags set up + """ + for dtype, vflag, nflag, expected in [ + ["val", False, False, (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["val", False, True, (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["val", True, False, (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["val", True, True, (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["var", False, False, (None, None, None, None, None, None)], + ["var", False, True, (None, None, None, None, None, None)], + ["var", True, False, (None, None, None, None, None, None)], + ["var", True, True, (None, None, None, None, None, None)], + ["N", False, False, (None, None, None, None, None, None)], + ["N", False, True, (None, None, None, None, None, None)], + ["N", True, False, (None, None, None, None, None, None)], + ["N", True, True, (None, None, None, None, None, None)], + ["all", False, False, (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["all", False, True, (7.51879, None, 7.77141, None, 8.39265, None, 8.17443, None, 8.30401, None, 7.80944, None)], + ["all", True, False, (7.51879, None, 7.77141, None, 8.39265, None, 8.17443, None, 8.30401, None, 7.80944, None)], + ["all", True, True, (7.51879, None, None, 7.77141, None, None, 8.39265, None, None, 8.17443, None, None, 8.30401, None, None, 7.80944, None, None)] + ]: + with self.subTest(dtype=dtype, vflag=vflag, nflag=nflag): + self.assertEqual( + export_trait_data( + trait_data, strainlist, dtype=dtype, var_exists=vflag, + n_exists=nflag), + expected) -- cgit v1.2.3 From a2f6406909951a80dc4ead809a09e8de2c15200d Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Tue, 17 Aug 2021 08:49:14 +0300 Subject: Provide top-level `riset` key-value pair Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Provide the expected, top-level `riset` key-value pair and eliminate the redundant key-value pair. --- gn3/db/traits.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gn3/db/traits.py b/gn3/db/traits.py index 6ea24be..1031e44 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -418,9 +418,9 @@ def retrieve_trait_info( conn) if trait_info["haveinfo"]: return { - **trait_post_processing_functions_table[trait_dataset_type](trait_info), - "db": {**trait["db"], **trait_dataset}, - "riset": trait_dataset["riset"] + **trait_post_processing_functions_table[trait_dataset_type]( + {**trait_info, "riset": trait_dataset["riset"]}), + "db": {**trait["db"], **trait_dataset} } return trait_info -- cgit v1.2.3 From 99bfda81abe76b3bb3f7034cf6cdac21c8d50726 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Tue, 17 Aug 2021 11:05:03 +0300 Subject: Make child sequence a list Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Since the `slink` function assigns values to the `listcopy` variable and its children, this commit ensures that the sequence is a list to allow for the assignment. If the child-sequence is a tuple, that would lead to an exception. --- gn3/computations/slink.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gn3/computations/slink.py b/gn3/computations/slink.py index 5953e6b..3d7a576 100644 --- a/gn3/computations/slink.py +++ b/gn3/computations/slink.py @@ -161,7 +161,7 @@ def slink(lists): try: size = len(lists) listindexcopy = list(range(size)) - listscopy = [child[:] for child in lists] + listscopy = [list(child[:]) for child in lists] init_size = size candidate = [] while init_size > 2: -- cgit v1.2.3 From d491be2057843921cc67bd1c4b1ae612d9f15d34 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Tue, 17 Aug 2021 11:42:20 +0300 Subject: Fix obvious linting errors * Fix linting errors that do not change the function of the code. --- gn3/api/correlation.py | 4 ++-- gn3/api/general.py | 3 ++- gn3/computations/correlations.py | 4 ++-- wsgi.py | 6 ++++-- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/gn3/api/correlation.py b/gn3/api/correlation.py index a3e366e..46121f8 100644 --- a/gn3/api/correlation.py +++ b/gn3/api/correlation.py @@ -79,7 +79,7 @@ def compute_tissue_corr(corr_method="pearson"): target_tissues_dict = tissue_input_data["target_tissues_dict"] results = compute_tissue_correlation(primary_tissue_dict=primary_tissue_dict, - target_tissues_data=target_tissues_dict, - corr_method=corr_method) + target_tissues_data=target_tissues_dict, + corr_method=corr_method) return jsonify(results) diff --git a/gn3/api/general.py b/gn3/api/general.py index 86fb7b7..69ec343 100644 --- a/gn3/api/general.py +++ b/gn3/api/general.py @@ -13,7 +13,8 @@ general = Blueprint("general", __name__) @general.route("/version") def version(): - return jsonify("1.0") + """Get API version.""" + return jsonify("1.0") @general.route("/metadata/upload/", methods=["POST"], strict_slashes=False) diff --git a/gn3/computations/correlations.py b/gn3/computations/correlations.py index 1fd3213..8d76c09 100644 --- a/gn3/computations/correlations.py +++ b/gn3/computations/correlations.py @@ -341,8 +341,8 @@ def compute_all_lit_correlation(conn, trait_lists: List, def compute_tissue_correlation(primary_tissue_dict: dict, - target_tissues_data: dict, - corr_method: str): + target_tissues_data: dict, + corr_method: str): """Function acts as an abstraction for tissue_correlation_for_trait\ required input are target tissue object and primary tissue trait\ target tissues data contains the trait_symbol_dict and symbol_tissue_vals diff --git a/wsgi.py b/wsgi.py index d30bc49..0fcb573 100644 --- a/wsgi.py +++ b/wsgi.py @@ -1,9 +1,11 @@ +""" +WSGI application entry-point. +""" # import main +from gn3.app import create_app print("STARTING WSGI APP") -from gn3.app import create_app - app = create_app() if __name__ == "__main__": -- cgit v1.2.3 From 41fc5136914548710529cbed7ef370dfb5b4a5c8 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Tue, 17 Aug 2021 11:43:32 +0300 Subject: Test the clustering Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/computations/heatmap.py: Fix clustering bugs * tests/unit/computations/test_heatmap.py: Add new tests. Fix linting issues. Test and fix the clustering function. --- gn3/computations/heatmap.py | 14 ++-- tests/unit/computations/test_heatmap.py | 109 +++++++++++++++++++++++++++++--- 2 files changed, 106 insertions(+), 17 deletions(-) diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py index 8a86fe8..3c35029 100644 --- a/gn3/computations/heatmap.py +++ b/gn3/computations/heatmap.py @@ -110,13 +110,13 @@ def cluster_traits(traits_data_list: Sequence[Dict]): https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L138-L162 """ def __compute_corr(tdata_i, tdata_j): - if tdata_j[0] < tdata_i[0]: - corr_vals = compute_correlation(tdata_i, tdata_j) - corr = corr_vals[0] - if (1 - corr) < 0: - return 0.0 - return 1 - corr - return 0.0 + if tdata_i[0] == tdata_j[0]: + return 0.0 + corr_vals = compute_correlation(tdata_i[1], tdata_j[1]) + corr = corr_vals[0] + if (1 - corr) < 0: + return 0.0 + return 1 - corr def __cluster(tdata_i): return tuple( diff --git a/tests/unit/computations/test_heatmap.py b/tests/unit/computations/test_heatmap.py index 78303ae..650cb45 100644 --- a/tests/unit/computations/test_heatmap.py +++ b/tests/unit/computations/test_heatmap.py @@ -1,9 +1,38 @@ """Module contains tests for gn3.computations.heatmap""" from unittest import TestCase -from gn3.computations.heatmap import export_trait_data +from gn3.computations.heatmap import cluster_traits, export_trait_data strainlist = ["B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"] -trait_data = {"mysqlid": 36688172, "data": {"B6cC3-1": {"strain_name": "B6cC3-1", "value": 7.51879, "variance": None, "ndata": None}, "BXD1": {"strain_name": "BXD1", "value": 7.77141, "variance": None, "ndata": None}, "BXD12": {"strain_name": "BXD12", "value": 8.39265, "variance": None, "ndata": None}, "BXD16": {"strain_name": "BXD16", "value": 8.17443, "variance": None, "ndata": None}, "BXD19": {"strain_name": "BXD19", "value": 8.30401, "variance": None, "ndata": None}, "BXD2": {"strain_name": "BXD2", "value": 7.80944, "variance": None, "ndata": None}, "BXD21": {"strain_name": "BXD21", "value": 8.93809, "variance": None, "ndata": None}, "BXD24": {"strain_name": "BXD24", "value": 7.99415, "variance": None, "ndata": None}, "BXD27": {"strain_name": "BXD27", "value": 8.12177, "variance": None, "ndata": None}, "BXD28": {"strain_name": "BXD28", "value": 7.67688, "variance": None, "ndata": None}, "BXD32": {"strain_name": "BXD32", "value": 7.79062, "variance": None, "ndata": None}, "BXD39": {"strain_name": "BXD39", "value": 8.27641, "variance": None, "ndata": None}, "BXD40": {"strain_name": "BXD40", "value": 8.18012, "variance": None, "ndata": None}, "BXD42": {"strain_name": "BXD42", "value": 7.82433, "variance": None, "ndata": None}, "BXD6": {"strain_name": "BXD6", "value": 8.09718, "variance": None, "ndata": None}, "BXH14": {"strain_name": "BXH14", "value": 7.97475, "variance": None, "ndata": None}, "BXH19": {"strain_name": "BXH19", "value": 7.67223, "variance": None, "ndata": None}, "BXH2": {"strain_name": "BXH2", "value": 7.93622, "variance": None, "ndata": None}, "BXH22": {"strain_name": "BXH22", "value": 7.43692, "variance": None, "ndata": None}, "BXH4": {"strain_name": "BXH4", "value": 7.96336, "variance": None, "ndata": None}, "BXH6": {"strain_name": "BXH6", "value": 7.75132, "variance": None, "ndata": None}, "BXH7": {"strain_name": "BXH7", "value": 8.12927, "variance": None, "ndata": None}, "BXH8": {"strain_name": "BXH8", "value": 6.77338, "variance": None, "ndata": None}, "BXH9": {"strain_name": "BXH9", "value": 8.03836, "variance": None, "ndata": None}, "C3H/HeJ": {"strain_name": "C3H/HeJ", "value": 7.42795, "variance": None, "ndata": None}, "C57BL/6J": {"strain_name": "C57BL/6J", "value": 7.50606, "variance": None, "ndata": None}, "DBA/2J": {"strain_name": "DBA/2J", "value": 7.72588, "variance": None, "ndata": None}}} +trait_data = { + "mysqlid": 36688172, + "data": { + "B6cC3-1": {"strain_name": "B6cC3-1", "value": 7.51879, "variance": None, "ndata": None}, + "BXD1": {"strain_name": "BXD1", "value": 7.77141, "variance": None, "ndata": None}, + "BXD12": {"strain_name": "BXD12", "value": 8.39265, "variance": None, "ndata": None}, + "BXD16": {"strain_name": "BXD16", "value": 8.17443, "variance": None, "ndata": None}, + "BXD19": {"strain_name": "BXD19", "value": 8.30401, "variance": None, "ndata": None}, + "BXD2": {"strain_name": "BXD2", "value": 7.80944, "variance": None, "ndata": None}, + "BXD21": {"strain_name": "BXD21", "value": 8.93809, "variance": None, "ndata": None}, + "BXD24": {"strain_name": "BXD24", "value": 7.99415, "variance": None, "ndata": None}, + "BXD27": {"strain_name": "BXD27", "value": 8.12177, "variance": None, "ndata": None}, + "BXD28": {"strain_name": "BXD28", "value": 7.67688, "variance": None, "ndata": None}, + "BXD32": {"strain_name": "BXD32", "value": 7.79062, "variance": None, "ndata": None}, + "BXD39": {"strain_name": "BXD39", "value": 8.27641, "variance": None, "ndata": None}, + "BXD40": {"strain_name": "BXD40", "value": 8.18012, "variance": None, "ndata": None}, + "BXD42": {"strain_name": "BXD42", "value": 7.82433, "variance": None, "ndata": None}, + "BXD6": {"strain_name": "BXD6", "value": 8.09718, "variance": None, "ndata": None}, + "BXH14": {"strain_name": "BXH14", "value": 7.97475, "variance": None, "ndata": None}, + "BXH19": {"strain_name": "BXH19", "value": 7.67223, "variance": None, "ndata": None}, + "BXH2": {"strain_name": "BXH2", "value": 7.93622, "variance": None, "ndata": None}, + "BXH22": {"strain_name": "BXH22", "value": 7.43692, "variance": None, "ndata": None}, + "BXH4": {"strain_name": "BXH4", "value": 7.96336, "variance": None, "ndata": None}, + "BXH6": {"strain_name": "BXH6", "value": 7.75132, "variance": None, "ndata": None}, + "BXH7": {"strain_name": "BXH7", "value": 8.12927, "variance": None, "ndata": None}, + "BXH8": {"strain_name": "BXH8", "value": 6.77338, "variance": None, "ndata": None}, + "BXH9": {"strain_name": "BXH9", "value": 8.03836, "variance": None, "ndata": None}, + "C3H/HeJ": {"strain_name": "C3H/HeJ", "value": 7.42795, "variance": None, "ndata": None}, + "C57BL/6J": {"strain_name": "C57BL/6J", "value": 7.50606, "variance": None, "ndata": None}, + "DBA/2J": {"strain_name": "DBA/2J", "value": 7.72588, "variance": None, "ndata": None}}} class TestHeatmap(TestCase): """Class for testing heatmap computation functions""" @@ -29,10 +58,14 @@ class TestHeatmap(TestCase): argument and the different flags set up """ for dtype, vflag, nflag, expected in [ - ["val", False, False, (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], - ["val", False, True, (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], - ["val", True, False, (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], - ["val", True, True, (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["val", False, False, + (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["val", False, True, + (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["val", True, False, + (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["val", True, True, + (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], ["var", False, False, (None, None, None, None, None, None)], ["var", False, True, (None, None, None, None, None, None)], ["var", True, False, (None, None, None, None, None, None)], @@ -41,10 +74,17 @@ class TestHeatmap(TestCase): ["N", False, True, (None, None, None, None, None, None)], ["N", True, False, (None, None, None, None, None, None)], ["N", True, True, (None, None, None, None, None, None)], - ["all", False, False, (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], - ["all", False, True, (7.51879, None, 7.77141, None, 8.39265, None, 8.17443, None, 8.30401, None, 7.80944, None)], - ["all", True, False, (7.51879, None, 7.77141, None, 8.39265, None, 8.17443, None, 8.30401, None, 7.80944, None)], - ["all", True, True, (7.51879, None, None, 7.77141, None, None, 8.39265, None, None, 8.17443, None, None, 8.30401, None, None, 7.80944, None, None)] + ["all", False, False, + (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["all", False, True, + (7.51879, None, 7.77141, None, 8.39265, None, 8.17443, None, + 8.30401, None, 7.80944, None)], + ["all", True, False, + (7.51879, None, 7.77141, None, 8.39265, None, 8.17443, None, + 8.30401, None, 7.80944, None)], + ["all", True, True, + (7.51879, None, None, 7.77141, None, None, 8.39265, None, None, + 8.17443, None, None, 8.30401, None, None, 7.80944, None, None)] ]: with self.subTest(dtype=dtype, vflag=vflag, nflag=nflag): self.assertEqual( @@ -52,3 +92,52 @@ class TestHeatmap(TestCase): trait_data, strainlist, dtype=dtype, var_exists=vflag, n_exists=nflag), expected) + + def test_cluster_traits(self): + """ + Test that the clustering is working as expected. + """ + traits_data_list = [ + (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944), + (6.1427, 6.50588, 7.73705, 6.68328, 7.49293, 7.27398), + (8.4211, 8.30581, 9.24076, 8.51173, 9.18455, 8.36077), + (10.0904, 10.6509, 9.36716, 9.91202, 8.57444, 10.5731), + (10.188, 9.76652, 9.54813, 9.05074, 9.52319, 9.10505), + (6.74676, 7.01029, 7.54169, 6.48574, 7.01427, 7.26815), + (6.39359, 6.85321, 5.78337, 7.11141, 6.22101, 6.16544), + (6.84118, 7.08432, 7.59844, 7.08229, 7.26774, 7.24991), + (9.45215, 10.6943, 8.64719, 10.1592, 7.75044, 8.78615), + (7.04737, 6.87185, 7.58586, 6.92456, 6.84243, 7.36913)] + self.assertEqual( + cluster_traits(traits_data_list), + ((0.0, 0.20337048635536847, 0.16381088984330505, 1.7388553629398245, + 1.5025235756329178, 0.6952839500255574, 1.271661230252733, + 0.2100487290977544, 1.4699690641062024, 0.7934461515867415), + (0.20337048635536847, 0.0, 0.2198321044997198, 1.5753041735592204, + 1.4815755944537086, 0.26087293140686374, 1.6939790104301427, + 0.06024619831474998, 1.7430082449189215, 0.4497104244247795), + (0.16381088984330505, 0.2198321044997198, 0.0, 1.9073926868549234, + 1.0396738891139845, 0.5278328671176757, 1.6275069061182947, + 0.2636503792482082, 1.739617877037615, 0.7127042590637039), + (1.7388553629398245, 1.5753041735592204, 1.9073926868549234, 0.0, + 0.9936846292920328, 1.1169999189889366, 0.6007483980555253, + 1.430209221053372, 0.25879514152086425, 0.9313185954797953), + (1.5025235756329178, 1.4815755944537086, 1.0396738891139845, + 0.9936846292920328, 0.0, 1.027827186339337, 1.1441743109173244, + 1.4122477962364253, 0.8968250491499363, 1.1683723389247052), + (0.6952839500255574, 0.26087293140686374, 0.5278328671176757, + 1.1169999189889366, 1.027827186339337, 0.0, 1.8420471110023269, + 0.19179284676938602, 1.4875072385631605, 0.23451785425383564), + (1.271661230252733, 1.6939790104301427, 1.6275069061182947, + 0.6007483980555253, 1.1441743109173244, 1.8420471110023269, 0.0, + 1.6540234785929928, 0.2140799896286565, 1.7413442197913358), + (0.2100487290977544, 0.06024619831474998, 0.2636503792482082, + 1.430209221053372, 1.4122477962364253, 0.19179284676938602, + 1.6540234785929928, 0.0, 1.5225640692832796, 0.33370067057028485), + (1.4699690641062024, 1.7430082449189215, 1.739617877037615, + 0.25879514152086425, 0.8968250491499363, 1.4875072385631605, + 0.2140799896286565, 1.5225640692832796, 0.0, 1.3256191648260216), + (0.7934461515867415, 0.4497104244247795, 0.7127042590637039, + 0.9313185954797953, 1.1683723389247052, 0.23451785425383564, + 1.7413442197913358, 0.33370067057028485, 1.3256191648260216, + 0.0))) -- cgit v1.2.3 From ded960e3d32e4d7ebe590deda27fc47175be73d9 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Fri, 20 Aug 2021 13:21:31 +0300 Subject: Add tests for ordering and implement function Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/computations/heatmap.py: implement new ordering function * tests/unit/computations/test_heatmap.py: add new tests Implement the ordering function to migrate the setup of the `neworder` variable from GN1 to GN3. This migration is incomplete, since there is dependence on the return from the `web.webqtl.heatmap.Heatmap.draw` function in form of the `d_1` variable in some of the paths. The thing is, this `d_1` variable, and the `xoffset` variable seem to be used for laying out things on the drawn heatmap, and might actually end up not being needed for the new system using plotly, which has other ways of laying out things on the drawing. For now though, this commit "shims" the presence of these values until when the use of these variables is confirmed as present or absent in the new GN3 system. --- gn3/computations/heatmap.py | 28 ++++++++++++++++++++++++++++ tests/unit/computations/test_heatmap.py | 25 ++++++++++++++++++++++++- 2 files changed, 52 insertions(+), 1 deletion(-) diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py index 3c35029..1c86261 100644 --- a/gn3/computations/heatmap.py +++ b/gn3/computations/heatmap.py @@ -175,3 +175,31 @@ def heatmap_data(formd, search_result, conn: Any): "traits_list": traits_list, "traits_data_list": traits_data_list } + +def compute_heatmap_order( + slink_data, xoffset: int = 40, neworder: tuple = tuple()): + """ + Compute the data used for drawing the heatmap proper from `slink_data`. + + This function tries to reproduce the creation and update of the `neworder` + variable in + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L120 + and in the `web.webqtl.heatmap.Heatmap.draw` function in GN1 + """ + d_1 = (0, 0, 0) # returned from self.draw in lines 391 and 399. This is just a placeholder + + def __order_maker(norder, slnk_dt): + print("norder:{}, slnk_dt:{}".format(norder, slnk_dt)) + if isinstance(slnk_dt[0], int) and isinstance(slnk_dt[1], int): + return norder + ( + (xoffset+20, slnk_dt[0]), (xoffset + 40, slnk_dt[1])) + + if isinstance(slnk_dt[0], int): + return norder + ((xoffset + 20, slnk_dt[0]), ) + + if isinstance(slnk_dt[1], int): + return norder + ((xoffset + d_1[0] + 20, slnk_dt[1]), ) + + return __order_maker(__order_maker(norder, slnk_dt[0]), slnk_dt[1]) + + return __order_maker(neworder, slink_data) diff --git a/tests/unit/computations/test_heatmap.py b/tests/unit/computations/test_heatmap.py index 650cb45..14807bb 100644 --- a/tests/unit/computations/test_heatmap.py +++ b/tests/unit/computations/test_heatmap.py @@ -1,6 +1,9 @@ """Module contains tests for gn3.computations.heatmap""" from unittest import TestCase -from gn3.computations.heatmap import cluster_traits, export_trait_data +from gn3.computations.heatmap import ( + cluster_traits, + export_trait_data, + compute_heatmap_order) strainlist = ["B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"] trait_data = { @@ -34,6 +37,16 @@ trait_data = { "C57BL/6J": {"strain_name": "C57BL/6J", "value": 7.50606, "variance": None, "ndata": None}, "DBA/2J": {"strain_name": "DBA/2J", "value": 7.72588, "variance": None, "ndata": None}}} +slinked = ( + (((0, 2, 0.16381088984330505), + ((1, 7, 0.06024619831474998), 5, 0.19179284676938602), + 0.20337048635536847), + 9, + 0.23451785425383564), + ((3, (6, 8, 0.2140799896286565), 0.25879514152086425), + 4, 0.8968250491499363), + 0.9313185954797953) + class TestHeatmap(TestCase): """Class for testing heatmap computation functions""" @@ -141,3 +154,13 @@ class TestHeatmap(TestCase): 0.9313185954797953, 1.1683723389247052, 0.23451785425383564, 1.7413442197913358, 0.33370067057028485, 1.3256191648260216, 0.0))) + + def test_compute_heatmap_order(self): + """Test the orders.""" + for xoff, expected in [ + (40, ((60, 9), (60, 4))), + (30, ((50, 9), (50, 4))), + (20, ((40, 9), (40, 4)))]: + with self.subTest(xoffset=xoff): + self.assertEqual( + compute_heatmap_order(slinked, xoffset=xoff), expected) -- cgit v1.2.3 From 8b2c776771d2a70613a1e31d6e6671b612cfbafc Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Fri, 20 Aug 2021 14:10:45 +0300 Subject: Retrieve the strains with valid values Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/computations/heatmap.py: add function to get strains with values * tests/unit/computations/test_heatmap.py: new tests Add function to get the strains whose values are not `None` from the `trait_data` object passed in. This migrates https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L215-221 into a separate function that can handle that and be tested independently of any other code. --- gn3/computations/heatmap.py | 19 +++++++++++++++++++ tests/unit/computations/test_heatmap.py | 14 +++++++++++++- 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py index 1c86261..5a3c619 100644 --- a/gn3/computations/heatmap.py +++ b/gn3/computations/heatmap.py @@ -203,3 +203,22 @@ def compute_heatmap_order( return __order_maker(__order_maker(norder, slnk_dt[0]), slnk_dt[1]) return __order_maker(neworder, slink_data) + +def retrieve_strains_and_values(strainlist, trait_data): + """ + Get the strains and their corresponding values from `strainlist` and + `trait_data`. + + This migrates the code in + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L215-221 + """ + def __strains_and_values(acc, i): + if trait_data[i] is None: + return acc + if len(acc) == 0: + return ((strainlist[i], ), (trait_data[i], )) + _strains = acc[0] + _vals = acc[1] + return (_strains + (strainlist[i], ), _vals + (trait_data[i], )) + return reduce( + __strains_and_values, range(len(strainlist)), (tuple(), tuple())) diff --git a/tests/unit/computations/test_heatmap.py b/tests/unit/computations/test_heatmap.py index 14807bb..686288d 100644 --- a/tests/unit/computations/test_heatmap.py +++ b/tests/unit/computations/test_heatmap.py @@ -3,7 +3,8 @@ from unittest import TestCase from gn3.computations.heatmap import ( cluster_traits, export_trait_data, - compute_heatmap_order) + compute_heatmap_order, + retrieve_strains_and_values) strainlist = ["B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"] trait_data = { @@ -164,3 +165,14 @@ class TestHeatmap(TestCase): with self.subTest(xoffset=xoff): self.assertEqual( compute_heatmap_order(slinked, xoffset=xoff), expected) + + def test_retrieve_strains_and_values(self): + """Test retrieval of strains and values.""" + for slist, tdata, expected in [ + [["s1", "s2", "s3", "s4"], [9, None, 5, 4], + (("s1", "s3", "s4"), (9, 5, 4))], + [["s1", "s2", "s3", "s4", "s5"], [6, None, None, 4, None], + (("s1", "s4"), (6, 4))]]: + with self.subTest(strainlist=slist, traitdata=tdata): + self.assertEqual( + retrieve_strains_and_values(slist, tdata), expected) -- cgit v1.2.3 From 96af4e9e32ed167a8d70cf7761b709b1a37bb344 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Fri, 20 Aug 2021 14:14:12 +0300 Subject: Fix typing issue(s) caught by mypy Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/computations/heatmap.py: Use `Sequence` type not `Iterator` type --- gn3/computations/heatmap.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py index 5a3c619..c9c2b8a 100644 --- a/gn3/computations/heatmap.py +++ b/gn3/computations/heatmap.py @@ -156,8 +156,8 @@ def heatmap_data(formd, search_result, conn: Any): traits_details = [ __retrieve_traitlist_and_datalist(threshold, fullname) for fullname in search_result] - traits_list = map(lambda x: x[0], traits_details) - traits_data_list = map(lambda x: x[1], traits_details) + traits_list = tuple(x[0] for x in traits_details) + traits_data_list = tuple(x[1] for x in traits_details) return { "target_description_checked": formd.formdata.getvalue( -- cgit v1.2.3 From 7aa5f5422908b4dbfc80f3f73b008507878a34aa Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Thu, 26 Aug 2021 07:32:37 +0300 Subject: Add rust-qtlreaper Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * guix.scm: new dependency (rust-qtlreaper) --- guix.scm | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/guix.scm b/guix.scm index 729d089..868bd74 100644 --- a/guix.scm +++ b/guix.scm @@ -44,6 +44,7 @@ (gnu packages statistics) (gnu packages bioconductor) (gn packages golang) + (gn packages genenetwork) (gnu packages python) (gnu packages python-check) (gnu packages python-crypto) @@ -104,7 +105,8 @@ ("r-qtl" ,r-qtl) ("r-stringi" ,r-stringi) ("python-plotly" ,python-plotly) - ("python-pandas" ,python-pandas))) + ("python-pandas" ,python-pandas) + ("rust-qtlreaper" ,rust-qtlreaper))) (build-system python-build-system) (home-page "https://github.com/genenetwork/genenetwork3") (synopsis "GeneNetwork3 API for data science and machine learning.") -- cgit v1.2.3 From be4445e91f4a752ef7bbb99ed7d813c5fc88f467 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Thu, 26 Aug 2021 07:33:44 +0300 Subject: Update imported module name Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * guix.scm: (gn packages golang) ==> (gnu packages golang) csvdiff has moved to upstream guix and been removed from latest guix-bioinformatics. --- guix.scm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/guix.scm b/guix.scm index 868bd74..8e1cf79 100644 --- a/guix.scm +++ b/guix.scm @@ -43,7 +43,7 @@ (gnu packages databases) (gnu packages statistics) (gnu packages bioconductor) - (gn packages golang) + (gnu packages golang) (gn packages genenetwork) (gnu packages python) (gnu packages python-check) -- cgit v1.2.3 From 557e482c88ba3d44ae7d278b7222f37fa043b4d0 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Fri, 27 Aug 2021 15:47:52 +0300 Subject: Rework strains and trait values retrieval Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Rework the strains and values retrieval function to more closely correspond to the working of the original code in GN1 --- gn3/computations/heatmap.py | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py index c9c2b8a..da13ceb 100644 --- a/gn3/computations/heatmap.py +++ b/gn3/computations/heatmap.py @@ -204,21 +204,28 @@ def compute_heatmap_order( return __order_maker(neworder, slink_data) -def retrieve_strains_and_values(strainlist, trait_data): +def retrieve_strains_and_values(orders, strainlist, traits_data_list): """ Get the strains and their corresponding values from `strainlist` and - `trait_data`. + `traits_data_list`. This migrates the code in https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L215-221 """ - def __strains_and_values(acc, i): - if trait_data[i] is None: - return acc - if len(acc) == 0: - return ((strainlist[i], ), (trait_data[i], )) - _strains = acc[0] - _vals = acc[1] - return (_strains + (strainlist[i], ), _vals + (trait_data[i], )) - return reduce( - __strains_and_values, range(len(strainlist)), (tuple(), tuple())) + # This feels nasty! There's a lot of mutation of values here, that might + # indicate something untoward in the design of this function and its + # dependents ==> Review + strains = [] + values = [] + rets = [] + for order in orders: + temp_val = traits_data_list[order[1]] + for i in range(len(strainlist)): + if temp_val[i] != None: + strains.append(strainlist[i]) + values.append(temp_val[i]) + rets.append([order, strains[:], values[:]]) + strains = [] + values = [] + + return rets -- cgit v1.2.3 From 1a3901b174d00af8fa7f5ae78b810de66024b5ab Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Fri, 27 Aug 2021 15:49:53 +0300 Subject: Export trait data to file Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Provide a function to export the given strains and traits data into a traits file for use with `rust-qtlreaper`. --- gn3/computations/heatmap.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py index da13ceb..2f92048 100644 --- a/gn3/computations/heatmap.py +++ b/gn3/computations/heatmap.py @@ -229,3 +229,11 @@ def retrieve_strains_and_values(orders, strainlist, traits_data_list): values = [] return rets + +def generate_traits_file(strains, trait_values, traits_filename): + header = "Traits\t{}\n".format("\t".join(strains)) + data = [header] + [ + "T{}\t{}\n".format(i+1, "\t".join([str(i) for i in t])) + for i,t in enumerate(trait_values)] + with open(traits_filename, "w") as outfile: + outfile.writelines(data) -- cgit v1.2.3 From 28fde00ee2835d404157652548a4265be3accede Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Fri, 27 Aug 2021 15:51:27 +0300 Subject: Provide intermediate data in final results Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Seeing as not every requirement/feature has been migrated over at this time, this commit just provides all the intermediate data representations in the final return of the function for later use down the line. --- gn3/computations/heatmap.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py index 2f92048..3e96ed2 100644 --- a/gn3/computations/heatmap.py +++ b/gn3/computations/heatmap.py @@ -149,22 +149,22 @@ def heatmap_data(formd, search_result, conn: Any): def __retrieve_traitlist_and_datalist(threshold, fullname): trait = retrieve_trait_info(threshold, fullname, conn) - return ( - trait, - export_trait_data(retrieve_trait_data(trait, conn), strainlist)) + return (trait, retrieve_trait_data(trait, conn)) traits_details = [ __retrieve_traitlist_and_datalist(threshold, fullname) for fullname in search_result] traits_list = tuple(x[0] for x in traits_details) - traits_data_list = tuple(x[1] for x in traits_details) + traits_data_list = [x[1] for x in traits_details] + exported_traits_data_list = tuple( + export_trait_data(td, strainlist) for x in traits_data_list) return { "target_description_checked": formd.formdata.getvalue( "targetDescriptionCheck", ""), "cluster_checked": cluster_checked, "slink_data": ( - slink(cluster_traits(traits_data_list)) + slink(cluster_traits(exported_traits_data_list)) if cluster_checked else False), "sessionfile": formd.formdata.getvalue("session"), "genotype": genotype, @@ -173,7 +173,8 @@ def heatmap_data(formd, search_result, conn: Any): "ppolar": formd.ppolar, "mpolar":formd.mpolar, "traits_list": traits_list, - "traits_data_list": traits_data_list + "traits_data_list": traits_data_list, + "exported_traits_data_list": exported_traits_data_list } def compute_heatmap_order( -- cgit v1.2.3 From a9fa309f4017d84cd30f6df90376042f20b1836b Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Fri, 27 Aug 2021 15:55:41 +0300 Subject: Test out generation of traits file Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * As part of the development effort, this commit provides a proof-of-concept as a reference for generating the traits data file. It might be useful for verifying that the functions that are called are working as is expected. --- qtlfilesexport.py | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 qtlfilesexport.py diff --git a/qtlfilesexport.py b/qtlfilesexport.py new file mode 100644 index 0000000..2e7c9c2 --- /dev/null +++ b/qtlfilesexport.py @@ -0,0 +1,67 @@ +""" +Test the qtlfiles export of traits files + +Run with: + + env SQL_URI="mysql://:@:/db_webqtl" python3 qtlfilesexport.py + +replacing the variables in the angled brackets with the appropriate values +""" +import random +import string +from gn3.computations.slink import slink +from gn3.db_utils import database_connector +from gn3.computations.heatmap import export_trait_data +from gn3.db.traits import retrieve_trait_data, retrieve_trait_info +from gn3.computations.heatmap import ( + cluster_traits, + compute_heatmap_order, + generate_traits_file, + retrieve_strains_and_values) + +TMPDIR = "tmp/qtltests" + +def trait_fullnames(): + """Return sample names for traits""" + return [ + "UCLA_BXDBXH_CARTILAGE_V2::ILM103710672", + "UCLA_BXDBXH_CARTILAGE_V2::ILM2260338", + "UCLA_BXDBXH_CARTILAGE_V2::ILM3140576", + "UCLA_BXDBXH_CARTILAGE_V2::ILM5670577", + "UCLA_BXDBXH_CARTILAGE_V2::ILM2070121", + "UCLA_BXDBXH_CARTILAGE_V2::ILM103990541", + "UCLA_BXDBXH_CARTILAGE_V2::ILM1190722", + "UCLA_BXDBXH_CARTILAGE_V2::ILM6590722", + "UCLA_BXDBXH_CARTILAGE_V2::ILM4200064", + "UCLA_BXDBXH_CARTILAGE_V2::ILM3140463"] + +def random_string(length): + return "".join( + random.choices( + string.ascii_letters + string.digits, k=length)) + +def main(): + """entrypoint function""" + conn = database_connector()[0] + threshold = 0 + traits = [ + retrieve_trait_info(threshold, fullname, conn) + for fullname in trait_fullnames()] + traits_data_list = [retrieve_trait_data(t, conn) for t in traits] + strains = list(set([k for td in traits_data_list for k in td["data"].keys()])) + exported_traits_data_list = [ + export_trait_data(td, strains) for td in traits_data_list] + slinked = slink(cluster_traits(exported_traits_data_list)) + orders = compute_heatmap_order(slinked) + strains_and_values = retrieve_strains_and_values( + orders, strains, exported_traits_data_list) + strains_values = strains_and_values[0][1] + strains_values2 = strains_and_values[1][1] + trait_values = [t[2] for t in strains_and_values] + traits_filename = "{}/traits_test_file_{}.txt".format( + TMPDIR, random_string(10)) + generate_traits_file(strains_values, trait_values, traits_filename) + print("Generated file: {}".format(traits_filename)) + +if __name__ == "__main__": + main() -- cgit v1.2.3 From c045122908d36bba4ca197f3f67e89d80958f38f Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Mon, 30 Aug 2021 05:23:22 +0300 Subject: Document acquired knowledge on `rust-qtlreaper` Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/heatmaps/heatmaps.py: document format of the traits file To assist future developers, and development of the system, this commit documents some of the hard-won knowledge about the operation of the system to ease future development of the system. The documentation, if good, might also help with future onboarding of new developers to the system. --- README.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/README.md b/README.md index c3a9848..0e0e509 100644 --- a/README.md +++ b/README.md @@ -120,3 +120,24 @@ guix. To freeze dependencies: pip freeze --path venv/lib/python3.8/site-packages > requirements.txt ``` + +## QTLReaper (rust-qtlreaper) and Trait Files + +To run QTL computations, this system makes use of the [rust-qtlreaper](https://github.com/chfi/rust-qtlreaper.git) utility. + +To do this, the system needs to export the trait data into a tab-separated file, that can then be passed to the utility using the `--traits` option. For more information about the available options, please [see the rust-qtlreaper](https://github.com/chfi/rust-qtlreaper.git) repository. + +### Traits File Format + +The traits file begins with a header row/line with the column headers. The first column in the file has the header **"Trait"**. Every other column has a header for one of the strains in consideration. + +Under the **"Trait"** column, the traits are numbered from **T1** to **T** where **** is the count of the total number of traits in consideration. + +As an example, you could end up with a trait file like the following: + +```txt +Traits BXD27 BXD32 DBA/2J BXD21 ... +T1 10.5735 9.27408 9.48255 9.18253 ... +T2 6.4471 6.7191 5.98015 6.68051 ... +... +``` -- cgit v1.2.3 From 983acfdfc523677b4d7501287a000b7fd52a2c39 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Mon, 30 Aug 2021 07:00:38 +0300 Subject: Implement module for interfacing with rust-qtlreaper Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/computations/heatmap.py: move `generate_traits_file` function to new module * gn3/computations/qtlreaper.py: new module to interface with the `rust-qtlreaper` utility. * gn3/settings.py: Provide setting for the path to the `rust-qtlreaper` utility * qtlfilesexport.py: Move `random_string` function to new module. Update to use functions in new module. Provide a module with functions to be used to interface with `rust-qtlreaper`. This module essentially contains all the functions that are needed to build the files needed for, and to run the qtlreaper utility. --- gn3/computations/heatmap.py | 8 ---- gn3/computations/qtlreaper.py | 88 +++++++++++++++++++++++++++++++++++++++++++ gn3/settings.py | 3 ++ qtlfilesexport.py | 10 +---- 4 files changed, 92 insertions(+), 17 deletions(-) create mode 100644 gn3/computations/qtlreaper.py diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py index 3e96ed2..dcd64b1 100644 --- a/gn3/computations/heatmap.py +++ b/gn3/computations/heatmap.py @@ -230,11 +230,3 @@ def retrieve_strains_and_values(orders, strainlist, traits_data_list): values = [] return rets - -def generate_traits_file(strains, trait_values, traits_filename): - header = "Traits\t{}\n".format("\t".join(strains)) - data = [header] + [ - "T{}\t{}\n".format(i+1, "\t".join([str(i) for i in t])) - for i,t in enumerate(trait_values)] - with open(traits_filename, "w") as outfile: - outfile.writelines(data) diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py new file mode 100644 index 0000000..49d363b --- /dev/null +++ b/gn3/computations/qtlreaper.py @@ -0,0 +1,88 @@ +""" +This module contains functions to interact with the `qtlreaper` utility for +computation of QTLs. +""" +import os +import random +import string +import subprocess +from gn3.settings import TMPDIR, REAPER_COMMAND + +def random_string(length): + """Generate a random string of length `length`.""" + return "".join( + random.choices( + string.ascii_letters + string.digits, k=length)) + +def generate_traits_file(strains, trait_values, traits_filename): + """ + Generate a traits file for use with `qtlreaper`. + + PARAMETERS: + strains: A list of strains to use as the headers for the various columns. + trait_values: A list of lists of values for each trait and strain. + traits_filename: The tab-separated value to put the values in for + computation of QTLs. + """ + header = "Traits\t{}\n".format("\t".join(strains)) + data = [header] + [ + "T{}\t{}\n".format(i+1, "\t".join([str(i) for i in t])) + for i, t in enumerate(trait_values)] + with open(traits_filename, "w") as outfile: + outfile.writelines(data) + +def create_output_directory(path: str): + """Create the output directory at `path` if it does not exist.""" + try: + os.mkdir(path) + except OSError: + pass + +def run_reaper( + genotype_filename: str, traits_filename: str, + other_options: tuple = ("--n_permutations", 1000), + separate_nperm_output: bool = False, + output_dir: str = TMPDIR): + """ + Run the QTLReaper command to compute the QTLs. + + PARAMETERS: + genotype_filename: The complete path to a genotype file to use in the QTL + computation. + traits_filename: A path to a file previously generated with the + `generate_traits_file` function in this module, to be used in the QTL + computation. + other_options: Other options to pass to the `qtlreaper` command to modify + the QTL computations. + separate_nperm_output: A flag indicating whether or not to provide a + separate output for the permutations computation. The default is False, + which means by default, no separate output file is created. + output_dir: A path to the directory where the outputs are put + + RETURNS: + The function returns a tuple of the main output file, and the output file + for the permutation computations. If the `separate_nperm_output` is `False`, + the second value in the tuple returned is `None`. + + RAISES: + The function will raise a `subprocess.CalledProcessError` exception in case + of any errors running the `qtlreaper` command. + """ + create_output_directory(output_dir) + output_filename = "{}/qtlreaper/main_output_{}.txt".format( + output_dir, random_string(10)) + output_list = ["--main_output", output_filename] + if separate_nperm_output: + permu_output_filename = "{}/qtlreaper/permu_output_{}.txt".format( + output_dir, random_string(10)) + output_list = output_list + ["--permu_output", permu_output_filename] + else: + permu_output_filename = None + + command_list = [ + REAPER_COMMAND, "--geno", genotype_filename, + *other_options, # this splices the `other_options` list here + "--traits", traits_filename, "--main_output", output_filename] + + subprocess.run(command_list, check=True) + return (output_filename, permu_output_filename) diff --git a/gn3/settings.py b/gn3/settings.py index f4866d5..d137370 100644 --- a/gn3/settings.py +++ b/gn3/settings.py @@ -24,3 +24,6 @@ GN2_BASE_URL = "http://www.genenetwork.org/" # biweight script BIWEIGHT_RSCRIPT = "~/genenetwork3/scripts/calculate_biweight.R" + +# qtlreaper command +REAPER_COMMAND = "{}/bin/qtlreaper".format(os.environ.get("GUIX_ENVIRONMENT")) diff --git a/qtlfilesexport.py b/qtlfilesexport.py index 2e7c9c2..0543dc9 100644 --- a/qtlfilesexport.py +++ b/qtlfilesexport.py @@ -7,16 +7,14 @@ Run with: replacing the variables in the angled brackets with the appropriate values """ -import random -import string from gn3.computations.slink import slink from gn3.db_utils import database_connector from gn3.computations.heatmap import export_trait_data from gn3.db.traits import retrieve_trait_data, retrieve_trait_info +from gn3.computations.qtlreaper import random_string, generate_traits_file from gn3.computations.heatmap import ( cluster_traits, compute_heatmap_order, - generate_traits_file, retrieve_strains_and_values) TMPDIR = "tmp/qtltests" @@ -35,11 +33,6 @@ def trait_fullnames(): "UCLA_BXDBXH_CARTILAGE_V2::ILM4200064", "UCLA_BXDBXH_CARTILAGE_V2::ILM3140463"] -def random_string(length): - return "".join( - random.choices( - string.ascii_letters + string.digits, k=length)) - def main(): """entrypoint function""" conn = database_connector()[0] @@ -56,7 +49,6 @@ def main(): strains_and_values = retrieve_strains_and_values( orders, strains, exported_traits_data_list) strains_values = strains_and_values[0][1] - strains_values2 = strains_and_values[1][1] trait_values = [t[2] for t in strains_and_values] traits_filename = "{}/traits_test_file_{}.txt".format( TMPDIR, random_string(10)) -- cgit v1.2.3 From b95ad3bd2ce8bc22d1dcadefdf76c43f28309984 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Mon, 30 Aug 2021 07:05:49 +0300 Subject: Fix some linting errors and minor bugs. --- gn3/computations/heatmap.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py index dcd64b1..e0ff05b 100644 --- a/gn3/computations/heatmap.py +++ b/gn3/computations/heatmap.py @@ -157,7 +157,7 @@ def heatmap_data(formd, search_result, conn: Any): traits_list = tuple(x[0] for x in traits_details) traits_data_list = [x[1] for x in traits_details] exported_traits_data_list = tuple( - export_trait_data(td, strainlist) for x in traits_data_list) + export_trait_data(td, strainlist) for td in traits_data_list) return { "target_description_checked": formd.formdata.getvalue( @@ -190,7 +190,6 @@ def compute_heatmap_order( d_1 = (0, 0, 0) # returned from self.draw in lines 391 and 399. This is just a placeholder def __order_maker(norder, slnk_dt): - print("norder:{}, slnk_dt:{}".format(norder, slnk_dt)) if isinstance(slnk_dt[0], int) and isinstance(slnk_dt[1], int): return norder + ( (xoffset+20, slnk_dt[0]), (xoffset + 40, slnk_dt[1])) @@ -221,9 +220,9 @@ def retrieve_strains_and_values(orders, strainlist, traits_data_list): rets = [] for order in orders: temp_val = traits_data_list[order[1]] - for i in range(len(strainlist)): - if temp_val[i] != None: - strains.append(strainlist[i]) + for i, strain in enumerate(strainlist): + if temp_val[i] is not None: + strains.append(strain) values.append(temp_val[i]) rets.append([order, strains[:], values[:]]) strains = [] -- cgit v1.2.3 From bb1fd69fa24cec4ff605450d241601b3f0ced8cb Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Mon, 30 Aug 2021 09:50:44 +0300 Subject: Remove empty line Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Remove empty line at the end of the traits file --- gn3/computations/qtlreaper.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py index 49d363b..a88659e 100644 --- a/gn3/computations/qtlreaper.py +++ b/gn3/computations/qtlreaper.py @@ -27,7 +27,9 @@ def generate_traits_file(strains, trait_values, traits_filename): header = "Traits\t{}\n".format("\t".join(strains)) data = [header] + [ "T{}\t{}\n".format(i+1, "\t".join([str(i) for i in t])) - for i, t in enumerate(trait_values)] + for i, t in enumerate(trait_values[:-1])] + [ + "T{}\t{}".format(len(trait_values), "\t".join([str(i) for i in t])) + for t in trait_values[-1:]] with open(traits_filename, "w") as outfile: outfile.writelines(data) -- cgit v1.2.3 From 58f59b8f7df82969b58a604070aec095d17e0501 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Mon, 30 Aug 2021 11:44:37 +0300 Subject: Fix issues with traits file format * README.md: update header: Traits ==> Trait * gn3/computations/qtlreaper.py: update header: Traits ==> Trait * qtlfilesexport.py: Choose only BXD strains Rename the first column header from "Traits" to "Trait" to correspond with what `rust-qtlreaper` expects. Choose only the BXD strains for the proof-of-concept example - this helped bring out the fact that the traits file SHOULD NOT contain a strain column for a strain that does not exist in the genotype file in consideration. If the traits file has a strain column which does not exist in the genotype file, then `rust-qtlreaper` fails with a panic, since, from what I can tell, it tries to get a value from the genotype file for the non-existent strain, which results to a `None` type. Subsequent attempts at running an operation on the `None` type lead to the panic. --- README.md | 4 +++- gn3/computations/qtlreaper.py | 2 +- qtlfilesexport.py | 31 ++++++++++++++++++++++++++++++- 3 files changed, 34 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 0e0e509..b54015f 100644 --- a/README.md +++ b/README.md @@ -136,8 +136,10 @@ Under the **"Trait"** column, the traits are numbered from **T1** to **T** wh As an example, you could end up with a trait file like the following: ```txt -Traits BXD27 BXD32 DBA/2J BXD21 ... +Trait BXD27 BXD32 DBA/2J BXD21 ... T1 10.5735 9.27408 9.48255 9.18253 ... T2 6.4471 6.7191 5.98015 6.68051 ... ... ``` + +It is very important that the column header names for the strains correspond to the genotype file used. diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py index a88659e..9b13a55 100644 --- a/gn3/computations/qtlreaper.py +++ b/gn3/computations/qtlreaper.py @@ -24,7 +24,7 @@ def generate_traits_file(strains, trait_values, traits_filename): traits_filename: The tab-separated value to put the values in for computation of QTLs. """ - header = "Traits\t{}\n".format("\t".join(strains)) + header = "Trait\t{}\n".format("\t".join(strains)) data = [header] + [ "T{}\t{}\n".format(i+1, "\t".join([str(i) for i in t])) for i, t in enumerate(trait_values[:-1])] + [ diff --git a/qtlfilesexport.py b/qtlfilesexport.py index 0543dc9..adc5e77 100644 --- a/qtlfilesexport.py +++ b/qtlfilesexport.py @@ -41,7 +41,36 @@ def main(): retrieve_trait_info(threshold, fullname, conn) for fullname in trait_fullnames()] traits_data_list = [retrieve_trait_data(t, conn) for t in traits] - strains = list(set([k for td in traits_data_list for k in td["data"].keys()])) + # strains = list(set([k for td in traits_data_list for k in td["data"].keys()])) + strains = [# Use only the strains in the BXD.geno genotype file + "BXD1", "BXD2", "BXD5", "BXD6", "BXD8", "BXD9", "BXD11", "BXD12", + "BXD13", "BXD14", "BXD15", "BXD16", "BXD18", "BXD19", "BXD20", "BXD21", + "BXD22", "BXD23", "BXD24", "BXD24a", "BXD25", "BXD27", "BXD28", "BXD29", + "BXD30", "BXD31", "BXD32", "BXD33", "BXD34", "BXD35", "BXD36", "BXD37", + "BXD38", "BXD39", "BXD40", "BXD41", "BXD42", "BXD43", "BXD44", "BXD45", + "BXD48", "BXD48a", "BXD49", "BXD50", "BXD51", "BXD52", "BXD53", "BXD54", + "BXD55", "BXD56", "BXD59", "BXD60", "BXD61", "BXD62", "BXD63", "BXD64", + "BXD65", "BXD65a", "BXD65b", "BXD66", "BXD67", "BXD68", "BXD69", + "BXD70", "BXD71", "BXD72", "BXD73", "BXD73a", "BXD73b", "BXD74", + "BXD75", "BXD76", "BXD77", "BXD78", "BXD79", "BXD81", "BXD83", "BXD84", + "BXD85", "BXD86", "BXD87", "BXD88", "BXD89", "BXD90", "BXD91", "BXD93", + "BXD94", "BXD95", "BXD98", "BXD99", "BXD100", "BXD101", "BXD102", + "BXD104", "BXD105", "BXD106", "BXD107", "BXD108", "BXD109", "BXD110", + "BXD111", "BXD112", "BXD113", "BXD114", "BXD115", "BXD116", "BXD117", + "BXD119", "BXD120", "BXD121", "BXD122", "BXD123", "BXD124", "BXD125", + "BXD126", "BXD127", "BXD128", "BXD128a", "BXD130", "BXD131", "BXD132", + "BXD133", "BXD134", "BXD135", "BXD136", "BXD137", "BXD138", "BXD139", + "BXD141", "BXD142", "BXD144", "BXD145", "BXD146", "BXD147", "BXD148", + "BXD149", "BXD150", "BXD151", "BXD152", "BXD153", "BXD154", "BXD155", + "BXD156", "BXD157", "BXD160", "BXD161", "BXD162", "BXD165", "BXD168", + "BXD169", "BXD170", "BXD171", "BXD172", "BXD173", "BXD174", "BXD175", + "BXD176", "BXD177", "BXD178", "BXD180", "BXD181", "BXD183", "BXD184", + "BXD186", "BXD187", "BXD188", "BXD189", "BXD190", "BXD191", "BXD192", + "BXD193", "BXD194", "BXD195", "BXD196", "BXD197", "BXD198", "BXD199", + "BXD200", "BXD201", "BXD202", "BXD203", "BXD204", "BXD205", "BXD206", + "BXD207", "BXD208", "BXD209", "BXD210", "BXD211", "BXD212", "BXD213", + "BXD214", "BXD215", "BXD216", "BXD217", "BXD218", "BXD219", "BXD220" + ] exported_traits_data_list = [ export_trait_data(td, strains) for td in traits_data_list] slinked = slink(cluster_traits(exported_traits_data_list)) -- cgit v1.2.3 From b8777bcfee70325263d5389367e3a93ec2842f69 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Mon, 30 Aug 2021 12:01:54 +0300 Subject: Update documentation on genotype files * Provide documentation on downloading and using the genotype files. --- README.md | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b54015f..61ca539 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ guix environment --load=guix.scm Also, make sure you have the [guix-bioinformatics](https://git.genenetwork.org/guix-bioinformatics/guix-bioinformatics) channel set up. ```bash -env GUIX_PACKAGE_PATH=~/guix-bioinformatics/ ~/.config/guix/current/bin/guix environment --load=guix.scm +env GUIX_PACKAGE_PATH=~/guix-bioinformatics/ ~/.config/guix/current/bin/guix environment --expose=$HOME/genotype_files/ --load=guix.scm python3 import redis ``` @@ -22,7 +22,7 @@ python3 Better run a proper container ``` -env GUIX_PACKAGE_PATH=~/guix-bioinformatics/ ~/.config/guix/current/bin/guix environment -C --network --load=guix.scm +env GUIX_PACKAGE_PATH=~/guix-bioinformatics/ ~/.config/guix/current/bin/guix environment -C --network --expose=$HOME/genotype_files/ --load=guix.scm ``` If you get a Guix error, such as `ice-9/boot-9.scm:1669:16: In procedure raise-exception: @@ -121,6 +121,19 @@ pip freeze --path venv/lib/python3.8/site-packages > requirements.txt ``` +## Genotype Files + +You can get the genotype files from http://ipfs.genenetwork.org/ipfs/QmXQy3DAUWJuYxubLHLkPMNCEVq1oV7844xWG2d1GSPFPL and save them on your host machine at, say `$HOME/genotype_files` with something like: + +```bash +$ mkdir -p $HOME/genotype_files +$ cd $HOME/genotype_files +$ yes | 7z x genotype_files.tar.7z +$ tar xf genotype_files.tar +``` + +The `genotype_files.tar.7z` file seems to only contain the **BXD.geno** genotype file. + ## QTLReaper (rust-qtlreaper) and Trait Files To run QTL computations, this system makes use of the [rust-qtlreaper](https://github.com/chfi/rust-qtlreaper.git) utility. -- cgit v1.2.3 From 6ab866183aeac8553fdcda9217e4445da2b4836b Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Tue, 31 Aug 2021 06:51:18 +0300 Subject: Provide utilities for genotype files Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/db/genotypes.py: New module * gn3/settings.py: Add new configuration variable * qtlfilesexport.py: Test out new code Add a module containing functions dealing with the genotype files. Add a configuration variable to point to the location of the genotype files. --- gn3/db/genotypes.py | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++ gn3/settings.py | 4 ++++ qtlfilesexport.py | 33 +++---------------------- 3 files changed, 76 insertions(+), 30 deletions(-) create mode 100644 gn3/db/genotypes.py diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py new file mode 100644 index 0000000..610ddde --- /dev/null +++ b/gn3/db/genotypes.py @@ -0,0 +1,69 @@ +"""Genotype utilities""" + +import os +import gzip +from gn3.settings import GENOTYPE_FILES + +def build_genotype_file( + geno_name: str, base_dir: str = GENOTYPE_FILES, + extension: str = "geno"): + """Build the absolute path for the genotype file.""" + return "{}/{}.{}".format(os.path.abspath(base_dir), geno_name, extension) + +def load_genotype_samples(genotype_filename: str, file_type: str = "geno"): + """ + Load sample of strains from genotype files. + + DESCRIPTION: + Traits can contain a varied number of strains, some of which do not exist in + certain genotypes. In order to compute QTLs, GEMMAs, etc, we need to ensure + to pick only those strains that exist in the genotype under consideration + for the traits used in the computation. + + This function loads a list of samples from the genotype files for use in + filtering out unusable strains. + + + PARAMETERS: + genotype_filename: The absolute path to the genotype file to load the + samples from. + file_type: The type of file. Currently supported values are 'geno' and + 'plink'. + """ + file_type_fns = { + "geno": __load_genotype_samples_from_geno, + "plink": __load_genotype_samples_from_plink + } + return file_type_fns[file_type](genotype_filename) + +def __load_genotype_samples_from_geno(genotype_filename: str): + """ + Helper function for `load_genotype_samples` function. + + Loads samples from '.geno' files. + """ + gzipped_filename = "{}.gz".format(genotype_filename) + if os.path.isfile(gzipped_filename): + genofile = gzip.open(gzipped_filename) + else: + genofile = open(genotype_filename) + + for row in genofile: + line = row.strip() + if (not line) or (line.startswith(("#", "@"))): + continue + break + + headers = line.split("\t") + if headers[3] == "Mb": + return headers[4:] + return headers[3:] + +def __load_genotype_samples_from_plink(genotype_filename: str): + """ + Helper function for `load_genotype_samples` function. + + Loads samples from '.plink' files. + """ + genofile = open(genotype_filename) + return [line.split(" ")[1] for line in genofile] diff --git a/gn3/settings.py b/gn3/settings.py index d137370..a08f846 100644 --- a/gn3/settings.py +++ b/gn3/settings.py @@ -27,3 +27,7 @@ BIWEIGHT_RSCRIPT = "~/genenetwork3/scripts/calculate_biweight.R" # qtlreaper command REAPER_COMMAND = "{}/bin/qtlreaper".format(os.environ.get("GUIX_ENVIRONMENT")) + +# genotype files +GENOTYPE_FILES = os.environ.get( + "GENOTYPE_FILES", "{}/genotype_files/genotype".format(os.environ.get("HOME"))) diff --git a/qtlfilesexport.py b/qtlfilesexport.py index adc5e77..1db4ab6 100644 --- a/qtlfilesexport.py +++ b/qtlfilesexport.py @@ -11,6 +11,7 @@ from gn3.computations.slink import slink from gn3.db_utils import database_connector from gn3.computations.heatmap import export_trait_data from gn3.db.traits import retrieve_trait_data, retrieve_trait_info +from gn3.db.genotypes import build_genotype_file, load_genotype_samples from gn3.computations.qtlreaper import random_string, generate_traits_file from gn3.computations.heatmap import ( cluster_traits, @@ -41,36 +42,8 @@ def main(): retrieve_trait_info(threshold, fullname, conn) for fullname in trait_fullnames()] traits_data_list = [retrieve_trait_data(t, conn) for t in traits] - # strains = list(set([k for td in traits_data_list for k in td["data"].keys()])) - strains = [# Use only the strains in the BXD.geno genotype file - "BXD1", "BXD2", "BXD5", "BXD6", "BXD8", "BXD9", "BXD11", "BXD12", - "BXD13", "BXD14", "BXD15", "BXD16", "BXD18", "BXD19", "BXD20", "BXD21", - "BXD22", "BXD23", "BXD24", "BXD24a", "BXD25", "BXD27", "BXD28", "BXD29", - "BXD30", "BXD31", "BXD32", "BXD33", "BXD34", "BXD35", "BXD36", "BXD37", - "BXD38", "BXD39", "BXD40", "BXD41", "BXD42", "BXD43", "BXD44", "BXD45", - "BXD48", "BXD48a", "BXD49", "BXD50", "BXD51", "BXD52", "BXD53", "BXD54", - "BXD55", "BXD56", "BXD59", "BXD60", "BXD61", "BXD62", "BXD63", "BXD64", - "BXD65", "BXD65a", "BXD65b", "BXD66", "BXD67", "BXD68", "BXD69", - "BXD70", "BXD71", "BXD72", "BXD73", "BXD73a", "BXD73b", "BXD74", - "BXD75", "BXD76", "BXD77", "BXD78", "BXD79", "BXD81", "BXD83", "BXD84", - "BXD85", "BXD86", "BXD87", "BXD88", "BXD89", "BXD90", "BXD91", "BXD93", - "BXD94", "BXD95", "BXD98", "BXD99", "BXD100", "BXD101", "BXD102", - "BXD104", "BXD105", "BXD106", "BXD107", "BXD108", "BXD109", "BXD110", - "BXD111", "BXD112", "BXD113", "BXD114", "BXD115", "BXD116", "BXD117", - "BXD119", "BXD120", "BXD121", "BXD122", "BXD123", "BXD124", "BXD125", - "BXD126", "BXD127", "BXD128", "BXD128a", "BXD130", "BXD131", "BXD132", - "BXD133", "BXD134", "BXD135", "BXD136", "BXD137", "BXD138", "BXD139", - "BXD141", "BXD142", "BXD144", "BXD145", "BXD146", "BXD147", "BXD148", - "BXD149", "BXD150", "BXD151", "BXD152", "BXD153", "BXD154", "BXD155", - "BXD156", "BXD157", "BXD160", "BXD161", "BXD162", "BXD165", "BXD168", - "BXD169", "BXD170", "BXD171", "BXD172", "BXD173", "BXD174", "BXD175", - "BXD176", "BXD177", "BXD178", "BXD180", "BXD181", "BXD183", "BXD184", - "BXD186", "BXD187", "BXD188", "BXD189", "BXD190", "BXD191", "BXD192", - "BXD193", "BXD194", "BXD195", "BXD196", "BXD197", "BXD198", "BXD199", - "BXD200", "BXD201", "BXD202", "BXD203", "BXD204", "BXD205", "BXD206", - "BXD207", "BXD208", "BXD209", "BXD210", "BXD211", "BXD212", "BXD213", - "BXD214", "BXD215", "BXD216", "BXD217", "BXD218", "BXD219", "BXD220" - ] + genotype_filename = build_genotype_file(traits[0]["riset"]) + strains = load_genotype_samples(genotype_filename) exported_traits_data_list = [ export_trait_data(td, strains) for td in traits_data_list] slinked = slink(cluster_traits(exported_traits_data_list)) -- cgit v1.2.3 From 6c872943597f3664cca77abbdf56f074fc5231e6 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Tue, 31 Aug 2021 06:56:35 +0300 Subject: Fix bugs with `run_reaper` function Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/computations/qtlreaper.py: Fix some bugs * qtlfilesexport.py: Test out running rust-qtlreaper Test out the qtlreaper interface code and fix some bugs caught in the process. --- gn3/computations/qtlreaper.py | 8 +++++--- qtlfilesexport.py | 7 +++++++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py index 9b13a55..c058e14 100644 --- a/gn3/computations/qtlreaper.py +++ b/gn3/computations/qtlreaper.py @@ -42,7 +42,7 @@ def create_output_directory(path: str): def run_reaper( genotype_filename: str, traits_filename: str, - other_options: tuple = ("--n_permutations", 1000), + other_options: tuple = ("--n_permutations", "1000"), separate_nperm_output: bool = False, output_dir: str = TMPDIR): """ @@ -70,7 +70,7 @@ def run_reaper( The function will raise a `subprocess.CalledProcessError` exception in case of any errors running the `qtlreaper` command. """ - create_output_directory(output_dir) + create_output_directory("{}/qtlreaper".format(output_dir)) output_filename = "{}/qtlreaper/main_output_{}.txt".format( output_dir, random_string(10)) output_list = ["--main_output", output_filename] @@ -84,7 +84,9 @@ def run_reaper( command_list = [ REAPER_COMMAND, "--geno", genotype_filename, *other_options, # this splices the `other_options` list here - "--traits", traits_filename, "--main_output", output_filename] + "--traits", traits_filename, + *output_list # this splices the `output_list` list here + ] subprocess.run(command_list, check=True) return (output_filename, permu_output_filename) diff --git a/qtlfilesexport.py b/qtlfilesexport.py index 1db4ab6..799de31 100644 --- a/qtlfilesexport.py +++ b/qtlfilesexport.py @@ -9,6 +9,7 @@ replacing the variables in the angled brackets with the appropriate values """ from gn3.computations.slink import slink from gn3.db_utils import database_connector +from gn3.computations.qtlreaper import run_reaper from gn3.computations.heatmap import export_trait_data from gn3.db.traits import retrieve_trait_data, retrieve_trait_info from gn3.db.genotypes import build_genotype_file, load_genotype_samples @@ -57,5 +58,11 @@ def main(): generate_traits_file(strains_values, trait_values, traits_filename) print("Generated file: {}".format(traits_filename)) + main_output, permutations_output = run_reaper( + genotype_filename, traits_filename, separate_nperm_output=True) + + print("Main output: {}, Permutation output: {}".format( + main_output, permutations_output)) + if __name__ == "__main__": main() -- cgit v1.2.3 From 64ce38b45839b6305b009f6e28b0f852409e9bda Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Tue, 31 Aug 2021 10:45:11 +0300 Subject: Parse QTLReaper outputs Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/computations/qtlreaper.py: pass output files * tests/unit/computations/data/qtlreaper/main_output_sample.txt: sample test data * tests/unit/computations/data/qtlreaper/permu_output_sample.txt: sample test data * tests/unit/computations/test_qtlreaper.py: add tests Add code to parse the QTLReaper output data files. --- gn3/computations/qtlreaper.py | 18 ++++++ .../data/qtlreaper/main_output_sample.txt | 11 ++++ .../data/qtlreaper/permu_output_sample.txt | 27 ++++++++ tests/unit/computations/test_qtlreaper.py | 74 ++++++++++++++++++++++ 4 files changed, 130 insertions(+) create mode 100644 tests/unit/computations/data/qtlreaper/main_output_sample.txt create mode 100644 tests/unit/computations/data/qtlreaper/permu_output_sample.txt create mode 100644 tests/unit/computations/test_qtlreaper.py diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py index c058e14..3b8e4db 100644 --- a/gn3/computations/qtlreaper.py +++ b/gn3/computations/qtlreaper.py @@ -90,3 +90,21 @@ def run_reaper( subprocess.run(command_list, check=True) return (output_filename, permu_output_filename) + + +def parse_reaper_main_results(results_file): + with open(results_file, "r") as infile: + lines = infile.readlines() + + def __parse_line(line): + items = line.strip().split("\t") + return items[0:2] + [float(item) for item in items[2:]] + + header = lines[0].strip().split("\t") + return [dict(zip(header, __parse_line(line))) for line in lines[1:]] + +def parse_reaper_permutation_results(results_file): + with open(results_file, "r") as infile: + lines = infile.readlines() + + return [float(line.strip()) for line in lines] diff --git a/tests/unit/computations/data/qtlreaper/main_output_sample.txt b/tests/unit/computations/data/qtlreaper/main_output_sample.txt new file mode 100644 index 0000000..12b11b4 --- /dev/null +++ b/tests/unit/computations/data/qtlreaper/main_output_sample.txt @@ -0,0 +1,11 @@ +ID Locus Chr cM Mb LRS Additive pValue +T1 rs31443144 1 1.500 3.010 0.500 -0.074 1.000 +T1 rs6269442 1 1.500 3.492 0.500 -0.074 1.000 +T1 rs32285189 1 1.630 3.511 0.500 -0.074 1.000 +T1 rs258367496 1 1.630 3.660 0.500 -0.074 1.000 +T1 rs32430919 1 1.750 3.777 0.500 -0.074 1.000 +T1 rs36251697 1 1.880 3.812 0.500 -0.074 1.000 +T1 rs30658298 1 2.010 4.431 0.500 -0.074 1.000 +T1 rs51852623 1 2.010 4.447 0.500 -0.074 1.000 +T1 rs31879829 1 2.140 4.519 0.500 -0.074 1.000 +T1 rs36742481 1 2.140 4.776 0.500 -0.074 1.000 diff --git a/tests/unit/computations/data/qtlreaper/permu_output_sample.txt b/tests/unit/computations/data/qtlreaper/permu_output_sample.txt new file mode 100644 index 0000000..64cff07 --- /dev/null +++ b/tests/unit/computations/data/qtlreaper/permu_output_sample.txt @@ -0,0 +1,27 @@ +4.44174 +5.03825 +5.08167 +5.18119 +5.18578 +5.24563 +5.24619 +5.24619 +5.27961 +5.28228 +5.43903 +5.50188 +5.51694 +5.56830 +5.63874 +5.71346 +5.71936 +5.74275 +5.76764 +5.79815 +5.81671 +5.82775 +5.89659 +5.92117 +5.93396 +5.93396 +5.94957 diff --git a/tests/unit/computations/test_qtlreaper.py b/tests/unit/computations/test_qtlreaper.py new file mode 100644 index 0000000..ec23664 --- /dev/null +++ b/tests/unit/computations/test_qtlreaper.py @@ -0,0 +1,74 @@ +"""Module contains tests for gn3.computations.qtlreaper""" +import os +from unittest import TestCase +from gn3.computations.qtlreaper import ( + parse_reaper_main_results, parse_reaper_permutation_results) + +class TestQTLReaper(TestCase): + """Class for testing qtlreaper interface functions.""" + + def test_parse_reaper_main_results(self): + self.assertEqual( + parse_reaper_main_results( + "tests/unit/computations/data/qtlreaper/main_output_sample.txt"), + [ + { + "ID": "T1", "Locus": "rs31443144", "Chr": 1, "cM": 1.500, + "Mb": 3.010, "LRS": 0.500, "Additive": -0.074, + "pValue": 1.000 + }, + { + "ID": "T1", "Locus": "rs6269442", "Chr": 1, "cM": 1.500, + "Mb": 3.492, "LRS": 0.500, "Additive": -0.074, + "pValue": 1.000 + }, + { + "ID": "T1", "Locus": "rs32285189", "Chr": 1, "cM": 1.630, + "Mb": 3.511, "LRS": 0.500, "Additive": -0.074, + "pValue": 1.000 + }, + { + "ID": "T1", "Locus": "rs258367496", "Chr": 1, "cM": 1.630, + "Mb": 3.660, "LRS": 0.500, "Additive": -0.074, + "pValue": 1.000 + }, + { + "ID": "T1", "Locus": "rs32430919", "Chr": 1, "cM": 1.750, + "Mb": 3.777, "LRS": 0.500, "Additive": -0.074, + "pValue": 1.000 + }, + { + "ID": "T1", "Locus": "rs36251697", "Chr": 1, "cM": 1.880, + "Mb": 3.812, "LRS": 0.500, "Additive": -0.074, + "pValue": 1.000 + }, + { + "ID": "T1", "Locus": "rs30658298", "Chr": 1, "cM": 2.010, + "Mb": 4.431, "LRS": 0.500, "Additive": -0.074, + "pValue": 1.000 + }, + { + "ID": "T1", "Locus": "rs51852623", "Chr": 1, "cM": 2.010, + "Mb": 4.447, "LRS": 0.500, "Additive": -0.074, + "pValue": 1.000 + }, + { + "ID": "T1", "Locus": "rs31879829", "Chr": 1, "cM": 2.140, + "Mb": 4.519, "LRS": 0.500, "Additive": -0.074, + "pValue": 1.000 + }, + { + "ID": "T1", "Locus": "rs36742481", "Chr": 1, "cM": 2.140, + "Mb": 4.776, "LRS": 0.500, "Additive": -0.074, + "pValue": 1.000 + } + ]) + + def test_parse_reaper_permutation_results(self): + self.assertEqual( + parse_reaper_permutation_results( + "tests/unit/computations/data/qtlreaper/permu_output_sample.txt"), + [4.44174, 5.03825, 5.08167, 5.18119, 5.18578, 5.24563, 5.24619, + 5.24619, 5.27961, 5.28228, 5.43903, 5.50188, 5.51694, 5.56830, + 5.63874, 5.71346, 5.71936, 5.74275, 5.76764, 5.79815, 5.81671, + 5.82775, 5.89659, 5.92117, 5.93396, 5.93396, 5.94957]) -- cgit v1.2.3 From c3f8013347e3e8850c90cb787edb2bec1f367f7d Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Tue, 31 Aug 2021 10:48:30 +0300 Subject: Fix test Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * The number of the arguments to the function changed, and so the tests for the function needed to be updated. --- tests/unit/computations/test_heatmap.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/tests/unit/computations/test_heatmap.py b/tests/unit/computations/test_heatmap.py index 686288d..87f8e45 100644 --- a/tests/unit/computations/test_heatmap.py +++ b/tests/unit/computations/test_heatmap.py @@ -168,11 +168,25 @@ class TestHeatmap(TestCase): def test_retrieve_strains_and_values(self): """Test retrieval of strains and values.""" - for slist, tdata, expected in [ - [["s1", "s2", "s3", "s4"], [9, None, 5, 4], - (("s1", "s3", "s4"), (9, 5, 4))], - [["s1", "s2", "s3", "s4", "s5"], [6, None, None, 4, None], - (("s1", "s4"), (6, 4))]]: + for orders, slist, tdata, expected in [ + [ + [(60, 2)], + ["s1", "s2", "s3", "s4"], + [[2, 9, 6, None, 4], + [7, 5, None, None, 4], + [9, None, 5, 4, 7], + [6, None, None, 4, None]], + [[(60, 2), ["s1", "s3", "s4"], [9, 5, 4]]] + ], + [ + [(60, 3)], + ["s1", "s2", "s3", "s4", "s5"], + [[2, 9, 6, None, 4], + [7, 5, None, None, 4], + [9, None, 5, 4, 7], + [6, None, None, 4, None]], + [[(60, 3), ["s1", "s4"], [6, 4]]] + ]]: with self.subTest(strainlist=slist, traitdata=tdata): self.assertEqual( - retrieve_strains_and_values(slist, tdata), expected) + retrieve_strains_and_values(orders, slist, tdata), expected) -- cgit v1.2.3 From e441509a59c20a051fd5ab94710513f1968a5e02 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Tue, 31 Aug 2021 10:50:56 +0300 Subject: Update `heatmap_data` function: remove extraneous data Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/computations/heatmap.py: update function * gn3/db/traits.py: new function Remove extraneous data and arguments from the function. - Load the genotype file - Generate traits file - Provide both raw traits data, and exported traits data in return --- gn3/computations/heatmap.py | 42 ++++++++++++++++++++++-------------------- gn3/db/traits.py | 5 +++++ 2 files changed, 27 insertions(+), 20 deletions(-) diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py index e0ff05b..92014cf 100644 --- a/gn3/computations/heatmap.py +++ b/gn3/computations/heatmap.py @@ -6,8 +6,12 @@ generate various kinds of heatmaps. from functools import reduce from typing import Any, Dict, Sequence from gn3.computations.slink import slink -from gn3.db.traits import retrieve_trait_data, retrieve_trait_info from gn3.computations.correlations2 import compute_correlation +from gn3.db.genotypes import build_genotype_file, load_genotype_samples +from gn3.db.traits import ( + retrieve_trait_data, + retrieve_trait_info, + generate_traits_filename) def export_trait_data( trait_data: dict, strainlist: Sequence[str], dtype: str = "val", @@ -125,7 +129,7 @@ def cluster_traits(traits_data_list: Sequence[Dict]): return tuple(__cluster(tdata_i) for tdata_i in enumerate(traits_data_list)) -def heatmap_data(formd, search_result, conn: Any): +def heatmap_data(traits_names, conn: Any): """ heatmap function @@ -142,39 +146,37 @@ def heatmap_data(formd, search_result, conn: Any): TODO: Elaborate on the parameters here... """ threshold = 0 # webqtlConfig.PUBLICTHRESH - cluster_checked = formd.formdata.getvalue("clusterCheck", "") - strainlist = [ - strain for strain in formd.strainlist if strain not in formd.parlist] - genotype = formd.genotype - def __retrieve_traitlist_and_datalist(threshold, fullname): trait = retrieve_trait_info(threshold, fullname, conn) return (trait, retrieve_trait_data(trait, conn)) traits_details = [ __retrieve_traitlist_and_datalist(threshold, fullname) - for fullname in search_result] + for fullname in traits_names] traits_list = tuple(x[0] for x in traits_details) traits_data_list = [x[1] for x in traits_details] exported_traits_data_list = tuple( export_trait_data(td, strainlist) for td in traits_data_list) + genotype_filename = build_genotype_file(traits_list[0]["riset"]) + strainlist = load_genotype_samples(genotype_filename) + slink_data = slink(cluster_traits(exported_traits_data_list)) + ordering_data = compute_heatmap_order(slink_data) + strains_and_values = retrieve_strains_and_values( + orders, strainlist, exported_traits_data_list) + strains_values = strains_and_values[0][1] + trait_values = [t[2] for t in strains_and_values] + traits_filename = generate_traits_filename() + generate_traits_file(strains_values, trait_values, traits_filename) return { - "target_description_checked": formd.formdata.getvalue( - "targetDescriptionCheck", ""), - "cluster_checked": cluster_checked, - "slink_data": ( - slink(cluster_traits(exported_traits_data_list)) - if cluster_checked else False), - "sessionfile": formd.formdata.getvalue("session"), - "genotype": genotype, - "nLoci": sum(map(len, genotype)), + "slink_data": slink_data, + "ordering_data": ordering_data, "strainlist": strainlist, - "ppolar": formd.ppolar, - "mpolar":formd.mpolar, + "genotype_filename": genotype_filename, "traits_list": traits_list, "traits_data_list": traits_data_list, - "exported_traits_data_list": exported_traits_data_list + "exported_traits_data_list": exported_traits_data_list, + "traits_filename": traits_filename } def compute_heatmap_order( diff --git a/gn3/db/traits.py b/gn3/db/traits.py index 1031e44..ccb101a 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -1,4 +1,5 @@ """This class contains functions relating to trait data manipulation""" +from gn3.settings import TMPDIR from typing import Any, Dict, Union, Sequence from gn3.function_helpers import compose from gn3.db.datasets import retrieve_trait_dataset @@ -666,3 +667,7 @@ def retrieve_trait_data(trait: dict, conn: Any, strainlist: Sequence[str] = tupl {k:v for k, v in x.items() if x != "strain_name"}), data))} return {} + +def generate_traits_filename(base_path: str = TMPDIR): + return "{}/traits_test_file_{}.txt".format( + os.path.abspath(base_path), random_string(10)) -- cgit v1.2.3 From b5e1d1176f1bf4f7c0b68b27beb15e99418f1650 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Tue, 31 Aug 2021 11:16:29 +0300 Subject: Fix linting errors, minor bugs and reorganise code * Fix some linting errors and some minor bugs caught by the linter. Move the `random_string` function to separate module for use in multiple places in the code. --- gn3/computations/heatmap.py | 7 ++++--- gn3/computations/qtlreaper.py | 27 ++++++++++++++------------- gn3/db/traits.py | 5 ++++- gn3/heatmaps/heatmaps.py | 25 +++++++++++++++++++------ gn3/random.py | 11 +++++++++++ tests/unit/computations/test_qtlreaper.py | 5 +++-- 6 files changed, 55 insertions(+), 25 deletions(-) create mode 100644 gn3/random.py diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py index 92014cf..1143450 100644 --- a/gn3/computations/heatmap.py +++ b/gn3/computations/heatmap.py @@ -6,6 +6,7 @@ generate various kinds of heatmaps. from functools import reduce from typing import Any, Dict, Sequence from gn3.computations.slink import slink +from gn3.computations.qtlreaper import generate_traits_file from gn3.computations.correlations2 import compute_correlation from gn3.db.genotypes import build_genotype_file, load_genotype_samples from gn3.db.traits import ( @@ -155,14 +156,14 @@ def heatmap_data(traits_names, conn: Any): for fullname in traits_names] traits_list = tuple(x[0] for x in traits_details) traits_data_list = [x[1] for x in traits_details] - exported_traits_data_list = tuple( - export_trait_data(td, strainlist) for td in traits_data_list) genotype_filename = build_genotype_file(traits_list[0]["riset"]) strainlist = load_genotype_samples(genotype_filename) + exported_traits_data_list = tuple( + export_trait_data(td, strainlist) for td in traits_data_list) slink_data = slink(cluster_traits(exported_traits_data_list)) ordering_data = compute_heatmap_order(slink_data) strains_and_values = retrieve_strains_and_values( - orders, strainlist, exported_traits_data_list) + ordering_data, strainlist, exported_traits_data_list) strains_values = strains_and_values[0][1] trait_values = [t[2] for t in strains_and_values] traits_filename = generate_traits_filename() diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py index 3b8e4db..30c7051 100644 --- a/gn3/computations/qtlreaper.py +++ b/gn3/computations/qtlreaper.py @@ -3,17 +3,10 @@ This module contains functions to interact with the `qtlreaper` utility for computation of QTLs. """ import os -import random -import string import subprocess +from gn3.random import random_string from gn3.settings import TMPDIR, REAPER_COMMAND -def random_string(length): - """Generate a random string of length `length`.""" - return "".join( - random.choices( - string.ascii_letters + string.digits, k=length)) - def generate_traits_file(strains, trait_values, traits_filename): """ Generate a traits file for use with `qtlreaper`. @@ -25,11 +18,13 @@ def generate_traits_file(strains, trait_values, traits_filename): computation of QTLs. """ header = "Trait\t{}\n".format("\t".join(strains)) - data = [header] + [ - "T{}\t{}\n".format(i+1, "\t".join([str(i) for i in t])) - for i, t in enumerate(trait_values[:-1])] + [ - "T{}\t{}".format(len(trait_values), "\t".join([str(i) for i in t])) - for t in trait_values[-1:]] + data = ( + [header] + + ["T{}\t{}\n".format(i+1, "\t".join([str(i) for i in t])) + for i, t in enumerate(trait_values[:-1])] + + ["T{}\t{}".format( + len(trait_values), "\t".join([str(i) for i in t])) + for t in trait_values[-1:]]) with open(traits_filename, "w") as outfile: outfile.writelines(data) @@ -93,6 +88,9 @@ def run_reaper( def parse_reaper_main_results(results_file): + """ + Parse the results file of running QTLReaper into a list of dicts. + """ with open(results_file, "r") as infile: lines = infile.readlines() @@ -104,6 +102,9 @@ def parse_reaper_main_results(results_file): return [dict(zip(header, __parse_line(line))) for line in lines[1:]] def parse_reaper_permutation_results(results_file): + """ + Parse the results QTLReaper permutations into a list of values. + """ with open(results_file, "r") as infile: lines = infile.readlines() diff --git a/gn3/db/traits.py b/gn3/db/traits.py index ccb101a..bfe887e 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -1,6 +1,8 @@ """This class contains functions relating to trait data manipulation""" -from gn3.settings import TMPDIR +import os from typing import Any, Dict, Union, Sequence +from gn3.settings import TMPDIR +from gn3.random import random_string from gn3.function_helpers import compose from gn3.db.datasets import retrieve_trait_dataset @@ -669,5 +671,6 @@ def retrieve_trait_data(trait: dict, conn: Any, strainlist: Sequence[str] = tupl return {} def generate_traits_filename(base_path: str = TMPDIR): + """Generate a unique filename for use with generated traits files.""" return "{}/traits_test_file_{}.txt".format( os.path.abspath(base_path), random_string(10)) diff --git a/gn3/heatmaps/heatmaps.py b/gn3/heatmaps/heatmaps.py index 3bf7917..88f546d 100644 --- a/gn3/heatmaps/heatmaps.py +++ b/gn3/heatmaps/heatmaps.py @@ -14,6 +14,19 @@ def generate_random_data(data_stop: float = 2, width: int = 10, height: int = 30 return [[random.uniform(0,data_stop) for i in range(0, width)] for j in range(0, height)] +def generate_random_data2(data_stop: float = 2, width: int = 10, height: int = 30): + """ + This is mostly a utility function to be used to generate random data, useful + for development of the heatmap generation code, without access to the actual + database data. + """ + return [ + [{ + "value": item, + "category": random.choice(["C57BL/6J +", "DBA/2J +"])} + for item in axis] + for axis in generate_random_data(data_stop, width, height)] + def heatmap_x_axis_names(): return [ "UCLA_BXDBXH_CARTILAGE_V2::ILM103710672", @@ -30,13 +43,14 @@ def heatmap_x_axis_names(): # Grey + Blue + Red def generate_heatmap(): - rows = 20 - data = generate_random_data(height=rows) - y = (["%s"%x for x in range(1, rows+1)][:-1] + ["X"]) #replace last item with x for now + cols = 20 + y_axis = (["%s"%x for x in range(1, cols+1)][:-1] + ["X"]) #replace last item with x for now + x_axis = heatmap_x_axis_names() + data = generate_random_data(height=cols, width=len(x_axis)) fig = px.imshow( data, - x=heatmap_x_axis_names(), - y=y, + x=x_axis, + y=y_axis, width=500) fig.update_traces(xtype="array") fig.update_traces(ytype="array") @@ -49,6 +63,5 @@ def generate_heatmap(): coloraxis_colorscale=[ [0.0, '#3B3B3B'], [0.4999999999999999, '#ABABAB'], [0.5, '#F5DE11'], [1.0, '#FF0D00']]) - fig.write_html("%s/%s"%(heatmap_dir, "test_image.html")) return fig diff --git a/gn3/random.py b/gn3/random.py new file mode 100644 index 0000000..f0ba574 --- /dev/null +++ b/gn3/random.py @@ -0,0 +1,11 @@ +""" +Functions to generate complex random data. +""" +import random +import string + +def random_string(length): + """Generate a random string of length `length`.""" + return "".join( + random.choices( + string.ascii_letters + string.digits, k=length)) diff --git a/tests/unit/computations/test_qtlreaper.py b/tests/unit/computations/test_qtlreaper.py index ec23664..6c3b64d 100644 --- a/tests/unit/computations/test_qtlreaper.py +++ b/tests/unit/computations/test_qtlreaper.py @@ -1,5 +1,4 @@ """Module contains tests for gn3.computations.qtlreaper""" -import os from unittest import TestCase from gn3.computations.qtlreaper import ( parse_reaper_main_results, parse_reaper_permutation_results) @@ -8,6 +7,7 @@ class TestQTLReaper(TestCase): """Class for testing qtlreaper interface functions.""" def test_parse_reaper_main_results(self): + """Test that the main results file is parsed correctly.""" self.assertEqual( parse_reaper_main_results( "tests/unit/computations/data/qtlreaper/main_output_sample.txt"), @@ -65,9 +65,10 @@ class TestQTLReaper(TestCase): ]) def test_parse_reaper_permutation_results(self): + """Test that the permutations results file is parsed correctly.""" self.assertEqual( parse_reaper_permutation_results( - "tests/unit/computations/data/qtlreaper/permu_output_sample.txt"), + "tests/unit/computations/data/qtlreaper/permu_output_sample.txt"), [4.44174, 5.03825, 5.08167, 5.18119, 5.18578, 5.24563, 5.24619, 5.24619, 5.27961, 5.28228, 5.43903, 5.50188, 5.51694, 5.56830, 5.63874, 5.71346, 5.71936, 5.74275, 5.76764, 5.79815, 5.81671, -- cgit v1.2.3 From 221c773daea839ecf0e50c196484bb91e3a6db33 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Wed, 1 Sep 2021 06:18:20 +0300 Subject: Implement parsing of genotype labels Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/db/genotypes.py: parse genotype labels * tests/unit/db/test_genotypes.py: test that genotype labels are parsed correctly As part of parsing the genotype files into usable python data structures, this commit adds a function to parse the label lines (beginning with "@") into the appropriate values. --- gn3/db/genotypes.py | 20 ++++++++++++++++++++ tests/unit/db/test_genotypes.py | 17 +++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 tests/unit/db/test_genotypes.py diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py index 610ddde..2be3e1a 100644 --- a/gn3/db/genotypes.py +++ b/gn3/db/genotypes.py @@ -67,3 +67,23 @@ def __load_genotype_samples_from_plink(genotype_filename: str): """ genofile = open(genotype_filename) return [line.split(" ")[1] for line in genofile] + +def parse_genotype_labels(lines: list): + """ + Parse label lines into usable genotype values + + DESCRIPTION: + Reworks + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/utility/gen_geno_ob.py#L75-L93 + """ + acceptable_labels = ["name", "filler", "type", "mat", "pat", "het", "unk"] + def __parse_label(line): + label, value = [l.strip() for l in line[1:].split(":")] + if label not in acceptable_labels: + return None + if label == "name": + return ("group", value) + return (label, value) + return tuple( + item for item in (__parse_label(line) for line in lines) + if item is not None) diff --git a/tests/unit/db/test_genotypes.py b/tests/unit/db/test_genotypes.py new file mode 100644 index 0000000..0264764 --- /dev/null +++ b/tests/unit/db/test_genotypes.py @@ -0,0 +1,17 @@ +"""Tests gn3.db.genotypes""" +from unittest import TestCase +from gn3.db.genotypes import parse_genotype_labels + +class TestGenotypes(TestCase): + """Tests for functions in `gn3.db.genotypes`.""" + + def test_parse_genotype_labels(self): + self.assertEqual( + parse_genotype_labels([ + "@name: test_group\t", "@filler: test_filler ", + "@type:test_type", "@mat:test_mat \t", "@pat:test_pat ", + "@het: test_het ", "@unk: test_unk", "@other: test_other", + "@brrr: test_brrr "]), + (("group", "test_group"), ("filler", "test_filler"), + ("type", "test_type"), ("mat", "test_mat"), ("pat", "test_pat"), + ("het", "test_het"), ("unk", "test_unk"))) -- cgit v1.2.3 From b975e0cfd1d0adc5f51e66292d29d4651d3f053f Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Wed, 1 Sep 2021 07:35:40 +0300 Subject: Parse the genotype file's data header * gn3/db/genotypes.py: parse data header * tests/unit/db/test_genotypes.py: check that header's parse works correctly. Add tests to check that the parser works as expected. Add code to implement the parsing and pass the tests. --- gn3/db/genotypes.py | 19 +++++++++++++++++++ tests/unit/db/test_genotypes.py | 22 +++++++++++++++++++++- 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py index 2be3e1a..be0dfc2 100644 --- a/gn3/db/genotypes.py +++ b/gn3/db/genotypes.py @@ -87,3 +87,22 @@ def parse_genotype_labels(lines: list): return tuple( item for item in (__parse_label(line) for line in lines) if item is not None) + +def parse_genotype_header(line: str, parlist = tuple()): + """ + Parse the genotype file header line + + DESCRIPTION: + Reworks + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/utility/gen_geno_ob.py#L94-L114 + """ + items = [item.strip() for item in line.split("\t")] + Mbmap = "Mb" in items + prgy = ((parlist + tuple(items[4:])) if Mbmap + else (parlist + tuple(items[3:]))) + return ( + ("Mbmap", Mbmap), + ("cm_column", items.index("cM")), + ("mb_column", None if not Mbmap else items.index("Mb")), + ("prgy", prgy), + ("nprgy", len(prgy))) diff --git a/tests/unit/db/test_genotypes.py b/tests/unit/db/test_genotypes.py index 0264764..4fa8a53 100644 --- a/tests/unit/db/test_genotypes.py +++ b/tests/unit/db/test_genotypes.py @@ -1,6 +1,6 @@ """Tests gn3.db.genotypes""" from unittest import TestCase -from gn3.db.genotypes import parse_genotype_labels +from gn3.db.genotypes import parse_genotype_labels, parse_genotype_header class TestGenotypes(TestCase): """Tests for functions in `gn3.db.genotypes`.""" @@ -15,3 +15,23 @@ class TestGenotypes(TestCase): (("group", "test_group"), ("filler", "test_filler"), ("type", "test_type"), ("mat", "test_mat"), ("pat", "test_pat"), ("het", "test_het"), ("unk", "test_unk"))) + + def test_parse_genotype_header(self): + for header, expected in [ + [("Chr\tLocus\tcM\tMb\tBXD1\tBXD2\tBXD5\tBXD6\tBXD8\tBXD9\t" + "BXD11\tBXD12\tBXD13\tBXD14\tBXD15\tBXD16\tBXD18\tBXD19"), + (("Mbmap", True), ("cm_column", 2), ("mb_column", 3), + ("prgy", + ("BXD1", "BXD2", "BXD5", "BXD6", "BXD8", "BXD9", "BXD11", + "BXD12", "BXD13", "BXD14", "BXD15", "BXD16", "BXD18", + "BXD19")), + ("nprgy", 14))], + [("Chr\tLocus\tcM\tBXD1\tBXD2\tBXD5\tBXD6\tBXD8\tBXD9\tBXD11" + "\tBXD12\tBXD13\tBXD14\tBXD15\tBXD16\tBXD18"), + (("Mbmap", False), ("cm_column", 2), ("mb_column", None), + ("prgy", + ("BXD1", "BXD2", "BXD5", "BXD6", "BXD8", "BXD9", "BXD11", + "BXD12", "BXD13", "BXD14", "BXD15", "BXD16", "BXD18")), + ("nprgy", 13))]]: + with self.subTest(header=header): + self.assertEqual(parse_genotype_header(header), expected) -- cgit v1.2.3 From a1c217cf277feda3815a8435d6c8909f1b5546a1 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Wed, 1 Sep 2021 09:11:17 +0300 Subject: Parse data lines into markers Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/db/genotypes.py: parse data lines in file to genetic markers. * tests/unit/db/test_genotypes.py: test that parsing works. Add some tests to check that the parsing of the markers works as expected, and add the code to actually parse the markers. --- gn3/db/genotypes.py | 37 +++++++++++++++++++++++++++++++++++++ tests/unit/db/test_genotypes.py | 38 +++++++++++++++++++++++++++++++++++++- 2 files changed, 74 insertions(+), 1 deletion(-) diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py index be0dfc2..8710d2e 100644 --- a/gn3/db/genotypes.py +++ b/gn3/db/genotypes.py @@ -106,3 +106,40 @@ def parse_genotype_header(line: str, parlist = tuple()): ("mb_column", None if not Mbmap else items.index("Mb")), ("prgy", prgy), ("nprgy", len(prgy))) + +def parse_genotype_data_line(line: str, geno_obj: dict, parlist: list): + """ + Parse a data line in a genotype file + + DESCRIPTION: + Reworks + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/utility/gen_geno_ob.py#L143-L190 + """ + marker_row = [item.strip() for item in line.split("\t")] + geno_table = { + geno_obj["mat"]: -1, geno_obj["pat"]: 1, geno_obj["het"]: 0, + geno_obj["unk"]: "U" + } + start_pos = 4 if geno_obj["Mbmap"] else 3 + if len(parlist) > 0: + start_pos = start_pos + 2 + + alleles = marker_row[start_pos:] + genotype = tuple( + (geno_table[allele] if allele in geno_table.keys() else "U") + for allele in alleles) + if len(parlist) > 0: + genotype = (-1, 1) + genotype + try: + cM = float(geno_obj["cm_column"]) + except: + if geno_obj["Mbmap"]: + cM = float(geno_obj["mb_column"]) + else: + cM = 0 + return ( + ("chr", marker_row[0]), + ("name", marker_row[1]), + ("cM", cM), + ("Mb", float(geno_obj["mb_column"]) if geno_obj["Mbmap"] else None), + ("genotype", genotype)) diff --git a/tests/unit/db/test_genotypes.py b/tests/unit/db/test_genotypes.py index 4fa8a53..ba90191 100644 --- a/tests/unit/db/test_genotypes.py +++ b/tests/unit/db/test_genotypes.py @@ -1,11 +1,13 @@ """Tests gn3.db.genotypes""" from unittest import TestCase -from gn3.db.genotypes import parse_genotype_labels, parse_genotype_header +from gn3.db.genotypes import ( + parse_genotype_labels, parse_genotype_header, parse_genotype_data_line) class TestGenotypes(TestCase): """Tests for functions in `gn3.db.genotypes`.""" def test_parse_genotype_labels(self): + """Test that the genotype labels are parsed correctly.""" self.assertEqual( parse_genotype_labels([ "@name: test_group\t", "@filler: test_filler ", @@ -17,6 +19,7 @@ class TestGenotypes(TestCase): ("het", "test_het"), ("unk", "test_unk"))) def test_parse_genotype_header(self): + """Test that the genotype header is parsed correctly.""" for header, expected in [ [("Chr\tLocus\tcM\tMb\tBXD1\tBXD2\tBXD5\tBXD6\tBXD8\tBXD9\t" "BXD11\tBXD12\tBXD13\tBXD14\tBXD15\tBXD16\tBXD18\tBXD19"), @@ -35,3 +38,36 @@ class TestGenotypes(TestCase): ("nprgy", 13))]]: with self.subTest(header=header): self.assertEqual(parse_genotype_header(header), expected) + + def test_parse_genotype_data_line(self): + """Test parsing of data lines.""" + for line, geno_obj, parlist, expected in [ + ["1\trs31443144\t1.50\t3.010274\tB\tB\tD\tD\tD\tB\tB\tD\tB\tB", + {"mat": "test_mat", "pat": "test_pat", "het": "test_het", + "unk": "test_unk", "cm_column": 2, "Mbmap": True, + "mb_column": 3}, + tuple(), + (("chr", "1"), ("name", "rs31443144"), ("cM", 2.0), + ("Mb", 3.0), + ("genotype", + ("U", "U", "U", "U", "U", "U", "U", "U", "U", "U")))], + ["1\trs31443144\t1.50\t3.010274\tB\tB\tD\tD\tD\tB\tB\tD\tB\tB", + {"mat": "test_mat", "pat": "test_pat", "het": "test_het", + "unk": "test_unk", "cm_column": 2, "Mbmap": True, + "mb_column": 3}, + ("some", "parlist", "content"), + (("chr", "1"), ("name", "rs31443144"), ("cM", 2.0), + ("Mb", 3.0), + ("genotype", + (-1, 1, "U", "U", "U", "U", "U", "U", "U", "U")))], + ["1\trs31443144\t1.50\t3.010274\tB\tB\tD\tH\tD\tB\tU\tD\tB\tB", + {"mat": "B", "pat": "D", "het": "H", "unk": "U", + "cm_column": 2, "Mbmap": True, "mb_column": 3}, + tuple(), + (("chr", "1"), ("name", "rs31443144"), ("cM", 2.0), + ("Mb", 3.0), + ("genotype", (-1, -1, 1, 0, 1, -1, "U", 1, -1, -1)))]]: + with self.subTest(line = line): + self.assertEqual( + parse_genotype_data_line(line, geno_obj, parlist), + expected) -- cgit v1.2.3 From abfc0410a2385d8c3d6ee1915fc99b708e1d0dbc Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Wed, 1 Sep 2021 10:49:52 +0300 Subject: Built top-level genotype file parsing function Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/db/genotypes.py: parse genotype files * tests/unit/db/test_genotypes.py: test parsing is correct Add the overall genotype files parsing function and tests to check that the parsing works as expected. --- gn3/db/genotypes.py | 38 ++++++++++++++- tests/unit/db/test_genotypes.py | 101 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 136 insertions(+), 3 deletions(-) diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py index 8710d2e..b5d14a5 100644 --- a/gn3/db/genotypes.py +++ b/gn3/db/genotypes.py @@ -107,7 +107,7 @@ def parse_genotype_header(line: str, parlist = tuple()): ("prgy", prgy), ("nprgy", len(prgy))) -def parse_genotype_data_line(line: str, geno_obj: dict, parlist: list): +def parse_genotype_marker(line: str, geno_obj: dict, parlist: list): """ Parse a data line in a genotype file @@ -143,3 +143,39 @@ def parse_genotype_data_line(line: str, geno_obj: dict, parlist: list): ("cM", cM), ("Mb", float(geno_obj["mb_column"]) if geno_obj["Mbmap"] else None), ("genotype", genotype)) + +def build_genotype_chromosomes(geno_obj, markers): + """ + Build up the chromosomes from the given markers and partially built geno + object + """ + mrks = [dict(marker) for marker in markers] + chr_names = {marker["chr"] for marker in mrks} + return tuple(( + ("name", chr_name), ("mb_exists", geno_obj["Mbmap"]), ("cm_column", 2), + ("mb_column", geno_obj["mb_column"]), + ("loci", tuple(marker for marker in mrks if marker["chr"] == chr_name))) + for chr_name in sorted(chr_names)) + +def parse_genotype_file(filename: str, parlist = tuple()): + """ + Parse the provided genotype file into a usable pytho3 data structure. + """ + with open(filename, "r") as infile: + contents = infile.readlines() + + lines = tuple(line for line in contents if + ((not line.strip().startswith("#")) and + (not line.strip() == ""))) + labels = parse_genotype_labels( + line for line in lines if line.startswith("@")) + data_lines = tuple(line for line in lines if not line.startswith("@")) + header = parse_genotype_header(data_lines[0], parlist) + geno_obj = dict(labels + header) + markers = tuple( + parse_genotype_marker(line, geno_obj, parlist) + for line in data_lines[1:]) + chromosomes = tuple( + dict(chromosome) for chromosome in + build_genotype_chromosomes(geno_obj, markers)) + return {**geno_obj, "chromosomes": chromosomes} diff --git a/tests/unit/db/test_genotypes.py b/tests/unit/db/test_genotypes.py index ba90191..a05ce48 100644 --- a/tests/unit/db/test_genotypes.py +++ b/tests/unit/db/test_genotypes.py @@ -1,7 +1,11 @@ """Tests gn3.db.genotypes""" from unittest import TestCase from gn3.db.genotypes import ( - parse_genotype_labels, parse_genotype_header, parse_genotype_data_line) + parse_genotype_file, + parse_genotype_labels, + parse_genotype_header, + parse_genotype_marker, + build_genotype_chromosomes) class TestGenotypes(TestCase): """Tests for functions in `gn3.db.genotypes`.""" @@ -69,5 +73,98 @@ class TestGenotypes(TestCase): ("genotype", (-1, -1, 1, 0, 1, -1, "U", 1, -1, -1)))]]: with self.subTest(line = line): self.assertEqual( - parse_genotype_data_line(line, geno_obj, parlist), + parse_genotype_marker(line, geno_obj, parlist), expected) + + def test_build_genotype_chromosomes(self): + """ + Given `markers` and `geno_obj`, test that `build_genotype_chromosomes` + builds a sequence of chromosomes with the given markers ordered + according to the `chr` value.""" + for markers, geno_obj, expected in [ + [[(("chr", "1"), ("name", "rs31443144"), ("cM", 2.0), + ("Mb", 3.0), + ("genotype", (-1, -1, 1, 0, 1, -1, "U", 1, -1, -1))), + (("chr", "2"), ("name", "rs31443144"), ("cM", 2.0), + ("Mb", 3.0), + ("genotype", (-1, -1, 1, 0, 1, -1, "U", 1, -1, -1)))], + {"mat": "B", "pat": "D", "het": "H", "unk": "U", + "cm_column": 2, "Mbmap": True, "mb_column": 3}, + ((("name", "1"), ("mb_exists", True), ("cm_column", 2), + ("mb_column", 3), + ("loci", + ({"chr": "1", "name": "rs31443144", "cM": 2.0, "Mb": 3.0, + "genotype": (-1, -1, 1, 0, 1, -1, "U", 1, -1, -1)},))), + (("name", "2"), ("mb_exists", True), ("cm_column", 2), + ("mb_column", 3), + ("loci", + ({"chr": "2", "name": "rs31443144", "cM": 2.0, "Mb": 3.0, + "genotype": (-1, -1, 1, 0, 1, -1, "U", 1, -1, -1)},))))], + [[(("chr", "1"), ("name", "rs31443144"), ("cM", 2.0), + ("Mb", None), + ("genotype", (-1, 1, 1, 0, 1, -1, "U", 1, -1, -1)))], + {"mat": "B", "pat": "D", "het": "H", "unk": "U", + "cm_column": 2, "Mbmap": False, "mb_column": None}, + ((("name", "1"), ("mb_exists", False), ("cm_column", 2), + ("mb_column", None), + ("loci", + ({"chr": "1", "name": "rs31443144", "cM": 2.0, "Mb": None, + "genotype": (-1, 1, 1, 0, 1, -1, "U", 1, -1, -1)},))),)]]: + with self.subTest(markers = markers): + self.assertEqual( + build_genotype_chromosomes(geno_obj, markers), + expected) + + def test_parse_genotype_file(self): + """Test the parsing of genotype files. """ + self.assertEqual( + parse_genotype_file( + "tests/unit/db/data/genotypes/genotype_sample1.geno"), + {"group": "BXD", + "type": "riset", + "mat": "B", + "pat": "D", + "het": "H", + "unk": "U", + "Mbmap": True, + "cm_column": 2, + "mb_column": 3, + "prgy": ("BXD1", "BXD2", "BXD5", "BXD6", "BXD8", "BXD9"), + "nprgy": 6, + "chromosomes": ( + {"name": "1", + "mb_exists": True, + "cm_column": 2, + "mb_column": 3, + "loci": ( + {"chr": "1", + "name": "rs31443144", + "cM": 2.0, + "Mb": 3.0, + "genotype": (-1, -1, 1, 1, 1, -1) + }, + {"chr": "1", + "name": "rs6269442", + "cM": 2.0, + "Mb": 3.0, + "genotype": (-1, -1, 1, 1, 0, "U")}, + {"chr": "1", + "name": "rs32285189", + "cM": 2.0, + "Mb": 3.0, + "genotype": (-1, "U", 1, 1, 1, -1)})}, + {"name": "2", + "mb_exists": True, + "cm_column": 2, + "mb_column": 3, + "loci": ( + {"chr": "2", + "name": "rs31443144", + "cM": 2.0, + "Mb": 3.0, + "genotype": (-1, -1, 1, 1, 1, -1)}, + {"chr": "2", + "name": "rs6269442", + "cM": 2.0, + "Mb": 3.0, + "genotype": (-1, -1, 1, 1, 0, "U")})})}) -- cgit v1.2.3 From 3ded952f40f486d9aa69746eac2afe7f67fef790 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Wed, 1 Sep 2021 11:08:38 +0300 Subject: Fix linting and typing issues Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi --- gn3/db/genotypes.py | 32 ++++++++++++++++---------------- tests/unit/db/test_genotypes.py | 10 +++++----- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py index b5d14a5..b03d55c 100644 --- a/gn3/db/genotypes.py +++ b/gn3/db/genotypes.py @@ -88,7 +88,7 @@ def parse_genotype_labels(lines: list): item for item in (__parse_label(line) for line in lines) if item is not None) -def parse_genotype_header(line: str, parlist = tuple()): +def parse_genotype_header(line: str, parlist: tuple = tuple()): """ Parse the genotype file header line @@ -97,13 +97,13 @@ def parse_genotype_header(line: str, parlist = tuple()): https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/utility/gen_geno_ob.py#L94-L114 """ items = [item.strip() for item in line.split("\t")] - Mbmap = "Mb" in items - prgy = ((parlist + tuple(items[4:])) if Mbmap + mbmap = "Mb" in items + prgy = ((parlist + tuple(items[4:])) if mbmap else (parlist + tuple(items[3:]))) return ( - ("Mbmap", Mbmap), + ("Mbmap", mbmap), ("cm_column", items.index("cM")), - ("mb_column", None if not Mbmap else items.index("Mb")), + ("mb_column", None if not mbmap else items.index("Mb")), ("prgy", prgy), ("nprgy", len(prgy))) @@ -131,16 +131,16 @@ def parse_genotype_marker(line: str, geno_obj: dict, parlist: list): if len(parlist) > 0: genotype = (-1, 1) + genotype try: - cM = float(geno_obj["cm_column"]) + cm_val = float(geno_obj["cm_column"]) except: if geno_obj["Mbmap"]: - cM = float(geno_obj["mb_column"]) + cm_val = float(geno_obj["mb_column"]) else: - cM = 0 + cm_val = 0 return ( ("chr", marker_row[0]), ("name", marker_row[1]), - ("cM", cM), + ("cM", cm_val), ("Mb", float(geno_obj["mb_column"]) if geno_obj["Mbmap"] else None), ("genotype", genotype)) @@ -155,9 +155,9 @@ def build_genotype_chromosomes(geno_obj, markers): ("name", chr_name), ("mb_exists", geno_obj["Mbmap"]), ("cm_column", 2), ("mb_column", geno_obj["mb_column"]), ("loci", tuple(marker for marker in mrks if marker["chr"] == chr_name))) - for chr_name in sorted(chr_names)) + for chr_name in sorted(chr_names)) -def parse_genotype_file(filename: str, parlist = tuple()): +def parse_genotype_file(filename: str, parlist: tuple = tuple()): """ Parse the provided genotype file into a usable pytho3 data structure. """ @@ -165,16 +165,16 @@ def parse_genotype_file(filename: str, parlist = tuple()): contents = infile.readlines() lines = tuple(line for line in contents if - ((not line.strip().startswith("#")) and - (not line.strip() == ""))) + ((not line.strip().startswith("#")) and + (not line.strip() == ""))) labels = parse_genotype_labels( - line for line in lines if line.startswith("@")) + [line for line in lines if line.startswith("@")]) data_lines = tuple(line for line in lines if not line.startswith("@")) header = parse_genotype_header(data_lines[0], parlist) geno_obj = dict(labels + header) markers = tuple( - parse_genotype_marker(line, geno_obj, parlist) - for line in data_lines[1:]) + [parse_genotype_marker(line, geno_obj, parlist) + for line in data_lines[1:]]) chromosomes = tuple( dict(chromosome) for chromosome in build_genotype_chromosomes(geno_obj, markers)) diff --git a/tests/unit/db/test_genotypes.py b/tests/unit/db/test_genotypes.py index a05ce48..c125224 100644 --- a/tests/unit/db/test_genotypes.py +++ b/tests/unit/db/test_genotypes.py @@ -18,9 +18,9 @@ class TestGenotypes(TestCase): "@type:test_type", "@mat:test_mat \t", "@pat:test_pat ", "@het: test_het ", "@unk: test_unk", "@other: test_other", "@brrr: test_brrr "]), - (("group", "test_group"), ("filler", "test_filler"), - ("type", "test_type"), ("mat", "test_mat"), ("pat", "test_pat"), - ("het", "test_het"), ("unk", "test_unk"))) + (("group", "test_group"), ("filler", "test_filler"), + ("type", "test_type"), ("mat", "test_mat"), ("pat", "test_pat"), + ("het", "test_het"), ("unk", "test_unk"))) def test_parse_genotype_header(self): """Test that the genotype header is parsed correctly.""" @@ -71,7 +71,7 @@ class TestGenotypes(TestCase): (("chr", "1"), ("name", "rs31443144"), ("cM", 2.0), ("Mb", 3.0), ("genotype", (-1, -1, 1, 0, 1, -1, "U", 1, -1, -1)))]]: - with self.subTest(line = line): + with self.subTest(line=line): self.assertEqual( parse_genotype_marker(line, geno_obj, parlist), expected) @@ -110,7 +110,7 @@ class TestGenotypes(TestCase): ("loci", ({"chr": "1", "name": "rs31443144", "cM": 2.0, "Mb": None, "genotype": (-1, 1, 1, 0, 1, -1, "U", 1, -1, -1)},))),)]]: - with self.subTest(markers = markers): + with self.subTest(markers=markers): self.assertEqual( build_genotype_chromosomes(geno_obj, markers), expected) -- cgit v1.2.3 From 608ff9c6ff668d18f0c42aebf658ef80b517a6de Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Mon, 6 Sep 2021 06:45:18 +0300 Subject: Find nearest marker Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Migrate the `web.webqtl.heatmap.Heatmap.getNearestMarker` function in GN1 to GN3. --- gn3/computations/heatmap.py | 49 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py index 1143450..ccce385 100644 --- a/gn3/computations/heatmap.py +++ b/gn3/computations/heatmap.py @@ -30,7 +30,7 @@ def export_trait_data( The dictionary of key-value pairs representing a trait strainlist: (list) A list of strain names - type: (str) + dtype: (str) ... verify what this is ... var_exists: (bool) A flag indicating existence of variance @@ -232,3 +232,50 @@ def retrieve_strains_and_values(orders, strainlist, traits_data_list): values = [] return rets + +def nearest_marker_finder(genotype): + """ + Returns a function to be used with `genotype` to compute the nearest marker + to the trait passed to the returned function. + + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L425-434 + """ + def __compute_distances(chromo, trait): + loci = chromo.get("loci", None) + if not loci: + return None + return tuple( + { + "name": locus["name"], + "distance": abs(locus["Mb"] - trait["mb"]) + } for locus in loci) + + def __finder(trait): + _chrs = tuple( + _chr for _chr in genotype["chromosomes"] + if str(_chr["name"]) == str(trait["chr"])) + if len(_chrs) == 0: + return None + distances = tuple( + distance for dists in + filter( + lambda x: x is not None, + (__compute_distances(_chr, trait) for _chr in _chrs)) + for distance in dists) + nearest = min(distances, key=lambda d: d["distance"]) + return nearest["name"] + return __finder + +def get_nearest_marker(traits_list, genotype): + """ + Retrieves the nearest marker for each of the traits in the list. + + DESCRIPTION: + This migrates the code in + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L419-L438 + """ + if not genotype["Mbmap"]: + return [None] * len(trait_list) + + marker_finder = nearest_marker_finder(genotype) + return [marker_finder(trait) for trait in traits_list] -- cgit v1.2.3 From 4ce5695a35e92a704add8d497266bb2986a593f6 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Mon, 6 Sep 2021 06:47:52 +0300 Subject: Handle type-coercion exceptions * gn3/computations/qtlreaper.py: handle exceptions Sometimes, the values being parsed are plain strings and cannot be cast to the float types. This commit handles that by casting only those values that can be cast to float, and returning the others as strings. --- gn3/computations/qtlreaper.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py index 30c7051..eff2a80 100644 --- a/gn3/computations/qtlreaper.py +++ b/gn3/computations/qtlreaper.py @@ -94,9 +94,15 @@ def parse_reaper_main_results(results_file): with open(results_file, "r") as infile: lines = infile.readlines() + def __parse_column_value(value): + try: + return float(value) + except: + return value + def __parse_line(line): items = line.strip().split("\t") - return items[0:2] + [float(item) for item in items[2:]] + return items[0:2] + [__parse_column_value(item) for item in items[2:]] header = lines[0].strip().split("\t") return [dict(zip(header, __parse_line(line))) for line in lines[1:]] -- cgit v1.2.3 From 679a1af832ad9585c7cf72996043edb08e1b0d10 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Mon, 6 Sep 2021 08:06:14 +0300 Subject: Leave "Chr" value as string when parsing Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * The "Chr" value seems to be mostly a name of some sort, despite it being, seemingly an number. This commit parses the "Chr" value as a string. It also updates the tests to expec a string, rather than a number for "Chr" values. --- gn3/computations/qtlreaper.py | 5 +++-- tests/unit/computations/test_qtlreaper.py | 20 ++++++++++---------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py index eff2a80..9b20309 100644 --- a/gn3/computations/qtlreaper.py +++ b/gn3/computations/qtlreaper.py @@ -94,7 +94,7 @@ def parse_reaper_main_results(results_file): with open(results_file, "r") as infile: lines = infile.readlines() - def __parse_column_value(value): + def __parse_column_float_value(value): try: return float(value) except: @@ -102,7 +102,8 @@ def parse_reaper_main_results(results_file): def __parse_line(line): items = line.strip().split("\t") - return items[0:2] + [__parse_column_value(item) for item in items[2:]] + return items[0:3] + [ + __parse_column_float_value(item) for item in items[3:]] header = lines[0].strip().split("\t") return [dict(zip(header, __parse_line(line))) for line in lines[1:]] diff --git a/tests/unit/computations/test_qtlreaper.py b/tests/unit/computations/test_qtlreaper.py index 6c3b64d..fd3434a 100644 --- a/tests/unit/computations/test_qtlreaper.py +++ b/tests/unit/computations/test_qtlreaper.py @@ -13,52 +13,52 @@ class TestQTLReaper(TestCase): "tests/unit/computations/data/qtlreaper/main_output_sample.txt"), [ { - "ID": "T1", "Locus": "rs31443144", "Chr": 1, "cM": 1.500, + "ID": "T1", "Locus": "rs31443144", "Chr": "1", "cM": 1.500, "Mb": 3.010, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs6269442", "Chr": 1, "cM": 1.500, + "ID": "T1", "Locus": "rs6269442", "Chr": "1", "cM": 1.500, "Mb": 3.492, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs32285189", "Chr": 1, "cM": 1.630, + "ID": "T1", "Locus": "rs32285189", "Chr": "1", "cM": 1.630, "Mb": 3.511, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs258367496", "Chr": 1, "cM": 1.630, + "ID": "T1", "Locus": "rs258367496", "Chr": "1", "cM": 1.630, "Mb": 3.660, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs32430919", "Chr": 1, "cM": 1.750, + "ID": "T1", "Locus": "rs32430919", "Chr": "1", "cM": 1.750, "Mb": 3.777, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs36251697", "Chr": 1, "cM": 1.880, + "ID": "T1", "Locus": "rs36251697", "Chr": "1", "cM": 1.880, "Mb": 3.812, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs30658298", "Chr": 1, "cM": 2.010, + "ID": "T1", "Locus": "rs30658298", "Chr": "1", "cM": 2.010, "Mb": 4.431, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs51852623", "Chr": 1, "cM": 2.010, + "ID": "T1", "Locus": "rs51852623", "Chr": "1", "cM": 2.010, "Mb": 4.447, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs31879829", "Chr": 1, "cM": 2.140, + "ID": "T1", "Locus": "rs31879829", "Chr": "1", "cM": 2.140, "Mb": 4.519, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs36742481", "Chr": 1, "cM": 2.140, + "ID": "T1", "Locus": "rs36742481", "Chr": "1", "cM": 2.140, "Mb": 4.776, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 } -- cgit v1.2.3 From d4943f1d01d89a3928c905f80914a23144126c8e Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Mon, 6 Sep 2021 08:09:20 +0300 Subject: Provide function to organise parsed QTLReaper results * gn3/computations/qtlreaper.py: Provide a function to organise the results by trait for easier use down the line. * tests/unit/computations/test_qtlreaper.py: provide a test to ensure that the organising function works as expected. --- gn3/computations/qtlreaper.py | 25 +++++++ tests/unit/computations/test_qtlreaper.py | 105 +++++++++++++++++++++++++++++- 2 files changed, 129 insertions(+), 1 deletion(-) diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py index 9b20309..8c0e6de 100644 --- a/gn3/computations/qtlreaper.py +++ b/gn3/computations/qtlreaper.py @@ -86,6 +86,31 @@ def run_reaper( subprocess.run(command_list, check=True) return (output_filename, permu_output_filename) +def organise_reaper_main_results(parsed_results): + def __organise_by_chromosome(chr_name, items): + chr_items = [item for item in items if item["Chr"] == chr_name] + return { + "Chr": str(chr_name), + "loci": [{ + "Locus": locus["Locus"], + "cM": locus["cM"], + "Mb": locus["Mb"], + "LRS": locus["LRS"], + "Additive": locus["Additive"], + "pValue": locus["pValue"] + } for locus in chr_items]} + + def __organise_by_id(identifier, items): + id_items = [item for item in items if item["ID"] == identifier] + unique_chromosomes = {item["Chr"] for item in id_items} + return { + "ID": identifier, + "chromosomes": [ + __organise_by_chromosome(chromo, id_items) + for chromo in sorted(unique_chromosomes)]} + + unique_ids = {res["ID"] for res in parsed_results} + return [__organise_by_id(_id, parsed_results) for _id in sorted(unique_ids)] def parse_reaper_main_results(results_file): """ diff --git a/tests/unit/computations/test_qtlreaper.py b/tests/unit/computations/test_qtlreaper.py index fd3434a..1d7347f 100644 --- a/tests/unit/computations/test_qtlreaper.py +++ b/tests/unit/computations/test_qtlreaper.py @@ -1,7 +1,9 @@ """Module contains tests for gn3.computations.qtlreaper""" from unittest import TestCase from gn3.computations.qtlreaper import ( - parse_reaper_main_results, parse_reaper_permutation_results) + parse_reaper_main_results, + organise_reaper_main_results, + parse_reaper_permutation_results) class TestQTLReaper(TestCase): """Class for testing qtlreaper interface functions.""" @@ -73,3 +75,104 @@ class TestQTLReaper(TestCase): 5.24619, 5.27961, 5.28228, 5.43903, 5.50188, 5.51694, 5.56830, 5.63874, 5.71346, 5.71936, 5.74275, 5.76764, 5.79815, 5.81671, 5.82775, 5.89659, 5.92117, 5.93396, 5.93396, 5.94957]) + + def test_organise_reaper_main_results(self): + self.assertEqual( + organise_reaper_main_results([ + { + "ID": "T1", "Locus": "rs31443144", "Chr": 1, "cM": 1.500, + "Mb": 3.010, "LRS": 0.500, "Additive": -0.074, + "pValue": 1.000 + }, + { + "ID": "T1", "Locus": "rs6269442", "Chr": 1, "cM": 1.500, + "Mb": 3.492, "LRS": 0.500, "Additive": -0.074, + "pValue": 1.000 + }, + { + "ID": "T1", "Locus": "rs32285189", "Chr": 1, "cM": 1.630, + "Mb": 3.511, "LRS": 0.500, "Additive": -0.074, + "pValue": 1.000 + }, + { + "ID": "T1", "Locus": "rs258367496", "Chr": 1, "cM": 1.630, + "Mb": 3.660, "LRS": 0.500, "Additive": -0.074, + "pValue": 1.000 + }, + { + "ID": "T1", "Locus": "rs32430919", "Chr": 1, "cM": 1.750, + "Mb": 3.777, "LRS": 0.500, "Additive": -0.074, + "pValue": 1.000 + }, + { + "ID": "T1", "Locus": "rs36251697", "Chr": 1, "cM": 1.880, + "Mb": 3.812, "LRS": 0.500, "Additive": -0.074, + "pValue": 1.000 + }, + { + "ID": "T1", "Locus": "rs30658298", "Chr": 1, "cM": 2.010, + "Mb": 4.431, "LRS": 0.500, "Additive": -0.074, + "pValue": 1.000 + }, + { + "ID": "T1", "Locus": "rs51852623", "Chr": 2, "cM": 2.010, + "Mb": 4.447, "LRS": 0.500, "Additive": -0.074, + "pValue": 1.000 + }, + { + "ID": "T1", "Locus": "rs31879829", "Chr": 2, "cM": 2.140, + "Mb": 4.519, "LRS": 0.500, "Additive": -0.074, + "pValue": 1.000 + }, + { + "ID": "T1", "Locus": "rs36742481", "Chr": 2, "cM": 2.140, + "Mb": 4.776, "LRS": 0.500, "Additive": -0.074, + "pValue": 1.000 + } + ]), + [{"ID": "T1", + "chromosomes": [ + {"Chr": "1", + "loci": [ + { + "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs6269442", "cM": 1.500, "Mb": 3.492, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs32285189", "cM": 1.630, "Mb": 3.511, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs258367496", "cM": 1.630, "Mb": 3.660, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs32430919", "cM": 1.750, "Mb": 3.777, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs36251697", "cM": 1.880, "Mb": 3.812, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs30658298", "cM": 2.010, "Mb": 4.431, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }]}, + {"Chr": "2", + "loci": [ + { + "Locus": "rs51852623", "cM": 2.010, "Mb": 4.447, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs31879829", "cM": 2.140, "Mb": 4.519, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs36742481", "cM": 2.140, "Mb": 4.776, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }]}]}]) -- cgit v1.2.3 From 31ca02d1f095c2cc667e5b7d49131d702982f321 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Wed, 8 Sep 2021 06:52:01 +0300 Subject: Fix the traits order computations for clustering Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/computations/heatmap.py: Fix ordering function * tests/unit/computations/test_heatmap.py: update test The order of the traits is important for the clustering algorithm, since the clustering seems to use the distance of one trait from another to determine how to order them. This commit also gets rid of the xoffset argument that is not important to the ordering, and was used in the older GN1 to determine how to draw the clustering lines. --- gn3/computations/heatmap.py | 16 ++++++---------- tests/unit/computations/test_heatmap.py | 11 +++-------- 2 files changed, 9 insertions(+), 18 deletions(-) diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py index ccce385..8727c92 100644 --- a/gn3/computations/heatmap.py +++ b/gn3/computations/heatmap.py @@ -180,28 +180,24 @@ def heatmap_data(traits_names, conn: Any): "traits_filename": traits_filename } -def compute_heatmap_order( - slink_data, xoffset: int = 40, neworder: tuple = tuple()): +def compute_traits_order(slink_data, neworder: tuple = tuple()): """ - Compute the data used for drawing the heatmap proper from `slink_data`. + Compute the order of the traits for clustering from `slink_data`. This function tries to reproduce the creation and update of the `neworder` variable in https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L120 and in the `web.webqtl.heatmap.Heatmap.draw` function in GN1 """ - d_1 = (0, 0, 0) # returned from self.draw in lines 391 and 399. This is just a placeholder - def __order_maker(norder, slnk_dt): if isinstance(slnk_dt[0], int) and isinstance(slnk_dt[1], int): - return norder + ( - (xoffset+20, slnk_dt[0]), (xoffset + 40, slnk_dt[1])) + return norder + (slnk_dt[0], slnk_dt[1]) if isinstance(slnk_dt[0], int): - return norder + ((xoffset + 20, slnk_dt[0]), ) + return __order_maker((norder + (slnk_dt[0], )), slnk_dt[1]) if isinstance(slnk_dt[1], int): - return norder + ((xoffset + d_1[0] + 20, slnk_dt[1]), ) + return __order_maker(norder, slnk_dt[0]) + (slnk_dt[1], ) return __order_maker(__order_maker(norder, slnk_dt[0]), slnk_dt[1]) @@ -222,7 +218,7 @@ def retrieve_strains_and_values(orders, strainlist, traits_data_list): values = [] rets = [] for order in orders: - temp_val = traits_data_list[order[1]] + temp_val = traits_data_list[order] for i, strain in enumerate(strainlist): if temp_val[i] is not None: strains.append(strain) diff --git a/tests/unit/computations/test_heatmap.py b/tests/unit/computations/test_heatmap.py index 87f8e45..f1bbefc 100644 --- a/tests/unit/computations/test_heatmap.py +++ b/tests/unit/computations/test_heatmap.py @@ -3,7 +3,7 @@ from unittest import TestCase from gn3.computations.heatmap import ( cluster_traits, export_trait_data, - compute_heatmap_order, + compute_traits_order, retrieve_strains_and_values) strainlist = ["B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"] @@ -158,13 +158,8 @@ class TestHeatmap(TestCase): def test_compute_heatmap_order(self): """Test the orders.""" - for xoff, expected in [ - (40, ((60, 9), (60, 4))), - (30, ((50, 9), (50, 4))), - (20, ((40, 9), (40, 4)))]: - with self.subTest(xoffset=xoff): - self.assertEqual( - compute_heatmap_order(slinked, xoffset=xoff), expected) + self.assertEqual( + compute_traits_order(slinked), (0, 2, 1, 7, 5, 9, 3, 6, 8, 4)) def test_retrieve_strains_and_values(self): """Test retrieval of strains and values.""" -- cgit v1.2.3 From f360cc62cc156af90d3283ae7b6db9e8250fa43c Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Wed, 8 Sep 2021 10:51:57 +0300 Subject: Remove extraneous text to ease sorting Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Change the id from 'T' to simply '' to ease sorting of the trait results by numerical order rather than string order. --- gn3/computations/qtlreaper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py index 8c0e6de..ec215e5 100644 --- a/gn3/computations/qtlreaper.py +++ b/gn3/computations/qtlreaper.py @@ -20,9 +20,9 @@ def generate_traits_file(strains, trait_values, traits_filename): header = "Trait\t{}\n".format("\t".join(strains)) data = ( [header] + - ["T{}\t{}\n".format(i+1, "\t".join([str(i) for i in t])) + ["{}\t{}\n".format(i+1, "\t".join([str(i) for i in t])) for i, t in enumerate(trait_values[:-1])] + - ["T{}\t{}".format( + ["{}\t{}".format( len(trait_values), "\t".join([str(i) for i in t])) for t in trait_values[-1:]]) with open(traits_filename, "w") as outfile: -- cgit v1.2.3 From 3f323734fcf258d28f3f7d33fdc1518ef9ec24a8 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Wed, 8 Sep 2021 10:54:48 +0300 Subject: Parse Chr value as int where possible * To ease sorting of data by numerical order down the line, sort the "Chr" values by numerical order. --- gn3/computations/qtlreaper.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py index ec215e5..02d6572 100644 --- a/gn3/computations/qtlreaper.py +++ b/gn3/computations/qtlreaper.py @@ -86,11 +86,16 @@ def run_reaper( subprocess.run(command_list, check=True) return (output_filename, permu_output_filename) +def chromosome_sorter_key_fn(val): + if isinstance(val, int): + return val + return ord(val) + def organise_reaper_main_results(parsed_results): def __organise_by_chromosome(chr_name, items): chr_items = [item for item in items if item["Chr"] == chr_name] return { - "Chr": str(chr_name), + "Chr": chr_name, "loci": [{ "Locus": locus["Locus"], "cM": locus["cM"], @@ -125,9 +130,15 @@ def parse_reaper_main_results(results_file): except: return value + def __parse_column_int_value(value): + try: + return int(value) + except: + return value + def __parse_line(line): items = line.strip().split("\t") - return items[0:3] + [ + return items[0:2] + [__parse_column_int_value(items[2])] + [ __parse_column_float_value(item) for item in items[3:]] header = lines[0].strip().split("\t") -- cgit v1.2.3 From a718069c757bea9f7ecbaee25e23bd581750f906 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Wed, 8 Sep 2021 10:56:56 +0300 Subject: Ease search for traits and chromosomes Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Return a dict of values rather than list for the traits and chromosomes to ease searching through the data. --- gn3/computations/qtlreaper.py | 9 ++- tests/unit/computations/test_qtlreaper.py | 92 +++++++++++++++---------------- 2 files changed, 52 insertions(+), 49 deletions(-) diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py index 02d6572..5180853 100644 --- a/gn3/computations/qtlreaper.py +++ b/gn3/computations/qtlreaper.py @@ -110,12 +110,15 @@ def organise_reaper_main_results(parsed_results): unique_chromosomes = {item["Chr"] for item in id_items} return { "ID": identifier, - "chromosomes": [ + "chromosomes": {_chr["Chr"]: _chr for _chr in [ __organise_by_chromosome(chromo, id_items) - for chromo in sorted(unique_chromosomes)]} + for chromo in sorted( + unique_chromosomes, key=chromosome_sorter_key_fn)]}} unique_ids = {res["ID"] for res in parsed_results} - return [__organise_by_id(_id, parsed_results) for _id in sorted(unique_ids)] + return { + trait["ID"]: trait for trait in + [__organise_by_id(_id, parsed_results) for _id in sorted(unique_ids)]} def parse_reaper_main_results(results_file): """ diff --git a/tests/unit/computations/test_qtlreaper.py b/tests/unit/computations/test_qtlreaper.py index 1d7347f..495ed97 100644 --- a/tests/unit/computations/test_qtlreaper.py +++ b/tests/unit/computations/test_qtlreaper.py @@ -130,49 +130,49 @@ class TestQTLReaper(TestCase): "pValue": 1.000 } ]), - [{"ID": "T1", - "chromosomes": [ - {"Chr": "1", - "loci": [ - { - "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs6269442", "cM": 1.500, "Mb": 3.492, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs32285189", "cM": 1.630, "Mb": 3.511, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs258367496", "cM": 1.630, "Mb": 3.660, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs32430919", "cM": 1.750, "Mb": 3.777, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs36251697", "cM": 1.880, "Mb": 3.812, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs30658298", "cM": 2.010, "Mb": 4.431, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }]}, - {"Chr": "2", - "loci": [ - { - "Locus": "rs51852623", "cM": 2.010, "Mb": 4.447, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs31879829", "cM": 2.140, "Mb": 4.519, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs36742481", "cM": 2.140, "Mb": 4.776, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }]}]}]) + {"T1": {"ID": "T1", + "chromosomes": { + 1: {"Chr": 1, + "loci": [ + { + "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs6269442", "cM": 1.500, "Mb": 3.492, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs32285189", "cM": 1.630, "Mb": 3.511, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs258367496", "cM": 1.630, "Mb": 3.660, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs32430919", "cM": 1.750, "Mb": 3.777, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs36251697", "cM": 1.880, "Mb": 3.812, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs30658298", "cM": 2.010, "Mb": 4.431, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }]}, + 2: {"Chr": 2, + "loci": [ + { + "Locus": "rs51852623", "cM": 2.010, "Mb": 4.447, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs31879829", "cM": 2.140, "Mb": 4.519, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs36742481", "cM": 2.140, "Mb": 4.776, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }]}}}}) -- cgit v1.2.3 From 0d4e21d74d54554cf278142d0a31d35bec873ac1 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Thu, 9 Sep 2021 10:59:31 +0300 Subject: Update proof-of-concept code * Add individual heatmaps * Add dendograms * Merge multiple heatmaps to single plot Updated the proof of concept code to provide a sample of what is needed to generate the appropriate heatmaps. To generate the sample heatmaps, one can run something like: env SQL_URI="mysql://webqtlout:webqtlout@127.0.0.1:3306/db_webqtl" \ python3 qtlfilesexport.py assuming that the database can be accessed at 127.0.0.1:3306 --- qtlfilesexport.py | 199 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 192 insertions(+), 7 deletions(-) diff --git a/qtlfilesexport.py b/qtlfilesexport.py index 799de31..100fa75 100644 --- a/qtlfilesexport.py +++ b/qtlfilesexport.py @@ -7,19 +7,38 @@ Run with: replacing the variables in the angled brackets with the appropriate values """ +from gn3.random import random_string from gn3.computations.slink import slink from gn3.db_utils import database_connector from gn3.computations.qtlreaper import run_reaper -from gn3.computations.heatmap import export_trait_data from gn3.db.traits import retrieve_trait_data, retrieve_trait_info -from gn3.db.genotypes import build_genotype_file, load_genotype_samples -from gn3.computations.qtlreaper import random_string, generate_traits_file +from gn3.computations.heatmap import export_trait_data, get_nearest_marker +from gn3.db.genotypes import ( + build_genotype_file, + parse_genotype_file, + load_genotype_samples) from gn3.computations.heatmap import ( cluster_traits, - compute_heatmap_order, + compute_traits_order, retrieve_strains_and_values) +from gn3.computations.qtlreaper import ( + generate_traits_file, + chromosome_sorter_key_fn, + parse_reaper_main_results, + organise_reaper_main_results, + parse_reaper_permutation_results) -TMPDIR = "tmp/qtltests" +import plotly.express as px + +## for dendrogram +import numpy as np +import plotly.graph_objects as go +import plotly.figure_factory as ff + +# for single heatmap +from plotly.subplots import make_subplots + +TMPDIR = "tmp/" def trait_fullnames(): """Return sample names for traits""" @@ -35,6 +54,104 @@ def trait_fullnames(): "UCLA_BXDBXH_CARTILAGE_V2::ILM4200064", "UCLA_BXDBXH_CARTILAGE_V2::ILM3140463"] +def get_lrs_from_chr(trait, chr_name): + chromosome = trait["chromosomes"].get(chr_name) + if chromosome: + return [ + locus["LRS"] for locus in + sorted(chromosome["loci"], key=lambda loc: loc["Locus"])] + return [None] + +def process_traits_data_for_heatmap(data, trait_names, chromosome_names): + print("TRAIT_NAMES: {}".format(trait_names)) + print("chromosome names: {}".format(chromosome_names)) + print("data keys: {}".format(data.keys())) + hdata = [ + [get_lrs_from_chr(data[trait], chr_name) for trait in trait_names] + for chr_name in chromosome_names] + # print("hdata: {}".format(hdata)) + return hdata + +def generate_heatmap( + data, image_filename_prefix, x_axis = None, x_label: str = "", + y_axis = None, y_label: str = "", output_dir: str = TMPDIR): + """Generate single heatmap section.""" + print("X-AXIS:({}, {}), Y-AXIS: ({}, {}), ROWS:{}, COLS:{}".format( + x_axis, (len(x_axis) if x_axis else 0), + y_axis, (len(y_axis) if y_axis else 0), + len(data), len(data[0]))) + fig = px.imshow( + data, + x = x_axis, + y = y_axis, + width=1000 + ) + fig.update_yaxes(title=y_label) + fig.update_xaxes(title=x_label) + image_filename = "{}/{}.html".format(output_dir, image_filename_prefix) + fig.write_html(image_filename) + return image_filename, fig + +def generate_dendrogram( + data, image_filename_prefix, x_axis = None, x_label: str = "", + y_axis = None, y_label: str = "", output_dir: str = TMPDIR): + fig = ff.create_dendrogram( + np.array(data), orientation="right", labels=y_axis) + + heatmap = go.Heatmap( + x=fig['layout']['xaxis']['ticktext'], + y=fig['layout']['yaxis']['ticktext'], + z=data) + + # print("HEAMAP:{}".format(heatmap)) + fig.add_trace(heatmap) + + fig.update_layout({"width": 1000, "height": 500}) + image_filename = "{}/{}.html".format(output_dir, image_filename_prefix) + fig.write_html(image_filename) + return image_filename, fig + +def generate_single_heatmap( + data, image_filename_prefix, x_axis = None, x_label: str = "", + y_axis = None, y_label: str = "", output_dir: str = TMPDIR): + """Generate single heatmap section.""" + # fig = go.Figure({"type": "heatmap"}) + num_cols = len(x_axis) + fig = make_subplots( + rows=1, + cols=num_cols, + shared_yaxes="rows", + # horizontal_spacing=(1 / (num_cols - 1)), + subplot_titles=x_axis + ) + hms = [go.Heatmap( + name=chromo, + y = y_axis, + z = data_array, + showscale=False) for chromo, data_array in zip(x_axis, data)] + for col, hm in enumerate(hms): + fig.add_trace(hm, row=1, col=(col + 1)) + + fig.update_traces( + showlegend=False, + colorscale=[ + [0.0, '#3B3B3B'], [0.4999999999999999, '#ABABAB'], + [0.5, '#F5DE11'], [1.0, '#FF0D00']], + selector={"type": "heatmap"}) + fig.update_traces( + showlegend=True, + showscale=True, + selector={"name": x_axis[-1]}) + fig.update_layout( + coloraxis_colorscale=[ + [0.0, '#3B3B3B'], [0.4999999999999999, '#ABABAB'], + [0.5, '#F5DE11'], [1.0, '#FF0D00']] + ) + print(fig) + image_filename = "{}/{}.html".format(output_dir, image_filename_prefix) + fig.write_html(image_filename) + return image_filename, fig + def main(): """entrypoint function""" conn = database_connector()[0] @@ -44,13 +161,19 @@ def main(): for fullname in trait_fullnames()] traits_data_list = [retrieve_trait_data(t, conn) for t in traits] genotype_filename = build_genotype_file(traits[0]["riset"]) + genotype = parse_genotype_file(genotype_filename) strains = load_genotype_samples(genotype_filename) exported_traits_data_list = [ export_trait_data(td, strains) for td in traits_data_list] slinked = slink(cluster_traits(exported_traits_data_list)) - orders = compute_heatmap_order(slinked) + print("SLINKED: {}".format(slinked)) + traits_order = compute_traits_order(slinked) + print("KEYS: {}".format(traits[0].keys())) + ordered_traits_names = [ + traits[idx]["trait_fullname"] for idx in traits_order] + print("ORDERS: {}".format(traits_order)) strains_and_values = retrieve_strains_and_values( - orders, strains, exported_traits_data_list) + traits_order, strains, exported_traits_data_list) strains_values = strains_and_values[0][1] trait_values = [t[2] for t in strains_and_values] traits_filename = "{}/traits_test_file_{}.txt".format( @@ -64,5 +187,67 @@ def main(): print("Main output: {}, Permutation output: {}".format( main_output, permutations_output)) + qtlresults = parse_reaper_main_results(main_output) + permudata = parse_reaper_permutation_results(permutations_output) + # print("QTLRESULTS: {}".format(qtlresults)) + # print("PERMUDATA: {}".format(permudata)) + + nearest = get_nearest_marker(traits, genotype) + print("NEAREST: {}".format(nearest)) + + organised = organise_reaper_main_results(qtlresults) + + traits_ids = [# sort numerically, but retain the ids as strings + str(i) for i in sorted({int(row["ID"]) for row in qtlresults})] + chromosome_names = sorted( + {row["Chr"] for row in qtlresults}, key = chromosome_sorter_key_fn) + loci_names = sorted({row["Locus"] for row in qtlresults}) + ordered_traits_names = { + res_id: trait for res_id, trait in + zip(traits_ids, + [traits[idx]["trait_fullname"] for idx in traits_order])} + # print("ordered:{}, original: {}".format( + # ordered_traits_names, [t["trait_fullname"] for t in traits])) + # print("chromosome_names:{}".format(chromosome_names)) + # print("trait_ids:{}".format(traits_ids)) + # print("loci names:{}".format(loci_names)) + hdata = process_traits_data_for_heatmap(organised, traits_ids, chromosome_names) + + # print("ZIPPED: {}".format(zip(tuple(ordered_traits_names.keys()), hdata))) + # print("HDATA LENGTH:{}, ORDERED TRAITS LENGTH:{}".format(len(hdata), len(ordered_traits_names.keys()))) + heatmaps_data = [ + generate_heatmap( + data, + "heatmap_chr{}_{}".format(chromo, random_string(10)), + y_axis=tuple( + ordered_traits_names[traits_ids[order]] + for order in traits_order), + x_label=chromo, + output_dir=TMPDIR) + for chromo, data in zip(chromosome_names, hdata)] + print("IMAGES FILENAMES: {}".format([img[0] for img in heatmaps_data])) + + dendograms_data = [ + generate_dendrogram( + data, + "dendo_chr{}_{}".format(chromo, random_string(10)), + y_axis=tuple( + ordered_traits_names[traits_ids[order]] + for order in traits_order), + x_label=chromo, + output_dir=TMPDIR) + for chromo, data in zip(chromosome_names, hdata)] + + res = generate_single_heatmap( + hdata, + "single_heatmap_{}".format(random_string(10)), + y_axis=tuple( + ordered_traits_names[traits_ids[order]] + for order in traits_order), + y_label="Traits", + x_axis=[chromo for chromo in chromosome_names], + x_label="Chromosomes", + output_dir=TMPDIR) + if __name__ == "__main__": main() -- cgit v1.2.3 From ed2e4c0f9d68cfb720da95eba559d69359f7b5fc Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Wed, 15 Sep 2021 05:35:34 +0300 Subject: Add missing sample file for tests * tests/unit/db/data/genotypes/genotype_sample1.geno: new file Add a missing sample data file needed for unit tests. --- tests/unit/db/data/genotypes/genotype_sample1.geno | 23 ++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 tests/unit/db/data/genotypes/genotype_sample1.geno diff --git a/tests/unit/db/data/genotypes/genotype_sample1.geno b/tests/unit/db/data/genotypes/genotype_sample1.geno new file mode 100644 index 0000000..2a55964 --- /dev/null +++ b/tests/unit/db/data/genotypes/genotype_sample1.geno @@ -0,0 +1,23 @@ +# File name: genotype_sample for testing + +# Metadata: Please retain this header information with file. + + +@name: BXD +@type: riset +@mat: B +@pat: D +@het:H +@unk: U + + + + + + +Chr Locus cM Mb BXD1 BXD2 BXD5 BXD6 BXD8 BXD9 +1 rs31443144 1.50 3.010274 B B D D D B +1 rs6269442 1.50 3.492195 B B D D H Y +1 rs32285189 1.63 3.511204 B U D D D B +2 rs31443144 1.50 3.010274 B B D D D B +2 rs6269442 1.50 3.492195 B B D D H Y \ No newline at end of file -- cgit v1.2.3 From f17b489c8eb94050b81b1a59fb43954d036f7c38 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Wed, 15 Sep 2021 06:01:44 +0300 Subject: Fix format of arguments and expected values * tests/unit/computations/test_heatmap.py: ordering is not longer provided as a list of tuples; the ordering values are just a list of numbers now. This commit updates the test to take this into consideration. * tests/unit/computations/test_qtlreaper.py: the 'Chr' value if numeric, is represented by an actual number, not a string. This commit updates the code to take this into consideration. --- tests/unit/computations/test_heatmap.py | 8 ++++---- tests/unit/computations/test_qtlreaper.py | 20 ++++++++++---------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/tests/unit/computations/test_heatmap.py b/tests/unit/computations/test_heatmap.py index f1bbefc..156af45 100644 --- a/tests/unit/computations/test_heatmap.py +++ b/tests/unit/computations/test_heatmap.py @@ -165,22 +165,22 @@ class TestHeatmap(TestCase): """Test retrieval of strains and values.""" for orders, slist, tdata, expected in [ [ - [(60, 2)], + [2], ["s1", "s2", "s3", "s4"], [[2, 9, 6, None, 4], [7, 5, None, None, 4], [9, None, 5, 4, 7], [6, None, None, 4, None]], - [[(60, 2), ["s1", "s3", "s4"], [9, 5, 4]]] + [[2, ["s1", "s3", "s4"], [9, 5, 4]]] ], [ - [(60, 3)], + [3], ["s1", "s2", "s3", "s4", "s5"], [[2, 9, 6, None, 4], [7, 5, None, None, 4], [9, None, 5, 4, 7], [6, None, None, 4, None]], - [[(60, 3), ["s1", "s4"], [6, 4]]] + [[3, ["s1", "s4"], [6, 4]]] ]]: with self.subTest(strainlist=slist, traitdata=tdata): self.assertEqual( diff --git a/tests/unit/computations/test_qtlreaper.py b/tests/unit/computations/test_qtlreaper.py index 495ed97..1d67827 100644 --- a/tests/unit/computations/test_qtlreaper.py +++ b/tests/unit/computations/test_qtlreaper.py @@ -15,52 +15,52 @@ class TestQTLReaper(TestCase): "tests/unit/computations/data/qtlreaper/main_output_sample.txt"), [ { - "ID": "T1", "Locus": "rs31443144", "Chr": "1", "cM": 1.500, + "ID": "T1", "Locus": "rs31443144", "Chr": 1, "cM": 1.500, "Mb": 3.010, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs6269442", "Chr": "1", "cM": 1.500, + "ID": "T1", "Locus": "rs6269442", "Chr": 1, "cM": 1.500, "Mb": 3.492, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs32285189", "Chr": "1", "cM": 1.630, + "ID": "T1", "Locus": "rs32285189", "Chr": 1, "cM": 1.630, "Mb": 3.511, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs258367496", "Chr": "1", "cM": 1.630, + "ID": "T1", "Locus": "rs258367496", "Chr": 1, "cM": 1.630, "Mb": 3.660, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs32430919", "Chr": "1", "cM": 1.750, + "ID": "T1", "Locus": "rs32430919", "Chr": 1, "cM": 1.750, "Mb": 3.777, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs36251697", "Chr": "1", "cM": 1.880, + "ID": "T1", "Locus": "rs36251697", "Chr": 1, "cM": 1.880, "Mb": 3.812, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs30658298", "Chr": "1", "cM": 2.010, + "ID": "T1", "Locus": "rs30658298", "Chr": 1, "cM": 2.010, "Mb": 4.431, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs51852623", "Chr": "1", "cM": 2.010, + "ID": "T1", "Locus": "rs51852623", "Chr": 1, "cM": 2.010, "Mb": 4.447, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs31879829", "Chr": "1", "cM": 2.140, + "ID": "T1", "Locus": "rs31879829", "Chr": 1, "cM": 2.140, "Mb": 4.519, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs36742481", "Chr": "1", "cM": 2.140, + "ID": "T1", "Locus": "rs36742481", "Chr": 1, "cM": 2.140, "Mb": 4.776, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 } -- cgit v1.2.3 From e3e18950cfcdec918429dcbb5d5ed2e9616b7a20 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Wed, 15 Sep 2021 11:19:56 +0300 Subject: Reorganise modules Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * The heatmap generation does not fall cleanly within the computations or db modules. This commit moves it to the higher level gn3 module. --- gn3/computations/heatmap.py | 277 ----------------------------- gn3/heatmaps.py | 302 ++++++++++++++++++++++++++++++++ gn3/heatmaps/heatmaps.py | 67 ------- tests/unit/computations/test_heatmap.py | 187 -------------------- tests/unit/test_heatmaps.py | 187 ++++++++++++++++++++ 5 files changed, 489 insertions(+), 531 deletions(-) delete mode 100644 gn3/computations/heatmap.py create mode 100644 gn3/heatmaps.py delete mode 100644 gn3/heatmaps/heatmaps.py delete mode 100644 tests/unit/computations/test_heatmap.py create mode 100644 tests/unit/test_heatmaps.py diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py deleted file mode 100644 index 8727c92..0000000 --- a/gn3/computations/heatmap.py +++ /dev/null @@ -1,277 +0,0 @@ -""" -This module will contain functions to be used in computation of the data used to -generate various kinds of heatmaps. -""" - -from functools import reduce -from typing import Any, Dict, Sequence -from gn3.computations.slink import slink -from gn3.computations.qtlreaper import generate_traits_file -from gn3.computations.correlations2 import compute_correlation -from gn3.db.genotypes import build_genotype_file, load_genotype_samples -from gn3.db.traits import ( - retrieve_trait_data, - retrieve_trait_info, - generate_traits_filename) - -def export_trait_data( - trait_data: dict, strainlist: Sequence[str], dtype: str = "val", - var_exists: bool = False, n_exists: bool = False): - """ - Export data according to `strainlist`. Mostly used in calculating - correlations. - - DESCRIPTION: - Migrated from - https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L166-L211 - - PARAMETERS - trait: (dict) - The dictionary of key-value pairs representing a trait - strainlist: (list) - A list of strain names - dtype: (str) - ... verify what this is ... - var_exists: (bool) - A flag indicating existence of variance - n_exists: (bool) - A flag indicating existence of ndata - """ - def __export_all_types(tdata, strain): - sample_data = [] - if tdata[strain]["value"]: - sample_data.append(tdata[strain]["value"]) - if var_exists: - if tdata[strain]["variance"]: - sample_data.append(tdata[strain]["variance"]) - else: - sample_data.append(None) - if n_exists: - if tdata[strain]["ndata"]: - sample_data.append(tdata[strain]["ndata"]) - else: - sample_data.append(None) - else: - if var_exists and n_exists: - sample_data += [None, None, None] - elif var_exists or n_exists: - sample_data += [None, None] - else: - sample_data.append(None) - - return tuple(sample_data) - - def __exporter(accumulator, strain): - # pylint: disable=[R0911] - if strain in trait_data["data"]: - if dtype == "val": - return accumulator + (trait_data["data"][strain]["value"], ) - if dtype == "var": - return accumulator + (trait_data["data"][strain]["variance"], ) - if dtype == "N": - return accumulator + (trait_data["data"][strain]["ndata"], ) - if dtype == "all": - return accumulator + __export_all_types(trait_data["data"], strain) - raise KeyError("Type `%s` is incorrect" % dtype) - if var_exists and n_exists: - return accumulator + (None, None, None) - if var_exists or n_exists: - return accumulator + (None, None) - return accumulator + (None,) - - return reduce(__exporter, strainlist, tuple()) - -def trait_display_name(trait: Dict): - """ - Given a trait, return a name to use to display the trait on a heatmap. - - DESCRIPTION - Migrated from - https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L141-L157 - """ - if trait.get("db", None) and trait.get("trait_name", None): - if trait["db"]["dataset_type"] == "Temp": - desc = trait["description"] - if desc.find("PCA") >= 0: - return "%s::%s" % ( - trait["db"]["displayname"], - desc[desc.rindex(':')+1:].strip()) - return "%s::%s" % ( - trait["db"]["displayname"], - desc[:desc.index('entered')].strip()) - prefix = "%s::%s" % ( - trait["db"]["dataset_name"], trait["trait_name"]) - if trait["cellid"]: - return "%s::%s" % (prefix, trait["cellid"]) - return prefix - return trait["description"] - -def cluster_traits(traits_data_list: Sequence[Dict]): - """ - Clusters the trait values. - - DESCRIPTION - Attempts to replicate the clustering of the traits, as done at - https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L138-L162 - """ - def __compute_corr(tdata_i, tdata_j): - if tdata_i[0] == tdata_j[0]: - return 0.0 - corr_vals = compute_correlation(tdata_i[1], tdata_j[1]) - corr = corr_vals[0] - if (1 - corr) < 0: - return 0.0 - return 1 - corr - - def __cluster(tdata_i): - return tuple( - __compute_corr(tdata_i, tdata_j) - for tdata_j in enumerate(traits_data_list)) - - return tuple(__cluster(tdata_i) for tdata_i in enumerate(traits_data_list)) - -def heatmap_data(traits_names, conn: Any): - """ - heatmap function - - DESCRIPTION - This function is an attempt to reproduce the initialisation at - https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L46-L64 - and also the clustering and slink computations at - https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L138-L165 - with the help of the `gn3.computations.heatmap.cluster_traits` function. - - It does not try to actually draw the heatmap image. - - PARAMETERS: - TODO: Elaborate on the parameters here... - """ - threshold = 0 # webqtlConfig.PUBLICTHRESH - def __retrieve_traitlist_and_datalist(threshold, fullname): - trait = retrieve_trait_info(threshold, fullname, conn) - return (trait, retrieve_trait_data(trait, conn)) - - traits_details = [ - __retrieve_traitlist_and_datalist(threshold, fullname) - for fullname in traits_names] - traits_list = tuple(x[0] for x in traits_details) - traits_data_list = [x[1] for x in traits_details] - genotype_filename = build_genotype_file(traits_list[0]["riset"]) - strainlist = load_genotype_samples(genotype_filename) - exported_traits_data_list = tuple( - export_trait_data(td, strainlist) for td in traits_data_list) - slink_data = slink(cluster_traits(exported_traits_data_list)) - ordering_data = compute_heatmap_order(slink_data) - strains_and_values = retrieve_strains_and_values( - ordering_data, strainlist, exported_traits_data_list) - strains_values = strains_and_values[0][1] - trait_values = [t[2] for t in strains_and_values] - traits_filename = generate_traits_filename() - generate_traits_file(strains_values, trait_values, traits_filename) - - return { - "slink_data": slink_data, - "ordering_data": ordering_data, - "strainlist": strainlist, - "genotype_filename": genotype_filename, - "traits_list": traits_list, - "traits_data_list": traits_data_list, - "exported_traits_data_list": exported_traits_data_list, - "traits_filename": traits_filename - } - -def compute_traits_order(slink_data, neworder: tuple = tuple()): - """ - Compute the order of the traits for clustering from `slink_data`. - - This function tries to reproduce the creation and update of the `neworder` - variable in - https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L120 - and in the `web.webqtl.heatmap.Heatmap.draw` function in GN1 - """ - def __order_maker(norder, slnk_dt): - if isinstance(slnk_dt[0], int) and isinstance(slnk_dt[1], int): - return norder + (slnk_dt[0], slnk_dt[1]) - - if isinstance(slnk_dt[0], int): - return __order_maker((norder + (slnk_dt[0], )), slnk_dt[1]) - - if isinstance(slnk_dt[1], int): - return __order_maker(norder, slnk_dt[0]) + (slnk_dt[1], ) - - return __order_maker(__order_maker(norder, slnk_dt[0]), slnk_dt[1]) - - return __order_maker(neworder, slink_data) - -def retrieve_strains_and_values(orders, strainlist, traits_data_list): - """ - Get the strains and their corresponding values from `strainlist` and - `traits_data_list`. - - This migrates the code in - https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L215-221 - """ - # This feels nasty! There's a lot of mutation of values here, that might - # indicate something untoward in the design of this function and its - # dependents ==> Review - strains = [] - values = [] - rets = [] - for order in orders: - temp_val = traits_data_list[order] - for i, strain in enumerate(strainlist): - if temp_val[i] is not None: - strains.append(strain) - values.append(temp_val[i]) - rets.append([order, strains[:], values[:]]) - strains = [] - values = [] - - return rets - -def nearest_marker_finder(genotype): - """ - Returns a function to be used with `genotype` to compute the nearest marker - to the trait passed to the returned function. - - https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L425-434 - """ - def __compute_distances(chromo, trait): - loci = chromo.get("loci", None) - if not loci: - return None - return tuple( - { - "name": locus["name"], - "distance": abs(locus["Mb"] - trait["mb"]) - } for locus in loci) - - def __finder(trait): - _chrs = tuple( - _chr for _chr in genotype["chromosomes"] - if str(_chr["name"]) == str(trait["chr"])) - if len(_chrs) == 0: - return None - distances = tuple( - distance for dists in - filter( - lambda x: x is not None, - (__compute_distances(_chr, trait) for _chr in _chrs)) - for distance in dists) - nearest = min(distances, key=lambda d: d["distance"]) - return nearest["name"] - return __finder - -def get_nearest_marker(traits_list, genotype): - """ - Retrieves the nearest marker for each of the traits in the list. - - DESCRIPTION: - This migrates the code in - https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L419-L438 - """ - if not genotype["Mbmap"]: - return [None] * len(trait_list) - - marker_finder = nearest_marker_finder(genotype) - return [marker_finder(trait) for trait in traits_list] diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py new file mode 100644 index 0000000..198fb45 --- /dev/null +++ b/gn3/heatmaps.py @@ -0,0 +1,302 @@ +""" +This module will contain functions to be used in computation of the data used to +generate various kinds of heatmaps. +""" + +from functools import reduce +from typing import Any, Dict, Sequence +from gn3.computations.slink import slink +from gn3.computations.qtlreaper import generate_traits_file +from gn3.computations.correlations2 import compute_correlation +from gn3.db.genotypes import build_genotype_file, load_genotype_samples +from gn3.db.traits import ( + retrieve_trait_data, + retrieve_trait_info, + generate_traits_filename) + +def export_trait_data( + trait_data: dict, strainlist: Sequence[str], dtype: str = "val", + var_exists: bool = False, n_exists: bool = False): + """ + Export data according to `strainlist`. Mostly used in calculating + correlations. + + DESCRIPTION: + Migrated from + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L166-L211 + + PARAMETERS + trait: (dict) + The dictionary of key-value pairs representing a trait + strainlist: (list) + A list of strain names + dtype: (str) + ... verify what this is ... + var_exists: (bool) + A flag indicating existence of variance + n_exists: (bool) + A flag indicating existence of ndata + """ + def __export_all_types(tdata, strain): + sample_data = [] + if tdata[strain]["value"]: + sample_data.append(tdata[strain]["value"]) + if var_exists: + if tdata[strain]["variance"]: + sample_data.append(tdata[strain]["variance"]) + else: + sample_data.append(None) + if n_exists: + if tdata[strain]["ndata"]: + sample_data.append(tdata[strain]["ndata"]) + else: + sample_data.append(None) + else: + if var_exists and n_exists: + sample_data += [None, None, None] + elif var_exists or n_exists: + sample_data += [None, None] + else: + sample_data.append(None) + + return tuple(sample_data) + + def __exporter(accumulator, strain): + # pylint: disable=[R0911] + if strain in trait_data["data"]: + if dtype == "val": + return accumulator + (trait_data["data"][strain]["value"], ) + if dtype == "var": + return accumulator + (trait_data["data"][strain]["variance"], ) + if dtype == "N": + return accumulator + (trait_data["data"][strain]["ndata"], ) + if dtype == "all": + return accumulator + __export_all_types(trait_data["data"], strain) + raise KeyError("Type `%s` is incorrect" % dtype) + if var_exists and n_exists: + return accumulator + (None, None, None) + if var_exists or n_exists: + return accumulator + (None, None) + return accumulator + (None,) + + return reduce(__exporter, strainlist, tuple()) + +def trait_display_name(trait: Dict): + """ + Given a trait, return a name to use to display the trait on a heatmap. + + DESCRIPTION + Migrated from + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L141-L157 + """ + if trait.get("db", None) and trait.get("trait_name", None): + if trait["db"]["dataset_type"] == "Temp": + desc = trait["description"] + if desc.find("PCA") >= 0: + return "%s::%s" % ( + trait["db"]["displayname"], + desc[desc.rindex(':')+1:].strip()) + return "%s::%s" % ( + trait["db"]["displayname"], + desc[:desc.index('entered')].strip()) + prefix = "%s::%s" % ( + trait["db"]["dataset_name"], trait["trait_name"]) + if trait["cellid"]: + return "%s::%s" % (prefix, trait["cellid"]) + return prefix + return trait["description"] + +def cluster_traits(traits_data_list: Sequence[Dict]): + """ + Clusters the trait values. + + DESCRIPTION + Attempts to replicate the clustering of the traits, as done at + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L138-L162 + """ + def __compute_corr(tdata_i, tdata_j): + if tdata_i[0] == tdata_j[0]: + return 0.0 + corr_vals = compute_correlation(tdata_i[1], tdata_j[1]) + corr = corr_vals[0] + if (1 - corr) < 0: + return 0.0 + return 1 - corr + + def __cluster(tdata_i): + return tuple( + __compute_corr(tdata_i, tdata_j) + for tdata_j in enumerate(traits_data_list)) + + return tuple(__cluster(tdata_i) for tdata_i in enumerate(traits_data_list)) + +def heatmap_data(traits_names, conn: Any): + """ + heatmap function + + DESCRIPTION + This function is an attempt to reproduce the initialisation at + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L46-L64 + and also the clustering and slink computations at + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L138-L165 + with the help of the `gn3.computations.heatmap.cluster_traits` function. + + It does not try to actually draw the heatmap image. + + PARAMETERS: + TODO: Elaborate on the parameters here... + """ + threshold = 0 # webqtlConfig.PUBLICTHRESH + def __retrieve_traitlist_and_datalist(threshold, fullname): + trait = retrieve_trait_info(threshold, fullname, conn) + return (trait, retrieve_trait_data(trait, conn)) + + traits_details = [ + __retrieve_traitlist_and_datalist(threshold, fullname) + for fullname in traits_names] + traits_list = tuple(x[0] for x in traits_details) + traits_data_list = [x[1] for x in traits_details] + genotype_filename = build_genotype_file(traits_list[0]["riset"]) + strainlist = load_genotype_samples(genotype_filename) + exported_traits_data_list = tuple( + export_trait_data(td, strainlist) for td in traits_data_list) + slink_data = slink(cluster_traits(exported_traits_data_list)) + ordering_data = compute_heatmap_order(slink_data) + strains_and_values = retrieve_strains_and_values( + ordering_data, strainlist, exported_traits_data_list) + strains_values = strains_and_values[0][1] + trait_values = [t[2] for t in strains_and_values] + traits_filename = generate_traits_filename() + generate_traits_file(strains_values, trait_values, traits_filename) + + return { + "slink_data": slink_data, + "ordering_data": ordering_data, + "strainlist": strainlist, + "genotype_filename": genotype_filename, + "traits_list": traits_list, + "traits_data_list": traits_data_list, + "exported_traits_data_list": exported_traits_data_list, + "traits_filename": traits_filename + } + +def compute_traits_order(slink_data, neworder: tuple = tuple()): + """ + Compute the order of the traits for clustering from `slink_data`. + + This function tries to reproduce the creation and update of the `neworder` + variable in + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L120 + and in the `web.webqtl.heatmap.Heatmap.draw` function in GN1 + """ + def __order_maker(norder, slnk_dt): + if isinstance(slnk_dt[0], int) and isinstance(slnk_dt[1], int): + return norder + (slnk_dt[0], slnk_dt[1]) + + if isinstance(slnk_dt[0], int): + return __order_maker((norder + (slnk_dt[0], )), slnk_dt[1]) + + if isinstance(slnk_dt[1], int): + return __order_maker(norder, slnk_dt[0]) + (slnk_dt[1], ) + + return __order_maker(__order_maker(norder, slnk_dt[0]), slnk_dt[1]) + + return __order_maker(neworder, slink_data) + +def retrieve_strains_and_values(orders, strainlist, traits_data_list): + """ + Get the strains and their corresponding values from `strainlist` and + `traits_data_list`. + + This migrates the code in + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L215-221 + """ + # This feels nasty! There's a lot of mutation of values here, that might + # indicate something untoward in the design of this function and its + # dependents ==> Review + strains = [] + values = [] + rets = [] + for order in orders: + temp_val = traits_data_list[order] + for i, strain in enumerate(strainlist): + if temp_val[i] is not None: + strains.append(strain) + values.append(temp_val[i]) + rets.append([order, strains[:], values[:]]) + strains = [] + values = [] + + return rets + +def nearest_marker_finder(genotype): + """ + Returns a function to be used with `genotype` to compute the nearest marker + to the trait passed to the returned function. + + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L425-434 + """ + def __compute_distances(chromo, trait): + loci = chromo.get("loci", None) + if not loci: + return None + return tuple( + { + "name": locus["name"], + "distance": abs(locus["Mb"] - trait["mb"]) + } for locus in loci) + + def __finder(trait): + _chrs = tuple( + _chr for _chr in genotype["chromosomes"] + if str(_chr["name"]) == str(trait["chr"])) + if len(_chrs) == 0: + return None + distances = tuple( + distance for dists in + filter( + lambda x: x is not None, + (__compute_distances(_chr, trait) for _chr in _chrs)) + for distance in dists) + nearest = min(distances, key=lambda d: d["distance"]) + return nearest["name"] + return __finder + +def get_nearest_marker(traits_list, genotype): + """ + Retrieves the nearest marker for each of the traits in the list. + + DESCRIPTION: + This migrates the code in + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L419-L438 + """ + if not genotype["Mbmap"]: + return [None] * len(trait_list) + + marker_finder = nearest_marker_finder(genotype) + return [marker_finder(trait) for trait in traits_list] + +# # Grey + Blue + Red +# def generate_heatmap(): +# cols = 20 +# y_axis = (["%s"%x for x in range(1, cols+1)][:-1] + ["X"]) #replace last item with x for now +# x_axis = heatmap_x_axis_names() +# data = generate_random_data(height=cols, width=len(x_axis)) +# fig = px.imshow( +# data, +# x=x_axis, +# y=y_axis, +# width=500) +# fig.update_traces(xtype="array") +# fig.update_traces(ytype="array") +# # fig.update_traces(xgap=10) +# fig.update_xaxes( +# visible=True, +# title_text="Traits", +# title_font_size=16) +# fig.update_layout( +# coloraxis_colorscale=[ +# [0.0, '#3B3B3B'], [0.4999999999999999, '#ABABAB'], +# [0.5, '#F5DE11'], [1.0, '#FF0D00']]) +# fig.write_html("%s/%s"%(heatmap_dir, "test_image.html")) +# return fig diff --git a/gn3/heatmaps/heatmaps.py b/gn3/heatmaps/heatmaps.py deleted file mode 100644 index 88f546d..0000000 --- a/gn3/heatmaps/heatmaps.py +++ /dev/null @@ -1,67 +0,0 @@ -import random -import plotly.express as px - -#### Remove these #### - -heatmap_dir = "heatmap_images" - -def generate_random_data(data_stop: float = 2, width: int = 10, height: int = 30): - """ - This is mostly a utility function to be used to generate random data, useful - for development of the heatmap generation code, without access to the actual - database data. - """ - return [[random.uniform(0,data_stop) for i in range(0, width)] - for j in range(0, height)] - -def generate_random_data2(data_stop: float = 2, width: int = 10, height: int = 30): - """ - This is mostly a utility function to be used to generate random data, useful - for development of the heatmap generation code, without access to the actual - database data. - """ - return [ - [{ - "value": item, - "category": random.choice(["C57BL/6J +", "DBA/2J +"])} - for item in axis] - for axis in generate_random_data(data_stop, width, height)] - -def heatmap_x_axis_names(): - return [ - "UCLA_BXDBXH_CARTILAGE_V2::ILM103710672", - "UCLA_BXDBXH_CARTILAGE_V2::ILM2260338", - "UCLA_BXDBXH_CARTILAGE_V2::ILM3140576", - "UCLA_BXDBXH_CARTILAGE_V2::ILM5670577", - "UCLA_BXDBXH_CARTILAGE_V2::ILM2070121", - "UCLA_BXDBXH_CARTILAGE_V2::ILM103990541", - "UCLA_BXDBXH_CARTILAGE_V2::ILM1190722", - "UCLA_BXDBXH_CARTILAGE_V2::ILM6590722", - "UCLA_BXDBXH_CARTILAGE_V2::ILM4200064", - "UCLA_BXDBXH_CARTILAGE_V2::ILM3140463"] -#### END: Remove these #### - -# Grey + Blue + Red -def generate_heatmap(): - cols = 20 - y_axis = (["%s"%x for x in range(1, cols+1)][:-1] + ["X"]) #replace last item with x for now - x_axis = heatmap_x_axis_names() - data = generate_random_data(height=cols, width=len(x_axis)) - fig = px.imshow( - data, - x=x_axis, - y=y_axis, - width=500) - fig.update_traces(xtype="array") - fig.update_traces(ytype="array") - # fig.update_traces(xgap=10) - fig.update_xaxes( - visible=True, - title_text="Traits", - title_font_size=16) - fig.update_layout( - coloraxis_colorscale=[ - [0.0, '#3B3B3B'], [0.4999999999999999, '#ABABAB'], - [0.5, '#F5DE11'], [1.0, '#FF0D00']]) - fig.write_html("%s/%s"%(heatmap_dir, "test_image.html")) - return fig diff --git a/tests/unit/computations/test_heatmap.py b/tests/unit/computations/test_heatmap.py deleted file mode 100644 index 156af45..0000000 --- a/tests/unit/computations/test_heatmap.py +++ /dev/null @@ -1,187 +0,0 @@ -"""Module contains tests for gn3.computations.heatmap""" -from unittest import TestCase -from gn3.computations.heatmap import ( - cluster_traits, - export_trait_data, - compute_traits_order, - retrieve_strains_and_values) - -strainlist = ["B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"] -trait_data = { - "mysqlid": 36688172, - "data": { - "B6cC3-1": {"strain_name": "B6cC3-1", "value": 7.51879, "variance": None, "ndata": None}, - "BXD1": {"strain_name": "BXD1", "value": 7.77141, "variance": None, "ndata": None}, - "BXD12": {"strain_name": "BXD12", "value": 8.39265, "variance": None, "ndata": None}, - "BXD16": {"strain_name": "BXD16", "value": 8.17443, "variance": None, "ndata": None}, - "BXD19": {"strain_name": "BXD19", "value": 8.30401, "variance": None, "ndata": None}, - "BXD2": {"strain_name": "BXD2", "value": 7.80944, "variance": None, "ndata": None}, - "BXD21": {"strain_name": "BXD21", "value": 8.93809, "variance": None, "ndata": None}, - "BXD24": {"strain_name": "BXD24", "value": 7.99415, "variance": None, "ndata": None}, - "BXD27": {"strain_name": "BXD27", "value": 8.12177, "variance": None, "ndata": None}, - "BXD28": {"strain_name": "BXD28", "value": 7.67688, "variance": None, "ndata": None}, - "BXD32": {"strain_name": "BXD32", "value": 7.79062, "variance": None, "ndata": None}, - "BXD39": {"strain_name": "BXD39", "value": 8.27641, "variance": None, "ndata": None}, - "BXD40": {"strain_name": "BXD40", "value": 8.18012, "variance": None, "ndata": None}, - "BXD42": {"strain_name": "BXD42", "value": 7.82433, "variance": None, "ndata": None}, - "BXD6": {"strain_name": "BXD6", "value": 8.09718, "variance": None, "ndata": None}, - "BXH14": {"strain_name": "BXH14", "value": 7.97475, "variance": None, "ndata": None}, - "BXH19": {"strain_name": "BXH19", "value": 7.67223, "variance": None, "ndata": None}, - "BXH2": {"strain_name": "BXH2", "value": 7.93622, "variance": None, "ndata": None}, - "BXH22": {"strain_name": "BXH22", "value": 7.43692, "variance": None, "ndata": None}, - "BXH4": {"strain_name": "BXH4", "value": 7.96336, "variance": None, "ndata": None}, - "BXH6": {"strain_name": "BXH6", "value": 7.75132, "variance": None, "ndata": None}, - "BXH7": {"strain_name": "BXH7", "value": 8.12927, "variance": None, "ndata": None}, - "BXH8": {"strain_name": "BXH8", "value": 6.77338, "variance": None, "ndata": None}, - "BXH9": {"strain_name": "BXH9", "value": 8.03836, "variance": None, "ndata": None}, - "C3H/HeJ": {"strain_name": "C3H/HeJ", "value": 7.42795, "variance": None, "ndata": None}, - "C57BL/6J": {"strain_name": "C57BL/6J", "value": 7.50606, "variance": None, "ndata": None}, - "DBA/2J": {"strain_name": "DBA/2J", "value": 7.72588, "variance": None, "ndata": None}}} - -slinked = ( - (((0, 2, 0.16381088984330505), - ((1, 7, 0.06024619831474998), 5, 0.19179284676938602), - 0.20337048635536847), - 9, - 0.23451785425383564), - ((3, (6, 8, 0.2140799896286565), 0.25879514152086425), - 4, 0.8968250491499363), - 0.9313185954797953) - -class TestHeatmap(TestCase): - """Class for testing heatmap computation functions""" - - def test_export_trait_data_dtype(self): - """ - Test `export_trait_data` with different values for the `dtype` keyword - argument - """ - for dtype, expected in [ - ["val", (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], - ["var", (None, None, None, None, None, None)], - ["N", (None, None, None, None, None, None)], - ["all", (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)]]: - with self.subTest(dtype=dtype): - self.assertEqual( - export_trait_data(trait_data, strainlist, dtype=dtype), - expected) - - def test_export_trait_data_dtype_all_flags(self): - """ - Test `export_trait_data` with different values for the `dtype` keyword - argument and the different flags set up - """ - for dtype, vflag, nflag, expected in [ - ["val", False, False, - (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], - ["val", False, True, - (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], - ["val", True, False, - (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], - ["val", True, True, - (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], - ["var", False, False, (None, None, None, None, None, None)], - ["var", False, True, (None, None, None, None, None, None)], - ["var", True, False, (None, None, None, None, None, None)], - ["var", True, True, (None, None, None, None, None, None)], - ["N", False, False, (None, None, None, None, None, None)], - ["N", False, True, (None, None, None, None, None, None)], - ["N", True, False, (None, None, None, None, None, None)], - ["N", True, True, (None, None, None, None, None, None)], - ["all", False, False, - (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], - ["all", False, True, - (7.51879, None, 7.77141, None, 8.39265, None, 8.17443, None, - 8.30401, None, 7.80944, None)], - ["all", True, False, - (7.51879, None, 7.77141, None, 8.39265, None, 8.17443, None, - 8.30401, None, 7.80944, None)], - ["all", True, True, - (7.51879, None, None, 7.77141, None, None, 8.39265, None, None, - 8.17443, None, None, 8.30401, None, None, 7.80944, None, None)] - ]: - with self.subTest(dtype=dtype, vflag=vflag, nflag=nflag): - self.assertEqual( - export_trait_data( - trait_data, strainlist, dtype=dtype, var_exists=vflag, - n_exists=nflag), - expected) - - def test_cluster_traits(self): - """ - Test that the clustering is working as expected. - """ - traits_data_list = [ - (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944), - (6.1427, 6.50588, 7.73705, 6.68328, 7.49293, 7.27398), - (8.4211, 8.30581, 9.24076, 8.51173, 9.18455, 8.36077), - (10.0904, 10.6509, 9.36716, 9.91202, 8.57444, 10.5731), - (10.188, 9.76652, 9.54813, 9.05074, 9.52319, 9.10505), - (6.74676, 7.01029, 7.54169, 6.48574, 7.01427, 7.26815), - (6.39359, 6.85321, 5.78337, 7.11141, 6.22101, 6.16544), - (6.84118, 7.08432, 7.59844, 7.08229, 7.26774, 7.24991), - (9.45215, 10.6943, 8.64719, 10.1592, 7.75044, 8.78615), - (7.04737, 6.87185, 7.58586, 6.92456, 6.84243, 7.36913)] - self.assertEqual( - cluster_traits(traits_data_list), - ((0.0, 0.20337048635536847, 0.16381088984330505, 1.7388553629398245, - 1.5025235756329178, 0.6952839500255574, 1.271661230252733, - 0.2100487290977544, 1.4699690641062024, 0.7934461515867415), - (0.20337048635536847, 0.0, 0.2198321044997198, 1.5753041735592204, - 1.4815755944537086, 0.26087293140686374, 1.6939790104301427, - 0.06024619831474998, 1.7430082449189215, 0.4497104244247795), - (0.16381088984330505, 0.2198321044997198, 0.0, 1.9073926868549234, - 1.0396738891139845, 0.5278328671176757, 1.6275069061182947, - 0.2636503792482082, 1.739617877037615, 0.7127042590637039), - (1.7388553629398245, 1.5753041735592204, 1.9073926868549234, 0.0, - 0.9936846292920328, 1.1169999189889366, 0.6007483980555253, - 1.430209221053372, 0.25879514152086425, 0.9313185954797953), - (1.5025235756329178, 1.4815755944537086, 1.0396738891139845, - 0.9936846292920328, 0.0, 1.027827186339337, 1.1441743109173244, - 1.4122477962364253, 0.8968250491499363, 1.1683723389247052), - (0.6952839500255574, 0.26087293140686374, 0.5278328671176757, - 1.1169999189889366, 1.027827186339337, 0.0, 1.8420471110023269, - 0.19179284676938602, 1.4875072385631605, 0.23451785425383564), - (1.271661230252733, 1.6939790104301427, 1.6275069061182947, - 0.6007483980555253, 1.1441743109173244, 1.8420471110023269, 0.0, - 1.6540234785929928, 0.2140799896286565, 1.7413442197913358), - (0.2100487290977544, 0.06024619831474998, 0.2636503792482082, - 1.430209221053372, 1.4122477962364253, 0.19179284676938602, - 1.6540234785929928, 0.0, 1.5225640692832796, 0.33370067057028485), - (1.4699690641062024, 1.7430082449189215, 1.739617877037615, - 0.25879514152086425, 0.8968250491499363, 1.4875072385631605, - 0.2140799896286565, 1.5225640692832796, 0.0, 1.3256191648260216), - (0.7934461515867415, 0.4497104244247795, 0.7127042590637039, - 0.9313185954797953, 1.1683723389247052, 0.23451785425383564, - 1.7413442197913358, 0.33370067057028485, 1.3256191648260216, - 0.0))) - - def test_compute_heatmap_order(self): - """Test the orders.""" - self.assertEqual( - compute_traits_order(slinked), (0, 2, 1, 7, 5, 9, 3, 6, 8, 4)) - - def test_retrieve_strains_and_values(self): - """Test retrieval of strains and values.""" - for orders, slist, tdata, expected in [ - [ - [2], - ["s1", "s2", "s3", "s4"], - [[2, 9, 6, None, 4], - [7, 5, None, None, 4], - [9, None, 5, 4, 7], - [6, None, None, 4, None]], - [[2, ["s1", "s3", "s4"], [9, 5, 4]]] - ], - [ - [3], - ["s1", "s2", "s3", "s4", "s5"], - [[2, 9, 6, None, 4], - [7, 5, None, None, 4], - [9, None, 5, 4, 7], - [6, None, None, 4, None]], - [[3, ["s1", "s4"], [6, 4]]] - ]]: - with self.subTest(strainlist=slist, traitdata=tdata): - self.assertEqual( - retrieve_strains_and_values(orders, slist, tdata), expected) diff --git a/tests/unit/test_heatmaps.py b/tests/unit/test_heatmaps.py new file mode 100644 index 0000000..265d5a8 --- /dev/null +++ b/tests/unit/test_heatmaps.py @@ -0,0 +1,187 @@ +"""Module contains tests for gn3.heatmaps.heatmaps""" +from unittest import TestCase +from gn3.heatmaps import ( + cluster_traits, + export_trait_data, + compute_traits_order, + retrieve_strains_and_values) + +strainlist = ["B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"] +trait_data = { + "mysqlid": 36688172, + "data": { + "B6cC3-1": {"strain_name": "B6cC3-1", "value": 7.51879, "variance": None, "ndata": None}, + "BXD1": {"strain_name": "BXD1", "value": 7.77141, "variance": None, "ndata": None}, + "BXD12": {"strain_name": "BXD12", "value": 8.39265, "variance": None, "ndata": None}, + "BXD16": {"strain_name": "BXD16", "value": 8.17443, "variance": None, "ndata": None}, + "BXD19": {"strain_name": "BXD19", "value": 8.30401, "variance": None, "ndata": None}, + "BXD2": {"strain_name": "BXD2", "value": 7.80944, "variance": None, "ndata": None}, + "BXD21": {"strain_name": "BXD21", "value": 8.93809, "variance": None, "ndata": None}, + "BXD24": {"strain_name": "BXD24", "value": 7.99415, "variance": None, "ndata": None}, + "BXD27": {"strain_name": "BXD27", "value": 8.12177, "variance": None, "ndata": None}, + "BXD28": {"strain_name": "BXD28", "value": 7.67688, "variance": None, "ndata": None}, + "BXD32": {"strain_name": "BXD32", "value": 7.79062, "variance": None, "ndata": None}, + "BXD39": {"strain_name": "BXD39", "value": 8.27641, "variance": None, "ndata": None}, + "BXD40": {"strain_name": "BXD40", "value": 8.18012, "variance": None, "ndata": None}, + "BXD42": {"strain_name": "BXD42", "value": 7.82433, "variance": None, "ndata": None}, + "BXD6": {"strain_name": "BXD6", "value": 8.09718, "variance": None, "ndata": None}, + "BXH14": {"strain_name": "BXH14", "value": 7.97475, "variance": None, "ndata": None}, + "BXH19": {"strain_name": "BXH19", "value": 7.67223, "variance": None, "ndata": None}, + "BXH2": {"strain_name": "BXH2", "value": 7.93622, "variance": None, "ndata": None}, + "BXH22": {"strain_name": "BXH22", "value": 7.43692, "variance": None, "ndata": None}, + "BXH4": {"strain_name": "BXH4", "value": 7.96336, "variance": None, "ndata": None}, + "BXH6": {"strain_name": "BXH6", "value": 7.75132, "variance": None, "ndata": None}, + "BXH7": {"strain_name": "BXH7", "value": 8.12927, "variance": None, "ndata": None}, + "BXH8": {"strain_name": "BXH8", "value": 6.77338, "variance": None, "ndata": None}, + "BXH9": {"strain_name": "BXH9", "value": 8.03836, "variance": None, "ndata": None}, + "C3H/HeJ": {"strain_name": "C3H/HeJ", "value": 7.42795, "variance": None, "ndata": None}, + "C57BL/6J": {"strain_name": "C57BL/6J", "value": 7.50606, "variance": None, "ndata": None}, + "DBA/2J": {"strain_name": "DBA/2J", "value": 7.72588, "variance": None, "ndata": None}}} + +slinked = ( + (((0, 2, 0.16381088984330505), + ((1, 7, 0.06024619831474998), 5, 0.19179284676938602), + 0.20337048635536847), + 9, + 0.23451785425383564), + ((3, (6, 8, 0.2140799896286565), 0.25879514152086425), + 4, 0.8968250491499363), + 0.9313185954797953) + +class TestHeatmap(TestCase): + """Class for testing heatmap computation functions""" + + def test_export_trait_data_dtype(self): + """ + Test `export_trait_data` with different values for the `dtype` keyword + argument + """ + for dtype, expected in [ + ["val", (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["var", (None, None, None, None, None, None)], + ["N", (None, None, None, None, None, None)], + ["all", (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)]]: + with self.subTest(dtype=dtype): + self.assertEqual( + export_trait_data(trait_data, strainlist, dtype=dtype), + expected) + + def test_export_trait_data_dtype_all_flags(self): + """ + Test `export_trait_data` with different values for the `dtype` keyword + argument and the different flags set up + """ + for dtype, vflag, nflag, expected in [ + ["val", False, False, + (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["val", False, True, + (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["val", True, False, + (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["val", True, True, + (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["var", False, False, (None, None, None, None, None, None)], + ["var", False, True, (None, None, None, None, None, None)], + ["var", True, False, (None, None, None, None, None, None)], + ["var", True, True, (None, None, None, None, None, None)], + ["N", False, False, (None, None, None, None, None, None)], + ["N", False, True, (None, None, None, None, None, None)], + ["N", True, False, (None, None, None, None, None, None)], + ["N", True, True, (None, None, None, None, None, None)], + ["all", False, False, + (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["all", False, True, + (7.51879, None, 7.77141, None, 8.39265, None, 8.17443, None, + 8.30401, None, 7.80944, None)], + ["all", True, False, + (7.51879, None, 7.77141, None, 8.39265, None, 8.17443, None, + 8.30401, None, 7.80944, None)], + ["all", True, True, + (7.51879, None, None, 7.77141, None, None, 8.39265, None, None, + 8.17443, None, None, 8.30401, None, None, 7.80944, None, None)] + ]: + with self.subTest(dtype=dtype, vflag=vflag, nflag=nflag): + self.assertEqual( + export_trait_data( + trait_data, strainlist, dtype=dtype, var_exists=vflag, + n_exists=nflag), + expected) + + def test_cluster_traits(self): + """ + Test that the clustering is working as expected. + """ + traits_data_list = [ + (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944), + (6.1427, 6.50588, 7.73705, 6.68328, 7.49293, 7.27398), + (8.4211, 8.30581, 9.24076, 8.51173, 9.18455, 8.36077), + (10.0904, 10.6509, 9.36716, 9.91202, 8.57444, 10.5731), + (10.188, 9.76652, 9.54813, 9.05074, 9.52319, 9.10505), + (6.74676, 7.01029, 7.54169, 6.48574, 7.01427, 7.26815), + (6.39359, 6.85321, 5.78337, 7.11141, 6.22101, 6.16544), + (6.84118, 7.08432, 7.59844, 7.08229, 7.26774, 7.24991), + (9.45215, 10.6943, 8.64719, 10.1592, 7.75044, 8.78615), + (7.04737, 6.87185, 7.58586, 6.92456, 6.84243, 7.36913)] + self.assertEqual( + cluster_traits(traits_data_list), + ((0.0, 0.20337048635536847, 0.16381088984330505, 1.7388553629398245, + 1.5025235756329178, 0.6952839500255574, 1.271661230252733, + 0.2100487290977544, 1.4699690641062024, 0.7934461515867415), + (0.20337048635536847, 0.0, 0.2198321044997198, 1.5753041735592204, + 1.4815755944537086, 0.26087293140686374, 1.6939790104301427, + 0.06024619831474998, 1.7430082449189215, 0.4497104244247795), + (0.16381088984330505, 0.2198321044997198, 0.0, 1.9073926868549234, + 1.0396738891139845, 0.5278328671176757, 1.6275069061182947, + 0.2636503792482082, 1.739617877037615, 0.7127042590637039), + (1.7388553629398245, 1.5753041735592204, 1.9073926868549234, 0.0, + 0.9936846292920328, 1.1169999189889366, 0.6007483980555253, + 1.430209221053372, 0.25879514152086425, 0.9313185954797953), + (1.5025235756329178, 1.4815755944537086, 1.0396738891139845, + 0.9936846292920328, 0.0, 1.027827186339337, 1.1441743109173244, + 1.4122477962364253, 0.8968250491499363, 1.1683723389247052), + (0.6952839500255574, 0.26087293140686374, 0.5278328671176757, + 1.1169999189889366, 1.027827186339337, 0.0, 1.8420471110023269, + 0.19179284676938602, 1.4875072385631605, 0.23451785425383564), + (1.271661230252733, 1.6939790104301427, 1.6275069061182947, + 0.6007483980555253, 1.1441743109173244, 1.8420471110023269, 0.0, + 1.6540234785929928, 0.2140799896286565, 1.7413442197913358), + (0.2100487290977544, 0.06024619831474998, 0.2636503792482082, + 1.430209221053372, 1.4122477962364253, 0.19179284676938602, + 1.6540234785929928, 0.0, 1.5225640692832796, 0.33370067057028485), + (1.4699690641062024, 1.7430082449189215, 1.739617877037615, + 0.25879514152086425, 0.8968250491499363, 1.4875072385631605, + 0.2140799896286565, 1.5225640692832796, 0.0, 1.3256191648260216), + (0.7934461515867415, 0.4497104244247795, 0.7127042590637039, + 0.9313185954797953, 1.1683723389247052, 0.23451785425383564, + 1.7413442197913358, 0.33370067057028485, 1.3256191648260216, + 0.0))) + + def test_compute_heatmap_order(self): + """Test the orders.""" + self.assertEqual( + compute_traits_order(slinked), (0, 2, 1, 7, 5, 9, 3, 6, 8, 4)) + + def test_retrieve_strains_and_values(self): + """Test retrieval of strains and values.""" + for orders, slist, tdata, expected in [ + [ + [2], + ["s1", "s2", "s3", "s4"], + [[2, 9, 6, None, 4], + [7, 5, None, None, 4], + [9, None, 5, 4, 7], + [6, None, None, 4, None]], + [[2, ["s1", "s3", "s4"], [9, 5, 4]]] + ], + [ + [3], + ["s1", "s2", "s3", "s4", "s5"], + [[2, 9, 6, None, 4], + [7, 5, None, None, 4], + [9, None, 5, 4, 7], + [6, None, None, 4, None]], + [[3, ["s1", "s4"], [6, 4]]] + ]]: + with self.subTest(strainlist=slist, traitdata=tdata): + self.assertEqual( + retrieve_strains_and_values(orders, slist, tdata), expected) -- cgit v1.2.3 From b1eb0451578c53afabe4f2054ce08665dec4bb82 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Wed, 15 Sep 2021 11:41:36 +0300 Subject: Integrate get_lsr_from_chr function * gn3/heatmaps.py: copy over function * tests/unit/test_heatmaps.py: add tests Copy function over from proof of concept and add some tests to ensure it works as expected. --- gn3/heatmaps.py | 8 ++++++++ tests/unit/test_heatmaps.py | 14 ++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py index 198fb45..991ddec 100644 --- a/gn3/heatmaps.py +++ b/gn3/heatmaps.py @@ -276,6 +276,14 @@ def get_nearest_marker(traits_list, genotype): marker_finder = nearest_marker_finder(genotype) return [marker_finder(trait) for trait in traits_list] +def get_lrs_from_chr(trait, chr_name): + chromosome = trait["chromosomes"].get(chr_name) + if chromosome: + return [ + locus["LRS"] for locus in + sorted(chromosome["loci"], key=lambda loc: loc["Locus"])] + return [None] + # # Grey + Blue + Red # def generate_heatmap(): # cols = 20 diff --git a/tests/unit/test_heatmaps.py b/tests/unit/test_heatmaps.py index 265d5a8..cfdde1e 100644 --- a/tests/unit/test_heatmaps.py +++ b/tests/unit/test_heatmaps.py @@ -2,6 +2,7 @@ from unittest import TestCase from gn3.heatmaps import ( cluster_traits, + get_lrs_from_chr, export_trait_data, compute_traits_order, retrieve_strains_and_values) @@ -185,3 +186,16 @@ class TestHeatmap(TestCase): with self.subTest(strainlist=slist, traitdata=tdata): self.assertEqual( retrieve_strains_and_values(orders, slist, tdata), expected) + + def test_get_lrs_from_chr(self): + for trait, chromosome, expected in [ + [{"chromosomes": {}}, 3, [None]], + [{"chromosomes": {3: {"loci": [ + {"Locus": "b", "LRS": 1.9}, + {"Locus": "a", "LRS": 13.2}, + {"Locus": "d", "LRS": 53.21}, + {"Locus": "c", "LRS": 2.22}]}}}, + 3, + [13.2, 1.9, 2.22, 53.21]]]: + with self.subTest(trait=trait, chromosome=chromosome): + self.assertEqual(get_lrs_from_chr(trait, chromosome), expected) -- cgit v1.2.3 From 11632a565a6f901eca852a5a40a6f9fd3170152a Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Wed, 15 Sep 2021 12:08:56 +0300 Subject: Process data into format usable by heatmaps * gn3/heatmaps.py: implement `process_traits_data_for_heatmap` function, that will process the data into a form usable by heatmaps. * tests/unit/test_heatmaps.py: check that the function processes the data into the correct form. --- gn3/heatmaps.py | 12 +++++ tests/unit/test_heatmaps.py | 107 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 118 insertions(+), 1 deletion(-) diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py index 991ddec..0c00d6c 100644 --- a/gn3/heatmaps.py +++ b/gn3/heatmaps.py @@ -277,6 +277,9 @@ def get_nearest_marker(traits_list, genotype): return [marker_finder(trait) for trait in traits_list] def get_lrs_from_chr(trait, chr_name): + """ + Retrieve the LRS values for a specific chromosome in the given trait. + """ chromosome = trait["chromosomes"].get(chr_name) if chromosome: return [ @@ -284,6 +287,15 @@ def get_lrs_from_chr(trait, chr_name): sorted(chromosome["loci"], key=lambda loc: loc["Locus"])] return [None] +def process_traits_data_for_heatmap(data, trait_names, chromosome_names): + """ + Process the traits data in a format useful for generating heatmap diagrams. + """ + hdata = [ + [get_lrs_from_chr(data[trait], chr_name) for trait in trait_names] + for chr_name in chromosome_names] + return hdata + # # Grey + Blue + Red # def generate_heatmap(): # cols = 20 diff --git a/tests/unit/test_heatmaps.py b/tests/unit/test_heatmaps.py index cfdde1e..f3a81c5 100644 --- a/tests/unit/test_heatmaps.py +++ b/tests/unit/test_heatmaps.py @@ -5,7 +5,8 @@ from gn3.heatmaps import ( get_lrs_from_chr, export_trait_data, compute_traits_order, - retrieve_strains_and_values) + retrieve_strains_and_values, + process_traits_data_for_heatmap) strainlist = ["B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"] trait_data = { @@ -199,3 +200,107 @@ class TestHeatmap(TestCase): [13.2, 1.9, 2.22, 53.21]]]: with self.subTest(trait=trait, chromosome=chromosome): self.assertEqual(get_lrs_from_chr(trait, chromosome), expected) + + def test_process_traits_data_for_heatmap(self): + self.assertEqual( + process_traits_data_for_heatmap( + {"1": { + "ID": "T1", + "chromosomes": { + 1: {"Chr": 1, + "loci": [ + { + "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs6269442", "cM": 1.500, "Mb": 3.492, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs32285189", "cM": 1.630, "Mb": 3.511, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs258367496", "cM": 1.630, "Mb": 3.660, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs32430919", "cM": 1.750, "Mb": 3.777, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs36251697", "cM": 1.880, "Mb": 3.812, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs30658298", "cM": 2.010, "Mb": 4.431, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }]}, + 2: {"Chr": 2, + "loci": [ + { + "Locus": "rs51852623", "cM": 2.010, "Mb": 4.447, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs31879829", "cM": 2.140, "Mb": 4.519, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs36742481", "cM": 2.140, "Mb": 4.776, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }]}}}, + "2": { + "ID": "T1", + "chromosomes": { + 1: {"Chr": 1, + "loci": [ + { + "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs6269442", "cM": 1.500, "Mb": 3.492, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs32285189", "cM": 1.630, "Mb": 3.511, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs258367496", "cM": 1.630, "Mb": 3.660, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs32430919", "cM": 1.750, "Mb": 3.777, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs36251697", "cM": 1.880, "Mb": 3.812, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs30658298", "cM": 2.010, "Mb": 4.431, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }]}, + 2: {"Chr": 2, + "loci": [ + { + "Locus": "rs51852623", "cM": 2.010, "Mb": 4.447, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs31879829", "cM": 2.140, "Mb": 4.519, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs36742481", "cM": 2.140, "Mb": 4.776, + "LRS": 0.579, "Additive": -0.074, "pValue": 1.000 + }]}}}}, + ["2", "1"], + [1, 2]), + [[[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5], + [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]], + [[0.5, 0.579, 0.5], + [0.5, 0.5, 0.5]]]) -- cgit v1.2.3 From e9fb4e45cfc52c5d86ef534b0e7f42ba8f4c84d3 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Wed, 15 Sep 2021 12:28:56 +0300 Subject: Generate heatmaps in a single plot Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Add a function to generate the heatmaps for each chromosome into a single plot. --- gn3/heatmaps.py | 65 ++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 24 deletions(-) diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py index 0c00d6c..f3d7d25 100644 --- a/gn3/heatmaps.py +++ b/gn3/heatmaps.py @@ -4,6 +4,7 @@ generate various kinds of heatmaps. """ from functools import reduce +from gn3.settings import TMPDIR from typing import Any, Dict, Sequence from gn3.computations.slink import slink from gn3.computations.qtlreaper import generate_traits_file @@ -296,27 +297,43 @@ def process_traits_data_for_heatmap(data, trait_names, chromosome_names): for chr_name in chromosome_names] return hdata -# # Grey + Blue + Red -# def generate_heatmap(): -# cols = 20 -# y_axis = (["%s"%x for x in range(1, cols+1)][:-1] + ["X"]) #replace last item with x for now -# x_axis = heatmap_x_axis_names() -# data = generate_random_data(height=cols, width=len(x_axis)) -# fig = px.imshow( -# data, -# x=x_axis, -# y=y_axis, -# width=500) -# fig.update_traces(xtype="array") -# fig.update_traces(ytype="array") -# # fig.update_traces(xgap=10) -# fig.update_xaxes( -# visible=True, -# title_text="Traits", -# title_font_size=16) -# fig.update_layout( -# coloraxis_colorscale=[ -# [0.0, '#3B3B3B'], [0.4999999999999999, '#ABABAB'], -# [0.5, '#F5DE11'], [1.0, '#FF0D00']]) -# fig.write_html("%s/%s"%(heatmap_dir, "test_image.html")) -# return fig +def generate_clustered_heatmap( + data, image_filename_prefix, x_axis = None, x_label: str = "", + y_axis = None, y_label: str = "", output_dir: str = TMPDIR, + colorscale = [ + [0.0, '#3B3B3B'], [0.4999999999999999, '#ABABAB'], + [0.5, '#F5DE11']], [1.0, '#FF0D00']): + """ + Generate a dendrogram, and heatmaps for each chromosome, and put them all + into one plot. + """ + num_cols = len(x_axis) + fig = make_subplots( + rows=1, + cols=num_cols, + shared_yaxes="rows", + # horizontal_spacing=(1 / (num_cols - 1)), + subplot_titles=x_axis + ) + hms = [go.Heatmap( + name=chromo, + y = y_axis, + z = data_array, + showscale=False) for chromo, data_array in zip(x_axis, data)] + for col, hm in enumerate(hms): + fig.add_trace(hm, row=1, col=(col + 1)) + + fig.update_traces( + showlegend=False, + colorscale=colorscale, + selector={"type": "heatmap"}) + fig.update_traces( + showlegend=True, + showscale=True, + selector={"name": x_axis[-1]}) + fig.update_layout( + coloraxis_colorscale=colorscale + ) + image_filename = "{}/{}.html".format(output_dir, image_filename_prefix) + fig.write_html(image_filename) + return image_filename, fig -- cgit v1.2.3 From 347301c1bc60ba5036625364ee48a5d72eeb1186 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Wed, 15 Sep 2021 12:42:38 +0300 Subject: Update entry-point function for heatmap generation Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Copy over code from the proof-of-concept implementation and clean it up a little for the entry-point function for heatmap generation via the API --- gn3/heatmaps.py | 70 ++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 49 insertions(+), 21 deletions(-) diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py index f3d7d25..4349ee0 100644 --- a/gn3/heatmaps.py +++ b/gn3/heatmaps.py @@ -131,7 +131,7 @@ def cluster_traits(traits_data_list: Sequence[Dict]): return tuple(__cluster(tdata_i) for tdata_i in enumerate(traits_data_list)) -def heatmap_data(traits_names, conn: Any): +def build_heatmap(traits_names, conn: Any): """ heatmap function @@ -148,27 +148,55 @@ def heatmap_data(traits_names, conn: Any): TODO: Elaborate on the parameters here... """ threshold = 0 # webqtlConfig.PUBLICTHRESH - def __retrieve_traitlist_and_datalist(threshold, fullname): - trait = retrieve_trait_info(threshold, fullname, conn) - return (trait, retrieve_trait_data(trait, conn)) - - traits_details = [ - __retrieve_traitlist_and_datalist(threshold, fullname) - for fullname in traits_names] - traits_list = tuple(x[0] for x in traits_details) - traits_data_list = [x[1] for x in traits_details] - genotype_filename = build_genotype_file(traits_list[0]["riset"]) - strainlist = load_genotype_samples(genotype_filename) - exported_traits_data_list = tuple( - export_trait_data(td, strainlist) for td in traits_data_list) - slink_data = slink(cluster_traits(exported_traits_data_list)) - ordering_data = compute_heatmap_order(slink_data) + traits = [ + retrieve_trait_info(threshold, fullname, conn) + for fullname in trait_fullnames()] + traits_data_list = [retrieve_trait_data(t, conn) for t in traits] + genotype_filename = build_genotype_file(traits[0]["riset"]) + genotype = parse_genotype_file(genotype_filename) + strains = load_genotype_samples(genotype_filename) + exported_traits_data_list = [ + export_trait_data(td, strains) for td in traits_data_list] + slinked = slink(cluster_traits(exported_traits_data_list)) + traits_order = compute_traits_order(slinked) + ordered_traits_names = [ + traits[idx]["trait_fullname"] for idx in traits_order] strains_and_values = retrieve_strains_and_values( - ordering_data, strainlist, exported_traits_data_list) - strains_values = strains_and_values[0][1] - trait_values = [t[2] for t in strains_and_values] - traits_filename = generate_traits_filename() - generate_traits_file(strains_values, trait_values, traits_filename) + traits_order, strains, exported_traits_data_list) + traits_filename = "{}/traits_test_file_{}.txt".format( + TMPDIR, random_string(10)) + generate_traits_file( + strains_and_values[0][1], + [t[2] for t in strains_and_values], + traits_filename) + + main_output, permutations_output = run_reaper( + genotype_filename, traits_filename, separate_nperm_output=True) + + qtlresults = parse_reaper_main_results(main_output) + permudata = parse_reaper_permutation_results(permutations_output) + organised = organise_reaper_main_results(qtlresults) + + traits_ids = [# sort numerically, but retain the ids as strings + str(i) for i in sorted({int(row["ID"]) for row in qtlresults})] + chromosome_names = sorted( + {row["Chr"] for row in qtlresults}, key = chromosome_sorter_key_fn) + loci_names = sorted({row["Locus"] for row in qtlresults}) + ordered_traits_names = { + res_id: trait for res_id, trait in + zip(traits_ids, + [traits[idx]["trait_fullname"] for idx in traits_order])} + + return generate_clustered_heatmap( + process_traits_data_for_heatmap( + organised, traits_ids, chromosome_names), + "single_heatmap_{}".format(random_string(10)), + y_axis=tuple( + ordered_traits_names[traits_ids[order]] + for order in traits_order), + y_label="Traits", + x_axis=[chromo for chromo in chromosome_names], + x_label="Chromosomes") return { "slink_data": slink_data, -- cgit v1.2.3 From 78e37440844a0397d29135a8ba215e54fd92c86d Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Thu, 16 Sep 2021 11:35:23 +0300 Subject: Add missing imports Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Add missing imports that are needed in the code. --- gn3/heatmaps.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py index 4349ee0..c48a2d3 100644 --- a/gn3/heatmaps.py +++ b/gn3/heatmaps.py @@ -5,15 +5,25 @@ generate various kinds of heatmaps. from functools import reduce from gn3.settings import TMPDIR +import plotly.graph_objects as go +from gn3.random import random_string from typing import Any, Dict, Sequence from gn3.computations.slink import slink -from gn3.computations.qtlreaper import generate_traits_file +from plotly.subplots import make_subplots from gn3.computations.correlations2 import compute_correlation -from gn3.db.genotypes import build_genotype_file, load_genotype_samples +from gn3.db.genotypes import ( + build_genotype_file, load_genotype_samples, parse_genotype_file) from gn3.db.traits import ( retrieve_trait_data, retrieve_trait_info, generate_traits_filename) +from gn3.computations.qtlreaper import ( + run_reaper, + generate_traits_file, + chromosome_sorter_key_fn, + parse_reaper_main_results, + organise_reaper_main_results, + parse_reaper_permutation_results) def export_trait_data( trait_data: dict, strainlist: Sequence[str], dtype: str = "val", -- cgit v1.2.3 From 2cc9f382e199dbdbaab98c7e06deabd72e244adb Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Thu, 16 Sep 2021 11:36:42 +0300 Subject: Fix minor bugs Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Fix a few minor bugs left over from the integration of code from the proof-of-concept code. --- gn3/heatmaps.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py index c48a2d3..170b0cd 100644 --- a/gn3/heatmaps.py +++ b/gn3/heatmaps.py @@ -160,7 +160,7 @@ def build_heatmap(traits_names, conn: Any): threshold = 0 # webqtlConfig.PUBLICTHRESH traits = [ retrieve_trait_info(threshold, fullname, conn) - for fullname in trait_fullnames()] + for fullname in traits_names] traits_data_list = [retrieve_trait_data(t, conn) for t in traits] genotype_filename = build_genotype_file(traits[0]["riset"]) genotype = parse_genotype_file(genotype_filename) @@ -338,9 +338,9 @@ def process_traits_data_for_heatmap(data, trait_names, chromosome_names): def generate_clustered_heatmap( data, image_filename_prefix, x_axis = None, x_label: str = "", y_axis = None, y_label: str = "", output_dir: str = TMPDIR, - colorscale = [ - [0.0, '#3B3B3B'], [0.4999999999999999, '#ABABAB'], - [0.5, '#F5DE11']], [1.0, '#FF0D00']): + colorscale = ( + (0.0, '#3B3B3B'), (0.4999999999999999, '#ABABAB'), + (0.5, '#F5DE11'), (1.0, '#FF0D00'))): """ Generate a dendrogram, and heatmaps for each chromosome, and put them all into one plot. -- cgit v1.2.3 From 056171a0a2f127e90ab803b74635495fb0c079a2 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Thu, 16 Sep 2021 13:06:04 +0300 Subject: Intergrate the heatmap generation with the API Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Intergrate the heatmap generation code on the /api/heatmaps/clustered endpoint. The endpoint should take a json query of the form: {"traits_names": [ ... ] } where the "traits_name" value is a list of the full names of traits. A sample query to the endpoint could be something like the following: curl -i -X POST "http://localhost:8080/api/heatmaps/clustered" \ -H "Accept: application/json" \ -H "Content-Type: application/json" \ -d '{ "traits_names": [ "UCLA_BXDBXH_CARTILAGE_V2::ILM103710672", "UCLA_BXDBXH_CARTILAGE_V2::ILM2260338", "UCLA_BXDBXH_CARTILAGE_V2::ILM3140576", "UCLA_BXDBXH_CARTILAGE_V2::ILM5670577", "UCLA_BXDBXH_CARTILAGE_V2::ILM2070121", "UCLA_BXDBXH_CARTILAGE_V2::ILM103990541", "UCLA_BXDBXH_CARTILAGE_V2::ILM1190722", "UCLA_BXDBXH_CARTILAGE_V2::ILM6590722", "UCLA_BXDBXH_CARTILAGE_V2::ILM4200064", "UCLA_BXDBXH_CARTILAGE_V2::ILM3140463" ] }' which should respond with a json response containing the raw binary string for the png format and possibly another for the svg format. --- gn3/api/heatmaps.py | 28 ++++++++++++++++++++++++++++ gn3/app.py | 2 ++ 2 files changed, 30 insertions(+) create mode 100644 gn3/api/heatmaps.py diff --git a/gn3/api/heatmaps.py b/gn3/api/heatmaps.py new file mode 100644 index 0000000..cac9c71 --- /dev/null +++ b/gn3/api/heatmaps.py @@ -0,0 +1,28 @@ +from flask import jsonify +from flask import request +from flask import Blueprint +from gn3.heatmaps import build_heatmap +from gn3.db_utils import database_connector + +heatmaps = Blueprint("heatmaps", __name__) + +@heatmaps.route("/clustered", methods=("POST",)) +def clustered_heatmaps(): + heatmap_request = request.get_json() + traits_names = heatmap_request.get("traits_names", tuple()) + if len(traits_names) < 1: + return jsonify({ + "message": "You need to provide at least one trait name." + }), 400 + conn, _cursor = database_connector() + _heatmap_file, heatmap_fig = build_heatmap(traits_names, conn) + + # stream the heatmap data somehow here. + # Can plotly actually stream the figure data in a way that can be used on + # remote end to display the image without necessarily being html? + return jsonify( + { + "query": heatmap_request, + "output_png": heatmap_fig.to_image(format="png"), + "output_svg": heatmap_fig.to_image(format="svg") + }), 200 diff --git a/gn3/app.py b/gn3/app.py index 046b5de..b4b08d0 100644 --- a/gn3/app.py +++ b/gn3/app.py @@ -7,6 +7,7 @@ from flask import Flask from gn3.api.gemma import gemma from gn3.api.rqtl import rqtl from gn3.api.general import general +from gn3.api.heatmaps import heatmaps from gn3.api.correlation import correlation from gn3.api.data_entry import data_entry @@ -30,6 +31,7 @@ def create_app(config: Union[Dict, str, None] = None) -> Flask: app.register_blueprint(general, url_prefix="/api/") app.register_blueprint(gemma, url_prefix="/api/gemma") app.register_blueprint(rqtl, url_prefix="/api/rqtl") + app.register_blueprint(heatmaps, url_prefix="/api/heatmaps") app.register_blueprint(correlation, url_prefix="/api/correlation") app.register_blueprint(data_entry, url_prefix="/api/dataentry") return app -- cgit v1.2.3 From 78871ef396f394c54072960e985476e418220fe3 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Fri, 17 Sep 2021 10:30:16 +0300 Subject: Create dendrogram to show clustering tree Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Provide the clustering data to be used for the creation of the clustering dendrogram in the final clustered heatmap plot. --- gn3/heatmaps.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py index 170b0cd..bf69d9b 100644 --- a/gn3/heatmaps.py +++ b/gn3/heatmaps.py @@ -3,9 +3,11 @@ This module will contain functions to be used in computation of the data used to generate various kinds of heatmaps. """ +import numpy as np from functools import reduce from gn3.settings import TMPDIR import plotly.graph_objects as go +import plotly.figure_factory as ff from gn3.random import random_string from typing import Any, Dict, Sequence from gn3.computations.slink import slink @@ -167,7 +169,8 @@ def build_heatmap(traits_names, conn: Any): strains = load_genotype_samples(genotype_filename) exported_traits_data_list = [ export_trait_data(td, strains) for td in traits_data_list] - slinked = slink(cluster_traits(exported_traits_data_list)) + clustered = cluster_traits(exported_traits_data_list) + slinked = slink(clustered) traits_order = compute_traits_order(slinked) ordered_traits_names = [ traits[idx]["trait_fullname"] for idx in traits_order] @@ -200,6 +203,7 @@ def build_heatmap(traits_names, conn: Any): return generate_clustered_heatmap( process_traits_data_for_heatmap( organised, traits_ids, chromosome_names), + clustered, "single_heatmap_{}".format(random_string(10)), y_axis=tuple( ordered_traits_names[traits_ids[order]] @@ -336,8 +340,9 @@ def process_traits_data_for_heatmap(data, trait_names, chromosome_names): return hdata def generate_clustered_heatmap( - data, image_filename_prefix, x_axis = None, x_label: str = "", - y_axis = None, y_label: str = "", output_dir: str = TMPDIR, + data, clustering_data, image_filename_prefix, x_axis = None, + x_label: str = "", y_axis = None, y_label: str = "", + output_dir: str = TMPDIR, colorscale = ( (0.0, '#3B3B3B'), (0.4999999999999999, '#ABABAB'), (0.5, '#F5DE11'), (1.0, '#FF0D00'))): @@ -345,21 +350,22 @@ def generate_clustered_heatmap( Generate a dendrogram, and heatmaps for each chromosome, and put them all into one plot. """ - num_cols = len(x_axis) + num_cols = 1 + len(x_axis) fig = make_subplots( rows=1, cols=num_cols, shared_yaxes="rows", - # horizontal_spacing=(1 / (num_cols - 1)), - subplot_titles=x_axis - ) + horizontal_spacing=0.001, + subplot_titles=["distance"] + x_axis, + figure = ff.create_dendrogram( + np.array(clustering_data), orientation="right", labels=y_axis)) hms = [go.Heatmap( name=chromo, y = y_axis, z = data_array, showscale=False) for chromo, data_array in zip(x_axis, data)] - for col, hm in enumerate(hms): - fig.add_trace(hm, row=1, col=(col + 1)) + for i, hm in enumerate(hms): + fig.add_trace(hm, row=1, col=(i + 2)) fig.update_traces( showlegend=False, -- cgit v1.2.3 From e17540bf8a57837fcf3241ea0b694250b53294fc Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Fri, 17 Sep 2021 10:32:23 +0300 Subject: Fix some layout issues and update colorscale Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Update the plot layouts and size to display the dendrogram and individual chromosome heatmaps side by side * Update the colour scale to begin with the grays rather than absolute black --- gn3/heatmaps.py | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py index bf69d9b..2859dde 100644 --- a/gn3/heatmaps.py +++ b/gn3/heatmaps.py @@ -344,7 +344,7 @@ def generate_clustered_heatmap( x_label: str = "", y_axis = None, y_label: str = "", output_dir: str = TMPDIR, colorscale = ( - (0.0, '#3B3B3B'), (0.4999999999999999, '#ABABAB'), + (0.0, '#5D5D5D'), (0.4999999999999999, '#ABABAB'), (0.5, '#F5DE11'), (1.0, '#FF0D00'))): """ Generate a dendrogram, and heatmaps for each chromosome, and put them all @@ -367,6 +367,33 @@ def generate_clustered_heatmap( for i, hm in enumerate(hms): fig.add_trace(hm, row=1, col=(i + 2)) + fig.update_layout( + { + "width": 1500, + "height": 800, + "xaxis": { + "mirror": False, + "showgrid": True + } + }) + + x_axes_layouts = { + "xaxis{}".format(i+1 if i > 0 else ""): { + "mirror": False, + "showticklabels": True if i==0 else False, + "ticks": "outside" if i==0 else "" + } + for i in range(num_cols)} + + fig.update_layout( + { + "width": 4000, + "height": 800, + "yaxis": { + "mirror": False, + "ticks": "" + }, + **x_axes_layouts}) fig.update_traces( showlegend=False, colorscale=colorscale, @@ -375,9 +402,6 @@ def generate_clustered_heatmap( showlegend=True, showscale=True, selector={"name": x_axis[-1]}) - fig.update_layout( - coloraxis_colorscale=colorscale - ) image_filename = "{}/{}.html".format(output_dir, image_filename_prefix) fig.write_html(image_filename) return image_filename, fig -- cgit v1.2.3 From 8ac3194f06084dfe5d0cfb141f178d83d937fcc3 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Fri, 17 Sep 2021 10:35:15 +0300 Subject: Return path to generated filename for now Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * To help with demonstrating that the code is producing the expected output, for now, we return the path to the generated html file that displays the interactive heatmap. At this point, it is mostly useful in the development environment. Moving forward, we might have to actually stream the raw html, or if we can get the Kaleido library packaged for GNU Guix, stream the images binary data instead. --- gn3/api/heatmaps.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/gn3/api/heatmaps.py b/gn3/api/heatmaps.py index cac9c71..f053241 100644 --- a/gn3/api/heatmaps.py +++ b/gn3/api/heatmaps.py @@ -20,9 +20,10 @@ def clustered_heatmaps(): # stream the heatmap data somehow here. # Can plotly actually stream the figure data in a way that can be used on # remote end to display the image without necessarily being html? - return jsonify( - { - "query": heatmap_request, - "output_png": heatmap_fig.to_image(format="png"), - "output_svg": heatmap_fig.to_image(format="svg") - }), 200 + # return jsonify( + # { + # "query": heatmap_request, + # "output_png": heatmap_fig.to_image(format="png"), + # "output_svg": heatmap_fig.to_image(format="svg") + # }), 200 + return jsonify({"output_filename": _heatmap_file}), 200 -- cgit v1.2.3 From 1e2357049adc72808fbf8eaac3da9411d3c78c66 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Fri, 17 Sep 2021 11:20:16 +0300 Subject: Fix a number of linting issues Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi --- gn3/computations/qtlreaper.py | 7 ++-- gn3/db/genotypes.py | 2 +- gn3/heatmaps.py | 54 ++++++++++++------------------- tests/unit/computations/test_qtlreaper.py | 3 +- tests/unit/test_heatmaps.py | 6 ++-- 5 files changed, 32 insertions(+), 40 deletions(-) diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py index 5180853..377db9b 100644 --- a/gn3/computations/qtlreaper.py +++ b/gn3/computations/qtlreaper.py @@ -110,9 +110,10 @@ def organise_reaper_main_results(parsed_results): unique_chromosomes = {item["Chr"] for item in id_items} return { "ID": identifier, - "chromosomes": {_chr["Chr"]: _chr for _chr in [ - __organise_by_chromosome(chromo, id_items) - for chromo in sorted( + "chromosomes": { + _chr["Chr"]: _chr for _chr in [ + __organise_by_chromosome(chromo, id_items) + for chromo in sorted( unique_chromosomes, key=chromosome_sorter_key_fn)]}} unique_ids = {res["ID"] for res in parsed_results} diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py index b03d55c..9d052d9 100644 --- a/gn3/db/genotypes.py +++ b/gn3/db/genotypes.py @@ -174,7 +174,7 @@ def parse_genotype_file(filename: str, parlist: tuple = tuple()): geno_obj = dict(labels + header) markers = tuple( [parse_genotype_marker(line, geno_obj, parlist) - for line in data_lines[1:]]) + for line in data_lines[1:]]) chromosomes = tuple( dict(chromosome) for chromosome in build_genotype_chromosomes(geno_obj, markers)) diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py index 2859dde..c4fc67d 100644 --- a/gn3/heatmaps.py +++ b/gn3/heatmaps.py @@ -3,13 +3,13 @@ This module will contain functions to be used in computation of the data used to generate various kinds of heatmaps. """ +from typing import Any, Dict, Sequence import numpy as np from functools import reduce from gn3.settings import TMPDIR import plotly.graph_objects as go import plotly.figure_factory as ff from gn3.random import random_string -from typing import Any, Dict, Sequence from gn3.computations.slink import slink from plotly.subplots import make_subplots from gn3.computations.correlations2 import compute_correlation @@ -165,7 +165,7 @@ def build_heatmap(traits_names, conn: Any): for fullname in traits_names] traits_data_list = [retrieve_trait_data(t, conn) for t in traits] genotype_filename = build_genotype_file(traits[0]["riset"]) - genotype = parse_genotype_file(genotype_filename) + # genotype = parse_genotype_file(genotype_filename) strains = load_genotype_samples(genotype_filename) exported_traits_data_list = [ export_trait_data(td, strains) for td in traits_data_list] @@ -183,22 +183,21 @@ def build_heatmap(traits_names, conn: Any): [t[2] for t in strains_and_values], traits_filename) - main_output, permutations_output = run_reaper( + main_output, _permutations_output = run_reaper( genotype_filename, traits_filename, separate_nperm_output=True) qtlresults = parse_reaper_main_results(main_output) - permudata = parse_reaper_permutation_results(permutations_output) + # permudata = parse_reaper_permutation_results(permutations_output) organised = organise_reaper_main_results(qtlresults) traits_ids = [# sort numerically, but retain the ids as strings str(i) for i in sorted({int(row["ID"]) for row in qtlresults})] chromosome_names = sorted( - {row["Chr"] for row in qtlresults}, key = chromosome_sorter_key_fn) - loci_names = sorted({row["Locus"] for row in qtlresults}) - ordered_traits_names = { - res_id: trait for res_id, trait in + {row["Chr"] for row in qtlresults}, key=chromosome_sorter_key_fn) + # loci_names = sorted({row["Locus"] for row in qtlresults}) + ordered_traits_names = dict( zip(traits_ids, - [traits[idx]["trait_fullname"] for idx in traits_order])} + [traits[idx]["trait_fullname"] for idx in traits_order])) return generate_clustered_heatmap( process_traits_data_for_heatmap( @@ -207,22 +206,11 @@ def build_heatmap(traits_names, conn: Any): "single_heatmap_{}".format(random_string(10)), y_axis=tuple( ordered_traits_names[traits_ids[order]] - for order in traits_order), + for order in traits_order), y_label="Traits", - x_axis=[chromo for chromo in chromosome_names], + x_axis=chromosome_names, x_label="Chromosomes") - return { - "slink_data": slink_data, - "ordering_data": ordering_data, - "strainlist": strainlist, - "genotype_filename": genotype_filename, - "traits_list": traits_list, - "traits_data_list": traits_data_list, - "exported_traits_data_list": exported_traits_data_list, - "traits_filename": traits_filename - } - def compute_traits_order(slink_data, neworder: tuple = tuple()): """ Compute the order of the traits for clustering from `slink_data`. @@ -314,7 +302,7 @@ def get_nearest_marker(traits_list, genotype): https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L419-L438 """ if not genotype["Mbmap"]: - return [None] * len(trait_list) + return [None] * len(traits_list) marker_finder = nearest_marker_finder(genotype) return [marker_finder(trait) for trait in traits_list] @@ -340,10 +328,10 @@ def process_traits_data_for_heatmap(data, trait_names, chromosome_names): return hdata def generate_clustered_heatmap( - data, clustering_data, image_filename_prefix, x_axis = None, - x_label: str = "", y_axis = None, y_label: str = "", + data, clustering_data, image_filename_prefix, x_axis=None, + x_label: str = "", y_axis=None, y_label: str = "", output_dir: str = TMPDIR, - colorscale = ( + colorscale=( (0.0, '#5D5D5D'), (0.4999999999999999, '#ABABAB'), (0.5, '#F5DE11'), (1.0, '#FF0D00'))): """ @@ -357,15 +345,15 @@ def generate_clustered_heatmap( shared_yaxes="rows", horizontal_spacing=0.001, subplot_titles=["distance"] + x_axis, - figure = ff.create_dendrogram( + figure=ff.create_dendrogram( np.array(clustering_data), orientation="right", labels=y_axis)) hms = [go.Heatmap( name=chromo, - y = y_axis, - z = data_array, + y=y_axis, + z=data_array, showscale=False) for chromo, data_array in zip(x_axis, data)] - for i, hm in enumerate(hms): - fig.add_trace(hm, row=1, col=(i + 2)) + for i, heatmap in enumerate(hms): + fig.add_trace(heatmap, row=1, col=(i + 2)) fig.update_layout( { @@ -380,8 +368,8 @@ def generate_clustered_heatmap( x_axes_layouts = { "xaxis{}".format(i+1 if i > 0 else ""): { "mirror": False, - "showticklabels": True if i==0 else False, - "ticks": "outside" if i==0 else "" + "showticklabels": True if i == 0 else False, + "ticks": "outside" if i == 0 else "" } for i in range(num_cols)} diff --git a/tests/unit/computations/test_qtlreaper.py b/tests/unit/computations/test_qtlreaper.py index 1d67827..d420470 100644 --- a/tests/unit/computations/test_qtlreaper.py +++ b/tests/unit/computations/test_qtlreaper.py @@ -77,6 +77,7 @@ class TestQTLReaper(TestCase): 5.82775, 5.89659, 5.92117, 5.93396, 5.93396, 5.94957]) def test_organise_reaper_main_results(self): + """Check that results are organised correctly.""" self.assertEqual( organise_reaper_main_results([ { @@ -135,7 +136,7 @@ class TestQTLReaper(TestCase): 1: {"Chr": 1, "loci": [ { - "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, + "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { diff --git a/tests/unit/test_heatmaps.py b/tests/unit/test_heatmaps.py index f3a81c5..c0a496b 100644 --- a/tests/unit/test_heatmaps.py +++ b/tests/unit/test_heatmaps.py @@ -189,6 +189,7 @@ class TestHeatmap(TestCase): retrieve_strains_and_values(orders, slist, tdata), expected) def test_get_lrs_from_chr(self): + """Check that function gets correct LRS values""" for trait, chromosome, expected in [ [{"chromosomes": {}}, 3, [None]], [{"chromosomes": {3: {"loci": [ @@ -202,6 +203,7 @@ class TestHeatmap(TestCase): self.assertEqual(get_lrs_from_chr(trait, chromosome), expected) def test_process_traits_data_for_heatmap(self): + """Check for correct processing of data for heatmap generation.""" self.assertEqual( process_traits_data_for_heatmap( {"1": { @@ -210,7 +212,7 @@ class TestHeatmap(TestCase): 1: {"Chr": 1, "loci": [ { - "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, + "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { @@ -257,7 +259,7 @@ class TestHeatmap(TestCase): 1: {"Chr": 1, "loci": [ { - "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, + "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { -- cgit v1.2.3 From b7fb10586b956a8b0389e7925e4c0cff28cde82f Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 20 Sep 2021 06:36:00 +0300 Subject: Return only the data Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/api/heatmaps.py: Parse incoming data to build up correct trait names and respond with only the computed heatmap data. * gn3/heatmaps.py: Return only the computed data for heatmaps and clustering. Since GN3 is supposed to handle only the data, and db-access, this commit ensures that GN3 responds to the client with only the computed heatmap data, and does not try to generate the heatmaps themselves. The generation of the heatmaps will be delegated to the UI clients, such as GeneNetwork2. --- gn3/api/heatmaps.py | 18 ++++++------------ gn3/heatmaps.py | 25 +++++++++++++++++-------- 2 files changed, 23 insertions(+), 20 deletions(-) diff --git a/gn3/api/heatmaps.py b/gn3/api/heatmaps.py index f053241..eea3ebe 100644 --- a/gn3/api/heatmaps.py +++ b/gn3/api/heatmaps.py @@ -15,15 +15,9 @@ def clustered_heatmaps(): "message": "You need to provide at least one trait name." }), 400 conn, _cursor = database_connector() - _heatmap_file, heatmap_fig = build_heatmap(traits_names, conn) - - # stream the heatmap data somehow here. - # Can plotly actually stream the figure data in a way that can be used on - # remote end to display the image without necessarily being html? - # return jsonify( - # { - # "query": heatmap_request, - # "output_png": heatmap_fig.to_image(format="png"), - # "output_svg": heatmap_fig.to_image(format="svg") - # }), 200 - return jsonify({"output_filename": _heatmap_file}), 200 + def setup_trait_fullname(trait): + name_parts = trait.split(":") + return "{dataset_name}::{trait_name}".format( + dataset_name=trait[1], trait_name=trait[0]) + traits_fullnames = [parse_trait_fullname(trait) for trait in traits_names] + return jsonify(build_heatmap(traits_fullnames, conn)), 200 diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py index c4fc67d..205a3b3 100644 --- a/gn3/heatmaps.py +++ b/gn3/heatmaps.py @@ -199,17 +199,26 @@ def build_heatmap(traits_names, conn: Any): zip(traits_ids, [traits[idx]["trait_fullname"] for idx in traits_order])) - return generate_clustered_heatmap( - process_traits_data_for_heatmap( + # return generate_clustered_heatmap( + # process_traits_data_for_heatmap( + # organised, traits_ids, chromosome_names), + # clustered, + # "single_heatmap_{}".format(random_string(10)), + # y_axis=tuple( + # ordered_traits_names[traits_ids[order]] + # for order in traits_order), + # y_label="Traits", + # x_axis=chromosome_names, + # x_label="Chromosomes") + return { + "clustering_data": clustered, + "heatmap_data": process_traits_data_for_heatmap( organised, traits_ids, chromosome_names), - clustered, - "single_heatmap_{}".format(random_string(10)), - y_axis=tuple( + "traits": tuple( ordered_traits_names[traits_ids[order]] for order in traits_order), - y_label="Traits", - x_axis=chromosome_names, - x_label="Chromosomes") + "chromosomes": chromosome_names + } def compute_traits_order(slink_data, neworder: tuple = tuple()): """ -- cgit v1.2.3 From f5415f41d7f682771555e73f36ac4ee7ef51a1d3 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 20 Sep 2021 06:42:05 +0300 Subject: Remove proof-of-concept test code Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Remove the proof-of-concept CLI-only module that was used to learn how the heatmaps work and identify the appropriate data for use with them. --- qtlfilesexport.py | 253 ------------------------------------------------------ 1 file changed, 253 deletions(-) delete mode 100644 qtlfilesexport.py diff --git a/qtlfilesexport.py b/qtlfilesexport.py deleted file mode 100644 index 100fa75..0000000 --- a/qtlfilesexport.py +++ /dev/null @@ -1,253 +0,0 @@ -""" -Test the qtlfiles export of traits files - -Run with: - - env SQL_URI="mysql://:@:/db_webqtl" python3 qtlfilesexport.py - -replacing the variables in the angled brackets with the appropriate values -""" -from gn3.random import random_string -from gn3.computations.slink import slink -from gn3.db_utils import database_connector -from gn3.computations.qtlreaper import run_reaper -from gn3.db.traits import retrieve_trait_data, retrieve_trait_info -from gn3.computations.heatmap import export_trait_data, get_nearest_marker -from gn3.db.genotypes import ( - build_genotype_file, - parse_genotype_file, - load_genotype_samples) -from gn3.computations.heatmap import ( - cluster_traits, - compute_traits_order, - retrieve_strains_and_values) -from gn3.computations.qtlreaper import ( - generate_traits_file, - chromosome_sorter_key_fn, - parse_reaper_main_results, - organise_reaper_main_results, - parse_reaper_permutation_results) - -import plotly.express as px - -## for dendrogram -import numpy as np -import plotly.graph_objects as go -import plotly.figure_factory as ff - -# for single heatmap -from plotly.subplots import make_subplots - -TMPDIR = "tmp/" - -def trait_fullnames(): - """Return sample names for traits""" - return [ - "UCLA_BXDBXH_CARTILAGE_V2::ILM103710672", - "UCLA_BXDBXH_CARTILAGE_V2::ILM2260338", - "UCLA_BXDBXH_CARTILAGE_V2::ILM3140576", - "UCLA_BXDBXH_CARTILAGE_V2::ILM5670577", - "UCLA_BXDBXH_CARTILAGE_V2::ILM2070121", - "UCLA_BXDBXH_CARTILAGE_V2::ILM103990541", - "UCLA_BXDBXH_CARTILAGE_V2::ILM1190722", - "UCLA_BXDBXH_CARTILAGE_V2::ILM6590722", - "UCLA_BXDBXH_CARTILAGE_V2::ILM4200064", - "UCLA_BXDBXH_CARTILAGE_V2::ILM3140463"] - -def get_lrs_from_chr(trait, chr_name): - chromosome = trait["chromosomes"].get(chr_name) - if chromosome: - return [ - locus["LRS"] for locus in - sorted(chromosome["loci"], key=lambda loc: loc["Locus"])] - return [None] - -def process_traits_data_for_heatmap(data, trait_names, chromosome_names): - print("TRAIT_NAMES: {}".format(trait_names)) - print("chromosome names: {}".format(chromosome_names)) - print("data keys: {}".format(data.keys())) - hdata = [ - [get_lrs_from_chr(data[trait], chr_name) for trait in trait_names] - for chr_name in chromosome_names] - # print("hdata: {}".format(hdata)) - return hdata - -def generate_heatmap( - data, image_filename_prefix, x_axis = None, x_label: str = "", - y_axis = None, y_label: str = "", output_dir: str = TMPDIR): - """Generate single heatmap section.""" - print("X-AXIS:({}, {}), Y-AXIS: ({}, {}), ROWS:{}, COLS:{}".format( - x_axis, (len(x_axis) if x_axis else 0), - y_axis, (len(y_axis) if y_axis else 0), - len(data), len(data[0]))) - fig = px.imshow( - data, - x = x_axis, - y = y_axis, - width=1000 - ) - fig.update_yaxes(title=y_label) - fig.update_xaxes(title=x_label) - image_filename = "{}/{}.html".format(output_dir, image_filename_prefix) - fig.write_html(image_filename) - return image_filename, fig - -def generate_dendrogram( - data, image_filename_prefix, x_axis = None, x_label: str = "", - y_axis = None, y_label: str = "", output_dir: str = TMPDIR): - fig = ff.create_dendrogram( - np.array(data), orientation="right", labels=y_axis) - - heatmap = go.Heatmap( - x=fig['layout']['xaxis']['ticktext'], - y=fig['layout']['yaxis']['ticktext'], - z=data) - - # print("HEAMAP:{}".format(heatmap)) - fig.add_trace(heatmap) - - fig.update_layout({"width": 1000, "height": 500}) - image_filename = "{}/{}.html".format(output_dir, image_filename_prefix) - fig.write_html(image_filename) - return image_filename, fig - -def generate_single_heatmap( - data, image_filename_prefix, x_axis = None, x_label: str = "", - y_axis = None, y_label: str = "", output_dir: str = TMPDIR): - """Generate single heatmap section.""" - # fig = go.Figure({"type": "heatmap"}) - num_cols = len(x_axis) - fig = make_subplots( - rows=1, - cols=num_cols, - shared_yaxes="rows", - # horizontal_spacing=(1 / (num_cols - 1)), - subplot_titles=x_axis - ) - hms = [go.Heatmap( - name=chromo, - y = y_axis, - z = data_array, - showscale=False) for chromo, data_array in zip(x_axis, data)] - for col, hm in enumerate(hms): - fig.add_trace(hm, row=1, col=(col + 1)) - - fig.update_traces( - showlegend=False, - colorscale=[ - [0.0, '#3B3B3B'], [0.4999999999999999, '#ABABAB'], - [0.5, '#F5DE11'], [1.0, '#FF0D00']], - selector={"type": "heatmap"}) - fig.update_traces( - showlegend=True, - showscale=True, - selector={"name": x_axis[-1]}) - fig.update_layout( - coloraxis_colorscale=[ - [0.0, '#3B3B3B'], [0.4999999999999999, '#ABABAB'], - [0.5, '#F5DE11'], [1.0, '#FF0D00']] - ) - print(fig) - image_filename = "{}/{}.html".format(output_dir, image_filename_prefix) - fig.write_html(image_filename) - return image_filename, fig - -def main(): - """entrypoint function""" - conn = database_connector()[0] - threshold = 0 - traits = [ - retrieve_trait_info(threshold, fullname, conn) - for fullname in trait_fullnames()] - traits_data_list = [retrieve_trait_data(t, conn) for t in traits] - genotype_filename = build_genotype_file(traits[0]["riset"]) - genotype = parse_genotype_file(genotype_filename) - strains = load_genotype_samples(genotype_filename) - exported_traits_data_list = [ - export_trait_data(td, strains) for td in traits_data_list] - slinked = slink(cluster_traits(exported_traits_data_list)) - print("SLINKED: {}".format(slinked)) - traits_order = compute_traits_order(slinked) - print("KEYS: {}".format(traits[0].keys())) - ordered_traits_names = [ - traits[idx]["trait_fullname"] for idx in traits_order] - print("ORDERS: {}".format(traits_order)) - strains_and_values = retrieve_strains_and_values( - traits_order, strains, exported_traits_data_list) - strains_values = strains_and_values[0][1] - trait_values = [t[2] for t in strains_and_values] - traits_filename = "{}/traits_test_file_{}.txt".format( - TMPDIR, random_string(10)) - generate_traits_file(strains_values, trait_values, traits_filename) - print("Generated file: {}".format(traits_filename)) - - main_output, permutations_output = run_reaper( - genotype_filename, traits_filename, separate_nperm_output=True) - - print("Main output: {}, Permutation output: {}".format( - main_output, permutations_output)) - - qtlresults = parse_reaper_main_results(main_output) - permudata = parse_reaper_permutation_results(permutations_output) - # print("QTLRESULTS: {}".format(qtlresults)) - # print("PERMUDATA: {}".format(permudata)) - - nearest = get_nearest_marker(traits, genotype) - print("NEAREST: {}".format(nearest)) - - organised = organise_reaper_main_results(qtlresults) - - traits_ids = [# sort numerically, but retain the ids as strings - str(i) for i in sorted({int(row["ID"]) for row in qtlresults})] - chromosome_names = sorted( - {row["Chr"] for row in qtlresults}, key = chromosome_sorter_key_fn) - loci_names = sorted({row["Locus"] for row in qtlresults}) - ordered_traits_names = { - res_id: trait for res_id, trait in - zip(traits_ids, - [traits[idx]["trait_fullname"] for idx in traits_order])} - # print("ordered:{}, original: {}".format( - # ordered_traits_names, [t["trait_fullname"] for t in traits])) - # print("chromosome_names:{}".format(chromosome_names)) - # print("trait_ids:{}".format(traits_ids)) - # print("loci names:{}".format(loci_names)) - hdata = process_traits_data_for_heatmap(organised, traits_ids, chromosome_names) - - # print("ZIPPED: {}".format(zip(tuple(ordered_traits_names.keys()), hdata))) - # print("HDATA LENGTH:{}, ORDERED TRAITS LENGTH:{}".format(len(hdata), len(ordered_traits_names.keys()))) - heatmaps_data = [ - generate_heatmap( - data, - "heatmap_chr{}_{}".format(chromo, random_string(10)), - y_axis=tuple( - ordered_traits_names[traits_ids[order]] - for order in traits_order), - x_label=chromo, - output_dir=TMPDIR) - for chromo, data in zip(chromosome_names, hdata)] - print("IMAGES FILENAMES: {}".format([img[0] for img in heatmaps_data])) - - dendograms_data = [ - generate_dendrogram( - data, - "dendo_chr{}_{}".format(chromo, random_string(10)), - y_axis=tuple( - ordered_traits_names[traits_ids[order]] - for order in traits_order), - x_label=chromo, - output_dir=TMPDIR) - for chromo, data in zip(chromosome_names, hdata)] - - res = generate_single_heatmap( - hdata, - "single_heatmap_{}".format(random_string(10)), - y_axis=tuple( - ordered_traits_names[traits_ids[order]] - for order in traits_order), - y_label="Traits", - x_axis=[chromo for chromo in chromosome_names], - x_label="Chromosomes", - output_dir=TMPDIR) - -if __name__ == "__main__": - main() -- cgit v1.2.3 From 8442204492a28153e995f3147e06c9758cd3bd28 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 20 Sep 2021 08:43:38 +0300 Subject: Enable Cross-Origin Resource Sharing Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/api/heatmaps.py: Fix bugs in data parsing * gn3/app.py: enable CORS * gn3/settings.py: add flask-cors configurations * guix.scm: Add flask-cors dependency For easier testing of the heatmaps generation feature, this commit activates the cross-origin resource sharing for all "localhost" origins. --- gn3/api/heatmaps.py | 4 ++-- gn3/app.py | 7 +++++++ gn3/settings.py | 12 ++++++++++++ guix.scm | 3 ++- 4 files changed, 23 insertions(+), 3 deletions(-) diff --git a/gn3/api/heatmaps.py b/gn3/api/heatmaps.py index eea3ebe..43ac580 100644 --- a/gn3/api/heatmaps.py +++ b/gn3/api/heatmaps.py @@ -15,9 +15,9 @@ def clustered_heatmaps(): "message": "You need to provide at least one trait name." }), 400 conn, _cursor = database_connector() - def setup_trait_fullname(trait): + def parse_trait_fullname(trait): name_parts = trait.split(":") return "{dataset_name}::{trait_name}".format( - dataset_name=trait[1], trait_name=trait[0]) + dataset_name=name_parts[1], trait_name=name_parts[0]) traits_fullnames = [parse_trait_fullname(trait) for trait in traits_names] return jsonify(build_heatmap(traits_fullnames, conn)), 200 diff --git a/gn3/app.py b/gn3/app.py index b4b08d0..6b4c57e 100644 --- a/gn3/app.py +++ b/gn3/app.py @@ -11,6 +11,7 @@ from gn3.api.heatmaps import heatmaps from gn3.api.correlation import correlation from gn3.api.data_entry import data_entry +from flask_cors import CORS def create_app(config: Union[Dict, str, None] = None) -> Flask: """Create a new flask object""" @@ -18,6 +19,12 @@ def create_app(config: Union[Dict, str, None] = None) -> Flask: # Load default configuration app.config.from_object("gn3.settings") + CORS( + app, + origins=app.config["CORS_ORIGINS"], + allow_headers=app.config["CORS_HEADERS"], + supports_credentials=True, intercept_exceptions=False) + # Load environment configuration if "GN3_CONF" in os.environ: app.config.from_envvar('GN3_CONF') diff --git a/gn3/settings.py b/gn3/settings.py index a08f846..9d7bba3 100644 --- a/gn3/settings.py +++ b/gn3/settings.py @@ -31,3 +31,15 @@ REAPER_COMMAND = "{}/bin/qtlreaper".format(os.environ.get("GUIX_ENVIRONMENT")) # genotype files GENOTYPE_FILES = os.environ.get( "GENOTYPE_FILES", "{}/genotype_files/genotype".format(os.environ.get("HOME"))) + +# CROSS-ORIGIN SETUP +CORS_ORIGINS = [ + "http://localhost:*", + "http://127.0.0.1:*" +] + +CORS_HEADERS = [ + "Content-Type", + "Authorization", + "Access-Control-Allow-Credentials" +] diff --git a/guix.scm b/guix.scm index 8e1cf79..fb97142 100644 --- a/guix.scm +++ b/guix.scm @@ -106,7 +106,8 @@ ("r-stringi" ,r-stringi) ("python-plotly" ,python-plotly) ("python-pandas" ,python-pandas) - ("rust-qtlreaper" ,rust-qtlreaper))) + ("rust-qtlreaper" ,rust-qtlreaper) + ("python-flask-cors" ,python-flask-cors))) (build-system python-build-system) (home-page "https://github.com/genenetwork/genenetwork3") (synopsis "GeneNetwork3 API for data science and machine learning.") -- cgit v1.2.3 From 920be820e9cefe1dcde86d9a252f098c67a2bb8b Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Wed, 22 Sep 2021 07:03:53 +0300 Subject: Return serialized plotly figure Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/api/heatmaps.py: Serialize the figure to JSON * gn3/heatmaps.py: Return the figure object Serialize the Plotly figure into JSON, and return that, so that it can be used on the client to display the image. --- gn3/api/heatmaps.py | 8 +++++++- gn3/heatmaps.py | 27 ++++++++------------------- 2 files changed, 15 insertions(+), 20 deletions(-) diff --git a/gn3/api/heatmaps.py b/gn3/api/heatmaps.py index 43ac580..0493f8a 100644 --- a/gn3/api/heatmaps.py +++ b/gn3/api/heatmaps.py @@ -1,3 +1,4 @@ +import io from flask import jsonify from flask import request from flask import Blueprint @@ -20,4 +21,9 @@ def clustered_heatmaps(): return "{dataset_name}::{trait_name}".format( dataset_name=name_parts[1], trait_name=name_parts[0]) traits_fullnames = [parse_trait_fullname(trait) for trait in traits_names] - return jsonify(build_heatmap(traits_fullnames, conn)), 200 + + with io.StringIO() as io_str: + _filename, figure = build_heatmap(traits_fullnames, conn) + figure.write_json(io_str) + fig_json = io_str.getvalue() + return fig_json, 200 diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py index 205a3b3..cd93b3f 100644 --- a/gn3/heatmaps.py +++ b/gn3/heatmaps.py @@ -187,38 +187,27 @@ def build_heatmap(traits_names, conn: Any): genotype_filename, traits_filename, separate_nperm_output=True) qtlresults = parse_reaper_main_results(main_output) - # permudata = parse_reaper_permutation_results(permutations_output) organised = organise_reaper_main_results(qtlresults) traits_ids = [# sort numerically, but retain the ids as strings str(i) for i in sorted({int(row["ID"]) for row in qtlresults})] chromosome_names = sorted( {row["Chr"] for row in qtlresults}, key=chromosome_sorter_key_fn) - # loci_names = sorted({row["Locus"] for row in qtlresults}) ordered_traits_names = dict( zip(traits_ids, [traits[idx]["trait_fullname"] for idx in traits_order])) - # return generate_clustered_heatmap( - # process_traits_data_for_heatmap( - # organised, traits_ids, chromosome_names), - # clustered, - # "single_heatmap_{}".format(random_string(10)), - # y_axis=tuple( - # ordered_traits_names[traits_ids[order]] - # for order in traits_order), - # y_label="Traits", - # x_axis=chromosome_names, - # x_label="Chromosomes") - return { - "clustering_data": clustered, - "heatmap_data": process_traits_data_for_heatmap( + return generate_clustered_heatmap( + process_traits_data_for_heatmap( organised, traits_ids, chromosome_names), - "traits": tuple( + clustered, + "single_heatmap_{}".format(random_string(10)), + y_axis=tuple( ordered_traits_names[traits_ids[order]] for order in traits_order), - "chromosomes": chromosome_names - } + y_label="Traits", + x_axis=chromosome_names, + x_label="Chromosomes") def compute_traits_order(slink_data, neworder: tuple = tuple()): """ -- cgit v1.2.3 From 5892ffc7488b0c9cbb4ea08fd5c5f8648e0baea8 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Wed, 22 Sep 2021 07:06:14 +0300 Subject: Update check: Heatmaps need at least 2 items Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Update the check to look for at least 2 traits before trying to generate the heatmap. --- gn3/api/heatmaps.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gn3/api/heatmaps.py b/gn3/api/heatmaps.py index 0493f8a..1022a35 100644 --- a/gn3/api/heatmaps.py +++ b/gn3/api/heatmaps.py @@ -11,9 +11,9 @@ heatmaps = Blueprint("heatmaps", __name__) def clustered_heatmaps(): heatmap_request = request.get_json() traits_names = heatmap_request.get("traits_names", tuple()) - if len(traits_names) < 1: + if len(traits_names) < 2: return jsonify({ - "message": "You need to provide at least one trait name." + "message": "You need to provide at least two trait names." }), 400 conn, _cursor = database_connector() def parse_trait_fullname(trait): -- cgit v1.2.3 From cd7f301688fd9780df1f842f8bd2b7602775ba1f Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Wed, 22 Sep 2021 07:53:53 +0300 Subject: Fix pylint errors * Add missing function and module docstrings * Remove unused imports * Fix import order * Rework some code sections to fix issues * Disable some pylint errors. --- gn3/api/heatmaps.py | 8 ++++++++ gn3/app.py | 5 +++-- gn3/computations/qtlreaper.py | 8 ++++++++ gn3/db/genotypes.py | 1 + gn3/db/traits.py | 2 +- gn3/heatmaps.py | 28 ++++++++++++++++------------ 6 files changed, 37 insertions(+), 15 deletions(-) diff --git a/gn3/api/heatmaps.py b/gn3/api/heatmaps.py index 1022a35..fe47aee 100644 --- a/gn3/api/heatmaps.py +++ b/gn3/api/heatmaps.py @@ -1,3 +1,7 @@ +""" +Module to hold the entrypoint functions that generate heatmaps +""" + import io from flask import jsonify from flask import request @@ -9,6 +13,10 @@ heatmaps = Blueprint("heatmaps", __name__) @heatmaps.route("/clustered", methods=("POST",)) def clustered_heatmaps(): + """ + Parses the incoming data and responds with the JSON-serialized plotly figure + representing the clustered heatmap. + """ heatmap_request = request.get_json() traits_names = heatmap_request.get("traits_names", tuple()) if len(traits_names) < 2: diff --git a/gn3/app.py b/gn3/app.py index 6b4c57e..8badb65 100644 --- a/gn3/app.py +++ b/gn3/app.py @@ -3,7 +3,10 @@ import os from typing import Dict from typing import Union + from flask import Flask +from flask_cors import CORS + from gn3.api.gemma import gemma from gn3.api.rqtl import rqtl from gn3.api.general import general @@ -11,8 +14,6 @@ from gn3.api.heatmaps import heatmaps from gn3.api.correlation import correlation from gn3.api.data_entry import data_entry -from flask_cors import CORS - def create_app(config: Union[Dict, str, None] = None) -> Flask: """Create a new flask object""" app = Flask(__name__) diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py index 377db9b..5d17fed 100644 --- a/gn3/computations/qtlreaper.py +++ b/gn3/computations/qtlreaper.py @@ -87,11 +87,17 @@ def run_reaper( return (output_filename, permu_output_filename) def chromosome_sorter_key_fn(val): + """ + Useful for sorting the chromosomes + """ if isinstance(val, int): return val return ord(val) def organise_reaper_main_results(parsed_results): + """ + Provide the results of running reaper in a format that is easier to use. + """ def __organise_by_chromosome(chr_name, items): chr_items = [item for item in items if item["Chr"] == chr_name] return { @@ -129,12 +135,14 @@ def parse_reaper_main_results(results_file): lines = infile.readlines() def __parse_column_float_value(value): + # pylint: disable=W0702 try: return float(value) except: return value def __parse_column_int_value(value): + # pylint: disable=W0702 try: return int(value) except: diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py index 9d052d9..919c539 100644 --- a/gn3/db/genotypes.py +++ b/gn3/db/genotypes.py @@ -115,6 +115,7 @@ def parse_genotype_marker(line: str, geno_obj: dict, parlist: list): Reworks https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/utility/gen_geno_ob.py#L143-L190 """ + # pylint: disable=W0702 marker_row = [item.strip() for item in line.split("\t")] geno_table = { geno_obj["mat"]: -1, geno_obj["pat"]: 1, geno_obj["het"]: 0, diff --git a/gn3/db/traits.py b/gn3/db/traits.py index bfe887e..747ed27 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -46,7 +46,7 @@ def update_sample_data(conn: Any, count: Union[int, str]): """Given the right parameters, update sample-data from the relevant table.""" - # pylint: disable=[R0913, R0914] + # pylint: disable=[R0913, R0914, C0103] STRAIN_ID_SQL: str = "UPDATE Strain SET Name = %s WHERE Id = %s" PUBLISH_DATA_SQL: str = ("UPDATE PublishData SET value = %s " "WHERE StrainId = %s AND Id = %s") diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py index cd93b3f..9d82fb2 100644 --- a/gn3/heatmaps.py +++ b/gn3/heatmaps.py @@ -3,29 +3,28 @@ This module will contain functions to be used in computation of the data used to generate various kinds of heatmaps. """ +from functools import reduce from typing import Any, Dict, Sequence + import numpy as np -from functools import reduce -from gn3.settings import TMPDIR import plotly.graph_objects as go import plotly.figure_factory as ff +from plotly.subplots import make_subplots + +from gn3.settings import TMPDIR from gn3.random import random_string from gn3.computations.slink import slink -from plotly.subplots import make_subplots from gn3.computations.correlations2 import compute_correlation from gn3.db.genotypes import ( - build_genotype_file, load_genotype_samples, parse_genotype_file) + build_genotype_file, load_genotype_samples) from gn3.db.traits import ( - retrieve_trait_data, - retrieve_trait_info, - generate_traits_filename) + retrieve_trait_data, retrieve_trait_info) from gn3.computations.qtlreaper import ( run_reaper, generate_traits_file, chromosome_sorter_key_fn, parse_reaper_main_results, - organise_reaper_main_results, - parse_reaper_permutation_results) + organise_reaper_main_results) def export_trait_data( trait_data: dict, strainlist: Sequence[str], dtype: str = "val", @@ -159,13 +158,13 @@ def build_heatmap(traits_names, conn: Any): PARAMETERS: TODO: Elaborate on the parameters here... """ + # pylint: disable=[R0914] threshold = 0 # webqtlConfig.PUBLICTHRESH traits = [ retrieve_trait_info(threshold, fullname, conn) for fullname in traits_names] traits_data_list = [retrieve_trait_data(t, conn) for t in traits] genotype_filename = build_genotype_file(traits[0]["riset"]) - # genotype = parse_genotype_file(genotype_filename) strains = load_genotype_samples(genotype_filename) exported_traits_data_list = [ export_trait_data(td, strains) for td in traits_data_list] @@ -336,6 +335,7 @@ def generate_clustered_heatmap( Generate a dendrogram, and heatmaps for each chromosome, and put them all into one plot. """ + # pylint: disable=[R0913, R0914] num_cols = 1 + len(x_axis) fig = make_subplots( rows=1, @@ -359,14 +359,18 @@ def generate_clustered_heatmap( "height": 800, "xaxis": { "mirror": False, - "showgrid": True + "showgrid": True, + "title": x_label + }, + "yaxis": { + "title": y_label } }) x_axes_layouts = { "xaxis{}".format(i+1 if i > 0 else ""): { "mirror": False, - "showticklabels": True if i == 0 else False, + "showticklabels": i == 0, "ticks": "outside" if i == 0 else "" } for i in range(num_cols)} -- cgit v1.2.3 From 71cc35e5178904b512b9007e33be17a36f6656f2 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Wed, 22 Sep 2021 08:36:11 +0300 Subject: Fix typing issues * Ignore some errors * Update typing definitions for some portions of code * Add missing imports --- gn3/app.py | 2 +- gn3/computations/qtlreaper.py | 6 ++++-- gn3/db/genotypes.py | 10 ++++++---- gn3/db/traits.py | 8 ++++---- gn3/heatmaps.py | 8 +++----- 5 files changed, 18 insertions(+), 16 deletions(-) diff --git a/gn3/app.py b/gn3/app.py index 8badb65..5e852e1 100644 --- a/gn3/app.py +++ b/gn3/app.py @@ -5,7 +5,7 @@ from typing import Dict from typing import Union from flask import Flask -from flask_cors import CORS +from flask_cors import CORS # type: ignore from gn3.api.gemma import gemma from gn3.api.rqtl import rqtl diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py index 5d17fed..5ddea76 100644 --- a/gn3/computations/qtlreaper.py +++ b/gn3/computations/qtlreaper.py @@ -4,6 +4,8 @@ computation of QTLs. """ import os import subprocess +from typing import Union + from gn3.random import random_string from gn3.settings import TMPDIR, REAPER_COMMAND @@ -70,9 +72,9 @@ def run_reaper( output_dir, random_string(10)) output_list = ["--main_output", output_filename] if separate_nperm_output: - permu_output_filename = "{}/qtlreaper/permu_output_{}.txt".format( + permu_output_filename: Union[None, str] = "{}/qtlreaper/permu_output_{}.txt".format( output_dir, random_string(10)) - output_list = output_list + ["--permu_output", permu_output_filename] + output_list = output_list + ["--permu_output", permu_output_filename] # type: ignore[list-item] else: permu_output_filename = None diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py index 919c539..9ea9f20 100644 --- a/gn3/db/genotypes.py +++ b/gn3/db/genotypes.py @@ -2,6 +2,8 @@ import os import gzip +from typing import Union, TextIO + from gn3.settings import GENOTYPE_FILES def build_genotype_file( @@ -44,17 +46,17 @@ def __load_genotype_samples_from_geno(genotype_filename: str): """ gzipped_filename = "{}.gz".format(genotype_filename) if os.path.isfile(gzipped_filename): - genofile = gzip.open(gzipped_filename) + genofile: Union[TextIO, gzip.GzipFile] = gzip.open(gzipped_filename) else: genofile = open(genotype_filename) for row in genofile: line = row.strip() - if (not line) or (line.startswith(("#", "@"))): + if (not line) or (line.startswith(("#", "@"))): # type: ignore[arg-type] continue break - headers = line.split("\t") + headers = line.split("\t" ) # type: ignore[arg-type] if headers[3] == "Mb": return headers[4:] return headers[3:] @@ -107,7 +109,7 @@ def parse_genotype_header(line: str, parlist: tuple = tuple()): ("prgy", prgy), ("nprgy", len(prgy))) -def parse_genotype_marker(line: str, geno_obj: dict, parlist: list): +def parse_genotype_marker(line: str, geno_obj: dict, parlist: tuple): """ Parse a data line in a genotype file diff --git a/gn3/db/traits.py b/gn3/db/traits.py index 747ed27..4fc47c3 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -63,22 +63,22 @@ def update_sample_data(conn: Any, with conn.cursor() as cursor: # Update the Strains table cursor.execute(STRAIN_ID_SQL, (strain_name, strain_id)) - updated_strains: int = cursor.rowcount + updated_strains = cursor.rowcount # Update the PublishData table cursor.execute(PUBLISH_DATA_SQL, (None if value == "x" else value, strain_id, publish_data_id)) - updated_published_data: int = cursor.rowcount + updated_published_data = cursor.rowcount # Update the PublishSE table cursor.execute(PUBLISH_SE_SQL, (None if error == "x" else error, strain_id, publish_data_id)) - updated_se_data: int = cursor.rowcount + updated_se_data = cursor.rowcount # Update the NStrain table cursor.execute(N_STRAIN_SQL, (None if count == "x" else count, strain_id, publish_data_id)) - updated_n_strains: int = cursor.rowcount + updated_n_strains = cursor.rowcount return (updated_strains, updated_published_data, updated_se_data, updated_n_strains) diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py index 9d82fb2..45d0c22 100644 --- a/gn3/heatmaps.py +++ b/gn3/heatmaps.py @@ -7,9 +7,9 @@ from functools import reduce from typing import Any, Dict, Sequence import numpy as np -import plotly.graph_objects as go -import plotly.figure_factory as ff -from plotly.subplots import make_subplots +import plotly.graph_objects as go # type: ignore +import plotly.figure_factory as ff # type: ignore +from plotly.subplots import make_subplots # type: ignore from gn3.settings import TMPDIR from gn3.random import random_string @@ -171,8 +171,6 @@ def build_heatmap(traits_names, conn: Any): clustered = cluster_traits(exported_traits_data_list) slinked = slink(clustered) traits_order = compute_traits_order(slinked) - ordered_traits_names = [ - traits[idx]["trait_fullname"] for idx in traits_order] strains_and_values = retrieve_strains_and_values( traits_order, strains, exported_traits_data_list) traits_filename = "{}/traits_test_file_{}.txt".format( -- cgit v1.2.3 From 56c73324c285d896567268370f3955bbd15754b0 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Wed, 22 Sep 2021 09:02:46 +0300 Subject: Fix more pylint errors --- gn3/computations/qtlreaper.py | 3 ++- gn3/db/genotypes.py | 2 +- tests/unit/db/test_traits.py | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py index 5ddea76..8b2893e 100644 --- a/gn3/computations/qtlreaper.py +++ b/gn3/computations/qtlreaper.py @@ -74,7 +74,8 @@ def run_reaper( if separate_nperm_output: permu_output_filename: Union[None, str] = "{}/qtlreaper/permu_output_{}.txt".format( output_dir, random_string(10)) - output_list = output_list + ["--permu_output", permu_output_filename] # type: ignore[list-item] + output_list = output_list + [ + "--permu_output", permu_output_filename] # type: ignore[list-item] else: permu_output_filename = None diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py index 9ea9f20..9987320 100644 --- a/gn3/db/genotypes.py +++ b/gn3/db/genotypes.py @@ -56,7 +56,7 @@ def __load_genotype_samples_from_geno(genotype_filename: str): continue break - headers = line.split("\t" ) # type: ignore[arg-type] + headers = line.split("\t") # type: ignore[arg-type] if headers[3] == "Mb": return headers[4:] return headers[3:] diff --git a/tests/unit/db/test_traits.py b/tests/unit/db/test_traits.py index ee98893..baa2af3 100644 --- a/tests/unit/db/test_traits.py +++ b/tests/unit/db/test_traits.py @@ -166,6 +166,7 @@ class TestTraitsDBFunctions(TestCase): the right calls. """ + # pylint: disable=C0103 db_mock = mock.MagicMock() STRAIN_ID_SQL: str = "UPDATE Strain SET Name = %s WHERE Id = %s" -- cgit v1.2.3 From 95c5c0e73bffbf0287a17309e703063ee54d25ba Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Thu, 23 Sep 2021 03:45:19 +0300 Subject: Refactor: Move common sample data to separate file Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Move common sample test data into a separate file where it can be imported from, to prevent pylint error R0801 which proved tricky to silence in any other way. --- tests/unit/computations/test_qtlreaper.py | 68 ++++-------------- tests/unit/db/test_traits.py | 15 ++-- tests/unit/sample_test_data.py | 111 ++++++++++++++++++++++++++++++ tests/unit/test_heatmaps.py | 96 +------------------------- 4 files changed, 134 insertions(+), 156 deletions(-) create mode 100644 tests/unit/sample_test_data.py diff --git a/tests/unit/computations/test_qtlreaper.py b/tests/unit/computations/test_qtlreaper.py index d420470..742d106 100644 --- a/tests/unit/computations/test_qtlreaper.py +++ b/tests/unit/computations/test_qtlreaper.py @@ -4,6 +4,7 @@ from gn3.computations.qtlreaper import ( parse_reaper_main_results, organise_reaper_main_results, parse_reaper_permutation_results) +from tests.unit.sample_test_data import organised_trait_1 class TestQTLReaper(TestCase): """Class for testing qtlreaper interface functions.""" @@ -81,99 +82,54 @@ class TestQTLReaper(TestCase): self.assertEqual( organise_reaper_main_results([ { - "ID": "T1", "Locus": "rs31443144", "Chr": 1, "cM": 1.500, + "ID": "1", "Locus": "rs31443144", "Chr": 1, "cM": 1.500, "Mb": 3.010, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs6269442", "Chr": 1, "cM": 1.500, + "ID": "1", "Locus": "rs6269442", "Chr": 1, "cM": 1.500, "Mb": 3.492, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs32285189", "Chr": 1, "cM": 1.630, + "ID": "1", "Locus": "rs32285189", "Chr": 1, "cM": 1.630, "Mb": 3.511, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs258367496", "Chr": 1, "cM": 1.630, + "ID": "1", "Locus": "rs258367496", "Chr": 1, "cM": 1.630, "Mb": 3.660, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs32430919", "Chr": 1, "cM": 1.750, + "ID": "1", "Locus": "rs32430919", "Chr": 1, "cM": 1.750, "Mb": 3.777, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs36251697", "Chr": 1, "cM": 1.880, + "ID": "1", "Locus": "rs36251697", "Chr": 1, "cM": 1.880, "Mb": 3.812, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs30658298", "Chr": 1, "cM": 2.010, + "ID": "1", "Locus": "rs30658298", "Chr": 1, "cM": 2.010, "Mb": 4.431, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs51852623", "Chr": 2, "cM": 2.010, + "ID": "1", "Locus": "rs51852623", "Chr": 2, "cM": 2.010, "Mb": 4.447, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs31879829", "Chr": 2, "cM": 2.140, + "ID": "1", "Locus": "rs31879829", "Chr": 2, "cM": 2.140, "Mb": 4.519, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs36742481", "Chr": 2, "cM": 2.140, + "ID": "1", "Locus": "rs36742481", "Chr": 2, "cM": 2.140, "Mb": 4.776, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 } ]), - {"T1": {"ID": "T1", - "chromosomes": { - 1: {"Chr": 1, - "loci": [ - { - "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs6269442", "cM": 1.500, "Mb": 3.492, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs32285189", "cM": 1.630, "Mb": 3.511, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs258367496", "cM": 1.630, "Mb": 3.660, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs32430919", "cM": 1.750, "Mb": 3.777, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs36251697", "cM": 1.880, "Mb": 3.812, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs30658298", "cM": 2.010, "Mb": 4.431, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }]}, - 2: {"Chr": 2, - "loci": [ - { - "Locus": "rs51852623", "cM": 2.010, "Mb": 4.447, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs31879829", "cM": 2.140, "Mb": 4.519, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs36742481", "cM": 2.140, "Mb": 4.776, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }]}}}}) + organised_trait_1) diff --git a/tests/unit/db/test_traits.py b/tests/unit/db/test_traits.py index baa2af3..8af8e82 100644 --- a/tests/unit/db/test_traits.py +++ b/tests/unit/db/test_traits.py @@ -170,12 +170,15 @@ class TestTraitsDBFunctions(TestCase): db_mock = mock.MagicMock() STRAIN_ID_SQL: str = "UPDATE Strain SET Name = %s WHERE Id = %s" - PUBLISH_DATA_SQL: str = ("UPDATE PublishData SET value = %s " - "WHERE StrainId = %s AND Id = %s") - PUBLISH_SE_SQL: str = ("UPDATE PublishSE SET error = %s " - "WHERE StrainId = %s AND DataId = %s") - N_STRAIN_SQL: str = ("UPDATE NStrain SET count = %s " - "WHERE StrainId = %s AND DataId = %s") + PUBLISH_DATA_SQL: str = ( + "UPDATE PublishData SET value = %s " + "WHERE StrainId = %s AND Id = %s") + PUBLISH_SE_SQL: str = ( + "UPDATE PublishSE SET error = %s " + "WHERE StrainId = %s AND DataId = %s") + N_STRAIN_SQL: str = ( + "UPDATE NStrain SET count = %s " + "WHERE StrainId = %s AND DataId = %s") with db_mock.cursor() as cursor: type(cursor).rowcount = 1 diff --git a/tests/unit/sample_test_data.py b/tests/unit/sample_test_data.py new file mode 100644 index 0000000..407d074 --- /dev/null +++ b/tests/unit/sample_test_data.py @@ -0,0 +1,111 @@ +""" +This module holds a collection of sample data variables, used in more than one + test. + +This is mostly to avoid the `duplicate-code` pylint error that gets raised if +the same data is defined in more than one file. It has been found that adding +the `# pylint: disable=R0801` or `# pylint: disable=duplicate-code` to the top +of the file seems to not work as expected. + +Adding these same declarations to .pylintrc is not an option, since that, +seemingly, would deactivate the warnings for all code in the project: We do not +want that. +""" + +organised_trait_1 = { + "1": { + "ID": "1", + "chromosomes": { + 1: {"Chr": 1, + "loci": [ + { + "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs6269442", "cM": 1.500, "Mb": 3.492, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs32285189", "cM": 1.630, "Mb": 3.511, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs258367496", "cM": 1.630, "Mb": 3.660, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs32430919", "cM": 1.750, "Mb": 3.777, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs36251697", "cM": 1.880, "Mb": 3.812, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs30658298", "cM": 2.010, "Mb": 4.431, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }]}, + 2: {"Chr": 2, + "loci": [ + { + "Locus": "rs51852623", "cM": 2.010, "Mb": 4.447, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs31879829", "cM": 2.140, "Mb": 4.519, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs36742481", "cM": 2.140, "Mb": 4.776, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }]}}}} + +organised_trait_2 = { + "2": { + "ID": "2", + "chromosomes": { + 1: {"Chr": 1, + "loci": [ + { + "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs6269442", "cM": 1.500, "Mb": 3.492, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs32285189", "cM": 1.630, "Mb": 3.511, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs258367496", "cM": 1.630, "Mb": 3.660, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs32430919", "cM": 1.750, "Mb": 3.777, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs36251697", "cM": 1.880, "Mb": 3.812, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs30658298", "cM": 2.010, "Mb": 4.431, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }]}, + 2: {"Chr": 2, + "loci": [ + { + "Locus": "rs51852623", "cM": 2.010, "Mb": 4.447, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs31879829", "cM": 2.140, "Mb": 4.519, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs36742481", "cM": 2.140, "Mb": 4.776, + "LRS": 0.579, "Additive": -0.074, "pValue": 1.000 + }]}}}} diff --git a/tests/unit/test_heatmaps.py b/tests/unit/test_heatmaps.py index c0a496b..fd91cf9 100644 --- a/tests/unit/test_heatmaps.py +++ b/tests/unit/test_heatmaps.py @@ -7,6 +7,7 @@ from gn3.heatmaps import ( compute_traits_order, retrieve_strains_and_values, process_traits_data_for_heatmap) +from tests.unit.sample_test_data import organised_trait_1, organised_trait_2 strainlist = ["B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"] trait_data = { @@ -206,100 +207,7 @@ class TestHeatmap(TestCase): """Check for correct processing of data for heatmap generation.""" self.assertEqual( process_traits_data_for_heatmap( - {"1": { - "ID": "T1", - "chromosomes": { - 1: {"Chr": 1, - "loci": [ - { - "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs6269442", "cM": 1.500, "Mb": 3.492, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs32285189", "cM": 1.630, "Mb": 3.511, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs258367496", "cM": 1.630, "Mb": 3.660, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs32430919", "cM": 1.750, "Mb": 3.777, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs36251697", "cM": 1.880, "Mb": 3.812, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs30658298", "cM": 2.010, "Mb": 4.431, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }]}, - 2: {"Chr": 2, - "loci": [ - { - "Locus": "rs51852623", "cM": 2.010, "Mb": 4.447, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs31879829", "cM": 2.140, "Mb": 4.519, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs36742481", "cM": 2.140, "Mb": 4.776, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }]}}}, - "2": { - "ID": "T1", - "chromosomes": { - 1: {"Chr": 1, - "loci": [ - { - "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs6269442", "cM": 1.500, "Mb": 3.492, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs32285189", "cM": 1.630, "Mb": 3.511, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs258367496", "cM": 1.630, "Mb": 3.660, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs32430919", "cM": 1.750, "Mb": 3.777, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs36251697", "cM": 1.880, "Mb": 3.812, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs30658298", "cM": 2.010, "Mb": 4.431, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }]}, - 2: {"Chr": 2, - "loci": [ - { - "Locus": "rs51852623", "cM": 2.010, "Mb": 4.447, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs31879829", "cM": 2.140, "Mb": 4.519, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs36742481", "cM": 2.140, "Mb": 4.776, - "LRS": 0.579, "Additive": -0.074, "pValue": 1.000 - }]}}}}, + {**organised_trait_1, **organised_trait_2}, ["2", "1"], [1, 2]), [[[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5], -- cgit v1.2.3 From 8d9bc0f29ce9208306915b079818e6f0c31785e2 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Thu, 23 Sep 2021 03:55:44 +0300 Subject: Add missing dependencies causing pylint to fail * Add some dependencies used by the system that were missing in the test environment, leading to the pylint step failing. --- requirements.txt | 2 ++ setup.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/requirements.txt b/requirements.txt index f94c86f..d332a96 100644 --- a/requirements.txt +++ b/requirements.txt @@ -33,3 +33,5 @@ urllib3==1.26.4 varint==1.0.2 Werkzeug==1.0.1 wrapt==1.12.1 +plotly==4.14.3 +flask-cors==3.0.9 diff --git a/setup.py b/setup.py index 3f0922b..98a076f 100644 --- a/setup.py +++ b/setup.py @@ -20,6 +20,8 @@ setup(author='Bonface M. K.', "requests==2.25.1" "scipy==1.6.0" "sqlalchemy-stubs==0.4" + "plotly==4.14.3" + "flask-cors==3.0.9" ], license='GPLV3', long_description=open('README.md').read(), -- cgit v1.2.3 From 19783a18c2bc7941fc5980e593f19fb1d18c3623 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 27 Sep 2021 04:48:53 +0300 Subject: Update terminology: `strain` to `sample` Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Update the terminology used: use `sample` in place of `strain` according to Zachary's direction at https://github.com/genenetwork/genenetwork3/pull/37#issuecomment-926043306 --- gn3/computations/parsers.py | 10 ++--- gn3/computations/qtlreaper.py | 8 ++-- gn3/db/genotypes.py | 8 ++-- gn3/db/traits.py | 44 ++++++++++----------- gn3/heatmaps.py | 62 ++++++++++++++--------------- tests/unit/computations/test_parsers.py | 4 +- tests/unit/test_heatmaps.py | 70 ++++++++++++++++----------------- 7 files changed, 103 insertions(+), 103 deletions(-) diff --git a/gn3/computations/parsers.py b/gn3/computations/parsers.py index 94387ff..1af35d6 100644 --- a/gn3/computations/parsers.py +++ b/gn3/computations/parsers.py @@ -14,7 +14,7 @@ def parse_genofile(file_path: str) -> Tuple[List[str], 'h': 0, 'u': None, } - genotypes, strains = [], [] + genotypes, samples = [], [] with open(file_path, "r") as _genofile: for line in _genofile: line = line.strip() @@ -22,8 +22,8 @@ def parse_genofile(file_path: str) -> Tuple[List[str], continue cells = line.split() if line.startswith("Chr"): - strains = cells[4:] - strains = [strain.lower() for strain in strains] + samples = cells[4:] + samples = [sample.lower() for sample in samples] continue values = [__map.get(value.lower(), None) for value in cells[4:]] genotype = { @@ -32,7 +32,7 @@ def parse_genofile(file_path: str) -> Tuple[List[str], "cm": cells[2], "mb": cells[3], "values": values, - "dicvalues": dict(zip(strains, values)), + "dicvalues": dict(zip(samples, values)), } genotypes.append(genotype) - return strains, genotypes + return samples, genotypes diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py index 8b2893e..166d2dd 100644 --- a/gn3/computations/qtlreaper.py +++ b/gn3/computations/qtlreaper.py @@ -9,17 +9,17 @@ from typing import Union from gn3.random import random_string from gn3.settings import TMPDIR, REAPER_COMMAND -def generate_traits_file(strains, trait_values, traits_filename): +def generate_traits_file(samples, trait_values, traits_filename): """ Generate a traits file for use with `qtlreaper`. PARAMETERS: - strains: A list of strains to use as the headers for the various columns. - trait_values: A list of lists of values for each trait and strain. + samples: A list of samples to use as the headers for the various columns. + trait_values: A list of lists of values for each trait and sample. traits_filename: The tab-separated value to put the values in for computation of QTLs. """ - header = "Trait\t{}\n".format("\t".join(strains)) + header = "Trait\t{}\n".format("\t".join(samples)) data = ( [header] + ["{}\t{}\n".format(i+1, "\t".join([str(i) for i in t])) diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py index 9987320..8f18cac 100644 --- a/gn3/db/genotypes.py +++ b/gn3/db/genotypes.py @@ -14,16 +14,16 @@ def build_genotype_file( def load_genotype_samples(genotype_filename: str, file_type: str = "geno"): """ - Load sample of strains from genotype files. + Load sample of samples from genotype files. DESCRIPTION: - Traits can contain a varied number of strains, some of which do not exist in + Traits can contain a varied number of samples, some of which do not exist in certain genotypes. In order to compute QTLs, GEMMAs, etc, we need to ensure - to pick only those strains that exist in the genotype under consideration + to pick only those samples that exist in the genotype under consideration for the traits used in the computation. This function loads a list of samples from the genotype files for use in - filtering out unusable strains. + filtering out unusable samples. PARAMETERS: diff --git a/gn3/db/traits.py b/gn3/db/traits.py index 4fc47c3..c9d05d7 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -445,7 +445,7 @@ def retrieve_temp_trait_data(trait_info: dict, conn: Any): query, {"trait_name": trait_info["trait_name"]}) return [dict(zip( - ["strain_name", "value", "se_error", "nstrain", "id"], row)) + ["sample_name", "value", "se_error", "nstrain", "id"], row)) for row in cursor.fetchall()] return [] @@ -484,7 +484,7 @@ def retrieve_geno_trait_data(trait_info: Dict, conn: Any): "species_id": retrieve_species_id( trait_info["db"]["riset"], conn)}) return [dict(zip( - ["strain_name", "value", "se_error", "id"], row)) + ["sample_name", "value", "se_error", "id"], row)) for row in cursor.fetchall()] return [] @@ -515,7 +515,7 @@ def retrieve_publish_trait_data(trait_info: Dict, conn: Any): {"trait_name": trait_info["trait_name"], "dataset_id": trait_info["db"]["dataset_id"]}) return [dict(zip( - ["strain_name", "value", "se_error", "nstrain", "id"], row)) + ["sample_name", "value", "se_error", "nstrain", "id"], row)) for row in cursor.fetchall()] return [] @@ -548,7 +548,7 @@ def retrieve_cellid_trait_data(trait_info: Dict, conn: Any): "trait_name": trait_info["trait_name"], "dataset_id": trait_info["db"]["dataset_id"]}) return [dict(zip( - ["strain_name", "value", "se_error", "id"], row)) + ["sample_name", "value", "se_error", "id"], row)) for row in cursor.fetchall()] return [] @@ -577,29 +577,29 @@ def retrieve_probeset_trait_data(trait_info: Dict, conn: Any): {"trait_name": trait_info["trait_name"], "dataset_name": trait_info["db"]["dataset_name"]}) return [dict(zip( - ["strain_name", "value", "se_error", "id"], row)) + ["sample_name", "value", "se_error", "id"], row)) for row in cursor.fetchall()] return [] -def with_strainlist_data_setup(strainlist: Sequence[str]): +def with_samplelist_data_setup(samplelist: Sequence[str]): """ - Build function that computes the trait data from provided list of strains. + Build function that computes the trait data from provided list of samples. PARAMETERS - strainlist: (list) - A list of strain names + samplelist: (list) + A list of sample names RETURNS: Returns a function that given some data from the database, computes the - strain's value, variance and ndata values, only if the strain is present - in the provided `strainlist` variable. + sample's value, variance and ndata values, only if the sample is present + in the provided `samplelist` variable. """ def setup_fn(tdata): - if tdata["strain_name"] in strainlist: + if tdata["sample_name"] in samplelist: val = tdata["value"] if val is not None: return { - "strain_name": tdata["strain_name"], + "sample_name": tdata["sample_name"], "value": val, "variance": tdata["se_error"], "ndata": tdata.get("nstrain", None) @@ -607,19 +607,19 @@ def with_strainlist_data_setup(strainlist: Sequence[str]): return None return setup_fn -def without_strainlist_data_setup(): +def without_samplelist_data_setup(): """ Build function that computes the trait data. RETURNS: Returns a function that given some data from the database, computes the - strain's value, variance and ndata values. + sample's value, variance and ndata values. """ def setup_fn(tdata): val = tdata["value"] if val is not None: return { - "strain_name": tdata["strain_name"], + "sample_name": tdata["sample_name"], "value": val, "variance": tdata["se_error"], "ndata": tdata.get("nstrain", None) @@ -627,7 +627,7 @@ def without_strainlist_data_setup(): return None return setup_fn -def retrieve_trait_data(trait: dict, conn: Any, strainlist: Sequence[str] = tuple()): +def retrieve_trait_data(trait: dict, conn: Any, samplelist: Sequence[str] = tuple()): """ Retrieve trait data @@ -650,23 +650,23 @@ def retrieve_trait_data(trait: dict, conn: Any, strainlist: Sequence[str] = tupl if results: # do something with mysqlid mysqlid = results[0]["id"] - if strainlist: + if samplelist: data = [ item for item in - map(with_strainlist_data_setup(strainlist), results) + map(with_samplelist_data_setup(samplelist), results) if item is not None] else: data = [ item for item in - map(without_strainlist_data_setup(), results) + map(without_samplelist_data_setup(), results) if item is not None] return { "mysqlid": mysqlid, "data": dict(map( lambda x: ( - x["strain_name"], - {k:v for k, v in x.items() if x != "strain_name"}), + x["sample_name"], + {k:v for k, v in x.items() if x != "sample_name"}), data))} return {} diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py index 45d0c22..b6fc6d3 100644 --- a/gn3/heatmaps.py +++ b/gn3/heatmaps.py @@ -27,10 +27,10 @@ from gn3.computations.qtlreaper import ( organise_reaper_main_results) def export_trait_data( - trait_data: dict, strainlist: Sequence[str], dtype: str = "val", + trait_data: dict, samplelist: Sequence[str], dtype: str = "val", var_exists: bool = False, n_exists: bool = False): """ - Export data according to `strainlist`. Mostly used in calculating + Export data according to `samplelist`. Mostly used in calculating correlations. DESCRIPTION: @@ -40,8 +40,8 @@ def export_trait_data( PARAMETERS trait: (dict) The dictionary of key-value pairs representing a trait - strainlist: (list) - A list of strain names + samplelist: (list) + A list of sample names dtype: (str) ... verify what this is ... var_exists: (bool) @@ -49,18 +49,18 @@ def export_trait_data( n_exists: (bool) A flag indicating existence of ndata """ - def __export_all_types(tdata, strain): + def __export_all_types(tdata, sample): sample_data = [] - if tdata[strain]["value"]: - sample_data.append(tdata[strain]["value"]) + if tdata[sample]["value"]: + sample_data.append(tdata[sample]["value"]) if var_exists: - if tdata[strain]["variance"]: - sample_data.append(tdata[strain]["variance"]) + if tdata[sample]["variance"]: + sample_data.append(tdata[sample]["variance"]) else: sample_data.append(None) if n_exists: - if tdata[strain]["ndata"]: - sample_data.append(tdata[strain]["ndata"]) + if tdata[sample]["ndata"]: + sample_data.append(tdata[sample]["ndata"]) else: sample_data.append(None) else: @@ -73,17 +73,17 @@ def export_trait_data( return tuple(sample_data) - def __exporter(accumulator, strain): + def __exporter(accumulator, sample): # pylint: disable=[R0911] - if strain in trait_data["data"]: + if sample in trait_data["data"]: if dtype == "val": - return accumulator + (trait_data["data"][strain]["value"], ) + return accumulator + (trait_data["data"][sample]["value"], ) if dtype == "var": - return accumulator + (trait_data["data"][strain]["variance"], ) + return accumulator + (trait_data["data"][sample]["variance"], ) if dtype == "N": - return accumulator + (trait_data["data"][strain]["ndata"], ) + return accumulator + (trait_data["data"][sample]["ndata"], ) if dtype == "all": - return accumulator + __export_all_types(trait_data["data"], strain) + return accumulator + __export_all_types(trait_data["data"], sample) raise KeyError("Type `%s` is incorrect" % dtype) if var_exists and n_exists: return accumulator + (None, None, None) @@ -91,7 +91,7 @@ def export_trait_data( return accumulator + (None, None) return accumulator + (None,) - return reduce(__exporter, strainlist, tuple()) + return reduce(__exporter, samplelist, tuple()) def trait_display_name(trait: Dict): """ @@ -165,19 +165,19 @@ def build_heatmap(traits_names, conn: Any): for fullname in traits_names] traits_data_list = [retrieve_trait_data(t, conn) for t in traits] genotype_filename = build_genotype_file(traits[0]["riset"]) - strains = load_genotype_samples(genotype_filename) + samples = load_genotype_samples(genotype_filename) exported_traits_data_list = [ - export_trait_data(td, strains) for td in traits_data_list] + export_trait_data(td, samples) for td in traits_data_list] clustered = cluster_traits(exported_traits_data_list) slinked = slink(clustered) traits_order = compute_traits_order(slinked) - strains_and_values = retrieve_strains_and_values( - traits_order, strains, exported_traits_data_list) + samples_and_values = retrieve_samples_and_values( + traits_order, samples, exported_traits_data_list) traits_filename = "{}/traits_test_file_{}.txt".format( TMPDIR, random_string(10)) generate_traits_file( - strains_and_values[0][1], - [t[2] for t in strains_and_values], + samples_and_values[0][1], + [t[2] for t in samples_and_values], traits_filename) main_output, _permutations_output = run_reaper( @@ -229,9 +229,9 @@ def compute_traits_order(slink_data, neworder: tuple = tuple()): return __order_maker(neworder, slink_data) -def retrieve_strains_and_values(orders, strainlist, traits_data_list): +def retrieve_samples_and_values(orders, samplelist, traits_data_list): """ - Get the strains and their corresponding values from `strainlist` and + Get the samples and their corresponding values from `samplelist` and `traits_data_list`. This migrates the code in @@ -240,17 +240,17 @@ def retrieve_strains_and_values(orders, strainlist, traits_data_list): # This feels nasty! There's a lot of mutation of values here, that might # indicate something untoward in the design of this function and its # dependents ==> Review - strains = [] + samples = [] values = [] rets = [] for order in orders: temp_val = traits_data_list[order] - for i, strain in enumerate(strainlist): + for i, sample in enumerate(samplelist): if temp_val[i] is not None: - strains.append(strain) + samples.append(sample) values.append(temp_val[i]) - rets.append([order, strains[:], values[:]]) - strains = [] + rets.append([order, samples[:], values[:]]) + samples = [] values = [] return rets diff --git a/tests/unit/computations/test_parsers.py b/tests/unit/computations/test_parsers.py index 19c3067..b51b0bf 100644 --- a/tests/unit/computations/test_parsers.py +++ b/tests/unit/computations/test_parsers.py @@ -15,7 +15,7 @@ class TestParsers(unittest.TestCase): def test_parse_genofile_with_existing_file(self): """Test that a genotype file is parsed correctly""" - strains = ["bxd1", "bxd2"] + samples = ["bxd1", "bxd2"] genotypes = [ {"chr": "1", "locus": "rs31443144", "cm": "1.50", "mb": "3.010274", @@ -51,4 +51,4 @@ class TestParsers(unittest.TestCase): "../test_data/genotype.txt" )) self.assertEqual(parse_genofile( - test_genotype_file), (strains, genotypes)) + test_genotype_file), (samples, genotypes)) diff --git a/tests/unit/test_heatmaps.py b/tests/unit/test_heatmaps.py index fd91cf9..b54e2f3 100644 --- a/tests/unit/test_heatmaps.py +++ b/tests/unit/test_heatmaps.py @@ -5,41 +5,41 @@ from gn3.heatmaps import ( get_lrs_from_chr, export_trait_data, compute_traits_order, - retrieve_strains_and_values, + retrieve_samples_and_values, process_traits_data_for_heatmap) from tests.unit.sample_test_data import organised_trait_1, organised_trait_2 -strainlist = ["B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"] +samplelist = ["B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"] trait_data = { "mysqlid": 36688172, "data": { - "B6cC3-1": {"strain_name": "B6cC3-1", "value": 7.51879, "variance": None, "ndata": None}, - "BXD1": {"strain_name": "BXD1", "value": 7.77141, "variance": None, "ndata": None}, - "BXD12": {"strain_name": "BXD12", "value": 8.39265, "variance": None, "ndata": None}, - "BXD16": {"strain_name": "BXD16", "value": 8.17443, "variance": None, "ndata": None}, - "BXD19": {"strain_name": "BXD19", "value": 8.30401, "variance": None, "ndata": None}, - "BXD2": {"strain_name": "BXD2", "value": 7.80944, "variance": None, "ndata": None}, - "BXD21": {"strain_name": "BXD21", "value": 8.93809, "variance": None, "ndata": None}, - "BXD24": {"strain_name": "BXD24", "value": 7.99415, "variance": None, "ndata": None}, - "BXD27": {"strain_name": "BXD27", "value": 8.12177, "variance": None, "ndata": None}, - "BXD28": {"strain_name": "BXD28", "value": 7.67688, "variance": None, "ndata": None}, - "BXD32": {"strain_name": "BXD32", "value": 7.79062, "variance": None, "ndata": None}, - "BXD39": {"strain_name": "BXD39", "value": 8.27641, "variance": None, "ndata": None}, - "BXD40": {"strain_name": "BXD40", "value": 8.18012, "variance": None, "ndata": None}, - "BXD42": {"strain_name": "BXD42", "value": 7.82433, "variance": None, "ndata": None}, - "BXD6": {"strain_name": "BXD6", "value": 8.09718, "variance": None, "ndata": None}, - "BXH14": {"strain_name": "BXH14", "value": 7.97475, "variance": None, "ndata": None}, - "BXH19": {"strain_name": "BXH19", "value": 7.67223, "variance": None, "ndata": None}, - "BXH2": {"strain_name": "BXH2", "value": 7.93622, "variance": None, "ndata": None}, - "BXH22": {"strain_name": "BXH22", "value": 7.43692, "variance": None, "ndata": None}, - "BXH4": {"strain_name": "BXH4", "value": 7.96336, "variance": None, "ndata": None}, - "BXH6": {"strain_name": "BXH6", "value": 7.75132, "variance": None, "ndata": None}, - "BXH7": {"strain_name": "BXH7", "value": 8.12927, "variance": None, "ndata": None}, - "BXH8": {"strain_name": "BXH8", "value": 6.77338, "variance": None, "ndata": None}, - "BXH9": {"strain_name": "BXH9", "value": 8.03836, "variance": None, "ndata": None}, - "C3H/HeJ": {"strain_name": "C3H/HeJ", "value": 7.42795, "variance": None, "ndata": None}, - "C57BL/6J": {"strain_name": "C57BL/6J", "value": 7.50606, "variance": None, "ndata": None}, - "DBA/2J": {"strain_name": "DBA/2J", "value": 7.72588, "variance": None, "ndata": None}}} + "B6cC3-1": {"sample_name": "B6cC3-1", "value": 7.51879, "variance": None, "ndata": None}, + "BXD1": {"sample_name": "BXD1", "value": 7.77141, "variance": None, "ndata": None}, + "BXD12": {"sample_name": "BXD12", "value": 8.39265, "variance": None, "ndata": None}, + "BXD16": {"sample_name": "BXD16", "value": 8.17443, "variance": None, "ndata": None}, + "BXD19": {"sample_name": "BXD19", "value": 8.30401, "variance": None, "ndata": None}, + "BXD2": {"sample_name": "BXD2", "value": 7.80944, "variance": None, "ndata": None}, + "BXD21": {"sample_name": "BXD21", "value": 8.93809, "variance": None, "ndata": None}, + "BXD24": {"sample_name": "BXD24", "value": 7.99415, "variance": None, "ndata": None}, + "BXD27": {"sample_name": "BXD27", "value": 8.12177, "variance": None, "ndata": None}, + "BXD28": {"sample_name": "BXD28", "value": 7.67688, "variance": None, "ndata": None}, + "BXD32": {"sample_name": "BXD32", "value": 7.79062, "variance": None, "ndata": None}, + "BXD39": {"sample_name": "BXD39", "value": 8.27641, "variance": None, "ndata": None}, + "BXD40": {"sample_name": "BXD40", "value": 8.18012, "variance": None, "ndata": None}, + "BXD42": {"sample_name": "BXD42", "value": 7.82433, "variance": None, "ndata": None}, + "BXD6": {"sample_name": "BXD6", "value": 8.09718, "variance": None, "ndata": None}, + "BXH14": {"sample_name": "BXH14", "value": 7.97475, "variance": None, "ndata": None}, + "BXH19": {"sample_name": "BXH19", "value": 7.67223, "variance": None, "ndata": None}, + "BXH2": {"sample_name": "BXH2", "value": 7.93622, "variance": None, "ndata": None}, + "BXH22": {"sample_name": "BXH22", "value": 7.43692, "variance": None, "ndata": None}, + "BXH4": {"sample_name": "BXH4", "value": 7.96336, "variance": None, "ndata": None}, + "BXH6": {"sample_name": "BXH6", "value": 7.75132, "variance": None, "ndata": None}, + "BXH7": {"sample_name": "BXH7", "value": 8.12927, "variance": None, "ndata": None}, + "BXH8": {"sample_name": "BXH8", "value": 6.77338, "variance": None, "ndata": None}, + "BXH9": {"sample_name": "BXH9", "value": 8.03836, "variance": None, "ndata": None}, + "C3H/HeJ": {"sample_name": "C3H/HeJ", "value": 7.42795, "variance": None, "ndata": None}, + "C57BL/6J": {"sample_name": "C57BL/6J", "value": 7.50606, "variance": None, "ndata": None}, + "DBA/2J": {"sample_name": "DBA/2J", "value": 7.72588, "variance": None, "ndata": None}}} slinked = ( (((0, 2, 0.16381088984330505), @@ -66,7 +66,7 @@ class TestHeatmap(TestCase): ["all", (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)]]: with self.subTest(dtype=dtype): self.assertEqual( - export_trait_data(trait_data, strainlist, dtype=dtype), + export_trait_data(trait_data, samplelist, dtype=dtype), expected) def test_export_trait_data_dtype_all_flags(self): @@ -106,7 +106,7 @@ class TestHeatmap(TestCase): with self.subTest(dtype=dtype, vflag=vflag, nflag=nflag): self.assertEqual( export_trait_data( - trait_data, strainlist, dtype=dtype, var_exists=vflag, + trait_data, samplelist, dtype=dtype, var_exists=vflag, n_exists=nflag), expected) @@ -164,8 +164,8 @@ class TestHeatmap(TestCase): self.assertEqual( compute_traits_order(slinked), (0, 2, 1, 7, 5, 9, 3, 6, 8, 4)) - def test_retrieve_strains_and_values(self): - """Test retrieval of strains and values.""" + def test_retrieve_samples_and_values(self): + """Test retrieval of samples and values.""" for orders, slist, tdata, expected in [ [ [2], @@ -185,9 +185,9 @@ class TestHeatmap(TestCase): [6, None, None, 4, None]], [[3, ["s1", "s4"], [6, 4]]] ]]: - with self.subTest(strainlist=slist, traitdata=tdata): + with self.subTest(samplelist=slist, traitdata=tdata): self.assertEqual( - retrieve_strains_and_values(orders, slist, tdata), expected) + retrieve_samples_and_values(orders, slist, tdata), expected) def test_get_lrs_from_chr(self): """Check that function gets correct LRS values""" -- cgit v1.2.3 From 1d09a9222f8c661da3abd6d61c09ae19eeb5d793 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 27 Sep 2021 05:02:09 +0300 Subject: Update terminology: `riset` to `group` Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Update terminology to use the appropriate domain terminology according to Zachary's direction at https://github.com/genenetwork/genenetwork3/pull/37#issuecomment-926041744 --- gn3/db/datasets.py | 52 +++++++++++++++++++++--------------------- gn3/db/traits.py | 16 ++++++------- gn3/heatmaps.py | 2 +- tests/unit/db/test_datasets.py | 42 +++++++++++++++++----------------- 4 files changed, 56 insertions(+), 56 deletions(-) diff --git a/gn3/db/datasets.py b/gn3/db/datasets.py index 4a05499..6c328f5 100644 --- a/gn3/db/datasets.py +++ b/gn3/db/datasets.py @@ -119,9 +119,9 @@ def retrieve_dataset_name( return fn_map[trait_type](threshold, dataset_name, conn) -def retrieve_geno_riset_fields(name, conn): +def retrieve_geno_group_fields(name, conn): """ - Retrieve the RISet, and RISetID values for various Geno trait types. + Retrieve the Group, and GroupID values for various Geno trait types. """ query = ( "SELECT InbredSet.Name, InbredSet.Id " @@ -130,12 +130,12 @@ def retrieve_geno_riset_fields(name, conn): "AND GenoFreeze.Name = %(name)s") with conn.cursor() as cursor: cursor.execute(query, {"name": name}) - return dict(zip(["riset", "risetid"], cursor.fetchone())) + return dict(zip(["group", "groupid"], cursor.fetchone())) return {} -def retrieve_publish_riset_fields(name, conn): +def retrieve_publish_group_fields(name, conn): """ - Retrieve the RISet, and RISetID values for various Publish trait types. + Retrieve the Group, and GroupID values for various Publish trait types. """ query = ( "SELECT InbredSet.Name, InbredSet.Id " @@ -144,12 +144,12 @@ def retrieve_publish_riset_fields(name, conn): "AND PublishFreeze.Name = %(name)s") with conn.cursor() as cursor: cursor.execute(query, {"name": name}) - return dict(zip(["riset", "risetid"], cursor.fetchone())) + return dict(zip(["group", "groupid"], cursor.fetchone())) return {} -def retrieve_probeset_riset_fields(name, conn): +def retrieve_probeset_group_fields(name, conn): """ - Retrieve the RISet, and RISetID values for various ProbeSet trait types. + Retrieve the Group, and GroupID values for various ProbeSet trait types. """ query = ( "SELECT InbredSet.Name, InbredSet.Id " @@ -159,12 +159,12 @@ def retrieve_probeset_riset_fields(name, conn): "AND ProbeSetFreeze.Name = %(name)s") with conn.cursor() as cursor: cursor.execute(query, {"name": name}) - return dict(zip(["riset", "risetid"], cursor.fetchone())) + return dict(zip(["group", "groupid"], cursor.fetchone())) return {} -def retrieve_temp_riset_fields(name, conn): +def retrieve_temp_group_fields(name, conn): """ - Retrieve the RISet, and RISetID values for `Temp` trait types. + Retrieve the Group, and GroupID values for `Temp` trait types. """ query = ( "SELECT InbredSet.Name, InbredSet.Id " @@ -173,30 +173,30 @@ def retrieve_temp_riset_fields(name, conn): "AND Temp.Name = %(name)s") with conn.cursor() as cursor: cursor.execute(query, {"name": name}) - return dict(zip(["riset", "risetid"], cursor.fetchone())) + return dict(zip(["group", "groupid"], cursor.fetchone())) return {} -def retrieve_riset_fields(trait_type, trait_name, dataset_info, conn): +def retrieve_group_fields(trait_type, trait_name, dataset_info, conn): """ - Retrieve the RISet, and RISetID values for various trait types. + Retrieve the Group, and GroupID values for various trait types. """ - riset_fns_map = { - "Geno": retrieve_geno_riset_fields, - "Publish": retrieve_publish_riset_fields, - "ProbeSet": retrieve_probeset_riset_fields + group_fns_map = { + "Geno": retrieve_geno_group_fields, + "Publish": retrieve_publish_group_fields, + "ProbeSet": retrieve_probeset_group_fields } if trait_type == "Temp": - riset_info = retrieve_temp_riset_fields(trait_name, conn) + group_info = retrieve_temp_group_fields(trait_name, conn) else: - riset_info = riset_fns_map[trait_type](dataset_info["dataset_name"], conn) + group_info = group_fns_map[trait_type](dataset_info["dataset_name"], conn) return { **dataset_info, - **riset_info, - "riset": ( - "BXD" if riset_info.get("riset") == "BXD300" - else riset_info.get("riset", "")) + **group_info, + "group": ( + "BXD" if group_info.get("group") == "BXD300" + else group_info.get("group", "")) } def retrieve_temp_trait_dataset(): @@ -281,11 +281,11 @@ def retrieve_trait_dataset(trait_type, trait, threshold, conn): trait_type, threshold, trait["trait_name"], trait["db"]["dataset_name"], conn) } - riset = retrieve_riset_fields( + group = retrieve_group_fields( trait_type, trait["trait_name"], dataset_name_info, conn) return { "display_name": dataset_name_info["dataset_name"], **dataset_name_info, **dataset_fns[trait_type](), - **riset + **group } diff --git a/gn3/db/traits.py b/gn3/db/traits.py index c9d05d7..f2673c8 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -226,7 +226,7 @@ def set_homologene_id_field_probeset(trait_info, conn): """ query = ( "SELECT HomologeneId FROM Homologene, Species, InbredSet" - " WHERE Homologene.GeneId = %(geneid)s AND InbredSet.Name = %(riset)s" + " WHERE Homologene.GeneId = %(geneid)s AND InbredSet.Name = %(group)s" " AND InbredSet.SpeciesId = Species.Id AND" " Species.TaxonomyId = Homologene.TaxonomyId") with conn.cursor() as cursor: @@ -234,7 +234,7 @@ def set_homologene_id_field_probeset(trait_info, conn): query, { k:v for k, v in trait_info.items() - if k in ["geneid", "riset"] + if k in ["geneid", "group"] }) res = cursor.fetchone() if res: @@ -422,7 +422,7 @@ def retrieve_trait_info( if trait_info["haveinfo"]: return { **trait_post_processing_functions_table[trait_dataset_type]( - {**trait_info, "riset": trait_dataset["riset"]}), + {**trait_info, "group": trait_dataset["group"]}), "db": {**trait["db"], **trait_dataset} } return trait_info @@ -449,14 +449,14 @@ def retrieve_temp_trait_data(trait_info: dict, conn: Any): for row in cursor.fetchall()] return [] -def retrieve_species_id(riset, conn: Any): +def retrieve_species_id(group, conn: Any): """ - Retrieve a species id given the RISet value + Retrieve a species id given the Group value """ with conn.cursor as cursor: cursor.execute( - "SELECT SpeciesId from InbredSet WHERE Name = %(riset)s", - {"riset": riset}) + "SELECT SpeciesId from InbredSet WHERE Name = %(group)s", + {"group": group}) return cursor.fetchone()[0] return None @@ -482,7 +482,7 @@ def retrieve_geno_trait_data(trait_info: Dict, conn: Any): {"trait_name": trait_info["trait_name"], "dataset_name": trait_info["db"]["dataset_name"], "species_id": retrieve_species_id( - trait_info["db"]["riset"], conn)}) + trait_info["db"]["group"], conn)}) return [dict(zip( ["sample_name", "value", "se_error", "id"], row)) for row in cursor.fetchall()] diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py index b6fc6d3..a36940d 100644 --- a/gn3/heatmaps.py +++ b/gn3/heatmaps.py @@ -164,7 +164,7 @@ def build_heatmap(traits_names, conn: Any): retrieve_trait_info(threshold, fullname, conn) for fullname in traits_names] traits_data_list = [retrieve_trait_data(t, conn) for t in traits] - genotype_filename = build_genotype_file(traits[0]["riset"]) + genotype_filename = build_genotype_file(traits[0]["group"]) samples = load_genotype_samples(genotype_filename) exported_traits_data_list = [ export_trait_data(td, samples) for td in traits_data_list] diff --git a/tests/unit/db/test_datasets.py b/tests/unit/db/test_datasets.py index 38de0e2..39f4af9 100644 --- a/tests/unit/db/test_datasets.py +++ b/tests/unit/db/test_datasets.py @@ -3,10 +3,10 @@ from unittest import mock, TestCase from gn3.db.datasets import ( retrieve_dataset_name, - retrieve_riset_fields, - retrieve_geno_riset_fields, - retrieve_publish_riset_fields, - retrieve_probeset_riset_fields) + retrieve_group_fields, + retrieve_geno_group_fields, + retrieve_publish_group_fields, + retrieve_probeset_group_fields) class TestDatasetsDBFunctions(TestCase): """Test cases for datasets functions.""" @@ -40,9 +40,9 @@ class TestDatasetsDBFunctions(TestCase): table=table, cols=columns), {"threshold": thresh, "name": dataset_name}) - def test_retrieve_probeset_riset_fields(self): + def test_retrieve_probeset_group_fields(self): """ - Test that the `riset` and `riset_id` fields are retrieved appropriately + Test that the `group` and `group_id` fields are retrieved appropriately for the 'ProbeSet' trait type. """ for trait_name, expected in [ @@ -52,7 +52,7 @@ class TestDatasetsDBFunctions(TestCase): with db_mock.cursor() as cursor: cursor.execute.return_value = () self.assertEqual( - retrieve_probeset_riset_fields(trait_name, db_mock), + retrieve_probeset_group_fields(trait_name, db_mock), expected) cursor.execute.assert_called_once_with( ( @@ -63,34 +63,34 @@ class TestDatasetsDBFunctions(TestCase): " AND ProbeSetFreeze.Name = %(name)s"), {"name": trait_name}) - def test_retrieve_riset_fields(self): + def test_retrieve_group_fields(self): """ - Test that the riset fields are set up correctly for the different trait + Test that the group fields are set up correctly for the different trait types. """ for trait_type, trait_name, dataset_info, expected in [ ["Publish", "pubTraitName01", {"dataset_name": "pubDBName01"}, - {"dataset_name": "pubDBName01", "riset": ""}], + {"dataset_name": "pubDBName01", "group": ""}], ["ProbeSet", "prbTraitName01", {"dataset_name": "prbDBName01"}, - {"dataset_name": "prbDBName01", "riset": ""}], + {"dataset_name": "prbDBName01", "group": ""}], ["Geno", "genoTraitName01", {"dataset_name": "genoDBName01"}, - {"dataset_name": "genoDBName01", "riset": ""}], - ["Temp", "tempTraitName01", {}, {"riset": ""}], + {"dataset_name": "genoDBName01", "group": ""}], + ["Temp", "tempTraitName01", {}, {"group": ""}], ]: db_mock = mock.MagicMock() with self.subTest( trait_type=trait_type, trait_name=trait_name, dataset_info=dataset_info): with db_mock.cursor() as cursor: - cursor.execute.return_value = ("riset_name", 0) + cursor.execute.return_value = ("group_name", 0) self.assertEqual( - retrieve_riset_fields( + retrieve_group_fields( trait_type, trait_name, dataset_info, db_mock), expected) - def test_retrieve_publish_riset_fields(self): + def test_retrieve_publish_group_fields(self): """ - Test that the `riset` and `riset_id` fields are retrieved appropriately + Test that the `group` and `group_id` fields are retrieved appropriately for the 'Publish' trait type. """ for trait_name, expected in [ @@ -100,7 +100,7 @@ class TestDatasetsDBFunctions(TestCase): with db_mock.cursor() as cursor: cursor.execute.return_value = () self.assertEqual( - retrieve_publish_riset_fields(trait_name, db_mock), + retrieve_publish_group_fields(trait_name, db_mock), expected) cursor.execute.assert_called_once_with( ( @@ -110,9 +110,9 @@ class TestDatasetsDBFunctions(TestCase): " AND PublishFreeze.Name = %(name)s"), {"name": trait_name}) - def test_retrieve_geno_riset_fields(self): + def test_retrieve_geno_group_fields(self): """ - Test that the `riset` and `riset_id` fields are retrieved appropriately + Test that the `group` and `group_id` fields are retrieved appropriately for the 'Geno' trait type. """ for trait_name, expected in [ @@ -122,7 +122,7 @@ class TestDatasetsDBFunctions(TestCase): with db_mock.cursor() as cursor: cursor.execute.return_value = () self.assertEqual( - retrieve_geno_riset_fields(trait_name, db_mock), + retrieve_geno_group_fields(trait_name, db_mock), expected) cursor.execute.assert_called_once_with( ( -- cgit v1.2.3 From 60d54d8de466c179a93b6d46ad05ec1b9ba5f4a1 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 27 Sep 2021 05:13:19 +0300 Subject: Narrow the exception and add comments Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Only catch the `FileExistsError` allowing any other exception to pass through. This tries to conform a little to the review at https://github.com/genenetwork/genenetwork3/pull/37#discussion_r714552696 --- gn3/computations/qtlreaper.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py index 166d2dd..d1ff4ac 100644 --- a/gn3/computations/qtlreaper.py +++ b/gn3/computations/qtlreaper.py @@ -34,7 +34,8 @@ def create_output_directory(path: str): """Create the output directory at `path` if it does not exist.""" try: os.mkdir(path) - except OSError: + except FileExistsError: + # If the directory already exists, do nothing. pass def run_reaper( -- cgit v1.2.3 From a9fc9814760d205674904f8feb700eadae480fb1 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 27 Sep 2021 05:25:58 +0300 Subject: Remove unnecessary variable. Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Fix issue according to review https://github.com/genenetwork/genenetwork3/pull/37#discussion_r714549781 --- gn3/api/heatmaps.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/gn3/api/heatmaps.py b/gn3/api/heatmaps.py index fe47aee..62ca2ad 100644 --- a/gn3/api/heatmaps.py +++ b/gn3/api/heatmaps.py @@ -17,8 +17,7 @@ def clustered_heatmaps(): Parses the incoming data and responds with the JSON-serialized plotly figure representing the clustered heatmap. """ - heatmap_request = request.get_json() - traits_names = heatmap_request.get("traits_names", tuple()) + traits_names = request.get_json().get("traits_names", tuple()) if len(traits_names) < 2: return jsonify({ "message": "You need to provide at least two trait names." -- cgit v1.2.3