From e3e18950cfcdec918429dcbb5d5ed2e9616b7a20 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Wed, 15 Sep 2021 11:19:56 +0300 Subject: Reorganise modules Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * The heatmap generation does not fall cleanly within the computations or db modules. This commit moves it to the higher level gn3 module. --- tests/unit/test_heatmaps.py | 187 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 187 insertions(+) create mode 100644 tests/unit/test_heatmaps.py (limited to 'tests/unit/test_heatmaps.py') diff --git a/tests/unit/test_heatmaps.py b/tests/unit/test_heatmaps.py new file mode 100644 index 0000000..265d5a8 --- /dev/null +++ b/tests/unit/test_heatmaps.py @@ -0,0 +1,187 @@ +"""Module contains tests for gn3.heatmaps.heatmaps""" +from unittest import TestCase +from gn3.heatmaps import ( + cluster_traits, + export_trait_data, + compute_traits_order, + retrieve_strains_and_values) + +strainlist = ["B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"] +trait_data = { + "mysqlid": 36688172, + "data": { + "B6cC3-1": {"strain_name": "B6cC3-1", "value": 7.51879, "variance": None, "ndata": None}, + "BXD1": {"strain_name": "BXD1", "value": 7.77141, "variance": None, "ndata": None}, + "BXD12": {"strain_name": "BXD12", "value": 8.39265, "variance": None, "ndata": None}, + "BXD16": {"strain_name": "BXD16", "value": 8.17443, "variance": None, "ndata": None}, + "BXD19": {"strain_name": "BXD19", "value": 8.30401, "variance": None, "ndata": None}, + "BXD2": {"strain_name": "BXD2", "value": 7.80944, "variance": None, "ndata": None}, + "BXD21": {"strain_name": "BXD21", "value": 8.93809, "variance": None, "ndata": None}, + "BXD24": {"strain_name": "BXD24", "value": 7.99415, "variance": None, "ndata": None}, + "BXD27": {"strain_name": "BXD27", "value": 8.12177, "variance": None, "ndata": None}, + "BXD28": {"strain_name": "BXD28", "value": 7.67688, "variance": None, "ndata": None}, + "BXD32": {"strain_name": "BXD32", "value": 7.79062, "variance": None, "ndata": None}, + "BXD39": {"strain_name": "BXD39", "value": 8.27641, "variance": None, "ndata": None}, + "BXD40": {"strain_name": "BXD40", "value": 8.18012, "variance": None, "ndata": None}, + "BXD42": {"strain_name": "BXD42", "value": 7.82433, "variance": None, "ndata": None}, + "BXD6": {"strain_name": "BXD6", "value": 8.09718, "variance": None, "ndata": None}, + "BXH14": {"strain_name": "BXH14", "value": 7.97475, "variance": None, "ndata": None}, + "BXH19": {"strain_name": "BXH19", "value": 7.67223, "variance": None, "ndata": None}, + "BXH2": {"strain_name": "BXH2", "value": 7.93622, "variance": None, "ndata": None}, + "BXH22": {"strain_name": "BXH22", "value": 7.43692, "variance": None, "ndata": None}, + "BXH4": {"strain_name": "BXH4", "value": 7.96336, "variance": None, "ndata": None}, + "BXH6": {"strain_name": "BXH6", "value": 7.75132, "variance": None, "ndata": None}, + "BXH7": {"strain_name": "BXH7", "value": 8.12927, "variance": None, "ndata": None}, + "BXH8": {"strain_name": "BXH8", "value": 6.77338, "variance": None, "ndata": None}, + "BXH9": {"strain_name": "BXH9", "value": 8.03836, "variance": None, "ndata": None}, + "C3H/HeJ": {"strain_name": "C3H/HeJ", "value": 7.42795, "variance": None, "ndata": None}, + "C57BL/6J": {"strain_name": "C57BL/6J", "value": 7.50606, "variance": None, "ndata": None}, + "DBA/2J": {"strain_name": "DBA/2J", "value": 7.72588, "variance": None, "ndata": None}}} + +slinked = ( + (((0, 2, 0.16381088984330505), + ((1, 7, 0.06024619831474998), 5, 0.19179284676938602), + 0.20337048635536847), + 9, + 0.23451785425383564), + ((3, (6, 8, 0.2140799896286565), 0.25879514152086425), + 4, 0.8968250491499363), + 0.9313185954797953) + +class TestHeatmap(TestCase): + """Class for testing heatmap computation functions""" + + def test_export_trait_data_dtype(self): + """ + Test `export_trait_data` with different values for the `dtype` keyword + argument + """ + for dtype, expected in [ + ["val", (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["var", (None, None, None, None, None, None)], + ["N", (None, None, None, None, None, None)], + ["all", (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)]]: + with self.subTest(dtype=dtype): + self.assertEqual( + export_trait_data(trait_data, strainlist, dtype=dtype), + expected) + + def test_export_trait_data_dtype_all_flags(self): + """ + Test `export_trait_data` with different values for the `dtype` keyword + argument and the different flags set up + """ + for dtype, vflag, nflag, expected in [ + ["val", False, False, + (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["val", False, True, + (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["val", True, False, + (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["val", True, True, + (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["var", False, False, (None, None, None, None, None, None)], + ["var", False, True, (None, None, None, None, None, None)], + ["var", True, False, (None, None, None, None, None, None)], + ["var", True, True, (None, None, None, None, None, None)], + ["N", False, False, (None, None, None, None, None, None)], + ["N", False, True, (None, None, None, None, None, None)], + ["N", True, False, (None, None, None, None, None, None)], + ["N", True, True, (None, None, None, None, None, None)], + ["all", False, False, + (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["all", False, True, + (7.51879, None, 7.77141, None, 8.39265, None, 8.17443, None, + 8.30401, None, 7.80944, None)], + ["all", True, False, + (7.51879, None, 7.77141, None, 8.39265, None, 8.17443, None, + 8.30401, None, 7.80944, None)], + ["all", True, True, + (7.51879, None, None, 7.77141, None, None, 8.39265, None, None, + 8.17443, None, None, 8.30401, None, None, 7.80944, None, None)] + ]: + with self.subTest(dtype=dtype, vflag=vflag, nflag=nflag): + self.assertEqual( + export_trait_data( + trait_data, strainlist, dtype=dtype, var_exists=vflag, + n_exists=nflag), + expected) + + def test_cluster_traits(self): + """ + Test that the clustering is working as expected. + """ + traits_data_list = [ + (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944), + (6.1427, 6.50588, 7.73705, 6.68328, 7.49293, 7.27398), + (8.4211, 8.30581, 9.24076, 8.51173, 9.18455, 8.36077), + (10.0904, 10.6509, 9.36716, 9.91202, 8.57444, 10.5731), + (10.188, 9.76652, 9.54813, 9.05074, 9.52319, 9.10505), + (6.74676, 7.01029, 7.54169, 6.48574, 7.01427, 7.26815), + (6.39359, 6.85321, 5.78337, 7.11141, 6.22101, 6.16544), + (6.84118, 7.08432, 7.59844, 7.08229, 7.26774, 7.24991), + (9.45215, 10.6943, 8.64719, 10.1592, 7.75044, 8.78615), + (7.04737, 6.87185, 7.58586, 6.92456, 6.84243, 7.36913)] + self.assertEqual( + cluster_traits(traits_data_list), + ((0.0, 0.20337048635536847, 0.16381088984330505, 1.7388553629398245, + 1.5025235756329178, 0.6952839500255574, 1.271661230252733, + 0.2100487290977544, 1.4699690641062024, 0.7934461515867415), + (0.20337048635536847, 0.0, 0.2198321044997198, 1.5753041735592204, + 1.4815755944537086, 0.26087293140686374, 1.6939790104301427, + 0.06024619831474998, 1.7430082449189215, 0.4497104244247795), + (0.16381088984330505, 0.2198321044997198, 0.0, 1.9073926868549234, + 1.0396738891139845, 0.5278328671176757, 1.6275069061182947, + 0.2636503792482082, 1.739617877037615, 0.7127042590637039), + (1.7388553629398245, 1.5753041735592204, 1.9073926868549234, 0.0, + 0.9936846292920328, 1.1169999189889366, 0.6007483980555253, + 1.430209221053372, 0.25879514152086425, 0.9313185954797953), + (1.5025235756329178, 1.4815755944537086, 1.0396738891139845, + 0.9936846292920328, 0.0, 1.027827186339337, 1.1441743109173244, + 1.4122477962364253, 0.8968250491499363, 1.1683723389247052), + (0.6952839500255574, 0.26087293140686374, 0.5278328671176757, + 1.1169999189889366, 1.027827186339337, 0.0, 1.8420471110023269, + 0.19179284676938602, 1.4875072385631605, 0.23451785425383564), + (1.271661230252733, 1.6939790104301427, 1.6275069061182947, + 0.6007483980555253, 1.1441743109173244, 1.8420471110023269, 0.0, + 1.6540234785929928, 0.2140799896286565, 1.7413442197913358), + (0.2100487290977544, 0.06024619831474998, 0.2636503792482082, + 1.430209221053372, 1.4122477962364253, 0.19179284676938602, + 1.6540234785929928, 0.0, 1.5225640692832796, 0.33370067057028485), + (1.4699690641062024, 1.7430082449189215, 1.739617877037615, + 0.25879514152086425, 0.8968250491499363, 1.4875072385631605, + 0.2140799896286565, 1.5225640692832796, 0.0, 1.3256191648260216), + (0.7934461515867415, 0.4497104244247795, 0.7127042590637039, + 0.9313185954797953, 1.1683723389247052, 0.23451785425383564, + 1.7413442197913358, 0.33370067057028485, 1.3256191648260216, + 0.0))) + + def test_compute_heatmap_order(self): + """Test the orders.""" + self.assertEqual( + compute_traits_order(slinked), (0, 2, 1, 7, 5, 9, 3, 6, 8, 4)) + + def test_retrieve_strains_and_values(self): + """Test retrieval of strains and values.""" + for orders, slist, tdata, expected in [ + [ + [2], + ["s1", "s2", "s3", "s4"], + [[2, 9, 6, None, 4], + [7, 5, None, None, 4], + [9, None, 5, 4, 7], + [6, None, None, 4, None]], + [[2, ["s1", "s3", "s4"], [9, 5, 4]]] + ], + [ + [3], + ["s1", "s2", "s3", "s4", "s5"], + [[2, 9, 6, None, 4], + [7, 5, None, None, 4], + [9, None, 5, 4, 7], + [6, None, None, 4, None]], + [[3, ["s1", "s4"], [6, 4]]] + ]]: + with self.subTest(strainlist=slist, traitdata=tdata): + self.assertEqual( + retrieve_strains_and_values(orders, slist, tdata), expected) -- cgit v1.2.3 From b1eb0451578c53afabe4f2054ce08665dec4bb82 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Wed, 15 Sep 2021 11:41:36 +0300 Subject: Integrate get_lsr_from_chr function * gn3/heatmaps.py: copy over function * tests/unit/test_heatmaps.py: add tests Copy function over from proof of concept and add some tests to ensure it works as expected. --- gn3/heatmaps.py | 8 ++++++++ tests/unit/test_heatmaps.py | 14 ++++++++++++++ 2 files changed, 22 insertions(+) (limited to 'tests/unit/test_heatmaps.py') diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py index 198fb45..991ddec 100644 --- a/gn3/heatmaps.py +++ b/gn3/heatmaps.py @@ -276,6 +276,14 @@ def get_nearest_marker(traits_list, genotype): marker_finder = nearest_marker_finder(genotype) return [marker_finder(trait) for trait in traits_list] +def get_lrs_from_chr(trait, chr_name): + chromosome = trait["chromosomes"].get(chr_name) + if chromosome: + return [ + locus["LRS"] for locus in + sorted(chromosome["loci"], key=lambda loc: loc["Locus"])] + return [None] + # # Grey + Blue + Red # def generate_heatmap(): # cols = 20 diff --git a/tests/unit/test_heatmaps.py b/tests/unit/test_heatmaps.py index 265d5a8..cfdde1e 100644 --- a/tests/unit/test_heatmaps.py +++ b/tests/unit/test_heatmaps.py @@ -2,6 +2,7 @@ from unittest import TestCase from gn3.heatmaps import ( cluster_traits, + get_lrs_from_chr, export_trait_data, compute_traits_order, retrieve_strains_and_values) @@ -185,3 +186,16 @@ class TestHeatmap(TestCase): with self.subTest(strainlist=slist, traitdata=tdata): self.assertEqual( retrieve_strains_and_values(orders, slist, tdata), expected) + + def test_get_lrs_from_chr(self): + for trait, chromosome, expected in [ + [{"chromosomes": {}}, 3, [None]], + [{"chromosomes": {3: {"loci": [ + {"Locus": "b", "LRS": 1.9}, + {"Locus": "a", "LRS": 13.2}, + {"Locus": "d", "LRS": 53.21}, + {"Locus": "c", "LRS": 2.22}]}}}, + 3, + [13.2, 1.9, 2.22, 53.21]]]: + with self.subTest(trait=trait, chromosome=chromosome): + self.assertEqual(get_lrs_from_chr(trait, chromosome), expected) -- cgit v1.2.3 From 11632a565a6f901eca852a5a40a6f9fd3170152a Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Wed, 15 Sep 2021 12:08:56 +0300 Subject: Process data into format usable by heatmaps * gn3/heatmaps.py: implement `process_traits_data_for_heatmap` function, that will process the data into a form usable by heatmaps. * tests/unit/test_heatmaps.py: check that the function processes the data into the correct form. --- gn3/heatmaps.py | 12 +++++ tests/unit/test_heatmaps.py | 107 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 118 insertions(+), 1 deletion(-) (limited to 'tests/unit/test_heatmaps.py') diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py index 991ddec..0c00d6c 100644 --- a/gn3/heatmaps.py +++ b/gn3/heatmaps.py @@ -277,6 +277,9 @@ def get_nearest_marker(traits_list, genotype): return [marker_finder(trait) for trait in traits_list] def get_lrs_from_chr(trait, chr_name): + """ + Retrieve the LRS values for a specific chromosome in the given trait. + """ chromosome = trait["chromosomes"].get(chr_name) if chromosome: return [ @@ -284,6 +287,15 @@ def get_lrs_from_chr(trait, chr_name): sorted(chromosome["loci"], key=lambda loc: loc["Locus"])] return [None] +def process_traits_data_for_heatmap(data, trait_names, chromosome_names): + """ + Process the traits data in a format useful for generating heatmap diagrams. + """ + hdata = [ + [get_lrs_from_chr(data[trait], chr_name) for trait in trait_names] + for chr_name in chromosome_names] + return hdata + # # Grey + Blue + Red # def generate_heatmap(): # cols = 20 diff --git a/tests/unit/test_heatmaps.py b/tests/unit/test_heatmaps.py index cfdde1e..f3a81c5 100644 --- a/tests/unit/test_heatmaps.py +++ b/tests/unit/test_heatmaps.py @@ -5,7 +5,8 @@ from gn3.heatmaps import ( get_lrs_from_chr, export_trait_data, compute_traits_order, - retrieve_strains_and_values) + retrieve_strains_and_values, + process_traits_data_for_heatmap) strainlist = ["B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"] trait_data = { @@ -199,3 +200,107 @@ class TestHeatmap(TestCase): [13.2, 1.9, 2.22, 53.21]]]: with self.subTest(trait=trait, chromosome=chromosome): self.assertEqual(get_lrs_from_chr(trait, chromosome), expected) + + def test_process_traits_data_for_heatmap(self): + self.assertEqual( + process_traits_data_for_heatmap( + {"1": { + "ID": "T1", + "chromosomes": { + 1: {"Chr": 1, + "loci": [ + { + "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs6269442", "cM": 1.500, "Mb": 3.492, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs32285189", "cM": 1.630, "Mb": 3.511, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs258367496", "cM": 1.630, "Mb": 3.660, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs32430919", "cM": 1.750, "Mb": 3.777, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs36251697", "cM": 1.880, "Mb": 3.812, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs30658298", "cM": 2.010, "Mb": 4.431, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }]}, + 2: {"Chr": 2, + "loci": [ + { + "Locus": "rs51852623", "cM": 2.010, "Mb": 4.447, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs31879829", "cM": 2.140, "Mb": 4.519, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs36742481", "cM": 2.140, "Mb": 4.776, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }]}}}, + "2": { + "ID": "T1", + "chromosomes": { + 1: {"Chr": 1, + "loci": [ + { + "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs6269442", "cM": 1.500, "Mb": 3.492, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs32285189", "cM": 1.630, "Mb": 3.511, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs258367496", "cM": 1.630, "Mb": 3.660, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs32430919", "cM": 1.750, "Mb": 3.777, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs36251697", "cM": 1.880, "Mb": 3.812, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs30658298", "cM": 2.010, "Mb": 4.431, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }]}, + 2: {"Chr": 2, + "loci": [ + { + "Locus": "rs51852623", "cM": 2.010, "Mb": 4.447, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs31879829", "cM": 2.140, "Mb": 4.519, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs36742481", "cM": 2.140, "Mb": 4.776, + "LRS": 0.579, "Additive": -0.074, "pValue": 1.000 + }]}}}}, + ["2", "1"], + [1, 2]), + [[[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5], + [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]], + [[0.5, 0.579, 0.5], + [0.5, 0.5, 0.5]]]) -- cgit v1.2.3 From 1e2357049adc72808fbf8eaac3da9411d3c78c66 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Fri, 17 Sep 2021 11:20:16 +0300 Subject: Fix a number of linting issues Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi --- gn3/computations/qtlreaper.py | 7 ++-- gn3/db/genotypes.py | 2 +- gn3/heatmaps.py | 54 ++++++++++++------------------- tests/unit/computations/test_qtlreaper.py | 3 +- tests/unit/test_heatmaps.py | 6 ++-- 5 files changed, 32 insertions(+), 40 deletions(-) (limited to 'tests/unit/test_heatmaps.py') diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py index 5180853..377db9b 100644 --- a/gn3/computations/qtlreaper.py +++ b/gn3/computations/qtlreaper.py @@ -110,9 +110,10 @@ def organise_reaper_main_results(parsed_results): unique_chromosomes = {item["Chr"] for item in id_items} return { "ID": identifier, - "chromosomes": {_chr["Chr"]: _chr for _chr in [ - __organise_by_chromosome(chromo, id_items) - for chromo in sorted( + "chromosomes": { + _chr["Chr"]: _chr for _chr in [ + __organise_by_chromosome(chromo, id_items) + for chromo in sorted( unique_chromosomes, key=chromosome_sorter_key_fn)]}} unique_ids = {res["ID"] for res in parsed_results} diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py index b03d55c..9d052d9 100644 --- a/gn3/db/genotypes.py +++ b/gn3/db/genotypes.py @@ -174,7 +174,7 @@ def parse_genotype_file(filename: str, parlist: tuple = tuple()): geno_obj = dict(labels + header) markers = tuple( [parse_genotype_marker(line, geno_obj, parlist) - for line in data_lines[1:]]) + for line in data_lines[1:]]) chromosomes = tuple( dict(chromosome) for chromosome in build_genotype_chromosomes(geno_obj, markers)) diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py index 2859dde..c4fc67d 100644 --- a/gn3/heatmaps.py +++ b/gn3/heatmaps.py @@ -3,13 +3,13 @@ This module will contain functions to be used in computation of the data used to generate various kinds of heatmaps. """ +from typing import Any, Dict, Sequence import numpy as np from functools import reduce from gn3.settings import TMPDIR import plotly.graph_objects as go import plotly.figure_factory as ff from gn3.random import random_string -from typing import Any, Dict, Sequence from gn3.computations.slink import slink from plotly.subplots import make_subplots from gn3.computations.correlations2 import compute_correlation @@ -165,7 +165,7 @@ def build_heatmap(traits_names, conn: Any): for fullname in traits_names] traits_data_list = [retrieve_trait_data(t, conn) for t in traits] genotype_filename = build_genotype_file(traits[0]["riset"]) - genotype = parse_genotype_file(genotype_filename) + # genotype = parse_genotype_file(genotype_filename) strains = load_genotype_samples(genotype_filename) exported_traits_data_list = [ export_trait_data(td, strains) for td in traits_data_list] @@ -183,22 +183,21 @@ def build_heatmap(traits_names, conn: Any): [t[2] for t in strains_and_values], traits_filename) - main_output, permutations_output = run_reaper( + main_output, _permutations_output = run_reaper( genotype_filename, traits_filename, separate_nperm_output=True) qtlresults = parse_reaper_main_results(main_output) - permudata = parse_reaper_permutation_results(permutations_output) + # permudata = parse_reaper_permutation_results(permutations_output) organised = organise_reaper_main_results(qtlresults) traits_ids = [# sort numerically, but retain the ids as strings str(i) for i in sorted({int(row["ID"]) for row in qtlresults})] chromosome_names = sorted( - {row["Chr"] for row in qtlresults}, key = chromosome_sorter_key_fn) - loci_names = sorted({row["Locus"] for row in qtlresults}) - ordered_traits_names = { - res_id: trait for res_id, trait in + {row["Chr"] for row in qtlresults}, key=chromosome_sorter_key_fn) + # loci_names = sorted({row["Locus"] for row in qtlresults}) + ordered_traits_names = dict( zip(traits_ids, - [traits[idx]["trait_fullname"] for idx in traits_order])} + [traits[idx]["trait_fullname"] for idx in traits_order])) return generate_clustered_heatmap( process_traits_data_for_heatmap( @@ -207,22 +206,11 @@ def build_heatmap(traits_names, conn: Any): "single_heatmap_{}".format(random_string(10)), y_axis=tuple( ordered_traits_names[traits_ids[order]] - for order in traits_order), + for order in traits_order), y_label="Traits", - x_axis=[chromo for chromo in chromosome_names], + x_axis=chromosome_names, x_label="Chromosomes") - return { - "slink_data": slink_data, - "ordering_data": ordering_data, - "strainlist": strainlist, - "genotype_filename": genotype_filename, - "traits_list": traits_list, - "traits_data_list": traits_data_list, - "exported_traits_data_list": exported_traits_data_list, - "traits_filename": traits_filename - } - def compute_traits_order(slink_data, neworder: tuple = tuple()): """ Compute the order of the traits for clustering from `slink_data`. @@ -314,7 +302,7 @@ def get_nearest_marker(traits_list, genotype): https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L419-L438 """ if not genotype["Mbmap"]: - return [None] * len(trait_list) + return [None] * len(traits_list) marker_finder = nearest_marker_finder(genotype) return [marker_finder(trait) for trait in traits_list] @@ -340,10 +328,10 @@ def process_traits_data_for_heatmap(data, trait_names, chromosome_names): return hdata def generate_clustered_heatmap( - data, clustering_data, image_filename_prefix, x_axis = None, - x_label: str = "", y_axis = None, y_label: str = "", + data, clustering_data, image_filename_prefix, x_axis=None, + x_label: str = "", y_axis=None, y_label: str = "", output_dir: str = TMPDIR, - colorscale = ( + colorscale=( (0.0, '#5D5D5D'), (0.4999999999999999, '#ABABAB'), (0.5, '#F5DE11'), (1.0, '#FF0D00'))): """ @@ -357,15 +345,15 @@ def generate_clustered_heatmap( shared_yaxes="rows", horizontal_spacing=0.001, subplot_titles=["distance"] + x_axis, - figure = ff.create_dendrogram( + figure=ff.create_dendrogram( np.array(clustering_data), orientation="right", labels=y_axis)) hms = [go.Heatmap( name=chromo, - y = y_axis, - z = data_array, + y=y_axis, + z=data_array, showscale=False) for chromo, data_array in zip(x_axis, data)] - for i, hm in enumerate(hms): - fig.add_trace(hm, row=1, col=(i + 2)) + for i, heatmap in enumerate(hms): + fig.add_trace(heatmap, row=1, col=(i + 2)) fig.update_layout( { @@ -380,8 +368,8 @@ def generate_clustered_heatmap( x_axes_layouts = { "xaxis{}".format(i+1 if i > 0 else ""): { "mirror": False, - "showticklabels": True if i==0 else False, - "ticks": "outside" if i==0 else "" + "showticklabels": True if i == 0 else False, + "ticks": "outside" if i == 0 else "" } for i in range(num_cols)} diff --git a/tests/unit/computations/test_qtlreaper.py b/tests/unit/computations/test_qtlreaper.py index 1d67827..d420470 100644 --- a/tests/unit/computations/test_qtlreaper.py +++ b/tests/unit/computations/test_qtlreaper.py @@ -77,6 +77,7 @@ class TestQTLReaper(TestCase): 5.82775, 5.89659, 5.92117, 5.93396, 5.93396, 5.94957]) def test_organise_reaper_main_results(self): + """Check that results are organised correctly.""" self.assertEqual( organise_reaper_main_results([ { @@ -135,7 +136,7 @@ class TestQTLReaper(TestCase): 1: {"Chr": 1, "loci": [ { - "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, + "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { diff --git a/tests/unit/test_heatmaps.py b/tests/unit/test_heatmaps.py index f3a81c5..c0a496b 100644 --- a/tests/unit/test_heatmaps.py +++ b/tests/unit/test_heatmaps.py @@ -189,6 +189,7 @@ class TestHeatmap(TestCase): retrieve_strains_and_values(orders, slist, tdata), expected) def test_get_lrs_from_chr(self): + """Check that function gets correct LRS values""" for trait, chromosome, expected in [ [{"chromosomes": {}}, 3, [None]], [{"chromosomes": {3: {"loci": [ @@ -202,6 +203,7 @@ class TestHeatmap(TestCase): self.assertEqual(get_lrs_from_chr(trait, chromosome), expected) def test_process_traits_data_for_heatmap(self): + """Check for correct processing of data for heatmap generation.""" self.assertEqual( process_traits_data_for_heatmap( {"1": { @@ -210,7 +212,7 @@ class TestHeatmap(TestCase): 1: {"Chr": 1, "loci": [ { - "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, + "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { @@ -257,7 +259,7 @@ class TestHeatmap(TestCase): 1: {"Chr": 1, "loci": [ { - "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, + "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { -- cgit v1.2.3 From 95c5c0e73bffbf0287a17309e703063ee54d25ba Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Thu, 23 Sep 2021 03:45:19 +0300 Subject: Refactor: Move common sample data to separate file Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Move common sample test data into a separate file where it can be imported from, to prevent pylint error R0801 which proved tricky to silence in any other way. --- tests/unit/computations/test_qtlreaper.py | 68 ++++-------------- tests/unit/db/test_traits.py | 15 ++-- tests/unit/sample_test_data.py | 111 ++++++++++++++++++++++++++++++ tests/unit/test_heatmaps.py | 96 +------------------------- 4 files changed, 134 insertions(+), 156 deletions(-) create mode 100644 tests/unit/sample_test_data.py (limited to 'tests/unit/test_heatmaps.py') diff --git a/tests/unit/computations/test_qtlreaper.py b/tests/unit/computations/test_qtlreaper.py index d420470..742d106 100644 --- a/tests/unit/computations/test_qtlreaper.py +++ b/tests/unit/computations/test_qtlreaper.py @@ -4,6 +4,7 @@ from gn3.computations.qtlreaper import ( parse_reaper_main_results, organise_reaper_main_results, parse_reaper_permutation_results) +from tests.unit.sample_test_data import organised_trait_1 class TestQTLReaper(TestCase): """Class for testing qtlreaper interface functions.""" @@ -81,99 +82,54 @@ class TestQTLReaper(TestCase): self.assertEqual( organise_reaper_main_results([ { - "ID": "T1", "Locus": "rs31443144", "Chr": 1, "cM": 1.500, + "ID": "1", "Locus": "rs31443144", "Chr": 1, "cM": 1.500, "Mb": 3.010, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs6269442", "Chr": 1, "cM": 1.500, + "ID": "1", "Locus": "rs6269442", "Chr": 1, "cM": 1.500, "Mb": 3.492, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs32285189", "Chr": 1, "cM": 1.630, + "ID": "1", "Locus": "rs32285189", "Chr": 1, "cM": 1.630, "Mb": 3.511, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs258367496", "Chr": 1, "cM": 1.630, + "ID": "1", "Locus": "rs258367496", "Chr": 1, "cM": 1.630, "Mb": 3.660, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs32430919", "Chr": 1, "cM": 1.750, + "ID": "1", "Locus": "rs32430919", "Chr": 1, "cM": 1.750, "Mb": 3.777, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs36251697", "Chr": 1, "cM": 1.880, + "ID": "1", "Locus": "rs36251697", "Chr": 1, "cM": 1.880, "Mb": 3.812, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs30658298", "Chr": 1, "cM": 2.010, + "ID": "1", "Locus": "rs30658298", "Chr": 1, "cM": 2.010, "Mb": 4.431, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs51852623", "Chr": 2, "cM": 2.010, + "ID": "1", "Locus": "rs51852623", "Chr": 2, "cM": 2.010, "Mb": 4.447, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs31879829", "Chr": 2, "cM": 2.140, + "ID": "1", "Locus": "rs31879829", "Chr": 2, "cM": 2.140, "Mb": 4.519, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 }, { - "ID": "T1", "Locus": "rs36742481", "Chr": 2, "cM": 2.140, + "ID": "1", "Locus": "rs36742481", "Chr": 2, "cM": 2.140, "Mb": 4.776, "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 } ]), - {"T1": {"ID": "T1", - "chromosomes": { - 1: {"Chr": 1, - "loci": [ - { - "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs6269442", "cM": 1.500, "Mb": 3.492, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs32285189", "cM": 1.630, "Mb": 3.511, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs258367496", "cM": 1.630, "Mb": 3.660, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs32430919", "cM": 1.750, "Mb": 3.777, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs36251697", "cM": 1.880, "Mb": 3.812, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs30658298", "cM": 2.010, "Mb": 4.431, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }]}, - 2: {"Chr": 2, - "loci": [ - { - "Locus": "rs51852623", "cM": 2.010, "Mb": 4.447, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs31879829", "cM": 2.140, "Mb": 4.519, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs36742481", "cM": 2.140, "Mb": 4.776, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }]}}}}) + organised_trait_1) diff --git a/tests/unit/db/test_traits.py b/tests/unit/db/test_traits.py index baa2af3..8af8e82 100644 --- a/tests/unit/db/test_traits.py +++ b/tests/unit/db/test_traits.py @@ -170,12 +170,15 @@ class TestTraitsDBFunctions(TestCase): db_mock = mock.MagicMock() STRAIN_ID_SQL: str = "UPDATE Strain SET Name = %s WHERE Id = %s" - PUBLISH_DATA_SQL: str = ("UPDATE PublishData SET value = %s " - "WHERE StrainId = %s AND Id = %s") - PUBLISH_SE_SQL: str = ("UPDATE PublishSE SET error = %s " - "WHERE StrainId = %s AND DataId = %s") - N_STRAIN_SQL: str = ("UPDATE NStrain SET count = %s " - "WHERE StrainId = %s AND DataId = %s") + PUBLISH_DATA_SQL: str = ( + "UPDATE PublishData SET value = %s " + "WHERE StrainId = %s AND Id = %s") + PUBLISH_SE_SQL: str = ( + "UPDATE PublishSE SET error = %s " + "WHERE StrainId = %s AND DataId = %s") + N_STRAIN_SQL: str = ( + "UPDATE NStrain SET count = %s " + "WHERE StrainId = %s AND DataId = %s") with db_mock.cursor() as cursor: type(cursor).rowcount = 1 diff --git a/tests/unit/sample_test_data.py b/tests/unit/sample_test_data.py new file mode 100644 index 0000000..407d074 --- /dev/null +++ b/tests/unit/sample_test_data.py @@ -0,0 +1,111 @@ +""" +This module holds a collection of sample data variables, used in more than one + test. + +This is mostly to avoid the `duplicate-code` pylint error that gets raised if +the same data is defined in more than one file. It has been found that adding +the `# pylint: disable=R0801` or `# pylint: disable=duplicate-code` to the top +of the file seems to not work as expected. + +Adding these same declarations to .pylintrc is not an option, since that, +seemingly, would deactivate the warnings for all code in the project: We do not +want that. +""" + +organised_trait_1 = { + "1": { + "ID": "1", + "chromosomes": { + 1: {"Chr": 1, + "loci": [ + { + "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs6269442", "cM": 1.500, "Mb": 3.492, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs32285189", "cM": 1.630, "Mb": 3.511, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs258367496", "cM": 1.630, "Mb": 3.660, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs32430919", "cM": 1.750, "Mb": 3.777, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs36251697", "cM": 1.880, "Mb": 3.812, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs30658298", "cM": 2.010, "Mb": 4.431, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }]}, + 2: {"Chr": 2, + "loci": [ + { + "Locus": "rs51852623", "cM": 2.010, "Mb": 4.447, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs31879829", "cM": 2.140, "Mb": 4.519, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs36742481", "cM": 2.140, "Mb": 4.776, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }]}}}} + +organised_trait_2 = { + "2": { + "ID": "2", + "chromosomes": { + 1: {"Chr": 1, + "loci": [ + { + "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs6269442", "cM": 1.500, "Mb": 3.492, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs32285189", "cM": 1.630, "Mb": 3.511, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs258367496", "cM": 1.630, "Mb": 3.660, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs32430919", "cM": 1.750, "Mb": 3.777, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs36251697", "cM": 1.880, "Mb": 3.812, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs30658298", "cM": 2.010, "Mb": 4.431, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }]}, + 2: {"Chr": 2, + "loci": [ + { + "Locus": "rs51852623", "cM": 2.010, "Mb": 4.447, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs31879829", "cM": 2.140, "Mb": 4.519, + "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 + }, + { + "Locus": "rs36742481", "cM": 2.140, "Mb": 4.776, + "LRS": 0.579, "Additive": -0.074, "pValue": 1.000 + }]}}}} diff --git a/tests/unit/test_heatmaps.py b/tests/unit/test_heatmaps.py index c0a496b..fd91cf9 100644 --- a/tests/unit/test_heatmaps.py +++ b/tests/unit/test_heatmaps.py @@ -7,6 +7,7 @@ from gn3.heatmaps import ( compute_traits_order, retrieve_strains_and_values, process_traits_data_for_heatmap) +from tests.unit.sample_test_data import organised_trait_1, organised_trait_2 strainlist = ["B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"] trait_data = { @@ -206,100 +207,7 @@ class TestHeatmap(TestCase): """Check for correct processing of data for heatmap generation.""" self.assertEqual( process_traits_data_for_heatmap( - {"1": { - "ID": "T1", - "chromosomes": { - 1: {"Chr": 1, - "loci": [ - { - "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs6269442", "cM": 1.500, "Mb": 3.492, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs32285189", "cM": 1.630, "Mb": 3.511, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs258367496", "cM": 1.630, "Mb": 3.660, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs32430919", "cM": 1.750, "Mb": 3.777, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs36251697", "cM": 1.880, "Mb": 3.812, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs30658298", "cM": 2.010, "Mb": 4.431, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }]}, - 2: {"Chr": 2, - "loci": [ - { - "Locus": "rs51852623", "cM": 2.010, "Mb": 4.447, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs31879829", "cM": 2.140, "Mb": 4.519, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs36742481", "cM": 2.140, "Mb": 4.776, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }]}}}, - "2": { - "ID": "T1", - "chromosomes": { - 1: {"Chr": 1, - "loci": [ - { - "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs6269442", "cM": 1.500, "Mb": 3.492, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs32285189", "cM": 1.630, "Mb": 3.511, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs258367496", "cM": 1.630, "Mb": 3.660, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs32430919", "cM": 1.750, "Mb": 3.777, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs36251697", "cM": 1.880, "Mb": 3.812, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs30658298", "cM": 2.010, "Mb": 4.431, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }]}, - 2: {"Chr": 2, - "loci": [ - { - "Locus": "rs51852623", "cM": 2.010, "Mb": 4.447, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs31879829", "cM": 2.140, "Mb": 4.519, - "LRS": 0.500, "Additive": -0.074, "pValue": 1.000 - }, - { - "Locus": "rs36742481", "cM": 2.140, "Mb": 4.776, - "LRS": 0.579, "Additive": -0.074, "pValue": 1.000 - }]}}}}, + {**organised_trait_1, **organised_trait_2}, ["2", "1"], [1, 2]), [[[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5], -- cgit v1.2.3 From 19783a18c2bc7941fc5980e593f19fb1d18c3623 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 27 Sep 2021 04:48:53 +0300 Subject: Update terminology: `strain` to `sample` Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Update the terminology used: use `sample` in place of `strain` according to Zachary's direction at https://github.com/genenetwork/genenetwork3/pull/37#issuecomment-926043306 --- gn3/computations/parsers.py | 10 ++--- gn3/computations/qtlreaper.py | 8 ++-- gn3/db/genotypes.py | 8 ++-- gn3/db/traits.py | 44 ++++++++++----------- gn3/heatmaps.py | 62 ++++++++++++++--------------- tests/unit/computations/test_parsers.py | 4 +- tests/unit/test_heatmaps.py | 70 ++++++++++++++++----------------- 7 files changed, 103 insertions(+), 103 deletions(-) (limited to 'tests/unit/test_heatmaps.py') diff --git a/gn3/computations/parsers.py b/gn3/computations/parsers.py index 94387ff..1af35d6 100644 --- a/gn3/computations/parsers.py +++ b/gn3/computations/parsers.py @@ -14,7 +14,7 @@ def parse_genofile(file_path: str) -> Tuple[List[str], 'h': 0, 'u': None, } - genotypes, strains = [], [] + genotypes, samples = [], [] with open(file_path, "r") as _genofile: for line in _genofile: line = line.strip() @@ -22,8 +22,8 @@ def parse_genofile(file_path: str) -> Tuple[List[str], continue cells = line.split() if line.startswith("Chr"): - strains = cells[4:] - strains = [strain.lower() for strain in strains] + samples = cells[4:] + samples = [sample.lower() for sample in samples] continue values = [__map.get(value.lower(), None) for value in cells[4:]] genotype = { @@ -32,7 +32,7 @@ def parse_genofile(file_path: str) -> Tuple[List[str], "cm": cells[2], "mb": cells[3], "values": values, - "dicvalues": dict(zip(strains, values)), + "dicvalues": dict(zip(samples, values)), } genotypes.append(genotype) - return strains, genotypes + return samples, genotypes diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py index 8b2893e..166d2dd 100644 --- a/gn3/computations/qtlreaper.py +++ b/gn3/computations/qtlreaper.py @@ -9,17 +9,17 @@ from typing import Union from gn3.random import random_string from gn3.settings import TMPDIR, REAPER_COMMAND -def generate_traits_file(strains, trait_values, traits_filename): +def generate_traits_file(samples, trait_values, traits_filename): """ Generate a traits file for use with `qtlreaper`. PARAMETERS: - strains: A list of strains to use as the headers for the various columns. - trait_values: A list of lists of values for each trait and strain. + samples: A list of samples to use as the headers for the various columns. + trait_values: A list of lists of values for each trait and sample. traits_filename: The tab-separated value to put the values in for computation of QTLs. """ - header = "Trait\t{}\n".format("\t".join(strains)) + header = "Trait\t{}\n".format("\t".join(samples)) data = ( [header] + ["{}\t{}\n".format(i+1, "\t".join([str(i) for i in t])) diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py index 9987320..8f18cac 100644 --- a/gn3/db/genotypes.py +++ b/gn3/db/genotypes.py @@ -14,16 +14,16 @@ def build_genotype_file( def load_genotype_samples(genotype_filename: str, file_type: str = "geno"): """ - Load sample of strains from genotype files. + Load sample of samples from genotype files. DESCRIPTION: - Traits can contain a varied number of strains, some of which do not exist in + Traits can contain a varied number of samples, some of which do not exist in certain genotypes. In order to compute QTLs, GEMMAs, etc, we need to ensure - to pick only those strains that exist in the genotype under consideration + to pick only those samples that exist in the genotype under consideration for the traits used in the computation. This function loads a list of samples from the genotype files for use in - filtering out unusable strains. + filtering out unusable samples. PARAMETERS: diff --git a/gn3/db/traits.py b/gn3/db/traits.py index 4fc47c3..c9d05d7 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -445,7 +445,7 @@ def retrieve_temp_trait_data(trait_info: dict, conn: Any): query, {"trait_name": trait_info["trait_name"]}) return [dict(zip( - ["strain_name", "value", "se_error", "nstrain", "id"], row)) + ["sample_name", "value", "se_error", "nstrain", "id"], row)) for row in cursor.fetchall()] return [] @@ -484,7 +484,7 @@ def retrieve_geno_trait_data(trait_info: Dict, conn: Any): "species_id": retrieve_species_id( trait_info["db"]["riset"], conn)}) return [dict(zip( - ["strain_name", "value", "se_error", "id"], row)) + ["sample_name", "value", "se_error", "id"], row)) for row in cursor.fetchall()] return [] @@ -515,7 +515,7 @@ def retrieve_publish_trait_data(trait_info: Dict, conn: Any): {"trait_name": trait_info["trait_name"], "dataset_id": trait_info["db"]["dataset_id"]}) return [dict(zip( - ["strain_name", "value", "se_error", "nstrain", "id"], row)) + ["sample_name", "value", "se_error", "nstrain", "id"], row)) for row in cursor.fetchall()] return [] @@ -548,7 +548,7 @@ def retrieve_cellid_trait_data(trait_info: Dict, conn: Any): "trait_name": trait_info["trait_name"], "dataset_id": trait_info["db"]["dataset_id"]}) return [dict(zip( - ["strain_name", "value", "se_error", "id"], row)) + ["sample_name", "value", "se_error", "id"], row)) for row in cursor.fetchall()] return [] @@ -577,29 +577,29 @@ def retrieve_probeset_trait_data(trait_info: Dict, conn: Any): {"trait_name": trait_info["trait_name"], "dataset_name": trait_info["db"]["dataset_name"]}) return [dict(zip( - ["strain_name", "value", "se_error", "id"], row)) + ["sample_name", "value", "se_error", "id"], row)) for row in cursor.fetchall()] return [] -def with_strainlist_data_setup(strainlist: Sequence[str]): +def with_samplelist_data_setup(samplelist: Sequence[str]): """ - Build function that computes the trait data from provided list of strains. + Build function that computes the trait data from provided list of samples. PARAMETERS - strainlist: (list) - A list of strain names + samplelist: (list) + A list of sample names RETURNS: Returns a function that given some data from the database, computes the - strain's value, variance and ndata values, only if the strain is present - in the provided `strainlist` variable. + sample's value, variance and ndata values, only if the sample is present + in the provided `samplelist` variable. """ def setup_fn(tdata): - if tdata["strain_name"] in strainlist: + if tdata["sample_name"] in samplelist: val = tdata["value"] if val is not None: return { - "strain_name": tdata["strain_name"], + "sample_name": tdata["sample_name"], "value": val, "variance": tdata["se_error"], "ndata": tdata.get("nstrain", None) @@ -607,19 +607,19 @@ def with_strainlist_data_setup(strainlist: Sequence[str]): return None return setup_fn -def without_strainlist_data_setup(): +def without_samplelist_data_setup(): """ Build function that computes the trait data. RETURNS: Returns a function that given some data from the database, computes the - strain's value, variance and ndata values. + sample's value, variance and ndata values. """ def setup_fn(tdata): val = tdata["value"] if val is not None: return { - "strain_name": tdata["strain_name"], + "sample_name": tdata["sample_name"], "value": val, "variance": tdata["se_error"], "ndata": tdata.get("nstrain", None) @@ -627,7 +627,7 @@ def without_strainlist_data_setup(): return None return setup_fn -def retrieve_trait_data(trait: dict, conn: Any, strainlist: Sequence[str] = tuple()): +def retrieve_trait_data(trait: dict, conn: Any, samplelist: Sequence[str] = tuple()): """ Retrieve trait data @@ -650,23 +650,23 @@ def retrieve_trait_data(trait: dict, conn: Any, strainlist: Sequence[str] = tupl if results: # do something with mysqlid mysqlid = results[0]["id"] - if strainlist: + if samplelist: data = [ item for item in - map(with_strainlist_data_setup(strainlist), results) + map(with_samplelist_data_setup(samplelist), results) if item is not None] else: data = [ item for item in - map(without_strainlist_data_setup(), results) + map(without_samplelist_data_setup(), results) if item is not None] return { "mysqlid": mysqlid, "data": dict(map( lambda x: ( - x["strain_name"], - {k:v for k, v in x.items() if x != "strain_name"}), + x["sample_name"], + {k:v for k, v in x.items() if x != "sample_name"}), data))} return {} diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py index 45d0c22..b6fc6d3 100644 --- a/gn3/heatmaps.py +++ b/gn3/heatmaps.py @@ -27,10 +27,10 @@ from gn3.computations.qtlreaper import ( organise_reaper_main_results) def export_trait_data( - trait_data: dict, strainlist: Sequence[str], dtype: str = "val", + trait_data: dict, samplelist: Sequence[str], dtype: str = "val", var_exists: bool = False, n_exists: bool = False): """ - Export data according to `strainlist`. Mostly used in calculating + Export data according to `samplelist`. Mostly used in calculating correlations. DESCRIPTION: @@ -40,8 +40,8 @@ def export_trait_data( PARAMETERS trait: (dict) The dictionary of key-value pairs representing a trait - strainlist: (list) - A list of strain names + samplelist: (list) + A list of sample names dtype: (str) ... verify what this is ... var_exists: (bool) @@ -49,18 +49,18 @@ def export_trait_data( n_exists: (bool) A flag indicating existence of ndata """ - def __export_all_types(tdata, strain): + def __export_all_types(tdata, sample): sample_data = [] - if tdata[strain]["value"]: - sample_data.append(tdata[strain]["value"]) + if tdata[sample]["value"]: + sample_data.append(tdata[sample]["value"]) if var_exists: - if tdata[strain]["variance"]: - sample_data.append(tdata[strain]["variance"]) + if tdata[sample]["variance"]: + sample_data.append(tdata[sample]["variance"]) else: sample_data.append(None) if n_exists: - if tdata[strain]["ndata"]: - sample_data.append(tdata[strain]["ndata"]) + if tdata[sample]["ndata"]: + sample_data.append(tdata[sample]["ndata"]) else: sample_data.append(None) else: @@ -73,17 +73,17 @@ def export_trait_data( return tuple(sample_data) - def __exporter(accumulator, strain): + def __exporter(accumulator, sample): # pylint: disable=[R0911] - if strain in trait_data["data"]: + if sample in trait_data["data"]: if dtype == "val": - return accumulator + (trait_data["data"][strain]["value"], ) + return accumulator + (trait_data["data"][sample]["value"], ) if dtype == "var": - return accumulator + (trait_data["data"][strain]["variance"], ) + return accumulator + (trait_data["data"][sample]["variance"], ) if dtype == "N": - return accumulator + (trait_data["data"][strain]["ndata"], ) + return accumulator + (trait_data["data"][sample]["ndata"], ) if dtype == "all": - return accumulator + __export_all_types(trait_data["data"], strain) + return accumulator + __export_all_types(trait_data["data"], sample) raise KeyError("Type `%s` is incorrect" % dtype) if var_exists and n_exists: return accumulator + (None, None, None) @@ -91,7 +91,7 @@ def export_trait_data( return accumulator + (None, None) return accumulator + (None,) - return reduce(__exporter, strainlist, tuple()) + return reduce(__exporter, samplelist, tuple()) def trait_display_name(trait: Dict): """ @@ -165,19 +165,19 @@ def build_heatmap(traits_names, conn: Any): for fullname in traits_names] traits_data_list = [retrieve_trait_data(t, conn) for t in traits] genotype_filename = build_genotype_file(traits[0]["riset"]) - strains = load_genotype_samples(genotype_filename) + samples = load_genotype_samples(genotype_filename) exported_traits_data_list = [ - export_trait_data(td, strains) for td in traits_data_list] + export_trait_data(td, samples) for td in traits_data_list] clustered = cluster_traits(exported_traits_data_list) slinked = slink(clustered) traits_order = compute_traits_order(slinked) - strains_and_values = retrieve_strains_and_values( - traits_order, strains, exported_traits_data_list) + samples_and_values = retrieve_samples_and_values( + traits_order, samples, exported_traits_data_list) traits_filename = "{}/traits_test_file_{}.txt".format( TMPDIR, random_string(10)) generate_traits_file( - strains_and_values[0][1], - [t[2] for t in strains_and_values], + samples_and_values[0][1], + [t[2] for t in samples_and_values], traits_filename) main_output, _permutations_output = run_reaper( @@ -229,9 +229,9 @@ def compute_traits_order(slink_data, neworder: tuple = tuple()): return __order_maker(neworder, slink_data) -def retrieve_strains_and_values(orders, strainlist, traits_data_list): +def retrieve_samples_and_values(orders, samplelist, traits_data_list): """ - Get the strains and their corresponding values from `strainlist` and + Get the samples and their corresponding values from `samplelist` and `traits_data_list`. This migrates the code in @@ -240,17 +240,17 @@ def retrieve_strains_and_values(orders, strainlist, traits_data_list): # This feels nasty! There's a lot of mutation of values here, that might # indicate something untoward in the design of this function and its # dependents ==> Review - strains = [] + samples = [] values = [] rets = [] for order in orders: temp_val = traits_data_list[order] - for i, strain in enumerate(strainlist): + for i, sample in enumerate(samplelist): if temp_val[i] is not None: - strains.append(strain) + samples.append(sample) values.append(temp_val[i]) - rets.append([order, strains[:], values[:]]) - strains = [] + rets.append([order, samples[:], values[:]]) + samples = [] values = [] return rets diff --git a/tests/unit/computations/test_parsers.py b/tests/unit/computations/test_parsers.py index 19c3067..b51b0bf 100644 --- a/tests/unit/computations/test_parsers.py +++ b/tests/unit/computations/test_parsers.py @@ -15,7 +15,7 @@ class TestParsers(unittest.TestCase): def test_parse_genofile_with_existing_file(self): """Test that a genotype file is parsed correctly""" - strains = ["bxd1", "bxd2"] + samples = ["bxd1", "bxd2"] genotypes = [ {"chr": "1", "locus": "rs31443144", "cm": "1.50", "mb": "3.010274", @@ -51,4 +51,4 @@ class TestParsers(unittest.TestCase): "../test_data/genotype.txt" )) self.assertEqual(parse_genofile( - test_genotype_file), (strains, genotypes)) + test_genotype_file), (samples, genotypes)) diff --git a/tests/unit/test_heatmaps.py b/tests/unit/test_heatmaps.py index fd91cf9..b54e2f3 100644 --- a/tests/unit/test_heatmaps.py +++ b/tests/unit/test_heatmaps.py @@ -5,41 +5,41 @@ from gn3.heatmaps import ( get_lrs_from_chr, export_trait_data, compute_traits_order, - retrieve_strains_and_values, + retrieve_samples_and_values, process_traits_data_for_heatmap) from tests.unit.sample_test_data import organised_trait_1, organised_trait_2 -strainlist = ["B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"] +samplelist = ["B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"] trait_data = { "mysqlid": 36688172, "data": { - "B6cC3-1": {"strain_name": "B6cC3-1", "value": 7.51879, "variance": None, "ndata": None}, - "BXD1": {"strain_name": "BXD1", "value": 7.77141, "variance": None, "ndata": None}, - "BXD12": {"strain_name": "BXD12", "value": 8.39265, "variance": None, "ndata": None}, - "BXD16": {"strain_name": "BXD16", "value": 8.17443, "variance": None, "ndata": None}, - "BXD19": {"strain_name": "BXD19", "value": 8.30401, "variance": None, "ndata": None}, - "BXD2": {"strain_name": "BXD2", "value": 7.80944, "variance": None, "ndata": None}, - "BXD21": {"strain_name": "BXD21", "value": 8.93809, "variance": None, "ndata": None}, - "BXD24": {"strain_name": "BXD24", "value": 7.99415, "variance": None, "ndata": None}, - "BXD27": {"strain_name": "BXD27", "value": 8.12177, "variance": None, "ndata": None}, - "BXD28": {"strain_name": "BXD28", "value": 7.67688, "variance": None, "ndata": None}, - "BXD32": {"strain_name": "BXD32", "value": 7.79062, "variance": None, "ndata": None}, - "BXD39": {"strain_name": "BXD39", "value": 8.27641, "variance": None, "ndata": None}, - "BXD40": {"strain_name": "BXD40", "value": 8.18012, "variance": None, "ndata": None}, - "BXD42": {"strain_name": "BXD42", "value": 7.82433, "variance": None, "ndata": None}, - "BXD6": {"strain_name": "BXD6", "value": 8.09718, "variance": None, "ndata": None}, - "BXH14": {"strain_name": "BXH14", "value": 7.97475, "variance": None, "ndata": None}, - "BXH19": {"strain_name": "BXH19", "value": 7.67223, "variance": None, "ndata": None}, - "BXH2": {"strain_name": "BXH2", "value": 7.93622, "variance": None, "ndata": None}, - "BXH22": {"strain_name": "BXH22", "value": 7.43692, "variance": None, "ndata": None}, - "BXH4": {"strain_name": "BXH4", "value": 7.96336, "variance": None, "ndata": None}, - "BXH6": {"strain_name": "BXH6", "value": 7.75132, "variance": None, "ndata": None}, - "BXH7": {"strain_name": "BXH7", "value": 8.12927, "variance": None, "ndata": None}, - "BXH8": {"strain_name": "BXH8", "value": 6.77338, "variance": None, "ndata": None}, - "BXH9": {"strain_name": "BXH9", "value": 8.03836, "variance": None, "ndata": None}, - "C3H/HeJ": {"strain_name": "C3H/HeJ", "value": 7.42795, "variance": None, "ndata": None}, - "C57BL/6J": {"strain_name": "C57BL/6J", "value": 7.50606, "variance": None, "ndata": None}, - "DBA/2J": {"strain_name": "DBA/2J", "value": 7.72588, "variance": None, "ndata": None}}} + "B6cC3-1": {"sample_name": "B6cC3-1", "value": 7.51879, "variance": None, "ndata": None}, + "BXD1": {"sample_name": "BXD1", "value": 7.77141, "variance": None, "ndata": None}, + "BXD12": {"sample_name": "BXD12", "value": 8.39265, "variance": None, "ndata": None}, + "BXD16": {"sample_name": "BXD16", "value": 8.17443, "variance": None, "ndata": None}, + "BXD19": {"sample_name": "BXD19", "value": 8.30401, "variance": None, "ndata": None}, + "BXD2": {"sample_name": "BXD2", "value": 7.80944, "variance": None, "ndata": None}, + "BXD21": {"sample_name": "BXD21", "value": 8.93809, "variance": None, "ndata": None}, + "BXD24": {"sample_name": "BXD24", "value": 7.99415, "variance": None, "ndata": None}, + "BXD27": {"sample_name": "BXD27", "value": 8.12177, "variance": None, "ndata": None}, + "BXD28": {"sample_name": "BXD28", "value": 7.67688, "variance": None, "ndata": None}, + "BXD32": {"sample_name": "BXD32", "value": 7.79062, "variance": None, "ndata": None}, + "BXD39": {"sample_name": "BXD39", "value": 8.27641, "variance": None, "ndata": None}, + "BXD40": {"sample_name": "BXD40", "value": 8.18012, "variance": None, "ndata": None}, + "BXD42": {"sample_name": "BXD42", "value": 7.82433, "variance": None, "ndata": None}, + "BXD6": {"sample_name": "BXD6", "value": 8.09718, "variance": None, "ndata": None}, + "BXH14": {"sample_name": "BXH14", "value": 7.97475, "variance": None, "ndata": None}, + "BXH19": {"sample_name": "BXH19", "value": 7.67223, "variance": None, "ndata": None}, + "BXH2": {"sample_name": "BXH2", "value": 7.93622, "variance": None, "ndata": None}, + "BXH22": {"sample_name": "BXH22", "value": 7.43692, "variance": None, "ndata": None}, + "BXH4": {"sample_name": "BXH4", "value": 7.96336, "variance": None, "ndata": None}, + "BXH6": {"sample_name": "BXH6", "value": 7.75132, "variance": None, "ndata": None}, + "BXH7": {"sample_name": "BXH7", "value": 8.12927, "variance": None, "ndata": None}, + "BXH8": {"sample_name": "BXH8", "value": 6.77338, "variance": None, "ndata": None}, + "BXH9": {"sample_name": "BXH9", "value": 8.03836, "variance": None, "ndata": None}, + "C3H/HeJ": {"sample_name": "C3H/HeJ", "value": 7.42795, "variance": None, "ndata": None}, + "C57BL/6J": {"sample_name": "C57BL/6J", "value": 7.50606, "variance": None, "ndata": None}, + "DBA/2J": {"sample_name": "DBA/2J", "value": 7.72588, "variance": None, "ndata": None}}} slinked = ( (((0, 2, 0.16381088984330505), @@ -66,7 +66,7 @@ class TestHeatmap(TestCase): ["all", (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)]]: with self.subTest(dtype=dtype): self.assertEqual( - export_trait_data(trait_data, strainlist, dtype=dtype), + export_trait_data(trait_data, samplelist, dtype=dtype), expected) def test_export_trait_data_dtype_all_flags(self): @@ -106,7 +106,7 @@ class TestHeatmap(TestCase): with self.subTest(dtype=dtype, vflag=vflag, nflag=nflag): self.assertEqual( export_trait_data( - trait_data, strainlist, dtype=dtype, var_exists=vflag, + trait_data, samplelist, dtype=dtype, var_exists=vflag, n_exists=nflag), expected) @@ -164,8 +164,8 @@ class TestHeatmap(TestCase): self.assertEqual( compute_traits_order(slinked), (0, 2, 1, 7, 5, 9, 3, 6, 8, 4)) - def test_retrieve_strains_and_values(self): - """Test retrieval of strains and values.""" + def test_retrieve_samples_and_values(self): + """Test retrieval of samples and values.""" for orders, slist, tdata, expected in [ [ [2], @@ -185,9 +185,9 @@ class TestHeatmap(TestCase): [6, None, None, 4, None]], [[3, ["s1", "s4"], [6, 4]]] ]]: - with self.subTest(strainlist=slist, traitdata=tdata): + with self.subTest(samplelist=slist, traitdata=tdata): self.assertEqual( - retrieve_strains_and_values(orders, slist, tdata), expected) + retrieve_samples_and_values(orders, slist, tdata), expected) def test_get_lrs_from_chr(self): """Check that function gets correct LRS values""" -- cgit v1.2.3 From 4a55971a9be54b399c45a53e211df3348df1c52b Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Tue, 28 Sep 2021 10:15:43 +0300 Subject: Retrieve loci names ordered by chromosomes Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/heatmaps.py: implement function * tests/unit/test_heatmaps.py: add test Add a function to retrieve the loci names from the traits, ordered by chromosomes, in alphabetical order. This is useful to provide the user with more information on hovering over the heatmap cells: each cell will now display the locus name, trait name and value associated with it. --- gn3/heatmaps.py | 28 +++++++++++++++++++++++++++- tests/unit/test_heatmaps.py | 15 +++++++++++++++ 2 files changed, 42 insertions(+), 1 deletion(-) (limited to 'tests/unit/test_heatmaps.py') diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py index 2ef2d16..9c10ba3 100644 --- a/gn3/heatmaps.py +++ b/gn3/heatmaps.py @@ -4,7 +4,7 @@ generate various kinds of heatmaps. """ from functools import reduce -from typing import Any, Dict, Sequence +from typing import Any, Dict, Union, Sequence import numpy as np import plotly.graph_objects as go # type: ignore @@ -142,6 +142,32 @@ def cluster_traits(traits_data_list: Sequence[Dict]): return tuple(__cluster(tdata_i) for tdata_i in enumerate(traits_data_list)) +def get_loci_names( + organised: dict, + chromosome_names: Sequence[str]) -> Sequence[Sequence[str]]: + """ + Get the loci names organised by the same order as the `chromosome_names`. + """ + def __get_trait_loci(accumulator, trait): + chrs = tuple(trait["chromosomes"].keys()) + trait_loci = { + _chr: tuple( + locus["Locus"] + for locus in trait["chromosomes"][_chr]["loci"] + ) for _chr in chrs + } + return { + **accumulator, + **{ + _chr: tuple(sorted(set( + trait_loci[_chr] + accumulator.get(_chr, tuple())))) + for _chr in trait_loci.keys() + } + } + loci_dict: Dict[Union[str, int], Sequence[str]] = reduce( + __get_trait_loci, [v[1] for v in organised.items()], {}) + return tuple(loci_dict[_chr] for _chr in chromosome_names) + def build_heatmap(traits_names, conn: Any): """ heatmap function diff --git a/tests/unit/test_heatmaps.py b/tests/unit/test_heatmaps.py index b54e2f3..7b66688 100644 --- a/tests/unit/test_heatmaps.py +++ b/tests/unit/test_heatmaps.py @@ -2,6 +2,7 @@ from unittest import TestCase from gn3.heatmaps import ( cluster_traits, + get_loci_names, get_lrs_from_chr, export_trait_data, compute_traits_order, @@ -214,3 +215,17 @@ class TestHeatmap(TestCase): [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]], [[0.5, 0.579, 0.5], [0.5, 0.5, 0.5]]]) + + def test_get_loci_names(self): + """Check that loci names are retrieved correctly.""" + for organised, expected in ( + (organised_trait_1, + (("rs258367496", "rs30658298", "rs31443144", "rs32285189", + "rs32430919", "rs36251697", "rs6269442"), + ("rs31879829", "rs36742481", "rs51852623"))), + ({**organised_trait_1, **organised_trait_2}, + (("rs258367496", "rs30658298", "rs31443144", "rs32285189", + "rs32430919", "rs36251697", "rs6269442"), + ("rs31879829", "rs36742481", "rs51852623")))): + with self.subTest(organised=organised): + self.assertEqual(get_loci_names(organised, (1, 2)), expected) -- cgit v1.2.3