From 19783a18c2bc7941fc5980e593f19fb1d18c3623 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 27 Sep 2021 04:48:53 +0300 Subject: Update terminology: `strain` to `sample` Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Update the terminology used: use `sample` in place of `strain` according to Zachary's direction at https://github.com/genenetwork/genenetwork3/pull/37#issuecomment-926043306 --- gn3/computations/parsers.py | 10 ++--- gn3/computations/qtlreaper.py | 8 ++-- gn3/db/genotypes.py | 8 ++-- gn3/db/traits.py | 44 ++++++++++----------- gn3/heatmaps.py | 62 ++++++++++++++--------------- tests/unit/computations/test_parsers.py | 4 +- tests/unit/test_heatmaps.py | 70 ++++++++++++++++----------------- 7 files changed, 103 insertions(+), 103 deletions(-) diff --git a/gn3/computations/parsers.py b/gn3/computations/parsers.py index 94387ff..1af35d6 100644 --- a/gn3/computations/parsers.py +++ b/gn3/computations/parsers.py @@ -14,7 +14,7 @@ def parse_genofile(file_path: str) -> Tuple[List[str], 'h': 0, 'u': None, } - genotypes, strains = [], [] + genotypes, samples = [], [] with open(file_path, "r") as _genofile: for line in _genofile: line = line.strip() @@ -22,8 +22,8 @@ def parse_genofile(file_path: str) -> Tuple[List[str], continue cells = line.split() if line.startswith("Chr"): - strains = cells[4:] - strains = [strain.lower() for strain in strains] + samples = cells[4:] + samples = [sample.lower() for sample in samples] continue values = [__map.get(value.lower(), None) for value in cells[4:]] genotype = { @@ -32,7 +32,7 @@ def parse_genofile(file_path: str) -> Tuple[List[str], "cm": cells[2], "mb": cells[3], "values": values, - "dicvalues": dict(zip(strains, values)), + "dicvalues": dict(zip(samples, values)), } genotypes.append(genotype) - return strains, genotypes + return samples, genotypes diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py index 8b2893e..166d2dd 100644 --- a/gn3/computations/qtlreaper.py +++ b/gn3/computations/qtlreaper.py @@ -9,17 +9,17 @@ from typing import Union from gn3.random import random_string from gn3.settings import TMPDIR, REAPER_COMMAND -def generate_traits_file(strains, trait_values, traits_filename): +def generate_traits_file(samples, trait_values, traits_filename): """ Generate a traits file for use with `qtlreaper`. PARAMETERS: - strains: A list of strains to use as the headers for the various columns. - trait_values: A list of lists of values for each trait and strain. + samples: A list of samples to use as the headers for the various columns. + trait_values: A list of lists of values for each trait and sample. traits_filename: The tab-separated value to put the values in for computation of QTLs. """ - header = "Trait\t{}\n".format("\t".join(strains)) + header = "Trait\t{}\n".format("\t".join(samples)) data = ( [header] + ["{}\t{}\n".format(i+1, "\t".join([str(i) for i in t])) diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py index 9987320..8f18cac 100644 --- a/gn3/db/genotypes.py +++ b/gn3/db/genotypes.py @@ -14,16 +14,16 @@ def build_genotype_file( def load_genotype_samples(genotype_filename: str, file_type: str = "geno"): """ - Load sample of strains from genotype files. + Load sample of samples from genotype files. DESCRIPTION: - Traits can contain a varied number of strains, some of which do not exist in + Traits can contain a varied number of samples, some of which do not exist in certain genotypes. In order to compute QTLs, GEMMAs, etc, we need to ensure - to pick only those strains that exist in the genotype under consideration + to pick only those samples that exist in the genotype under consideration for the traits used in the computation. This function loads a list of samples from the genotype files for use in - filtering out unusable strains. + filtering out unusable samples. PARAMETERS: diff --git a/gn3/db/traits.py b/gn3/db/traits.py index 4fc47c3..c9d05d7 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -445,7 +445,7 @@ def retrieve_temp_trait_data(trait_info: dict, conn: Any): query, {"trait_name": trait_info["trait_name"]}) return [dict(zip( - ["strain_name", "value", "se_error", "nstrain", "id"], row)) + ["sample_name", "value", "se_error", "nstrain", "id"], row)) for row in cursor.fetchall()] return [] @@ -484,7 +484,7 @@ def retrieve_geno_trait_data(trait_info: Dict, conn: Any): "species_id": retrieve_species_id( trait_info["db"]["riset"], conn)}) return [dict(zip( - ["strain_name", "value", "se_error", "id"], row)) + ["sample_name", "value", "se_error", "id"], row)) for row in cursor.fetchall()] return [] @@ -515,7 +515,7 @@ def retrieve_publish_trait_data(trait_info: Dict, conn: Any): {"trait_name": trait_info["trait_name"], "dataset_id": trait_info["db"]["dataset_id"]}) return [dict(zip( - ["strain_name", "value", "se_error", "nstrain", "id"], row)) + ["sample_name", "value", "se_error", "nstrain", "id"], row)) for row in cursor.fetchall()] return [] @@ -548,7 +548,7 @@ def retrieve_cellid_trait_data(trait_info: Dict, conn: Any): "trait_name": trait_info["trait_name"], "dataset_id": trait_info["db"]["dataset_id"]}) return [dict(zip( - ["strain_name", "value", "se_error", "id"], row)) + ["sample_name", "value", "se_error", "id"], row)) for row in cursor.fetchall()] return [] @@ -577,29 +577,29 @@ def retrieve_probeset_trait_data(trait_info: Dict, conn: Any): {"trait_name": trait_info["trait_name"], "dataset_name": trait_info["db"]["dataset_name"]}) return [dict(zip( - ["strain_name", "value", "se_error", "id"], row)) + ["sample_name", "value", "se_error", "id"], row)) for row in cursor.fetchall()] return [] -def with_strainlist_data_setup(strainlist: Sequence[str]): +def with_samplelist_data_setup(samplelist: Sequence[str]): """ - Build function that computes the trait data from provided list of strains. + Build function that computes the trait data from provided list of samples. PARAMETERS - strainlist: (list) - A list of strain names + samplelist: (list) + A list of sample names RETURNS: Returns a function that given some data from the database, computes the - strain's value, variance and ndata values, only if the strain is present - in the provided `strainlist` variable. + sample's value, variance and ndata values, only if the sample is present + in the provided `samplelist` variable. """ def setup_fn(tdata): - if tdata["strain_name"] in strainlist: + if tdata["sample_name"] in samplelist: val = tdata["value"] if val is not None: return { - "strain_name": tdata["strain_name"], + "sample_name": tdata["sample_name"], "value": val, "variance": tdata["se_error"], "ndata": tdata.get("nstrain", None) @@ -607,19 +607,19 @@ def with_strainlist_data_setup(strainlist: Sequence[str]): return None return setup_fn -def without_strainlist_data_setup(): +def without_samplelist_data_setup(): """ Build function that computes the trait data. RETURNS: Returns a function that given some data from the database, computes the - strain's value, variance and ndata values. + sample's value, variance and ndata values. """ def setup_fn(tdata): val = tdata["value"] if val is not None: return { - "strain_name": tdata["strain_name"], + "sample_name": tdata["sample_name"], "value": val, "variance": tdata["se_error"], "ndata": tdata.get("nstrain", None) @@ -627,7 +627,7 @@ def without_strainlist_data_setup(): return None return setup_fn -def retrieve_trait_data(trait: dict, conn: Any, strainlist: Sequence[str] = tuple()): +def retrieve_trait_data(trait: dict, conn: Any, samplelist: Sequence[str] = tuple()): """ Retrieve trait data @@ -650,23 +650,23 @@ def retrieve_trait_data(trait: dict, conn: Any, strainlist: Sequence[str] = tupl if results: # do something with mysqlid mysqlid = results[0]["id"] - if strainlist: + if samplelist: data = [ item for item in - map(with_strainlist_data_setup(strainlist), results) + map(with_samplelist_data_setup(samplelist), results) if item is not None] else: data = [ item for item in - map(without_strainlist_data_setup(), results) + map(without_samplelist_data_setup(), results) if item is not None] return { "mysqlid": mysqlid, "data": dict(map( lambda x: ( - x["strain_name"], - {k:v for k, v in x.items() if x != "strain_name"}), + x["sample_name"], + {k:v for k, v in x.items() if x != "sample_name"}), data))} return {} diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py index 45d0c22..b6fc6d3 100644 --- a/gn3/heatmaps.py +++ b/gn3/heatmaps.py @@ -27,10 +27,10 @@ from gn3.computations.qtlreaper import ( organise_reaper_main_results) def export_trait_data( - trait_data: dict, strainlist: Sequence[str], dtype: str = "val", + trait_data: dict, samplelist: Sequence[str], dtype: str = "val", var_exists: bool = False, n_exists: bool = False): """ - Export data according to `strainlist`. Mostly used in calculating + Export data according to `samplelist`. Mostly used in calculating correlations. DESCRIPTION: @@ -40,8 +40,8 @@ def export_trait_data( PARAMETERS trait: (dict) The dictionary of key-value pairs representing a trait - strainlist: (list) - A list of strain names + samplelist: (list) + A list of sample names dtype: (str) ... verify what this is ... var_exists: (bool) @@ -49,18 +49,18 @@ def export_trait_data( n_exists: (bool) A flag indicating existence of ndata """ - def __export_all_types(tdata, strain): + def __export_all_types(tdata, sample): sample_data = [] - if tdata[strain]["value"]: - sample_data.append(tdata[strain]["value"]) + if tdata[sample]["value"]: + sample_data.append(tdata[sample]["value"]) if var_exists: - if tdata[strain]["variance"]: - sample_data.append(tdata[strain]["variance"]) + if tdata[sample]["variance"]: + sample_data.append(tdata[sample]["variance"]) else: sample_data.append(None) if n_exists: - if tdata[strain]["ndata"]: - sample_data.append(tdata[strain]["ndata"]) + if tdata[sample]["ndata"]: + sample_data.append(tdata[sample]["ndata"]) else: sample_data.append(None) else: @@ -73,17 +73,17 @@ def export_trait_data( return tuple(sample_data) - def __exporter(accumulator, strain): + def __exporter(accumulator, sample): # pylint: disable=[R0911] - if strain in trait_data["data"]: + if sample in trait_data["data"]: if dtype == "val": - return accumulator + (trait_data["data"][strain]["value"], ) + return accumulator + (trait_data["data"][sample]["value"], ) if dtype == "var": - return accumulator + (trait_data["data"][strain]["variance"], ) + return accumulator + (trait_data["data"][sample]["variance"], ) if dtype == "N": - return accumulator + (trait_data["data"][strain]["ndata"], ) + return accumulator + (trait_data["data"][sample]["ndata"], ) if dtype == "all": - return accumulator + __export_all_types(trait_data["data"], strain) + return accumulator + __export_all_types(trait_data["data"], sample) raise KeyError("Type `%s` is incorrect" % dtype) if var_exists and n_exists: return accumulator + (None, None, None) @@ -91,7 +91,7 @@ def export_trait_data( return accumulator + (None, None) return accumulator + (None,) - return reduce(__exporter, strainlist, tuple()) + return reduce(__exporter, samplelist, tuple()) def trait_display_name(trait: Dict): """ @@ -165,19 +165,19 @@ def build_heatmap(traits_names, conn: Any): for fullname in traits_names] traits_data_list = [retrieve_trait_data(t, conn) for t in traits] genotype_filename = build_genotype_file(traits[0]["riset"]) - strains = load_genotype_samples(genotype_filename) + samples = load_genotype_samples(genotype_filename) exported_traits_data_list = [ - export_trait_data(td, strains) for td in traits_data_list] + export_trait_data(td, samples) for td in traits_data_list] clustered = cluster_traits(exported_traits_data_list) slinked = slink(clustered) traits_order = compute_traits_order(slinked) - strains_and_values = retrieve_strains_and_values( - traits_order, strains, exported_traits_data_list) + samples_and_values = retrieve_samples_and_values( + traits_order, samples, exported_traits_data_list) traits_filename = "{}/traits_test_file_{}.txt".format( TMPDIR, random_string(10)) generate_traits_file( - strains_and_values[0][1], - [t[2] for t in strains_and_values], + samples_and_values[0][1], + [t[2] for t in samples_and_values], traits_filename) main_output, _permutations_output = run_reaper( @@ -229,9 +229,9 @@ def compute_traits_order(slink_data, neworder: tuple = tuple()): return __order_maker(neworder, slink_data) -def retrieve_strains_and_values(orders, strainlist, traits_data_list): +def retrieve_samples_and_values(orders, samplelist, traits_data_list): """ - Get the strains and their corresponding values from `strainlist` and + Get the samples and their corresponding values from `samplelist` and `traits_data_list`. This migrates the code in @@ -240,17 +240,17 @@ def retrieve_strains_and_values(orders, strainlist, traits_data_list): # This feels nasty! There's a lot of mutation of values here, that might # indicate something untoward in the design of this function and its # dependents ==> Review - strains = [] + samples = [] values = [] rets = [] for order in orders: temp_val = traits_data_list[order] - for i, strain in enumerate(strainlist): + for i, sample in enumerate(samplelist): if temp_val[i] is not None: - strains.append(strain) + samples.append(sample) values.append(temp_val[i]) - rets.append([order, strains[:], values[:]]) - strains = [] + rets.append([order, samples[:], values[:]]) + samples = [] values = [] return rets diff --git a/tests/unit/computations/test_parsers.py b/tests/unit/computations/test_parsers.py index 19c3067..b51b0bf 100644 --- a/tests/unit/computations/test_parsers.py +++ b/tests/unit/computations/test_parsers.py @@ -15,7 +15,7 @@ class TestParsers(unittest.TestCase): def test_parse_genofile_with_existing_file(self): """Test that a genotype file is parsed correctly""" - strains = ["bxd1", "bxd2"] + samples = ["bxd1", "bxd2"] genotypes = [ {"chr": "1", "locus": "rs31443144", "cm": "1.50", "mb": "3.010274", @@ -51,4 +51,4 @@ class TestParsers(unittest.TestCase): "../test_data/genotype.txt" )) self.assertEqual(parse_genofile( - test_genotype_file), (strains, genotypes)) + test_genotype_file), (samples, genotypes)) diff --git a/tests/unit/test_heatmaps.py b/tests/unit/test_heatmaps.py index fd91cf9..b54e2f3 100644 --- a/tests/unit/test_heatmaps.py +++ b/tests/unit/test_heatmaps.py @@ -5,41 +5,41 @@ from gn3.heatmaps import ( get_lrs_from_chr, export_trait_data, compute_traits_order, - retrieve_strains_and_values, + retrieve_samples_and_values, process_traits_data_for_heatmap) from tests.unit.sample_test_data import organised_trait_1, organised_trait_2 -strainlist = ["B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"] +samplelist = ["B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"] trait_data = { "mysqlid": 36688172, "data": { - "B6cC3-1": {"strain_name": "B6cC3-1", "value": 7.51879, "variance": None, "ndata": None}, - "BXD1": {"strain_name": "BXD1", "value": 7.77141, "variance": None, "ndata": None}, - "BXD12": {"strain_name": "BXD12", "value": 8.39265, "variance": None, "ndata": None}, - "BXD16": {"strain_name": "BXD16", "value": 8.17443, "variance": None, "ndata": None}, - "BXD19": {"strain_name": "BXD19", "value": 8.30401, "variance": None, "ndata": None}, - "BXD2": {"strain_name": "BXD2", "value": 7.80944, "variance": None, "ndata": None}, - "BXD21": {"strain_name": "BXD21", "value": 8.93809, "variance": None, "ndata": None}, - "BXD24": {"strain_name": "BXD24", "value": 7.99415, "variance": None, "ndata": None}, - "BXD27": {"strain_name": "BXD27", "value": 8.12177, "variance": None, "ndata": None}, - "BXD28": {"strain_name": "BXD28", "value": 7.67688, "variance": None, "ndata": None}, - "BXD32": {"strain_name": "BXD32", "value": 7.79062, "variance": None, "ndata": None}, - "BXD39": {"strain_name": "BXD39", "value": 8.27641, "variance": None, "ndata": None}, - "BXD40": {"strain_name": "BXD40", "value": 8.18012, "variance": None, "ndata": None}, - "BXD42": {"strain_name": "BXD42", "value": 7.82433, "variance": None, "ndata": None}, - "BXD6": {"strain_name": "BXD6", "value": 8.09718, "variance": None, "ndata": None}, - "BXH14": {"strain_name": "BXH14", "value": 7.97475, "variance": None, "ndata": None}, - "BXH19": {"strain_name": "BXH19", "value": 7.67223, "variance": None, "ndata": None}, - "BXH2": {"strain_name": "BXH2", "value": 7.93622, "variance": None, "ndata": None}, - "BXH22": {"strain_name": "BXH22", "value": 7.43692, "variance": None, "ndata": None}, - "BXH4": {"strain_name": "BXH4", "value": 7.96336, "variance": None, "ndata": None}, - "BXH6": {"strain_name": "BXH6", "value": 7.75132, "variance": None, "ndata": None}, - "BXH7": {"strain_name": "BXH7", "value": 8.12927, "variance": None, "ndata": None}, - "BXH8": {"strain_name": "BXH8", "value": 6.77338, "variance": None, "ndata": None}, - "BXH9": {"strain_name": "BXH9", "value": 8.03836, "variance": None, "ndata": None}, - "C3H/HeJ": {"strain_name": "C3H/HeJ", "value": 7.42795, "variance": None, "ndata": None}, - "C57BL/6J": {"strain_name": "C57BL/6J", "value": 7.50606, "variance": None, "ndata": None}, - "DBA/2J": {"strain_name": "DBA/2J", "value": 7.72588, "variance": None, "ndata": None}}} + "B6cC3-1": {"sample_name": "B6cC3-1", "value": 7.51879, "variance": None, "ndata": None}, + "BXD1": {"sample_name": "BXD1", "value": 7.77141, "variance": None, "ndata": None}, + "BXD12": {"sample_name": "BXD12", "value": 8.39265, "variance": None, "ndata": None}, + "BXD16": {"sample_name": "BXD16", "value": 8.17443, "variance": None, "ndata": None}, + "BXD19": {"sample_name": "BXD19", "value": 8.30401, "variance": None, "ndata": None}, + "BXD2": {"sample_name": "BXD2", "value": 7.80944, "variance": None, "ndata": None}, + "BXD21": {"sample_name": "BXD21", "value": 8.93809, "variance": None, "ndata": None}, + "BXD24": {"sample_name": "BXD24", "value": 7.99415, "variance": None, "ndata": None}, + "BXD27": {"sample_name": "BXD27", "value": 8.12177, "variance": None, "ndata": None}, + "BXD28": {"sample_name": "BXD28", "value": 7.67688, "variance": None, "ndata": None}, + "BXD32": {"sample_name": "BXD32", "value": 7.79062, "variance": None, "ndata": None}, + "BXD39": {"sample_name": "BXD39", "value": 8.27641, "variance": None, "ndata": None}, + "BXD40": {"sample_name": "BXD40", "value": 8.18012, "variance": None, "ndata": None}, + "BXD42": {"sample_name": "BXD42", "value": 7.82433, "variance": None, "ndata": None}, + "BXD6": {"sample_name": "BXD6", "value": 8.09718, "variance": None, "ndata": None}, + "BXH14": {"sample_name": "BXH14", "value": 7.97475, "variance": None, "ndata": None}, + "BXH19": {"sample_name": "BXH19", "value": 7.67223, "variance": None, "ndata": None}, + "BXH2": {"sample_name": "BXH2", "value": 7.93622, "variance": None, "ndata": None}, + "BXH22": {"sample_name": "BXH22", "value": 7.43692, "variance": None, "ndata": None}, + "BXH4": {"sample_name": "BXH4", "value": 7.96336, "variance": None, "ndata": None}, + "BXH6": {"sample_name": "BXH6", "value": 7.75132, "variance": None, "ndata": None}, + "BXH7": {"sample_name": "BXH7", "value": 8.12927, "variance": None, "ndata": None}, + "BXH8": {"sample_name": "BXH8", "value": 6.77338, "variance": None, "ndata": None}, + "BXH9": {"sample_name": "BXH9", "value": 8.03836, "variance": None, "ndata": None}, + "C3H/HeJ": {"sample_name": "C3H/HeJ", "value": 7.42795, "variance": None, "ndata": None}, + "C57BL/6J": {"sample_name": "C57BL/6J", "value": 7.50606, "variance": None, "ndata": None}, + "DBA/2J": {"sample_name": "DBA/2J", "value": 7.72588, "variance": None, "ndata": None}}} slinked = ( (((0, 2, 0.16381088984330505), @@ -66,7 +66,7 @@ class TestHeatmap(TestCase): ["all", (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)]]: with self.subTest(dtype=dtype): self.assertEqual( - export_trait_data(trait_data, strainlist, dtype=dtype), + export_trait_data(trait_data, samplelist, dtype=dtype), expected) def test_export_trait_data_dtype_all_flags(self): @@ -106,7 +106,7 @@ class TestHeatmap(TestCase): with self.subTest(dtype=dtype, vflag=vflag, nflag=nflag): self.assertEqual( export_trait_data( - trait_data, strainlist, dtype=dtype, var_exists=vflag, + trait_data, samplelist, dtype=dtype, var_exists=vflag, n_exists=nflag), expected) @@ -164,8 +164,8 @@ class TestHeatmap(TestCase): self.assertEqual( compute_traits_order(slinked), (0, 2, 1, 7, 5, 9, 3, 6, 8, 4)) - def test_retrieve_strains_and_values(self): - """Test retrieval of strains and values.""" + def test_retrieve_samples_and_values(self): + """Test retrieval of samples and values.""" for orders, slist, tdata, expected in [ [ [2], @@ -185,9 +185,9 @@ class TestHeatmap(TestCase): [6, None, None, 4, None]], [[3, ["s1", "s4"], [6, 4]]] ]]: - with self.subTest(strainlist=slist, traitdata=tdata): + with self.subTest(samplelist=slist, traitdata=tdata): self.assertEqual( - retrieve_strains_and_values(orders, slist, tdata), expected) + retrieve_samples_and_values(orders, slist, tdata), expected) def test_get_lrs_from_chr(self): """Check that function gets correct LRS values""" -- cgit v1.2.3