aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFrederick Muriuki Muriithi2021-09-27 04:48:53 +0300
committerFrederick Muriuki Muriithi2021-09-27 04:48:53 +0300
commit19783a18c2bc7941fc5980e593f19fb1d18c3623 (patch)
treef3a6e241be3c6224b9647c8258c516a7b741a28c
parent8d9bc0f29ce9208306915b079818e6f0c31785e2 (diff)
downloadgenenetwork3-19783a18c2bc7941fc5980e593f19fb1d18c3623.tar.gz
Update terminology: `strain` to `sample`
Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Update the terminology used: use `sample` in place of `strain` according to Zachary's direction at https://github.com/genenetwork/genenetwork3/pull/37#issuecomment-926043306
-rw-r--r--gn3/computations/parsers.py10
-rw-r--r--gn3/computations/qtlreaper.py8
-rw-r--r--gn3/db/genotypes.py8
-rw-r--r--gn3/db/traits.py44
-rw-r--r--gn3/heatmaps.py62
-rw-r--r--tests/unit/computations/test_parsers.py4
-rw-r--r--tests/unit/test_heatmaps.py70
7 files changed, 103 insertions, 103 deletions
diff --git a/gn3/computations/parsers.py b/gn3/computations/parsers.py
index 94387ff..1af35d6 100644
--- a/gn3/computations/parsers.py
+++ b/gn3/computations/parsers.py
@@ -14,7 +14,7 @@ def parse_genofile(file_path: str) -> Tuple[List[str],
'h': 0,
'u': None,
}
- genotypes, strains = [], []
+ genotypes, samples = [], []
with open(file_path, "r") as _genofile:
for line in _genofile:
line = line.strip()
@@ -22,8 +22,8 @@ def parse_genofile(file_path: str) -> Tuple[List[str],
continue
cells = line.split()
if line.startswith("Chr"):
- strains = cells[4:]
- strains = [strain.lower() for strain in strains]
+ samples = cells[4:]
+ samples = [sample.lower() for sample in samples]
continue
values = [__map.get(value.lower(), None) for value in cells[4:]]
genotype = {
@@ -32,7 +32,7 @@ def parse_genofile(file_path: str) -> Tuple[List[str],
"cm": cells[2],
"mb": cells[3],
"values": values,
- "dicvalues": dict(zip(strains, values)),
+ "dicvalues": dict(zip(samples, values)),
}
genotypes.append(genotype)
- return strains, genotypes
+ return samples, genotypes
diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py
index 8b2893e..166d2dd 100644
--- a/gn3/computations/qtlreaper.py
+++ b/gn3/computations/qtlreaper.py
@@ -9,17 +9,17 @@ from typing import Union
from gn3.random import random_string
from gn3.settings import TMPDIR, REAPER_COMMAND
-def generate_traits_file(strains, trait_values, traits_filename):
+def generate_traits_file(samples, trait_values, traits_filename):
"""
Generate a traits file for use with `qtlreaper`.
PARAMETERS:
- strains: A list of strains to use as the headers for the various columns.
- trait_values: A list of lists of values for each trait and strain.
+ samples: A list of samples to use as the headers for the various columns.
+ trait_values: A list of lists of values for each trait and sample.
traits_filename: The tab-separated value to put the values in for
computation of QTLs.
"""
- header = "Trait\t{}\n".format("\t".join(strains))
+ header = "Trait\t{}\n".format("\t".join(samples))
data = (
[header] +
["{}\t{}\n".format(i+1, "\t".join([str(i) for i in t]))
diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py
index 9987320..8f18cac 100644
--- a/gn3/db/genotypes.py
+++ b/gn3/db/genotypes.py
@@ -14,16 +14,16 @@ def build_genotype_file(
def load_genotype_samples(genotype_filename: str, file_type: str = "geno"):
"""
- Load sample of strains from genotype files.
+ Load sample of samples from genotype files.
DESCRIPTION:
- Traits can contain a varied number of strains, some of which do not exist in
+ Traits can contain a varied number of samples, some of which do not exist in
certain genotypes. In order to compute QTLs, GEMMAs, etc, we need to ensure
- to pick only those strains that exist in the genotype under consideration
+ to pick only those samples that exist in the genotype under consideration
for the traits used in the computation.
This function loads a list of samples from the genotype files for use in
- filtering out unusable strains.
+ filtering out unusable samples.
PARAMETERS:
diff --git a/gn3/db/traits.py b/gn3/db/traits.py
index 4fc47c3..c9d05d7 100644
--- a/gn3/db/traits.py
+++ b/gn3/db/traits.py
@@ -445,7 +445,7 @@ def retrieve_temp_trait_data(trait_info: dict, conn: Any):
query,
{"trait_name": trait_info["trait_name"]})
return [dict(zip(
- ["strain_name", "value", "se_error", "nstrain", "id"], row))
+ ["sample_name", "value", "se_error", "nstrain", "id"], row))
for row in cursor.fetchall()]
return []
@@ -484,7 +484,7 @@ def retrieve_geno_trait_data(trait_info: Dict, conn: Any):
"species_id": retrieve_species_id(
trait_info["db"]["riset"], conn)})
return [dict(zip(
- ["strain_name", "value", "se_error", "id"], row))
+ ["sample_name", "value", "se_error", "id"], row))
for row in cursor.fetchall()]
return []
@@ -515,7 +515,7 @@ def retrieve_publish_trait_data(trait_info: Dict, conn: Any):
{"trait_name": trait_info["trait_name"],
"dataset_id": trait_info["db"]["dataset_id"]})
return [dict(zip(
- ["strain_name", "value", "se_error", "nstrain", "id"], row))
+ ["sample_name", "value", "se_error", "nstrain", "id"], row))
for row in cursor.fetchall()]
return []
@@ -548,7 +548,7 @@ def retrieve_cellid_trait_data(trait_info: Dict, conn: Any):
"trait_name": trait_info["trait_name"],
"dataset_id": trait_info["db"]["dataset_id"]})
return [dict(zip(
- ["strain_name", "value", "se_error", "id"], row))
+ ["sample_name", "value", "se_error", "id"], row))
for row in cursor.fetchall()]
return []
@@ -577,29 +577,29 @@ def retrieve_probeset_trait_data(trait_info: Dict, conn: Any):
{"trait_name": trait_info["trait_name"],
"dataset_name": trait_info["db"]["dataset_name"]})
return [dict(zip(
- ["strain_name", "value", "se_error", "id"], row))
+ ["sample_name", "value", "se_error", "id"], row))
for row in cursor.fetchall()]
return []
-def with_strainlist_data_setup(strainlist: Sequence[str]):
+def with_samplelist_data_setup(samplelist: Sequence[str]):
"""
- Build function that computes the trait data from provided list of strains.
+ Build function that computes the trait data from provided list of samples.
PARAMETERS
- strainlist: (list)
- A list of strain names
+ samplelist: (list)
+ A list of sample names
RETURNS:
Returns a function that given some data from the database, computes the
- strain's value, variance and ndata values, only if the strain is present
- in the provided `strainlist` variable.
+ sample's value, variance and ndata values, only if the sample is present
+ in the provided `samplelist` variable.
"""
def setup_fn(tdata):
- if tdata["strain_name"] in strainlist:
+ if tdata["sample_name"] in samplelist:
val = tdata["value"]
if val is not None:
return {
- "strain_name": tdata["strain_name"],
+ "sample_name": tdata["sample_name"],
"value": val,
"variance": tdata["se_error"],
"ndata": tdata.get("nstrain", None)
@@ -607,19 +607,19 @@ def with_strainlist_data_setup(strainlist: Sequence[str]):
return None
return setup_fn
-def without_strainlist_data_setup():
+def without_samplelist_data_setup():
"""
Build function that computes the trait data.
RETURNS:
Returns a function that given some data from the database, computes the
- strain's value, variance and ndata values.
+ sample's value, variance and ndata values.
"""
def setup_fn(tdata):
val = tdata["value"]
if val is not None:
return {
- "strain_name": tdata["strain_name"],
+ "sample_name": tdata["sample_name"],
"value": val,
"variance": tdata["se_error"],
"ndata": tdata.get("nstrain", None)
@@ -627,7 +627,7 @@ def without_strainlist_data_setup():
return None
return setup_fn
-def retrieve_trait_data(trait: dict, conn: Any, strainlist: Sequence[str] = tuple()):
+def retrieve_trait_data(trait: dict, conn: Any, samplelist: Sequence[str] = tuple()):
"""
Retrieve trait data
@@ -650,23 +650,23 @@ def retrieve_trait_data(trait: dict, conn: Any, strainlist: Sequence[str] = tupl
if results:
# do something with mysqlid
mysqlid = results[0]["id"]
- if strainlist:
+ if samplelist:
data = [
item for item in
- map(with_strainlist_data_setup(strainlist), results)
+ map(with_samplelist_data_setup(samplelist), results)
if item is not None]
else:
data = [
item for item in
- map(without_strainlist_data_setup(), results)
+ map(without_samplelist_data_setup(), results)
if item is not None]
return {
"mysqlid": mysqlid,
"data": dict(map(
lambda x: (
- x["strain_name"],
- {k:v for k, v in x.items() if x != "strain_name"}),
+ x["sample_name"],
+ {k:v for k, v in x.items() if x != "sample_name"}),
data))}
return {}
diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py
index 45d0c22..b6fc6d3 100644
--- a/gn3/heatmaps.py
+++ b/gn3/heatmaps.py
@@ -27,10 +27,10 @@ from gn3.computations.qtlreaper import (
organise_reaper_main_results)
def export_trait_data(
- trait_data: dict, strainlist: Sequence[str], dtype: str = "val",
+ trait_data: dict, samplelist: Sequence[str], dtype: str = "val",
var_exists: bool = False, n_exists: bool = False):
"""
- Export data according to `strainlist`. Mostly used in calculating
+ Export data according to `samplelist`. Mostly used in calculating
correlations.
DESCRIPTION:
@@ -40,8 +40,8 @@ def export_trait_data(
PARAMETERS
trait: (dict)
The dictionary of key-value pairs representing a trait
- strainlist: (list)
- A list of strain names
+ samplelist: (list)
+ A list of sample names
dtype: (str)
... verify what this is ...
var_exists: (bool)
@@ -49,18 +49,18 @@ def export_trait_data(
n_exists: (bool)
A flag indicating existence of ndata
"""
- def __export_all_types(tdata, strain):
+ def __export_all_types(tdata, sample):
sample_data = []
- if tdata[strain]["value"]:
- sample_data.append(tdata[strain]["value"])
+ if tdata[sample]["value"]:
+ sample_data.append(tdata[sample]["value"])
if var_exists:
- if tdata[strain]["variance"]:
- sample_data.append(tdata[strain]["variance"])
+ if tdata[sample]["variance"]:
+ sample_data.append(tdata[sample]["variance"])
else:
sample_data.append(None)
if n_exists:
- if tdata[strain]["ndata"]:
- sample_data.append(tdata[strain]["ndata"])
+ if tdata[sample]["ndata"]:
+ sample_data.append(tdata[sample]["ndata"])
else:
sample_data.append(None)
else:
@@ -73,17 +73,17 @@ def export_trait_data(
return tuple(sample_data)
- def __exporter(accumulator, strain):
+ def __exporter(accumulator, sample):
# pylint: disable=[R0911]
- if strain in trait_data["data"]:
+ if sample in trait_data["data"]:
if dtype == "val":
- return accumulator + (trait_data["data"][strain]["value"], )
+ return accumulator + (trait_data["data"][sample]["value"], )
if dtype == "var":
- return accumulator + (trait_data["data"][strain]["variance"], )
+ return accumulator + (trait_data["data"][sample]["variance"], )
if dtype == "N":
- return accumulator + (trait_data["data"][strain]["ndata"], )
+ return accumulator + (trait_data["data"][sample]["ndata"], )
if dtype == "all":
- return accumulator + __export_all_types(trait_data["data"], strain)
+ return accumulator + __export_all_types(trait_data["data"], sample)
raise KeyError("Type `%s` is incorrect" % dtype)
if var_exists and n_exists:
return accumulator + (None, None, None)
@@ -91,7 +91,7 @@ def export_trait_data(
return accumulator + (None, None)
return accumulator + (None,)
- return reduce(__exporter, strainlist, tuple())
+ return reduce(__exporter, samplelist, tuple())
def trait_display_name(trait: Dict):
"""
@@ -165,19 +165,19 @@ def build_heatmap(traits_names, conn: Any):
for fullname in traits_names]
traits_data_list = [retrieve_trait_data(t, conn) for t in traits]
genotype_filename = build_genotype_file(traits[0]["riset"])
- strains = load_genotype_samples(genotype_filename)
+ samples = load_genotype_samples(genotype_filename)
exported_traits_data_list = [
- export_trait_data(td, strains) for td in traits_data_list]
+ export_trait_data(td, samples) for td in traits_data_list]
clustered = cluster_traits(exported_traits_data_list)
slinked = slink(clustered)
traits_order = compute_traits_order(slinked)
- strains_and_values = retrieve_strains_and_values(
- traits_order, strains, exported_traits_data_list)
+ samples_and_values = retrieve_samples_and_values(
+ traits_order, samples, exported_traits_data_list)
traits_filename = "{}/traits_test_file_{}.txt".format(
TMPDIR, random_string(10))
generate_traits_file(
- strains_and_values[0][1],
- [t[2] for t in strains_and_values],
+ samples_and_values[0][1],
+ [t[2] for t in samples_and_values],
traits_filename)
main_output, _permutations_output = run_reaper(
@@ -229,9 +229,9 @@ def compute_traits_order(slink_data, neworder: tuple = tuple()):
return __order_maker(neworder, slink_data)
-def retrieve_strains_and_values(orders, strainlist, traits_data_list):
+def retrieve_samples_and_values(orders, samplelist, traits_data_list):
"""
- Get the strains and their corresponding values from `strainlist` and
+ Get the samples and their corresponding values from `samplelist` and
`traits_data_list`.
This migrates the code in
@@ -240,17 +240,17 @@ def retrieve_strains_and_values(orders, strainlist, traits_data_list):
# This feels nasty! There's a lot of mutation of values here, that might
# indicate something untoward in the design of this function and its
# dependents ==> Review
- strains = []
+ samples = []
values = []
rets = []
for order in orders:
temp_val = traits_data_list[order]
- for i, strain in enumerate(strainlist):
+ for i, sample in enumerate(samplelist):
if temp_val[i] is not None:
- strains.append(strain)
+ samples.append(sample)
values.append(temp_val[i])
- rets.append([order, strains[:], values[:]])
- strains = []
+ rets.append([order, samples[:], values[:]])
+ samples = []
values = []
return rets
diff --git a/tests/unit/computations/test_parsers.py b/tests/unit/computations/test_parsers.py
index 19c3067..b51b0bf 100644
--- a/tests/unit/computations/test_parsers.py
+++ b/tests/unit/computations/test_parsers.py
@@ -15,7 +15,7 @@ class TestParsers(unittest.TestCase):
def test_parse_genofile_with_existing_file(self):
"""Test that a genotype file is parsed correctly"""
- strains = ["bxd1", "bxd2"]
+ samples = ["bxd1", "bxd2"]
genotypes = [
{"chr": "1", "locus": "rs31443144",
"cm": "1.50", "mb": "3.010274",
@@ -51,4 +51,4 @@ class TestParsers(unittest.TestCase):
"../test_data/genotype.txt"
))
self.assertEqual(parse_genofile(
- test_genotype_file), (strains, genotypes))
+ test_genotype_file), (samples, genotypes))
diff --git a/tests/unit/test_heatmaps.py b/tests/unit/test_heatmaps.py
index fd91cf9..b54e2f3 100644
--- a/tests/unit/test_heatmaps.py
+++ b/tests/unit/test_heatmaps.py
@@ -5,41 +5,41 @@ from gn3.heatmaps import (
get_lrs_from_chr,
export_trait_data,
compute_traits_order,
- retrieve_strains_and_values,
+ retrieve_samples_and_values,
process_traits_data_for_heatmap)
from tests.unit.sample_test_data import organised_trait_1, organised_trait_2
-strainlist = ["B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"]
+samplelist = ["B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"]
trait_data = {
"mysqlid": 36688172,
"data": {
- "B6cC3-1": {"strain_name": "B6cC3-1", "value": 7.51879, "variance": None, "ndata": None},
- "BXD1": {"strain_name": "BXD1", "value": 7.77141, "variance": None, "ndata": None},
- "BXD12": {"strain_name": "BXD12", "value": 8.39265, "variance": None, "ndata": None},
- "BXD16": {"strain_name": "BXD16", "value": 8.17443, "variance": None, "ndata": None},
- "BXD19": {"strain_name": "BXD19", "value": 8.30401, "variance": None, "ndata": None},
- "BXD2": {"strain_name": "BXD2", "value": 7.80944, "variance": None, "ndata": None},
- "BXD21": {"strain_name": "BXD21", "value": 8.93809, "variance": None, "ndata": None},
- "BXD24": {"strain_name": "BXD24", "value": 7.99415, "variance": None, "ndata": None},
- "BXD27": {"strain_name": "BXD27", "value": 8.12177, "variance": None, "ndata": None},
- "BXD28": {"strain_name": "BXD28", "value": 7.67688, "variance": None, "ndata": None},
- "BXD32": {"strain_name": "BXD32", "value": 7.79062, "variance": None, "ndata": None},
- "BXD39": {"strain_name": "BXD39", "value": 8.27641, "variance": None, "ndata": None},
- "BXD40": {"strain_name": "BXD40", "value": 8.18012, "variance": None, "ndata": None},
- "BXD42": {"strain_name": "BXD42", "value": 7.82433, "variance": None, "ndata": None},
- "BXD6": {"strain_name": "BXD6", "value": 8.09718, "variance": None, "ndata": None},
- "BXH14": {"strain_name": "BXH14", "value": 7.97475, "variance": None, "ndata": None},
- "BXH19": {"strain_name": "BXH19", "value": 7.67223, "variance": None, "ndata": None},
- "BXH2": {"strain_name": "BXH2", "value": 7.93622, "variance": None, "ndata": None},
- "BXH22": {"strain_name": "BXH22", "value": 7.43692, "variance": None, "ndata": None},
- "BXH4": {"strain_name": "BXH4", "value": 7.96336, "variance": None, "ndata": None},
- "BXH6": {"strain_name": "BXH6", "value": 7.75132, "variance": None, "ndata": None},
- "BXH7": {"strain_name": "BXH7", "value": 8.12927, "variance": None, "ndata": None},
- "BXH8": {"strain_name": "BXH8", "value": 6.77338, "variance": None, "ndata": None},
- "BXH9": {"strain_name": "BXH9", "value": 8.03836, "variance": None, "ndata": None},
- "C3H/HeJ": {"strain_name": "C3H/HeJ", "value": 7.42795, "variance": None, "ndata": None},
- "C57BL/6J": {"strain_name": "C57BL/6J", "value": 7.50606, "variance": None, "ndata": None},
- "DBA/2J": {"strain_name": "DBA/2J", "value": 7.72588, "variance": None, "ndata": None}}}
+ "B6cC3-1": {"sample_name": "B6cC3-1", "value": 7.51879, "variance": None, "ndata": None},
+ "BXD1": {"sample_name": "BXD1", "value": 7.77141, "variance": None, "ndata": None},
+ "BXD12": {"sample_name": "BXD12", "value": 8.39265, "variance": None, "ndata": None},
+ "BXD16": {"sample_name": "BXD16", "value": 8.17443, "variance": None, "ndata": None},
+ "BXD19": {"sample_name": "BXD19", "value": 8.30401, "variance": None, "ndata": None},
+ "BXD2": {"sample_name": "BXD2", "value": 7.80944, "variance": None, "ndata": None},
+ "BXD21": {"sample_name": "BXD21", "value": 8.93809, "variance": None, "ndata": None},
+ "BXD24": {"sample_name": "BXD24", "value": 7.99415, "variance": None, "ndata": None},
+ "BXD27": {"sample_name": "BXD27", "value": 8.12177, "variance": None, "ndata": None},
+ "BXD28": {"sample_name": "BXD28", "value": 7.67688, "variance": None, "ndata": None},
+ "BXD32": {"sample_name": "BXD32", "value": 7.79062, "variance": None, "ndata": None},
+ "BXD39": {"sample_name": "BXD39", "value": 8.27641, "variance": None, "ndata": None},
+ "BXD40": {"sample_name": "BXD40", "value": 8.18012, "variance": None, "ndata": None},
+ "BXD42": {"sample_name": "BXD42", "value": 7.82433, "variance": None, "ndata": None},
+ "BXD6": {"sample_name": "BXD6", "value": 8.09718, "variance": None, "ndata": None},
+ "BXH14": {"sample_name": "BXH14", "value": 7.97475, "variance": None, "ndata": None},
+ "BXH19": {"sample_name": "BXH19", "value": 7.67223, "variance": None, "ndata": None},
+ "BXH2": {"sample_name": "BXH2", "value": 7.93622, "variance": None, "ndata": None},
+ "BXH22": {"sample_name": "BXH22", "value": 7.43692, "variance": None, "ndata": None},
+ "BXH4": {"sample_name": "BXH4", "value": 7.96336, "variance": None, "ndata": None},
+ "BXH6": {"sample_name": "BXH6", "value": 7.75132, "variance": None, "ndata": None},
+ "BXH7": {"sample_name": "BXH7", "value": 8.12927, "variance": None, "ndata": None},
+ "BXH8": {"sample_name": "BXH8", "value": 6.77338, "variance": None, "ndata": None},
+ "BXH9": {"sample_name": "BXH9", "value": 8.03836, "variance": None, "ndata": None},
+ "C3H/HeJ": {"sample_name": "C3H/HeJ", "value": 7.42795, "variance": None, "ndata": None},
+ "C57BL/6J": {"sample_name": "C57BL/6J", "value": 7.50606, "variance": None, "ndata": None},
+ "DBA/2J": {"sample_name": "DBA/2J", "value": 7.72588, "variance": None, "ndata": None}}}
slinked = (
(((0, 2, 0.16381088984330505),
@@ -66,7 +66,7 @@ class TestHeatmap(TestCase):
["all", (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)]]:
with self.subTest(dtype=dtype):
self.assertEqual(
- export_trait_data(trait_data, strainlist, dtype=dtype),
+ export_trait_data(trait_data, samplelist, dtype=dtype),
expected)
def test_export_trait_data_dtype_all_flags(self):
@@ -106,7 +106,7 @@ class TestHeatmap(TestCase):
with self.subTest(dtype=dtype, vflag=vflag, nflag=nflag):
self.assertEqual(
export_trait_data(
- trait_data, strainlist, dtype=dtype, var_exists=vflag,
+ trait_data, samplelist, dtype=dtype, var_exists=vflag,
n_exists=nflag),
expected)
@@ -164,8 +164,8 @@ class TestHeatmap(TestCase):
self.assertEqual(
compute_traits_order(slinked), (0, 2, 1, 7, 5, 9, 3, 6, 8, 4))
- def test_retrieve_strains_and_values(self):
- """Test retrieval of strains and values."""
+ def test_retrieve_samples_and_values(self):
+ """Test retrieval of samples and values."""
for orders, slist, tdata, expected in [
[
[2],
@@ -185,9 +185,9 @@ class TestHeatmap(TestCase):
[6, None, None, 4, None]],
[[3, ["s1", "s4"], [6, 4]]]
]]:
- with self.subTest(strainlist=slist, traitdata=tdata):
+ with self.subTest(samplelist=slist, traitdata=tdata):
self.assertEqual(
- retrieve_strains_and_values(orders, slist, tdata), expected)
+ retrieve_samples_and_values(orders, slist, tdata), expected)
def test_get_lrs_from_chr(self):
"""Check that function gets correct LRS values"""