aboutsummaryrefslogtreecommitdiff
path: root/gn3
diff options
context:
space:
mode:
authorFrederick Muriuki Muriithi2021-09-27 04:48:53 +0300
committerFrederick Muriuki Muriithi2021-09-27 04:48:53 +0300
commit19783a18c2bc7941fc5980e593f19fb1d18c3623 (patch)
treef3a6e241be3c6224b9647c8258c516a7b741a28c /gn3
parent8d9bc0f29ce9208306915b079818e6f0c31785e2 (diff)
downloadgenenetwork3-19783a18c2bc7941fc5980e593f19fb1d18c3623.tar.gz
Update terminology: `strain` to `sample`
Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Update the terminology used: use `sample` in place of `strain` according to Zachary's direction at https://github.com/genenetwork/genenetwork3/pull/37#issuecomment-926043306
Diffstat (limited to 'gn3')
-rw-r--r--gn3/computations/parsers.py10
-rw-r--r--gn3/computations/qtlreaper.py8
-rw-r--r--gn3/db/genotypes.py8
-rw-r--r--gn3/db/traits.py44
-rw-r--r--gn3/heatmaps.py62
5 files changed, 66 insertions, 66 deletions
diff --git a/gn3/computations/parsers.py b/gn3/computations/parsers.py
index 94387ff..1af35d6 100644
--- a/gn3/computations/parsers.py
+++ b/gn3/computations/parsers.py
@@ -14,7 +14,7 @@ def parse_genofile(file_path: str) -> Tuple[List[str],
'h': 0,
'u': None,
}
- genotypes, strains = [], []
+ genotypes, samples = [], []
with open(file_path, "r") as _genofile:
for line in _genofile:
line = line.strip()
@@ -22,8 +22,8 @@ def parse_genofile(file_path: str) -> Tuple[List[str],
continue
cells = line.split()
if line.startswith("Chr"):
- strains = cells[4:]
- strains = [strain.lower() for strain in strains]
+ samples = cells[4:]
+ samples = [sample.lower() for sample in samples]
continue
values = [__map.get(value.lower(), None) for value in cells[4:]]
genotype = {
@@ -32,7 +32,7 @@ def parse_genofile(file_path: str) -> Tuple[List[str],
"cm": cells[2],
"mb": cells[3],
"values": values,
- "dicvalues": dict(zip(strains, values)),
+ "dicvalues": dict(zip(samples, values)),
}
genotypes.append(genotype)
- return strains, genotypes
+ return samples, genotypes
diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py
index 8b2893e..166d2dd 100644
--- a/gn3/computations/qtlreaper.py
+++ b/gn3/computations/qtlreaper.py
@@ -9,17 +9,17 @@ from typing import Union
from gn3.random import random_string
from gn3.settings import TMPDIR, REAPER_COMMAND
-def generate_traits_file(strains, trait_values, traits_filename):
+def generate_traits_file(samples, trait_values, traits_filename):
"""
Generate a traits file for use with `qtlreaper`.
PARAMETERS:
- strains: A list of strains to use as the headers for the various columns.
- trait_values: A list of lists of values for each trait and strain.
+ samples: A list of samples to use as the headers for the various columns.
+ trait_values: A list of lists of values for each trait and sample.
traits_filename: The tab-separated value to put the values in for
computation of QTLs.
"""
- header = "Trait\t{}\n".format("\t".join(strains))
+ header = "Trait\t{}\n".format("\t".join(samples))
data = (
[header] +
["{}\t{}\n".format(i+1, "\t".join([str(i) for i in t]))
diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py
index 9987320..8f18cac 100644
--- a/gn3/db/genotypes.py
+++ b/gn3/db/genotypes.py
@@ -14,16 +14,16 @@ def build_genotype_file(
def load_genotype_samples(genotype_filename: str, file_type: str = "geno"):
"""
- Load sample of strains from genotype files.
+ Load sample of samples from genotype files.
DESCRIPTION:
- Traits can contain a varied number of strains, some of which do not exist in
+ Traits can contain a varied number of samples, some of which do not exist in
certain genotypes. In order to compute QTLs, GEMMAs, etc, we need to ensure
- to pick only those strains that exist in the genotype under consideration
+ to pick only those samples that exist in the genotype under consideration
for the traits used in the computation.
This function loads a list of samples from the genotype files for use in
- filtering out unusable strains.
+ filtering out unusable samples.
PARAMETERS:
diff --git a/gn3/db/traits.py b/gn3/db/traits.py
index 4fc47c3..c9d05d7 100644
--- a/gn3/db/traits.py
+++ b/gn3/db/traits.py
@@ -445,7 +445,7 @@ def retrieve_temp_trait_data(trait_info: dict, conn: Any):
query,
{"trait_name": trait_info["trait_name"]})
return [dict(zip(
- ["strain_name", "value", "se_error", "nstrain", "id"], row))
+ ["sample_name", "value", "se_error", "nstrain", "id"], row))
for row in cursor.fetchall()]
return []
@@ -484,7 +484,7 @@ def retrieve_geno_trait_data(trait_info: Dict, conn: Any):
"species_id": retrieve_species_id(
trait_info["db"]["riset"], conn)})
return [dict(zip(
- ["strain_name", "value", "se_error", "id"], row))
+ ["sample_name", "value", "se_error", "id"], row))
for row in cursor.fetchall()]
return []
@@ -515,7 +515,7 @@ def retrieve_publish_trait_data(trait_info: Dict, conn: Any):
{"trait_name": trait_info["trait_name"],
"dataset_id": trait_info["db"]["dataset_id"]})
return [dict(zip(
- ["strain_name", "value", "se_error", "nstrain", "id"], row))
+ ["sample_name", "value", "se_error", "nstrain", "id"], row))
for row in cursor.fetchall()]
return []
@@ -548,7 +548,7 @@ def retrieve_cellid_trait_data(trait_info: Dict, conn: Any):
"trait_name": trait_info["trait_name"],
"dataset_id": trait_info["db"]["dataset_id"]})
return [dict(zip(
- ["strain_name", "value", "se_error", "id"], row))
+ ["sample_name", "value", "se_error", "id"], row))
for row in cursor.fetchall()]
return []
@@ -577,29 +577,29 @@ def retrieve_probeset_trait_data(trait_info: Dict, conn: Any):
{"trait_name": trait_info["trait_name"],
"dataset_name": trait_info["db"]["dataset_name"]})
return [dict(zip(
- ["strain_name", "value", "se_error", "id"], row))
+ ["sample_name", "value", "se_error", "id"], row))
for row in cursor.fetchall()]
return []
-def with_strainlist_data_setup(strainlist: Sequence[str]):
+def with_samplelist_data_setup(samplelist: Sequence[str]):
"""
- Build function that computes the trait data from provided list of strains.
+ Build function that computes the trait data from provided list of samples.
PARAMETERS
- strainlist: (list)
- A list of strain names
+ samplelist: (list)
+ A list of sample names
RETURNS:
Returns a function that given some data from the database, computes the
- strain's value, variance and ndata values, only if the strain is present
- in the provided `strainlist` variable.
+ sample's value, variance and ndata values, only if the sample is present
+ in the provided `samplelist` variable.
"""
def setup_fn(tdata):
- if tdata["strain_name"] in strainlist:
+ if tdata["sample_name"] in samplelist:
val = tdata["value"]
if val is not None:
return {
- "strain_name": tdata["strain_name"],
+ "sample_name": tdata["sample_name"],
"value": val,
"variance": tdata["se_error"],
"ndata": tdata.get("nstrain", None)
@@ -607,19 +607,19 @@ def with_strainlist_data_setup(strainlist: Sequence[str]):
return None
return setup_fn
-def without_strainlist_data_setup():
+def without_samplelist_data_setup():
"""
Build function that computes the trait data.
RETURNS:
Returns a function that given some data from the database, computes the
- strain's value, variance and ndata values.
+ sample's value, variance and ndata values.
"""
def setup_fn(tdata):
val = tdata["value"]
if val is not None:
return {
- "strain_name": tdata["strain_name"],
+ "sample_name": tdata["sample_name"],
"value": val,
"variance": tdata["se_error"],
"ndata": tdata.get("nstrain", None)
@@ -627,7 +627,7 @@ def without_strainlist_data_setup():
return None
return setup_fn
-def retrieve_trait_data(trait: dict, conn: Any, strainlist: Sequence[str] = tuple()):
+def retrieve_trait_data(trait: dict, conn: Any, samplelist: Sequence[str] = tuple()):
"""
Retrieve trait data
@@ -650,23 +650,23 @@ def retrieve_trait_data(trait: dict, conn: Any, strainlist: Sequence[str] = tupl
if results:
# do something with mysqlid
mysqlid = results[0]["id"]
- if strainlist:
+ if samplelist:
data = [
item for item in
- map(with_strainlist_data_setup(strainlist), results)
+ map(with_samplelist_data_setup(samplelist), results)
if item is not None]
else:
data = [
item for item in
- map(without_strainlist_data_setup(), results)
+ map(without_samplelist_data_setup(), results)
if item is not None]
return {
"mysqlid": mysqlid,
"data": dict(map(
lambda x: (
- x["strain_name"],
- {k:v for k, v in x.items() if x != "strain_name"}),
+ x["sample_name"],
+ {k:v for k, v in x.items() if x != "sample_name"}),
data))}
return {}
diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py
index 45d0c22..b6fc6d3 100644
--- a/gn3/heatmaps.py
+++ b/gn3/heatmaps.py
@@ -27,10 +27,10 @@ from gn3.computations.qtlreaper import (
organise_reaper_main_results)
def export_trait_data(
- trait_data: dict, strainlist: Sequence[str], dtype: str = "val",
+ trait_data: dict, samplelist: Sequence[str], dtype: str = "val",
var_exists: bool = False, n_exists: bool = False):
"""
- Export data according to `strainlist`. Mostly used in calculating
+ Export data according to `samplelist`. Mostly used in calculating
correlations.
DESCRIPTION:
@@ -40,8 +40,8 @@ def export_trait_data(
PARAMETERS
trait: (dict)
The dictionary of key-value pairs representing a trait
- strainlist: (list)
- A list of strain names
+ samplelist: (list)
+ A list of sample names
dtype: (str)
... verify what this is ...
var_exists: (bool)
@@ -49,18 +49,18 @@ def export_trait_data(
n_exists: (bool)
A flag indicating existence of ndata
"""
- def __export_all_types(tdata, strain):
+ def __export_all_types(tdata, sample):
sample_data = []
- if tdata[strain]["value"]:
- sample_data.append(tdata[strain]["value"])
+ if tdata[sample]["value"]:
+ sample_data.append(tdata[sample]["value"])
if var_exists:
- if tdata[strain]["variance"]:
- sample_data.append(tdata[strain]["variance"])
+ if tdata[sample]["variance"]:
+ sample_data.append(tdata[sample]["variance"])
else:
sample_data.append(None)
if n_exists:
- if tdata[strain]["ndata"]:
- sample_data.append(tdata[strain]["ndata"])
+ if tdata[sample]["ndata"]:
+ sample_data.append(tdata[sample]["ndata"])
else:
sample_data.append(None)
else:
@@ -73,17 +73,17 @@ def export_trait_data(
return tuple(sample_data)
- def __exporter(accumulator, strain):
+ def __exporter(accumulator, sample):
# pylint: disable=[R0911]
- if strain in trait_data["data"]:
+ if sample in trait_data["data"]:
if dtype == "val":
- return accumulator + (trait_data["data"][strain]["value"], )
+ return accumulator + (trait_data["data"][sample]["value"], )
if dtype == "var":
- return accumulator + (trait_data["data"][strain]["variance"], )
+ return accumulator + (trait_data["data"][sample]["variance"], )
if dtype == "N":
- return accumulator + (trait_data["data"][strain]["ndata"], )
+ return accumulator + (trait_data["data"][sample]["ndata"], )
if dtype == "all":
- return accumulator + __export_all_types(trait_data["data"], strain)
+ return accumulator + __export_all_types(trait_data["data"], sample)
raise KeyError("Type `%s` is incorrect" % dtype)
if var_exists and n_exists:
return accumulator + (None, None, None)
@@ -91,7 +91,7 @@ def export_trait_data(
return accumulator + (None, None)
return accumulator + (None,)
- return reduce(__exporter, strainlist, tuple())
+ return reduce(__exporter, samplelist, tuple())
def trait_display_name(trait: Dict):
"""
@@ -165,19 +165,19 @@ def build_heatmap(traits_names, conn: Any):
for fullname in traits_names]
traits_data_list = [retrieve_trait_data(t, conn) for t in traits]
genotype_filename = build_genotype_file(traits[0]["riset"])
- strains = load_genotype_samples(genotype_filename)
+ samples = load_genotype_samples(genotype_filename)
exported_traits_data_list = [
- export_trait_data(td, strains) for td in traits_data_list]
+ export_trait_data(td, samples) for td in traits_data_list]
clustered = cluster_traits(exported_traits_data_list)
slinked = slink(clustered)
traits_order = compute_traits_order(slinked)
- strains_and_values = retrieve_strains_and_values(
- traits_order, strains, exported_traits_data_list)
+ samples_and_values = retrieve_samples_and_values(
+ traits_order, samples, exported_traits_data_list)
traits_filename = "{}/traits_test_file_{}.txt".format(
TMPDIR, random_string(10))
generate_traits_file(
- strains_and_values[0][1],
- [t[2] for t in strains_and_values],
+ samples_and_values[0][1],
+ [t[2] for t in samples_and_values],
traits_filename)
main_output, _permutations_output = run_reaper(
@@ -229,9 +229,9 @@ def compute_traits_order(slink_data, neworder: tuple = tuple()):
return __order_maker(neworder, slink_data)
-def retrieve_strains_and_values(orders, strainlist, traits_data_list):
+def retrieve_samples_and_values(orders, samplelist, traits_data_list):
"""
- Get the strains and their corresponding values from `strainlist` and
+ Get the samples and their corresponding values from `samplelist` and
`traits_data_list`.
This migrates the code in
@@ -240,17 +240,17 @@ def retrieve_strains_and_values(orders, strainlist, traits_data_list):
# This feels nasty! There's a lot of mutation of values here, that might
# indicate something untoward in the design of this function and its
# dependents ==> Review
- strains = []
+ samples = []
values = []
rets = []
for order in orders:
temp_val = traits_data_list[order]
- for i, strain in enumerate(strainlist):
+ for i, sample in enumerate(samplelist):
if temp_val[i] is not None:
- strains.append(strain)
+ samples.append(sample)
values.append(temp_val[i])
- rets.append([order, strains[:], values[:]])
- strains = []
+ rets.append([order, samples[:], values[:]])
+ samples = []
values = []
return rets