aboutsummaryrefslogtreecommitdiff
path: root/gn3/computations
diff options
context:
space:
mode:
authorzsloan2021-10-12 18:56:34 +0000
committerzsloan2021-10-12 18:56:34 +0000
commit0f396f4a1a753d449cf2975fc425d587d9350689 (patch)
treec9dac243dc05e5cb90ccb7f1d96fd599986bf60a /gn3/computations
parent976660ce9ef915426c7ce5ff9077b439e4102a2c (diff)
parent77c274b79c3ec01de60e90db3299763cb58f715b (diff)
downloadgenenetwork3-0f396f4a1a753d449cf2975fc425d587d9350689.tar.gz
Merge branch 'main' of https://github.com/genenetwork/genenetwork3 into feature/add_rqtl_pairscan
Diffstat (limited to 'gn3/computations')
-rw-r--r--gn3/computations/heatmap.py177
-rw-r--r--gn3/computations/parsers.py10
-rw-r--r--gn3/computations/qtlreaper.py170
-rw-r--r--gn3/computations/wgcna.py51
4 files changed, 226 insertions, 182 deletions
diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py
deleted file mode 100644
index 3c35029..0000000
--- a/gn3/computations/heatmap.py
+++ /dev/null
@@ -1,177 +0,0 @@
-"""
-This module will contain functions to be used in computation of the data used to
-generate various kinds of heatmaps.
-"""
-
-from functools import reduce
-from typing import Any, Dict, Sequence
-from gn3.computations.slink import slink
-from gn3.db.traits import retrieve_trait_data, retrieve_trait_info
-from gn3.computations.correlations2 import compute_correlation
-
-def export_trait_data(
- trait_data: dict, strainlist: Sequence[str], dtype: str = "val",
- var_exists: bool = False, n_exists: bool = False):
- """
- Export data according to `strainlist`. Mostly used in calculating
- correlations.
-
- DESCRIPTION:
- Migrated from
- https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L166-L211
-
- PARAMETERS
- trait: (dict)
- The dictionary of key-value pairs representing a trait
- strainlist: (list)
- A list of strain names
- type: (str)
- ... verify what this is ...
- var_exists: (bool)
- A flag indicating existence of variance
- n_exists: (bool)
- A flag indicating existence of ndata
- """
- def __export_all_types(tdata, strain):
- sample_data = []
- if tdata[strain]["value"]:
- sample_data.append(tdata[strain]["value"])
- if var_exists:
- if tdata[strain]["variance"]:
- sample_data.append(tdata[strain]["variance"])
- else:
- sample_data.append(None)
- if n_exists:
- if tdata[strain]["ndata"]:
- sample_data.append(tdata[strain]["ndata"])
- else:
- sample_data.append(None)
- else:
- if var_exists and n_exists:
- sample_data += [None, None, None]
- elif var_exists or n_exists:
- sample_data += [None, None]
- else:
- sample_data.append(None)
-
- return tuple(sample_data)
-
- def __exporter(accumulator, strain):
- # pylint: disable=[R0911]
- if strain in trait_data["data"]:
- if dtype == "val":
- return accumulator + (trait_data["data"][strain]["value"], )
- if dtype == "var":
- return accumulator + (trait_data["data"][strain]["variance"], )
- if dtype == "N":
- return accumulator + (trait_data["data"][strain]["ndata"], )
- if dtype == "all":
- return accumulator + __export_all_types(trait_data["data"], strain)
- raise KeyError("Type `%s` is incorrect" % dtype)
- if var_exists and n_exists:
- return accumulator + (None, None, None)
- if var_exists or n_exists:
- return accumulator + (None, None)
- return accumulator + (None,)
-
- return reduce(__exporter, strainlist, tuple())
-
-def trait_display_name(trait: Dict):
- """
- Given a trait, return a name to use to display the trait on a heatmap.
-
- DESCRIPTION
- Migrated from
- https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L141-L157
- """
- if trait.get("db", None) and trait.get("trait_name", None):
- if trait["db"]["dataset_type"] == "Temp":
- desc = trait["description"]
- if desc.find("PCA") >= 0:
- return "%s::%s" % (
- trait["db"]["displayname"],
- desc[desc.rindex(':')+1:].strip())
- return "%s::%s" % (
- trait["db"]["displayname"],
- desc[:desc.index('entered')].strip())
- prefix = "%s::%s" % (
- trait["db"]["dataset_name"], trait["trait_name"])
- if trait["cellid"]:
- return "%s::%s" % (prefix, trait["cellid"])
- return prefix
- return trait["description"]
-
-def cluster_traits(traits_data_list: Sequence[Dict]):
- """
- Clusters the trait values.
-
- DESCRIPTION
- Attempts to replicate the clustering of the traits, as done at
- https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L138-L162
- """
- def __compute_corr(tdata_i, tdata_j):
- if tdata_i[0] == tdata_j[0]:
- return 0.0
- corr_vals = compute_correlation(tdata_i[1], tdata_j[1])
- corr = corr_vals[0]
- if (1 - corr) < 0:
- return 0.0
- return 1 - corr
-
- def __cluster(tdata_i):
- return tuple(
- __compute_corr(tdata_i, tdata_j)
- for tdata_j in enumerate(traits_data_list))
-
- return tuple(__cluster(tdata_i) for tdata_i in enumerate(traits_data_list))
-
-def heatmap_data(formd, search_result, conn: Any):
- """
- heatmap function
-
- DESCRIPTION
- This function is an attempt to reproduce the initialisation at
- https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L46-L64
- and also the clustering and slink computations at
- https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L138-L165
- with the help of the `gn3.computations.heatmap.cluster_traits` function.
-
- It does not try to actually draw the heatmap image.
-
- PARAMETERS:
- TODO: Elaborate on the parameters here...
- """
- threshold = 0 # webqtlConfig.PUBLICTHRESH
- cluster_checked = formd.formdata.getvalue("clusterCheck", "")
- strainlist = [
- strain for strain in formd.strainlist if strain not in formd.parlist]
- genotype = formd.genotype
-
- def __retrieve_traitlist_and_datalist(threshold, fullname):
- trait = retrieve_trait_info(threshold, fullname, conn)
- return (
- trait,
- export_trait_data(retrieve_trait_data(trait, conn), strainlist))
-
- traits_details = [
- __retrieve_traitlist_and_datalist(threshold, fullname)
- for fullname in search_result]
- traits_list = map(lambda x: x[0], traits_details)
- traits_data_list = map(lambda x: x[1], traits_details)
-
- return {
- "target_description_checked": formd.formdata.getvalue(
- "targetDescriptionCheck", ""),
- "cluster_checked": cluster_checked,
- "slink_data": (
- slink(cluster_traits(traits_data_list))
- if cluster_checked else False),
- "sessionfile": formd.formdata.getvalue("session"),
- "genotype": genotype,
- "nLoci": sum(map(len, genotype)),
- "strainlist": strainlist,
- "ppolar": formd.ppolar,
- "mpolar":formd.mpolar,
- "traits_list": traits_list,
- "traits_data_list": traits_data_list
- }
diff --git a/gn3/computations/parsers.py b/gn3/computations/parsers.py
index 94387ff..1af35d6 100644
--- a/gn3/computations/parsers.py
+++ b/gn3/computations/parsers.py
@@ -14,7 +14,7 @@ def parse_genofile(file_path: str) -> Tuple[List[str],
'h': 0,
'u': None,
}
- genotypes, strains = [], []
+ genotypes, samples = [], []
with open(file_path, "r") as _genofile:
for line in _genofile:
line = line.strip()
@@ -22,8 +22,8 @@ def parse_genofile(file_path: str) -> Tuple[List[str],
continue
cells = line.split()
if line.startswith("Chr"):
- strains = cells[4:]
- strains = [strain.lower() for strain in strains]
+ samples = cells[4:]
+ samples = [sample.lower() for sample in samples]
continue
values = [__map.get(value.lower(), None) for value in cells[4:]]
genotype = {
@@ -32,7 +32,7 @@ def parse_genofile(file_path: str) -> Tuple[List[str],
"cm": cells[2],
"mb": cells[3],
"values": values,
- "dicvalues": dict(zip(strains, values)),
+ "dicvalues": dict(zip(samples, values)),
}
genotypes.append(genotype)
- return strains, genotypes
+ return samples, genotypes
diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py
new file mode 100644
index 0000000..d1ff4ac
--- /dev/null
+++ b/gn3/computations/qtlreaper.py
@@ -0,0 +1,170 @@
+"""
+This module contains functions to interact with the `qtlreaper` utility for
+computation of QTLs.
+"""
+import os
+import subprocess
+from typing import Union
+
+from gn3.random import random_string
+from gn3.settings import TMPDIR, REAPER_COMMAND
+
+def generate_traits_file(samples, trait_values, traits_filename):
+ """
+ Generate a traits file for use with `qtlreaper`.
+
+ PARAMETERS:
+ samples: A list of samples to use as the headers for the various columns.
+ trait_values: A list of lists of values for each trait and sample.
+ traits_filename: The tab-separated value to put the values in for
+ computation of QTLs.
+ """
+ header = "Trait\t{}\n".format("\t".join(samples))
+ data = (
+ [header] +
+ ["{}\t{}\n".format(i+1, "\t".join([str(i) for i in t]))
+ for i, t in enumerate(trait_values[:-1])] +
+ ["{}\t{}".format(
+ len(trait_values), "\t".join([str(i) for i in t]))
+ for t in trait_values[-1:]])
+ with open(traits_filename, "w") as outfile:
+ outfile.writelines(data)
+
+def create_output_directory(path: str):
+ """Create the output directory at `path` if it does not exist."""
+ try:
+ os.mkdir(path)
+ except FileExistsError:
+ # If the directory already exists, do nothing.
+ pass
+
+def run_reaper(
+ genotype_filename: str, traits_filename: str,
+ other_options: tuple = ("--n_permutations", "1000"),
+ separate_nperm_output: bool = False,
+ output_dir: str = TMPDIR):
+ """
+ Run the QTLReaper command to compute the QTLs.
+
+ PARAMETERS:
+ genotype_filename: The complete path to a genotype file to use in the QTL
+ computation.
+ traits_filename: A path to a file previously generated with the
+ `generate_traits_file` function in this module, to be used in the QTL
+ computation.
+ other_options: Other options to pass to the `qtlreaper` command to modify
+ the QTL computations.
+ separate_nperm_output: A flag indicating whether or not to provide a
+ separate output for the permutations computation. The default is False,
+ which means by default, no separate output file is created.
+ output_dir: A path to the directory where the outputs are put
+
+ RETURNS:
+ The function returns a tuple of the main output file, and the output file
+ for the permutation computations. If the `separate_nperm_output` is `False`,
+ the second value in the tuple returned is `None`.
+
+ RAISES:
+ The function will raise a `subprocess.CalledProcessError` exception in case
+ of any errors running the `qtlreaper` command.
+ """
+ create_output_directory("{}/qtlreaper".format(output_dir))
+ output_filename = "{}/qtlreaper/main_output_{}.txt".format(
+ output_dir, random_string(10))
+ output_list = ["--main_output", output_filename]
+ if separate_nperm_output:
+ permu_output_filename: Union[None, str] = "{}/qtlreaper/permu_output_{}.txt".format(
+ output_dir, random_string(10))
+ output_list = output_list + [
+ "--permu_output", permu_output_filename] # type: ignore[list-item]
+ else:
+ permu_output_filename = None
+
+ command_list = [
+ REAPER_COMMAND, "--geno", genotype_filename,
+ *other_options, # this splices the `other_options` list here
+ "--traits", traits_filename,
+ *output_list # this splices the `output_list` list here
+ ]
+
+ subprocess.run(command_list, check=True)
+ return (output_filename, permu_output_filename)
+
+def chromosome_sorter_key_fn(val):
+ """
+ Useful for sorting the chromosomes
+ """
+ if isinstance(val, int):
+ return val
+ return ord(val)
+
+def organise_reaper_main_results(parsed_results):
+ """
+ Provide the results of running reaper in a format that is easier to use.
+ """
+ def __organise_by_chromosome(chr_name, items):
+ chr_items = [item for item in items if item["Chr"] == chr_name]
+ return {
+ "Chr": chr_name,
+ "loci": [{
+ "Locus": locus["Locus"],
+ "cM": locus["cM"],
+ "Mb": locus["Mb"],
+ "LRS": locus["LRS"],
+ "Additive": locus["Additive"],
+ "pValue": locus["pValue"]
+ } for locus in chr_items]}
+
+ def __organise_by_id(identifier, items):
+ id_items = [item for item in items if item["ID"] == identifier]
+ unique_chromosomes = {item["Chr"] for item in id_items}
+ return {
+ "ID": identifier,
+ "chromosomes": {
+ _chr["Chr"]: _chr for _chr in [
+ __organise_by_chromosome(chromo, id_items)
+ for chromo in sorted(
+ unique_chromosomes, key=chromosome_sorter_key_fn)]}}
+
+ unique_ids = {res["ID"] for res in parsed_results}
+ return {
+ trait["ID"]: trait for trait in
+ [__organise_by_id(_id, parsed_results) for _id in sorted(unique_ids)]}
+
+def parse_reaper_main_results(results_file):
+ """
+ Parse the results file of running QTLReaper into a list of dicts.
+ """
+ with open(results_file, "r") as infile:
+ lines = infile.readlines()
+
+ def __parse_column_float_value(value):
+ # pylint: disable=W0702
+ try:
+ return float(value)
+ except:
+ return value
+
+ def __parse_column_int_value(value):
+ # pylint: disable=W0702
+ try:
+ return int(value)
+ except:
+ return value
+
+ def __parse_line(line):
+ items = line.strip().split("\t")
+ return items[0:2] + [__parse_column_int_value(items[2])] + [
+ __parse_column_float_value(item) for item in items[3:]]
+
+ header = lines[0].strip().split("\t")
+ return [dict(zip(header, __parse_line(line))) for line in lines[1:]]
+
+def parse_reaper_permutation_results(results_file):
+ """
+ Parse the results QTLReaper permutations into a list of values.
+ """
+ with open(results_file, "r") as infile:
+ lines = infile.readlines()
+
+ return [float(line.strip()) for line in lines]
diff --git a/gn3/computations/wgcna.py b/gn3/computations/wgcna.py
new file mode 100644
index 0000000..fd508fa
--- /dev/null
+++ b/gn3/computations/wgcna.py
@@ -0,0 +1,51 @@
+"""module contains code to preprocess and call wgcna script"""
+
+import os
+import json
+import uuid
+from gn3.settings import TMPDIR
+
+from gn3.commands import run_cmd
+
+
+def dump_wgcna_data(request_data: dict):
+ """function to dump request data to json file"""
+ filename = f"{str(uuid.uuid4())}.json"
+
+ temp_file_path = os.path.join(TMPDIR, filename)
+
+ with open(temp_file_path, "w") as output_file:
+ json.dump(request_data, output_file)
+
+ return temp_file_path
+
+
+def compose_wgcna_cmd(rscript_path: str, temp_file_path: str):
+ """function to componse wgcna cmd"""
+ # (todo):issue relative paths to abs paths
+ cmd = f"Rscript ./scripts/{rscript_path} {temp_file_path}"
+ return cmd
+
+
+def call_wgcna_script(rscript_path: str, request_data: dict):
+ """function to call wgcna script"""
+ generated_file = dump_wgcna_data(request_data)
+ cmd = compose_wgcna_cmd(rscript_path, generated_file)
+
+ try:
+
+ run_cmd_results = run_cmd(cmd)
+
+ with open(generated_file, "r") as outputfile:
+
+ if run_cmd_results["code"] != 0:
+ return run_cmd_results
+ return {
+ "data": json.load(outputfile),
+ **run_cmd_results
+ }
+ except FileNotFoundError:
+ # relook at handling errors gn3
+ return {
+ "output": "output file not found"
+ }