Merge branch 'main' of https://github.com/genenetwork/genenetwork3 into feature/add_rqtl_pairscan

author: zsloan 2021-10-12 18:56:34 +0000
committer: zsloan 2021-10-12 18:56:34 +0000
commit: 0f396f4a1a753d449cf2975fc425d587d9350689 (patch)
tree: c9dac243dc05e5cb90ccb7f1d96fd599986bf60a /gn3/computations
parent: 976660ce9ef915426c7ce5ff9077b439e4102a2c (diff)
parent: 77c274b79c3ec01de60e90db3299763cb58f715b (diff)
download: genenetwork3-0f396f4a1a753d449cf2975fc425d587d9350689.tar.gz
4 files changed, 226 insertions, 182 deletions
diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py
deleted file mode 100644
index 3c35029..0000000
--- a/gn3/computations/heatmap.py
+++ /dev/null
@@ -1,177 +0,0 @@
-"""
-This module will contain functions to be used in computation of the data used to
-generate various kinds of heatmaps.
-"""
-
-from functools import reduce
-from typing import Any, Dict, Sequence
-from gn3.computations.slink import slink
-from gn3.db.traits import retrieve_trait_data, retrieve_trait_info
-from gn3.computations.correlations2 import compute_correlation
-
-def export_trait_data(
-        trait_data: dict, strainlist: Sequence[str], dtype: str = "val",
-        var_exists: bool = False, n_exists: bool = False):
-    """
-    Export data according to `strainlist`. Mostly used in calculating
-    correlations.
-
-    DESCRIPTION:
-    Migrated from
-    https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L166-L211
-
-    PARAMETERS
-    trait: (dict)
-      The dictionary of key-value pairs representing a trait
-    strainlist: (list)
-      A list of strain names
-    type: (str)
-      ... verify what this is ...
-    var_exists: (bool)
-      A flag indicating existence of variance
-    n_exists: (bool)
-      A flag indicating existence of ndata
-    """
-    def __export_all_types(tdata, strain):
-        sample_data = []
-        if tdata[strain]["value"]:
-            sample_data.append(tdata[strain]["value"])
-            if var_exists:
-                if tdata[strain]["variance"]:
-                    sample_data.append(tdata[strain]["variance"])
-                else:
-                    sample_data.append(None)
-            if n_exists:
-                if tdata[strain]["ndata"]:
-                    sample_data.append(tdata[strain]["ndata"])
-                else:
-                    sample_data.append(None)
-        else:
-            if var_exists and n_exists:
-                sample_data += [None, None, None]
-            elif var_exists or n_exists:
-                sample_data += [None, None]
-            else:
-                sample_data.append(None)
-
-        return tuple(sample_data)
-
-    def __exporter(accumulator, strain):
-        # pylint: disable=[R0911]
-        if strain in trait_data["data"]:
-            if dtype == "val":
-                return accumulator + (trait_data["data"][strain]["value"], )
-            if dtype == "var":
-                return accumulator + (trait_data["data"][strain]["variance"], )
-            if dtype == "N":
-                return accumulator + (trait_data["data"][strain]["ndata"], )
-            if dtype == "all":
-                return accumulator + __export_all_types(trait_data["data"], strain)
-            raise KeyError("Type `%s` is incorrect" % dtype)
-        if var_exists and n_exists:
-            return accumulator + (None, None, None)
-        if var_exists or n_exists:
-            return accumulator + (None, None)
-        return accumulator + (None,)
-
-    return reduce(__exporter, strainlist, tuple())
-
-def trait_display_name(trait: Dict):
-    """
-    Given a trait, return a name to use to display the trait on a heatmap.
-
-    DESCRIPTION
-    Migrated from
-    https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L141-L157
-    """
-    if trait.get("db", None) and trait.get("trait_name", None):
-        if trait["db"]["dataset_type"] == "Temp":
-            desc = trait["description"]
-            if desc.find("PCA") >= 0:
-                return "%s::%s" % (
-                    trait["db"]["displayname"],
-                    desc[desc.rindex(':')+1:].strip())
-            return "%s::%s" % (
-                trait["db"]["displayname"],
-                desc[:desc.index('entered')].strip())
-        prefix = "%s::%s" % (
-            trait["db"]["dataset_name"], trait["trait_name"])
-        if trait["cellid"]:
-            return "%s::%s" % (prefix, trait["cellid"])
-        return prefix
-    return trait["description"]
-
-def cluster_traits(traits_data_list: Sequence[Dict]):
-    """
-    Clusters the trait values.
-
-    DESCRIPTION
-    Attempts to replicate the clustering of the traits, as done at
-    https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L138-L162
-    """
-    def __compute_corr(tdata_i, tdata_j):
-        if tdata_i[0] == tdata_j[0]:
-            return 0.0
-        corr_vals = compute_correlation(tdata_i[1], tdata_j[1])
-        corr = corr_vals[0]
-        if (1 - corr) < 0:
-            return 0.0
-        return 1 - corr
-
-    def __cluster(tdata_i):
-        return tuple(
-            __compute_corr(tdata_i, tdata_j)
-            for tdata_j in enumerate(traits_data_list))
-
-    return tuple(__cluster(tdata_i) for tdata_i in enumerate(traits_data_list))
-
-def heatmap_data(formd, search_result, conn: Any):
-    """
-    heatmap function
-
-    DESCRIPTION
-    This function is an attempt to reproduce the initialisation at
-    https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L46-L64
-    and also the clustering and slink computations at
-    https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L138-L165
-    with the help of the `gn3.computations.heatmap.cluster_traits` function.
-
-    It does not try to actually draw the heatmap image.
-
-    PARAMETERS:
-    TODO: Elaborate on the parameters here...
-    """
-    threshold = 0 # webqtlConfig.PUBLICTHRESH
-    cluster_checked = formd.formdata.getvalue("clusterCheck", "")
-    strainlist = [
-        strain for strain in formd.strainlist if strain not in formd.parlist]
-    genotype = formd.genotype
-
-    def __retrieve_traitlist_and_datalist(threshold, fullname):
-        trait = retrieve_trait_info(threshold, fullname, conn)
-        return (
-            trait,
-            export_trait_data(retrieve_trait_data(trait, conn), strainlist))
-
-    traits_details = [
-        __retrieve_traitlist_and_datalist(threshold, fullname)
-        for fullname in search_result]
-    traits_list = map(lambda x: x[0], traits_details)
-    traits_data_list = map(lambda x: x[1], traits_details)
-
-    return {
-        "target_description_checked": formd.formdata.getvalue(
-            "targetDescriptionCheck", ""),
-        "cluster_checked": cluster_checked,
-        "slink_data": (
-            slink(cluster_traits(traits_data_list))
-            if cluster_checked else False),
-        "sessionfile": formd.formdata.getvalue("session"),
-        "genotype": genotype,
-        "nLoci": sum(map(len, genotype)),
-        "strainlist": strainlist,
-        "ppolar": formd.ppolar,
-        "mpolar":formd.mpolar,
-        "traits_list": traits_list,
-        "traits_data_list": traits_data_list
-    }
diff --git a/gn3/computations/parsers.py b/gn3/computations/parsers.py
index 94387ff..1af35d6 100644
--- a/gn3/computations/parsers.py
+++ b/gn3/computations/parsers.py
@@ -14,7 +14,7 @@ def parse_genofile(file_path: str) -> Tuple[List[str],
         'h': 0,
         'u': None,
     }
-    genotypes, strains = [], []
+    genotypes, samples = [], []
     with open(file_path, "r") as _genofile:
         for line in _genofile:
             line = line.strip()
@@ -22,8 +22,8 @@ def parse_genofile(file_path: str) -> Tuple[List[str],
                 continue
             cells = line.split()
             if line.startswith("Chr"):
-                strains = cells[4:]
-                strains = [strain.lower() for strain in strains]
+                samples = cells[4:]
+                samples = [sample.lower() for sample in samples]
                 continue
             values = [__map.get(value.lower(), None) for value in cells[4:]]
             genotype = {
@@ -32,7 +32,7 @@ def parse_genofile(file_path: str) -> Tuple[List[str],
                 "cm": cells[2],
                 "mb": cells[3],
                 "values":  values,
-                "dicvalues": dict(zip(strains, values)),
+                "dicvalues": dict(zip(samples, values)),
             }
             genotypes.append(genotype)
-        return strains, genotypes
+        return samples, genotypes
diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py
new file mode 100644
index 0000000..d1ff4ac
--- /dev/null
+++ b/gn3/computations/qtlreaper.py
@@ -0,0 +1,170 @@
+"""
+This module contains functions to interact with the `qtlreaper` utility for
+computation of QTLs.
+"""
+import os
+import subprocess
+from typing import Union
+
+from gn3.random import random_string
+from gn3.settings import TMPDIR, REAPER_COMMAND
+
+def generate_traits_file(samples, trait_values, traits_filename):
+    """
+    Generate a traits file for use with `qtlreaper`.
+
+    PARAMETERS:
+    samples: A list of samples to use as the headers for the various columns.
+    trait_values: A list of lists of values for each trait and sample.
+    traits_filename: The tab-separated value to put the values in for
+        computation of QTLs.
+    """
+    header = "Trait\t{}\n".format("\t".join(samples))
+    data = (
+        [header] +
+        ["{}\t{}\n".format(i+1, "\t".join([str(i) for i in t]))
+         for i, t in enumerate(trait_values[:-1])] +
+        ["{}\t{}".format(
+            len(trait_values), "\t".join([str(i) for i in t]))
+         for t in trait_values[-1:]])
+    with open(traits_filename, "w") as outfile:
+        outfile.writelines(data)
+
+def create_output_directory(path: str):
+    """Create the output directory at `path` if it does not exist."""
+    try:
+        os.mkdir(path)
+    except FileExistsError:
+        # If the directory already exists, do nothing.
+        pass
+
+def run_reaper(
+        genotype_filename: str, traits_filename: str,
+        other_options: tuple = ("--n_permutations", "1000"),
+        separate_nperm_output: bool = False,
+        output_dir: str = TMPDIR):
+    """
+    Run the QTLReaper command to compute the QTLs.
+
+    PARAMETERS:
+    genotype_filename: The complete path to a genotype file to use in the QTL
+        computation.
+    traits_filename: A path to a file previously generated with the
+        `generate_traits_file` function in this module, to be used in the QTL
+        computation.
+    other_options: Other options to pass to the `qtlreaper` command to modify
+        the QTL computations.
+    separate_nperm_output: A flag indicating whether or not to provide a
+        separate output for the permutations computation. The default is False,
+        which means by default, no separate output file is created.
+    output_dir: A path to the directory where the outputs are put
+
+    RETURNS:
+    The function returns a tuple of the main output file, and the output file
+    for the permutation computations. If the `separate_nperm_output` is `False`,
+    the second value in the tuple returned is `None`.
+
+    RAISES:
+    The function will raise a `subprocess.CalledProcessError` exception in case
+    of any errors running the `qtlreaper` command.
+    """
+    create_output_directory("{}/qtlreaper".format(output_dir))
+    output_filename = "{}/qtlreaper/main_output_{}.txt".format(
+        output_dir, random_string(10))
+    output_list = ["--main_output", output_filename]
+    if separate_nperm_output:
+        permu_output_filename: Union[None, str] = "{}/qtlreaper/permu_output_{}.txt".format(
+            output_dir, random_string(10))
+        output_list = output_list + [
+            "--permu_output", permu_output_filename] # type: ignore[list-item]
+    else:
+        permu_output_filename = None
+
+    command_list = [
+        REAPER_COMMAND, "--geno", genotype_filename,
+        *other_options, # this splices the `other_options` list here
+        "--traits", traits_filename,
+        *output_list # this splices the `output_list` list here
+    ]
+
+    subprocess.run(command_list, check=True)
+    return (output_filename, permu_output_filename)
+
+def chromosome_sorter_key_fn(val):
+    """
+    Useful for sorting the chromosomes
+    """
+    if isinstance(val, int):
+        return val
+    return ord(val)
+
+def organise_reaper_main_results(parsed_results):
+    """
+    Provide the results of running reaper in a format that is easier to use.
+    """
+    def __organise_by_chromosome(chr_name, items):
+        chr_items = [item for item in items if item["Chr"] == chr_name]
+        return {
+            "Chr": chr_name,
+            "loci": [{
+                "Locus": locus["Locus"],
+                "cM": locus["cM"],
+                "Mb": locus["Mb"],
+                "LRS": locus["LRS"],
+                "Additive": locus["Additive"],
+                "pValue": locus["pValue"]
+            } for locus in chr_items]}
+
+    def __organise_by_id(identifier, items):
+        id_items = [item for item in items if item["ID"] == identifier]
+        unique_chromosomes = {item["Chr"] for item in id_items}
+        return {
+            "ID": identifier,
+            "chromosomes": {
+                _chr["Chr"]: _chr for _chr in [
+                    __organise_by_chromosome(chromo, id_items)
+                    for chromo in sorted(
+                        unique_chromosomes, key=chromosome_sorter_key_fn)]}}
+
+    unique_ids = {res["ID"] for res in parsed_results}
+    return {
+        trait["ID"]: trait for trait in
+        [__organise_by_id(_id, parsed_results) for _id in sorted(unique_ids)]}
+
+def parse_reaper_main_results(results_file):
+    """
+    Parse the results file of running QTLReaper into a list of dicts.
+    """
+    with open(results_file, "r") as infile:
+        lines = infile.readlines()
+
+    def __parse_column_float_value(value):
+        # pylint: disable=W0702
+        try:
+            return float(value)
+        except:
+            return value
+
+    def __parse_column_int_value(value):
+        # pylint: disable=W0702
+        try:
+            return int(value)
+        except:
+            return value
+
+    def __parse_line(line):
+        items = line.strip().split("\t")
+        return items[0:2] + [__parse_column_int_value(items[2])] + [
+            __parse_column_float_value(item) for item in items[3:]]
+
+    header = lines[0].strip().split("\t")
+    return [dict(zip(header, __parse_line(line))) for line in lines[1:]]
+
+def parse_reaper_permutation_results(results_file):
+    """
+    Parse the results QTLReaper permutations into a list of values.
+    """
+    with open(results_file, "r") as infile:
+        lines = infile.readlines()
+
+    return [float(line.strip()) for line in lines]
diff --git a/gn3/computations/wgcna.py b/gn3/computations/wgcna.py
new file mode 100644
index 0000000..fd508fa
--- /dev/null
+++ b/gn3/computations/wgcna.py
@@ -0,0 +1,51 @@
+"""module contains code to preprocess and call wgcna script"""
+
+import os
+import json
+import uuid
+from gn3.settings import TMPDIR
+
+from gn3.commands import run_cmd
+
+
+def dump_wgcna_data(request_data: dict):
+    """function to dump request data to json file"""
+    filename = f"{str(uuid.uuid4())}.json"
+
+    temp_file_path = os.path.join(TMPDIR, filename)
+
+    with open(temp_file_path, "w") as output_file:
+        json.dump(request_data, output_file)
+
+    return temp_file_path
+
+
+def compose_wgcna_cmd(rscript_path: str, temp_file_path: str):
+    """function to componse wgcna cmd"""
+    # (todo):issue relative paths to abs paths
+    cmd = f"Rscript ./scripts/{rscript_path}  {temp_file_path}"
+    return cmd
+
+
+def call_wgcna_script(rscript_path: str, request_data: dict):
+    """function to call wgcna script"""
+    generated_file = dump_wgcna_data(request_data)
+    cmd = compose_wgcna_cmd(rscript_path, generated_file)
+
+    try:
+
+        run_cmd_results = run_cmd(cmd)
+
+        with open(generated_file, "r") as outputfile:
+
+            if run_cmd_results["code"] != 0:
+                return run_cmd_results
+            return {
+                "data": json.load(outputfile),
+                **run_cmd_results
+            }
+    except FileNotFoundError:
+        # relook  at handling errors gn3
+        return {
+            "output": "output file not found"
+        }
author	zsloan	2021-10-12 18:56:34 +0000
committer	zsloan	2021-10-12 18:56:34 +0000
commit	0f396f4a1a753d449cf2975fc425d587d9350689 (patch)
tree	c9dac243dc05e5cb90ccb7f1d96fd599986bf60a /gn3/computations
parent	976660ce9ef915426c7ce5ff9077b439e4102a2c (diff)
parent	77c274b79c3ec01de60e90db3299763cb58f715b (diff)
download	genenetwork3-0f396f4a1a753d449cf2975fc425d587d9350689.tar.gz