about summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--README.md40
-rw-r--r--gn3/computations/heatmap.py68
-rw-r--r--gn3/computations/qtlreaper.py92
-rw-r--r--gn3/db/genotypes.py69
-rw-r--r--gn3/settings.py7
-rw-r--r--guix.scm6
-rw-r--r--qtlfilesexport.py68
-rw-r--r--tests/unit/computations/test_heatmap.py37
8 files changed, 375 insertions, 12 deletions
diff --git a/README.md b/README.md
index 750f55d..84a7a54 100644
--- a/README.md
+++ b/README.md
@@ -24,7 +24,7 @@ guix environment --load=guix.scm
 Also, make sure you have the [guix-bioinformatics](https://git.genenetwork.org/guix-bioinformatics/guix-bioinformatics) channel set up.
 
 ```bash
-env GUIX_PACKAGE_PATH=~/guix-bioinformatics/ ~/.config/guix/current/bin/guix environment --load=guix.scm
+env GUIX_PACKAGE_PATH=~/guix-bioinformatics/ ~/.config/guix/current/bin/guix environment --expose=$HOME/genotype_files/ --load=guix.scm
 python3
   import redis
 ```
@@ -32,7 +32,7 @@ python3
 #### Run a Guix container
 
 ```
-env GUIX_PACKAGE_PATH=~/guix-bioinformatics/ ~/.config/guix/current/bin/guix environment -C --network --load=guix.scm
+env GUIX_PACKAGE_PATH=~/guix-bioinformatics/ ~/.config/guix/current/bin/guix environment -C --network --expose=$HOME/genotype_files/ --load=guix.scm
 ```
 
 
@@ -157,3 +157,39 @@ guix. To freeze dependencies:
 pip freeze --path venv/lib/python3.8/site-packages > requirements.txt
 
 ```
+
+## Genotype Files
+
+You can get the genotype files from http://ipfs.genenetwork.org/ipfs/QmXQy3DAUWJuYxubLHLkPMNCEVq1oV7844xWG2d1GSPFPL and save them on your host machine at, say `$HOME/genotype_files` with something like:
+
+```bash
+$ mkdir -p $HOME/genotype_files
+$ cd $HOME/genotype_files
+$ yes | 7z x genotype_files.tar.7z
+$ tar xf genotype_files.tar
+```
+
+The `genotype_files.tar.7z` file seems to only contain the **BXD.geno** genotype file.
+
+## QTLReaper (rust-qtlreaper) and Trait Files
+
+To run QTL computations, this system makes use of the [rust-qtlreaper](https://github.com/chfi/rust-qtlreaper.git) utility.
+
+To do this, the system needs to export the trait data into a tab-separated file, that can then be passed to the utility using the `--traits` option. For more information about the available options, please [see the rust-qtlreaper](https://github.com/chfi/rust-qtlreaper.git) repository.
+
+### Traits File Format
+
+The traits file begins with a header row/line with the column headers. The first column in the file has the header **"Trait"**. Every other column has a header for one of the strains in consideration.
+
+Under the **"Trait"** column, the traits are numbered from **T1** to **T<n>** where **<n>** is the count of the total number of traits in consideration.
+
+As an example, you could end up with a trait file like the following:
+
+```txt
+Trait	BXD27	BXD32	DBA/2J	BXD21	...
+T1	10.5735	9.27408	9.48255	9.18253	...
+T2	6.4471	6.7191	5.98015	6.68051	...
+...
+```
+
+It is very important that the column header names for the strains correspond to the genotype file used.
diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py
index 3c35029..e0ff05b 100644
--- a/gn3/computations/heatmap.py
+++ b/gn3/computations/heatmap.py
@@ -149,22 +149,22 @@ def heatmap_data(formd, search_result, conn: Any):
 
     def __retrieve_traitlist_and_datalist(threshold, fullname):
         trait = retrieve_trait_info(threshold, fullname, conn)
-        return (
-            trait,
-            export_trait_data(retrieve_trait_data(trait, conn), strainlist))
+        return (trait, retrieve_trait_data(trait, conn))
 
     traits_details = [
         __retrieve_traitlist_and_datalist(threshold, fullname)
         for fullname in search_result]
-    traits_list = map(lambda x: x[0], traits_details)
-    traits_data_list = map(lambda x: x[1], traits_details)
+    traits_list = tuple(x[0] for x in traits_details)
+    traits_data_list = [x[1] for x in traits_details]
+    exported_traits_data_list = tuple(
+        export_trait_data(td, strainlist) for td in traits_data_list)
 
     return {
         "target_description_checked": formd.formdata.getvalue(
             "targetDescriptionCheck", ""),
         "cluster_checked": cluster_checked,
         "slink_data": (
-            slink(cluster_traits(traits_data_list))
+            slink(cluster_traits(exported_traits_data_list))
             if cluster_checked else False),
         "sessionfile": formd.formdata.getvalue("session"),
         "genotype": genotype,
@@ -173,5 +173,59 @@ def heatmap_data(formd, search_result, conn: Any):
         "ppolar": formd.ppolar,
         "mpolar":formd.mpolar,
         "traits_list": traits_list,
-        "traits_data_list": traits_data_list
+        "traits_data_list": traits_data_list,
+        "exported_traits_data_list": exported_traits_data_list
     }
+
+def compute_heatmap_order(
+        slink_data, xoffset: int = 40, neworder: tuple = tuple()):
+    """
+    Compute the data used for drawing the heatmap proper from `slink_data`.
+
+    This function tries to reproduce the creation and update of the `neworder`
+    variable in
+    https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L120
+    and in the `web.webqtl.heatmap.Heatmap.draw` function in GN1
+    """
+    d_1 = (0, 0, 0) # returned from self.draw in lines 391 and 399. This is just a placeholder
+
+    def __order_maker(norder, slnk_dt):
+        if isinstance(slnk_dt[0], int) and isinstance(slnk_dt[1], int):
+            return norder + (
+                (xoffset+20, slnk_dt[0]), (xoffset + 40, slnk_dt[1]))
+
+        if isinstance(slnk_dt[0], int):
+            return norder + ((xoffset + 20, slnk_dt[0]), )
+
+        if isinstance(slnk_dt[1], int):
+            return norder + ((xoffset + d_1[0] + 20, slnk_dt[1]), )
+
+        return __order_maker(__order_maker(norder, slnk_dt[0]), slnk_dt[1])
+
+    return __order_maker(neworder, slink_data)
+
+def retrieve_strains_and_values(orders, strainlist, traits_data_list):
+    """
+    Get the strains and their corresponding values from `strainlist` and
+    `traits_data_list`.
+
+    This migrates the code in
+    https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L215-221
+    """
+    # This feels nasty! There's a lot of mutation of values here, that might
+    # indicate something untoward in the design of this function and its
+    # dependents  ==>  Review
+    strains = []
+    values = []
+    rets = []
+    for order in orders:
+        temp_val = traits_data_list[order[1]]
+        for i, strain in enumerate(strainlist):
+            if temp_val[i] is not None:
+                strains.append(strain)
+                values.append(temp_val[i])
+        rets.append([order, strains[:], values[:]])
+        strains = []
+        values = []
+
+    return rets
diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py
new file mode 100644
index 0000000..c058e14
--- /dev/null
+++ b/gn3/computations/qtlreaper.py
@@ -0,0 +1,92 @@
+"""
+This module contains functions to interact with the `qtlreaper` utility for
+computation of QTLs.
+"""
+import os
+import random
+import string
+import subprocess
+from gn3.settings import TMPDIR, REAPER_COMMAND
+
+def random_string(length):
+    """Generate a random string of length `length`."""
+    return "".join(
+        random.choices(
+            string.ascii_letters + string.digits, k=length))
+
+def generate_traits_file(strains, trait_values, traits_filename):
+    """
+    Generate a traits file for use with `qtlreaper`.
+
+    PARAMETERS:
+    strains: A list of strains to use as the headers for the various columns.
+    trait_values: A list of lists of values for each trait and strain.
+    traits_filename: The tab-separated value to put the values in for
+        computation of QTLs.
+    """
+    header = "Trait\t{}\n".format("\t".join(strains))
+    data = [header] + [
+        "T{}\t{}\n".format(i+1, "\t".join([str(i) for i in t]))
+        for i, t in enumerate(trait_values[:-1])] + [
+        "T{}\t{}".format(len(trait_values), "\t".join([str(i) for i in t]))
+        for t in trait_values[-1:]]
+    with open(traits_filename, "w") as outfile:
+        outfile.writelines(data)
+
+def create_output_directory(path: str):
+    """Create the output directory at `path` if it does not exist."""
+    try:
+        os.mkdir(path)
+    except OSError:
+        pass
+
+def run_reaper(
+        genotype_filename: str, traits_filename: str,
+        other_options: tuple = ("--n_permutations", "1000"),
+        separate_nperm_output: bool = False,
+        output_dir: str = TMPDIR):
+    """
+    Run the QTLReaper command to compute the QTLs.
+
+    PARAMETERS:
+    genotype_filename: The complete path to a genotype file to use in the QTL
+        computation.
+    traits_filename: A path to a file previously generated with the
+        `generate_traits_file` function in this module, to be used in the QTL
+        computation.
+    other_options: Other options to pass to the `qtlreaper` command to modify
+        the QTL computations.
+    separate_nperm_output: A flag indicating whether or not to provide a
+        separate output for the permutations computation. The default is False,
+        which means by default, no separate output file is created.
+    output_dir: A path to the directory where the outputs are put
+
+    RETURNS:
+    The function returns a tuple of the main output file, and the output file
+    for the permutation computations. If the `separate_nperm_output` is `False`,
+    the second value in the tuple returned is `None`.
+
+    RAISES:
+    The function will raise a `subprocess.CalledProcessError` exception in case
+    of any errors running the `qtlreaper` command.
+    """
+    create_output_directory("{}/qtlreaper".format(output_dir))
+    output_filename = "{}/qtlreaper/main_output_{}.txt".format(
+        output_dir, random_string(10))
+    output_list = ["--main_output", output_filename]
+    if separate_nperm_output:
+        permu_output_filename = "{}/qtlreaper/permu_output_{}.txt".format(
+            output_dir, random_string(10))
+        output_list = output_list + ["--permu_output", permu_output_filename]
+    else:
+        permu_output_filename = None
+
+    command_list = [
+        REAPER_COMMAND, "--geno", genotype_filename,
+        *other_options, # this splices the `other_options` list here
+        "--traits", traits_filename,
+        *output_list # this splices the `output_list` list here
+    ]
+
+    subprocess.run(command_list, check=True)
+    return (output_filename, permu_output_filename)
diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py
new file mode 100644
index 0000000..610ddde
--- /dev/null
+++ b/gn3/db/genotypes.py
@@ -0,0 +1,69 @@
+"""Genotype utilities"""
+
+import os
+import gzip
+from gn3.settings import GENOTYPE_FILES
+
+def build_genotype_file(
+        geno_name: str, base_dir: str = GENOTYPE_FILES,
+        extension: str = "geno"):
+    """Build the absolute path for the genotype file."""
+    return "{}/{}.{}".format(os.path.abspath(base_dir), geno_name, extension)
+
+def load_genotype_samples(genotype_filename: str, file_type: str = "geno"):
+    """
+    Load sample of strains from genotype files.
+
+    DESCRIPTION:
+    Traits can contain a varied number of strains, some of which do not exist in
+    certain genotypes. In order to compute QTLs, GEMMAs, etc, we need to ensure
+    to pick only those strains that exist in the genotype under consideration
+    for the traits used in the computation.
+
+    This function loads a list of samples from the genotype files for use in
+    filtering out unusable strains.
+
+
+    PARAMETERS:
+    genotype_filename: The absolute path to the genotype file to load the
+        samples from.
+    file_type: The type of file. Currently supported values are 'geno' and
+        'plink'.
+    """
+    file_type_fns = {
+        "geno": __load_genotype_samples_from_geno,
+        "plink": __load_genotype_samples_from_plink
+    }
+    return file_type_fns[file_type](genotype_filename)
+
+def __load_genotype_samples_from_geno(genotype_filename: str):
+    """
+    Helper function for `load_genotype_samples` function.
+
+    Loads samples from '.geno' files.
+    """
+    gzipped_filename = "{}.gz".format(genotype_filename)
+    if os.path.isfile(gzipped_filename):
+        genofile = gzip.open(gzipped_filename)
+    else:
+        genofile = open(genotype_filename)
+
+    for row in genofile:
+        line = row.strip()
+        if (not line) or (line.startswith(("#", "@"))):
+            continue
+        break
+
+    headers = line.split("\t")
+    if headers[3] == "Mb":
+        return headers[4:]
+    return headers[3:]
+
+def __load_genotype_samples_from_plink(genotype_filename: str):
+    """
+    Helper function for `load_genotype_samples` function.
+
+    Loads samples from '.plink' files.
+    """
+    genofile = open(genotype_filename)
+    return [line.split(" ")[1] for line in genofile]
diff --git a/gn3/settings.py b/gn3/settings.py
index f4866d5..a08f846 100644
--- a/gn3/settings.py
+++ b/gn3/settings.py
@@ -24,3 +24,10 @@ GN2_BASE_URL = "http://www.genenetwork.org/"
 
 # biweight script
 BIWEIGHT_RSCRIPT = "~/genenetwork3/scripts/calculate_biweight.R"
+
+# qtlreaper command
+REAPER_COMMAND = "{}/bin/qtlreaper".format(os.environ.get("GUIX_ENVIRONMENT"))
+
+# genotype files
+GENOTYPE_FILES = os.environ.get(
+    "GENOTYPE_FILES", "{}/genotype_files/genotype".format(os.environ.get("HOME")))
diff --git a/guix.scm b/guix.scm
index 729d089..8e1cf79 100644
--- a/guix.scm
+++ b/guix.scm
@@ -43,7 +43,8 @@
  (gnu packages databases)
  (gnu packages statistics)
  (gnu packages bioconductor)
- (gn packages golang)
+ (gnu packages golang)
+ (gn packages genenetwork)
  (gnu packages python)
  (gnu packages python-check)
  (gnu packages python-crypto)
@@ -104,7 +105,8 @@
                        ("r-qtl" ,r-qtl)
                        ("r-stringi" ,r-stringi)
                        ("python-plotly" ,python-plotly)
-                       ("python-pandas" ,python-pandas)))
+                       ("python-pandas" ,python-pandas)
+                       ("rust-qtlreaper" ,rust-qtlreaper)))
   (build-system python-build-system)
   (home-page "https://github.com/genenetwork/genenetwork3")
   (synopsis "GeneNetwork3 API for data science and machine learning.")
diff --git a/qtlfilesexport.py b/qtlfilesexport.py
new file mode 100644
index 0000000..799de31
--- /dev/null
+++ b/qtlfilesexport.py
@@ -0,0 +1,68 @@
+"""
+Test the qtlfiles export of traits files
+
+Run with:
+
+    env SQL_URI="mysql://<user>:<password>@<host>:<port>/db_webqtl" python3 qtlfilesexport.py
+
+replacing the variables in the angled brackets with the appropriate values
+"""
+from gn3.computations.slink import slink
+from gn3.db_utils import database_connector
+from gn3.computations.qtlreaper import run_reaper
+from gn3.computations.heatmap import export_trait_data
+from gn3.db.traits import retrieve_trait_data, retrieve_trait_info
+from gn3.db.genotypes import build_genotype_file, load_genotype_samples
+from gn3.computations.qtlreaper import random_string, generate_traits_file
+from gn3.computations.heatmap import (
+    cluster_traits,
+    compute_heatmap_order,
+    retrieve_strains_and_values)
+
+TMPDIR = "tmp/qtltests"
+
+def trait_fullnames():
+    """Return sample names for traits"""
+    return [
+        "UCLA_BXDBXH_CARTILAGE_V2::ILM103710672",
+        "UCLA_BXDBXH_CARTILAGE_V2::ILM2260338",
+        "UCLA_BXDBXH_CARTILAGE_V2::ILM3140576",
+        "UCLA_BXDBXH_CARTILAGE_V2::ILM5670577",
+        "UCLA_BXDBXH_CARTILAGE_V2::ILM2070121",
+        "UCLA_BXDBXH_CARTILAGE_V2::ILM103990541",
+        "UCLA_BXDBXH_CARTILAGE_V2::ILM1190722",
+        "UCLA_BXDBXH_CARTILAGE_V2::ILM6590722",
+        "UCLA_BXDBXH_CARTILAGE_V2::ILM4200064",
+        "UCLA_BXDBXH_CARTILAGE_V2::ILM3140463"]
+
+def main():
+    """entrypoint function"""
+    conn = database_connector()[0]
+    threshold = 0
+    traits = [
+        retrieve_trait_info(threshold, fullname, conn)
+        for fullname in trait_fullnames()]
+    traits_data_list = [retrieve_trait_data(t, conn) for t in traits]
+    genotype_filename = build_genotype_file(traits[0]["riset"])
+    strains = load_genotype_samples(genotype_filename)
+    exported_traits_data_list = [
+        export_trait_data(td, strains) for td in traits_data_list]
+    slinked = slink(cluster_traits(exported_traits_data_list))
+    orders = compute_heatmap_order(slinked)
+    strains_and_values = retrieve_strains_and_values(
+        orders, strains, exported_traits_data_list)
+    strains_values = strains_and_values[0][1]
+    trait_values = [t[2] for t in strains_and_values]
+    traits_filename = "{}/traits_test_file_{}.txt".format(
+        TMPDIR, random_string(10))
+    generate_traits_file(strains_values, trait_values, traits_filename)
+    print("Generated file: {}".format(traits_filename))
+
+    main_output, permutations_output = run_reaper(
+        genotype_filename, traits_filename, separate_nperm_output=True)
+
+    print("Main output: {}, Permutation output: {}".format(
+        main_output, permutations_output))
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/unit/computations/test_heatmap.py b/tests/unit/computations/test_heatmap.py
index 650cb45..686288d 100644
--- a/tests/unit/computations/test_heatmap.py
+++ b/tests/unit/computations/test_heatmap.py
@@ -1,6 +1,10 @@
 """Module contains tests for gn3.computations.heatmap"""
 from unittest import TestCase
-from gn3.computations.heatmap import cluster_traits, export_trait_data
+from gn3.computations.heatmap import (
+    cluster_traits,
+    export_trait_data,
+    compute_heatmap_order,
+    retrieve_strains_and_values)
 
 strainlist = ["B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"]
 trait_data = {
@@ -34,6 +38,16 @@ trait_data = {
         "C57BL/6J": {"strain_name": "C57BL/6J", "value": 7.50606, "variance": None, "ndata": None},
         "DBA/2J": {"strain_name": "DBA/2J", "value": 7.72588, "variance": None, "ndata": None}}}
 
+slinked = (
+    (((0, 2, 0.16381088984330505),
+      ((1, 7, 0.06024619831474998), 5, 0.19179284676938602),
+      0.20337048635536847),
+     9,
+     0.23451785425383564),
+    ((3, (6, 8, 0.2140799896286565), 0.25879514152086425),
+     4, 0.8968250491499363),
+    0.9313185954797953)
+
 class TestHeatmap(TestCase):
     """Class for testing heatmap computation functions"""
 
@@ -141,3 +155,24 @@ class TestHeatmap(TestCase):
               0.9313185954797953, 1.1683723389247052, 0.23451785425383564,
               1.7413442197913358, 0.33370067057028485, 1.3256191648260216,
               0.0)))
+
+    def test_compute_heatmap_order(self):
+        """Test the orders."""
+        for xoff, expected in [
+                (40, ((60, 9), (60, 4))),
+                (30, ((50, 9), (50, 4))),
+                (20, ((40, 9), (40, 4)))]:
+            with self.subTest(xoffset=xoff):
+                self.assertEqual(
+                    compute_heatmap_order(slinked, xoffset=xoff), expected)
+
+    def test_retrieve_strains_and_values(self):
+        """Test retrieval of strains and values."""
+        for slist, tdata, expected in [
+                [["s1", "s2", "s3", "s4"], [9, None, 5, 4],
+                 (("s1", "s3", "s4"), (9, 5, 4))],
+                [["s1", "s2", "s3", "s4", "s5"], [6, None, None, 4, None],
+                 (("s1", "s4"), (6, 4))]]:
+            with self.subTest(strainlist=slist, traitdata=tdata):
+                self.assertEqual(
+                    retrieve_strains_and_values(slist, tdata), expected)