aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.md40
-rw-r--r--gn3/computations/heatmap.py68
-rw-r--r--gn3/computations/qtlreaper.py92
-rw-r--r--gn3/db/genotypes.py69
-rw-r--r--gn3/settings.py7
-rw-r--r--guix.scm6
-rw-r--r--qtlfilesexport.py68
-rw-r--r--tests/unit/computations/test_heatmap.py37
8 files changed, 375 insertions, 12 deletions
diff --git a/README.md b/README.md
index 750f55d..84a7a54 100644
--- a/README.md
+++ b/README.md
@@ -24,7 +24,7 @@ guix environment --load=guix.scm
Also, make sure you have the [guix-bioinformatics](https://git.genenetwork.org/guix-bioinformatics/guix-bioinformatics) channel set up.
```bash
-env GUIX_PACKAGE_PATH=~/guix-bioinformatics/ ~/.config/guix/current/bin/guix environment --load=guix.scm
+env GUIX_PACKAGE_PATH=~/guix-bioinformatics/ ~/.config/guix/current/bin/guix environment --expose=$HOME/genotype_files/ --load=guix.scm
python3
import redis
```
@@ -32,7 +32,7 @@ python3
#### Run a Guix container
```
-env GUIX_PACKAGE_PATH=~/guix-bioinformatics/ ~/.config/guix/current/bin/guix environment -C --network --load=guix.scm
+env GUIX_PACKAGE_PATH=~/guix-bioinformatics/ ~/.config/guix/current/bin/guix environment -C --network --expose=$HOME/genotype_files/ --load=guix.scm
```
@@ -157,3 +157,39 @@ guix. To freeze dependencies:
pip freeze --path venv/lib/python3.8/site-packages > requirements.txt
```
+
+## Genotype Files
+
+You can get the genotype files from http://ipfs.genenetwork.org/ipfs/QmXQy3DAUWJuYxubLHLkPMNCEVq1oV7844xWG2d1GSPFPL and save them on your host machine at, say `$HOME/genotype_files` with something like:
+
+```bash
+$ mkdir -p $HOME/genotype_files
+$ cd $HOME/genotype_files
+$ yes | 7z x genotype_files.tar.7z
+$ tar xf genotype_files.tar
+```
+
+The `genotype_files.tar.7z` file seems to only contain the **BXD.geno** genotype file.
+
+## QTLReaper (rust-qtlreaper) and Trait Files
+
+To run QTL computations, this system makes use of the [rust-qtlreaper](https://github.com/chfi/rust-qtlreaper.git) utility.
+
+To do this, the system needs to export the trait data into a tab-separated file, that can then be passed to the utility using the `--traits` option. For more information about the available options, please [see the rust-qtlreaper](https://github.com/chfi/rust-qtlreaper.git) repository.
+
+### Traits File Format
+
+The traits file begins with a header row/line with the column headers. The first column in the file has the header **"Trait"**. Every other column has a header for one of the strains in consideration.
+
+Under the **"Trait"** column, the traits are numbered from **T1** to **T<n>** where **<n>** is the count of the total number of traits in consideration.
+
+As an example, you could end up with a trait file like the following:
+
+```txt
+Trait BXD27 BXD32 DBA/2J BXD21 ...
+T1 10.5735 9.27408 9.48255 9.18253 ...
+T2 6.4471 6.7191 5.98015 6.68051 ...
+...
+```
+
+It is very important that the column header names for the strains correspond to the genotype file used.
diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py
index 3c35029..e0ff05b 100644
--- a/gn3/computations/heatmap.py
+++ b/gn3/computations/heatmap.py
@@ -149,22 +149,22 @@ def heatmap_data(formd, search_result, conn: Any):
def __retrieve_traitlist_and_datalist(threshold, fullname):
trait = retrieve_trait_info(threshold, fullname, conn)
- return (
- trait,
- export_trait_data(retrieve_trait_data(trait, conn), strainlist))
+ return (trait, retrieve_trait_data(trait, conn))
traits_details = [
__retrieve_traitlist_and_datalist(threshold, fullname)
for fullname in search_result]
- traits_list = map(lambda x: x[0], traits_details)
- traits_data_list = map(lambda x: x[1], traits_details)
+ traits_list = tuple(x[0] for x in traits_details)
+ traits_data_list = [x[1] for x in traits_details]
+ exported_traits_data_list = tuple(
+ export_trait_data(td, strainlist) for td in traits_data_list)
return {
"target_description_checked": formd.formdata.getvalue(
"targetDescriptionCheck", ""),
"cluster_checked": cluster_checked,
"slink_data": (
- slink(cluster_traits(traits_data_list))
+ slink(cluster_traits(exported_traits_data_list))
if cluster_checked else False),
"sessionfile": formd.formdata.getvalue("session"),
"genotype": genotype,
@@ -173,5 +173,59 @@ def heatmap_data(formd, search_result, conn: Any):
"ppolar": formd.ppolar,
"mpolar":formd.mpolar,
"traits_list": traits_list,
- "traits_data_list": traits_data_list
+ "traits_data_list": traits_data_list,
+ "exported_traits_data_list": exported_traits_data_list
}
+
+def compute_heatmap_order(
+ slink_data, xoffset: int = 40, neworder: tuple = tuple()):
+ """
+ Compute the data used for drawing the heatmap proper from `slink_data`.
+
+ This function tries to reproduce the creation and update of the `neworder`
+ variable in
+ https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L120
+ and in the `web.webqtl.heatmap.Heatmap.draw` function in GN1
+ """
+ d_1 = (0, 0, 0) # returned from self.draw in lines 391 and 399. This is just a placeholder
+
+ def __order_maker(norder, slnk_dt):
+ if isinstance(slnk_dt[0], int) and isinstance(slnk_dt[1], int):
+ return norder + (
+ (xoffset+20, slnk_dt[0]), (xoffset + 40, slnk_dt[1]))
+
+ if isinstance(slnk_dt[0], int):
+ return norder + ((xoffset + 20, slnk_dt[0]), )
+
+ if isinstance(slnk_dt[1], int):
+ return norder + ((xoffset + d_1[0] + 20, slnk_dt[1]), )
+
+ return __order_maker(__order_maker(norder, slnk_dt[0]), slnk_dt[1])
+
+ return __order_maker(neworder, slink_data)
+
+def retrieve_strains_and_values(orders, strainlist, traits_data_list):
+ """
+ Get the strains and their corresponding values from `strainlist` and
+ `traits_data_list`.
+
+ This migrates the code in
+ https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L215-221
+ """
+ # This feels nasty! There's a lot of mutation of values here, that might
+ # indicate something untoward in the design of this function and its
+ # dependents ==> Review
+ strains = []
+ values = []
+ rets = []
+ for order in orders:
+ temp_val = traits_data_list[order[1]]
+ for i, strain in enumerate(strainlist):
+ if temp_val[i] is not None:
+ strains.append(strain)
+ values.append(temp_val[i])
+ rets.append([order, strains[:], values[:]])
+ strains = []
+ values = []
+
+ return rets
diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py
new file mode 100644
index 0000000..c058e14
--- /dev/null
+++ b/gn3/computations/qtlreaper.py
@@ -0,0 +1,92 @@
+"""
+This module contains functions to interact with the `qtlreaper` utility for
+computation of QTLs.
+"""
+import os
+import random
+import string
+import subprocess
+from gn3.settings import TMPDIR, REAPER_COMMAND
+
+def random_string(length):
+ """Generate a random string of length `length`."""
+ return "".join(
+ random.choices(
+ string.ascii_letters + string.digits, k=length))
+
+def generate_traits_file(strains, trait_values, traits_filename):
+ """
+ Generate a traits file for use with `qtlreaper`.
+
+ PARAMETERS:
+ strains: A list of strains to use as the headers for the various columns.
+ trait_values: A list of lists of values for each trait and strain.
+ traits_filename: The tab-separated value to put the values in for
+ computation of QTLs.
+ """
+ header = "Trait\t{}\n".format("\t".join(strains))
+ data = [header] + [
+ "T{}\t{}\n".format(i+1, "\t".join([str(i) for i in t]))
+ for i, t in enumerate(trait_values[:-1])] + [
+ "T{}\t{}".format(len(trait_values), "\t".join([str(i) for i in t]))
+ for t in trait_values[-1:]]
+ with open(traits_filename, "w") as outfile:
+ outfile.writelines(data)
+
+def create_output_directory(path: str):
+ """Create the output directory at `path` if it does not exist."""
+ try:
+ os.mkdir(path)
+ except OSError:
+ pass
+
+def run_reaper(
+ genotype_filename: str, traits_filename: str,
+ other_options: tuple = ("--n_permutations", "1000"),
+ separate_nperm_output: bool = False,
+ output_dir: str = TMPDIR):
+ """
+ Run the QTLReaper command to compute the QTLs.
+
+ PARAMETERS:
+ genotype_filename: The complete path to a genotype file to use in the QTL
+ computation.
+ traits_filename: A path to a file previously generated with the
+ `generate_traits_file` function in this module, to be used in the QTL
+ computation.
+ other_options: Other options to pass to the `qtlreaper` command to modify
+ the QTL computations.
+ separate_nperm_output: A flag indicating whether or not to provide a
+ separate output for the permutations computation. The default is False,
+ which means by default, no separate output file is created.
+ output_dir: A path to the directory where the outputs are put
+
+ RETURNS:
+ The function returns a tuple of the main output file, and the output file
+ for the permutation computations. If the `separate_nperm_output` is `False`,
+ the second value in the tuple returned is `None`.
+
+ RAISES:
+ The function will raise a `subprocess.CalledProcessError` exception in case
+ of any errors running the `qtlreaper` command.
+ """
+ create_output_directory("{}/qtlreaper".format(output_dir))
+ output_filename = "{}/qtlreaper/main_output_{}.txt".format(
+ output_dir, random_string(10))
+ output_list = ["--main_output", output_filename]
+ if separate_nperm_output:
+ permu_output_filename = "{}/qtlreaper/permu_output_{}.txt".format(
+ output_dir, random_string(10))
+ output_list = output_list + ["--permu_output", permu_output_filename]
+ else:
+ permu_output_filename = None
+
+ command_list = [
+ REAPER_COMMAND, "--geno", genotype_filename,
+ *other_options, # this splices the `other_options` list here
+ "--traits", traits_filename,
+ *output_list # this splices the `output_list` list here
+ ]
+
+ subprocess.run(command_list, check=True)
+ return (output_filename, permu_output_filename)
diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py
new file mode 100644
index 0000000..610ddde
--- /dev/null
+++ b/gn3/db/genotypes.py
@@ -0,0 +1,69 @@
+"""Genotype utilities"""
+
+import os
+import gzip
+from gn3.settings import GENOTYPE_FILES
+
+def build_genotype_file(
+ geno_name: str, base_dir: str = GENOTYPE_FILES,
+ extension: str = "geno"):
+ """Build the absolute path for the genotype file."""
+ return "{}/{}.{}".format(os.path.abspath(base_dir), geno_name, extension)
+
+def load_genotype_samples(genotype_filename: str, file_type: str = "geno"):
+ """
+ Load sample of strains from genotype files.
+
+ DESCRIPTION:
+ Traits can contain a varied number of strains, some of which do not exist in
+ certain genotypes. In order to compute QTLs, GEMMAs, etc, we need to ensure
+ to pick only those strains that exist in the genotype under consideration
+ for the traits used in the computation.
+
+ This function loads a list of samples from the genotype files for use in
+ filtering out unusable strains.
+
+
+ PARAMETERS:
+ genotype_filename: The absolute path to the genotype file to load the
+ samples from.
+ file_type: The type of file. Currently supported values are 'geno' and
+ 'plink'.
+ """
+ file_type_fns = {
+ "geno": __load_genotype_samples_from_geno,
+ "plink": __load_genotype_samples_from_plink
+ }
+ return file_type_fns[file_type](genotype_filename)
+
+def __load_genotype_samples_from_geno(genotype_filename: str):
+ """
+ Helper function for `load_genotype_samples` function.
+
+ Loads samples from '.geno' files.
+ """
+ gzipped_filename = "{}.gz".format(genotype_filename)
+ if os.path.isfile(gzipped_filename):
+ genofile = gzip.open(gzipped_filename)
+ else:
+ genofile = open(genotype_filename)
+
+ for row in genofile:
+ line = row.strip()
+ if (not line) or (line.startswith(("#", "@"))):
+ continue
+ break
+
+ headers = line.split("\t")
+ if headers[3] == "Mb":
+ return headers[4:]
+ return headers[3:]
+
+def __load_genotype_samples_from_plink(genotype_filename: str):
+ """
+ Helper function for `load_genotype_samples` function.
+
+ Loads samples from '.plink' files.
+ """
+ genofile = open(genotype_filename)
+ return [line.split(" ")[1] for line in genofile]
diff --git a/gn3/settings.py b/gn3/settings.py
index f4866d5..a08f846 100644
--- a/gn3/settings.py
+++ b/gn3/settings.py
@@ -24,3 +24,10 @@ GN2_BASE_URL = "http://www.genenetwork.org/"
# biweight script
BIWEIGHT_RSCRIPT = "~/genenetwork3/scripts/calculate_biweight.R"
+
+# qtlreaper command
+REAPER_COMMAND = "{}/bin/qtlreaper".format(os.environ.get("GUIX_ENVIRONMENT"))
+
+# genotype files
+GENOTYPE_FILES = os.environ.get(
+ "GENOTYPE_FILES", "{}/genotype_files/genotype".format(os.environ.get("HOME")))
diff --git a/guix.scm b/guix.scm
index 729d089..8e1cf79 100644
--- a/guix.scm
+++ b/guix.scm
@@ -43,7 +43,8 @@
(gnu packages databases)
(gnu packages statistics)
(gnu packages bioconductor)
- (gn packages golang)
+ (gnu packages golang)
+ (gn packages genenetwork)
(gnu packages python)
(gnu packages python-check)
(gnu packages python-crypto)
@@ -104,7 +105,8 @@
("r-qtl" ,r-qtl)
("r-stringi" ,r-stringi)
("python-plotly" ,python-plotly)
- ("python-pandas" ,python-pandas)))
+ ("python-pandas" ,python-pandas)
+ ("rust-qtlreaper" ,rust-qtlreaper)))
(build-system python-build-system)
(home-page "https://github.com/genenetwork/genenetwork3")
(synopsis "GeneNetwork3 API for data science and machine learning.")
diff --git a/qtlfilesexport.py b/qtlfilesexport.py
new file mode 100644
index 0000000..799de31
--- /dev/null
+++ b/qtlfilesexport.py
@@ -0,0 +1,68 @@
+"""
+Test the qtlfiles export of traits files
+
+Run with:
+
+ env SQL_URI="mysql://<user>:<password>@<host>:<port>/db_webqtl" python3 qtlfilesexport.py
+
+replacing the variables in the angled brackets with the appropriate values
+"""
+from gn3.computations.slink import slink
+from gn3.db_utils import database_connector
+from gn3.computations.qtlreaper import run_reaper
+from gn3.computations.heatmap import export_trait_data
+from gn3.db.traits import retrieve_trait_data, retrieve_trait_info
+from gn3.db.genotypes import build_genotype_file, load_genotype_samples
+from gn3.computations.qtlreaper import random_string, generate_traits_file
+from gn3.computations.heatmap import (
+ cluster_traits,
+ compute_heatmap_order,
+ retrieve_strains_and_values)
+
+TMPDIR = "tmp/qtltests"
+
+def trait_fullnames():
+ """Return sample names for traits"""
+ return [
+ "UCLA_BXDBXH_CARTILAGE_V2::ILM103710672",
+ "UCLA_BXDBXH_CARTILAGE_V2::ILM2260338",
+ "UCLA_BXDBXH_CARTILAGE_V2::ILM3140576",
+ "UCLA_BXDBXH_CARTILAGE_V2::ILM5670577",
+ "UCLA_BXDBXH_CARTILAGE_V2::ILM2070121",
+ "UCLA_BXDBXH_CARTILAGE_V2::ILM103990541",
+ "UCLA_BXDBXH_CARTILAGE_V2::ILM1190722",
+ "UCLA_BXDBXH_CARTILAGE_V2::ILM6590722",
+ "UCLA_BXDBXH_CARTILAGE_V2::ILM4200064",
+ "UCLA_BXDBXH_CARTILAGE_V2::ILM3140463"]
+
+def main():
+ """entrypoint function"""
+ conn = database_connector()[0]
+ threshold = 0
+ traits = [
+ retrieve_trait_info(threshold, fullname, conn)
+ for fullname in trait_fullnames()]
+ traits_data_list = [retrieve_trait_data(t, conn) for t in traits]
+ genotype_filename = build_genotype_file(traits[0]["riset"])
+ strains = load_genotype_samples(genotype_filename)
+ exported_traits_data_list = [
+ export_trait_data(td, strains) for td in traits_data_list]
+ slinked = slink(cluster_traits(exported_traits_data_list))
+ orders = compute_heatmap_order(slinked)
+ strains_and_values = retrieve_strains_and_values(
+ orders, strains, exported_traits_data_list)
+ strains_values = strains_and_values[0][1]
+ trait_values = [t[2] for t in strains_and_values]
+ traits_filename = "{}/traits_test_file_{}.txt".format(
+ TMPDIR, random_string(10))
+ generate_traits_file(strains_values, trait_values, traits_filename)
+ print("Generated file: {}".format(traits_filename))
+
+ main_output, permutations_output = run_reaper(
+ genotype_filename, traits_filename, separate_nperm_output=True)
+
+ print("Main output: {}, Permutation output: {}".format(
+ main_output, permutations_output))
+
+if __name__ == "__main__":
+ main()
diff --git a/tests/unit/computations/test_heatmap.py b/tests/unit/computations/test_heatmap.py
index 650cb45..686288d 100644
--- a/tests/unit/computations/test_heatmap.py
+++ b/tests/unit/computations/test_heatmap.py
@@ -1,6 +1,10 @@
"""Module contains tests for gn3.computations.heatmap"""
from unittest import TestCase
-from gn3.computations.heatmap import cluster_traits, export_trait_data
+from gn3.computations.heatmap import (
+ cluster_traits,
+ export_trait_data,
+ compute_heatmap_order,
+ retrieve_strains_and_values)
strainlist = ["B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"]
trait_data = {
@@ -34,6 +38,16 @@ trait_data = {
"C57BL/6J": {"strain_name": "C57BL/6J", "value": 7.50606, "variance": None, "ndata": None},
"DBA/2J": {"strain_name": "DBA/2J", "value": 7.72588, "variance": None, "ndata": None}}}
+slinked = (
+ (((0, 2, 0.16381088984330505),
+ ((1, 7, 0.06024619831474998), 5, 0.19179284676938602),
+ 0.20337048635536847),
+ 9,
+ 0.23451785425383564),
+ ((3, (6, 8, 0.2140799896286565), 0.25879514152086425),
+ 4, 0.8968250491499363),
+ 0.9313185954797953)
+
class TestHeatmap(TestCase):
"""Class for testing heatmap computation functions"""
@@ -141,3 +155,24 @@ class TestHeatmap(TestCase):
0.9313185954797953, 1.1683723389247052, 0.23451785425383564,
1.7413442197913358, 0.33370067057028485, 1.3256191648260216,
0.0)))
+
+ def test_compute_heatmap_order(self):
+ """Test the orders."""
+ for xoff, expected in [
+ (40, ((60, 9), (60, 4))),
+ (30, ((50, 9), (50, 4))),
+ (20, ((40, 9), (40, 4)))]:
+ with self.subTest(xoffset=xoff):
+ self.assertEqual(
+ compute_heatmap_order(slinked, xoffset=xoff), expected)
+
+ def test_retrieve_strains_and_values(self):
+ """Test retrieval of strains and values."""
+ for slist, tdata, expected in [
+ [["s1", "s2", "s3", "s4"], [9, None, 5, 4],
+ (("s1", "s3", "s4"), (9, 5, 4))],
+ [["s1", "s2", "s3", "s4", "s5"], [6, None, None, 4, None],
+ (("s1", "s4"), (6, 4))]]:
+ with self.subTest(strainlist=slist, traitdata=tdata):
+ self.assertEqual(
+ retrieve_strains_and_values(slist, tdata), expected)