diff options
-rw-r--r-- | README.md | 40 | ||||
-rw-r--r-- | gn3/computations/heatmap.py | 68 | ||||
-rw-r--r-- | gn3/computations/qtlreaper.py | 92 | ||||
-rw-r--r-- | gn3/db/genotypes.py | 69 | ||||
-rw-r--r-- | gn3/settings.py | 7 | ||||
-rw-r--r-- | guix.scm | 6 | ||||
-rw-r--r-- | qtlfilesexport.py | 68 | ||||
-rw-r--r-- | tests/unit/computations/test_heatmap.py | 37 |
8 files changed, 375 insertions, 12 deletions
@@ -24,7 +24,7 @@ guix environment --load=guix.scm Also, make sure you have the [guix-bioinformatics](https://git.genenetwork.org/guix-bioinformatics/guix-bioinformatics) channel set up. ```bash -env GUIX_PACKAGE_PATH=~/guix-bioinformatics/ ~/.config/guix/current/bin/guix environment --load=guix.scm +env GUIX_PACKAGE_PATH=~/guix-bioinformatics/ ~/.config/guix/current/bin/guix environment --expose=$HOME/genotype_files/ --load=guix.scm python3 import redis ``` @@ -32,7 +32,7 @@ python3 #### Run a Guix container ``` -env GUIX_PACKAGE_PATH=~/guix-bioinformatics/ ~/.config/guix/current/bin/guix environment -C --network --load=guix.scm +env GUIX_PACKAGE_PATH=~/guix-bioinformatics/ ~/.config/guix/current/bin/guix environment -C --network --expose=$HOME/genotype_files/ --load=guix.scm ``` @@ -157,3 +157,39 @@ guix. To freeze dependencies: pip freeze --path venv/lib/python3.8/site-packages > requirements.txt ``` + +## Genotype Files + +You can get the genotype files from http://ipfs.genenetwork.org/ipfs/QmXQy3DAUWJuYxubLHLkPMNCEVq1oV7844xWG2d1GSPFPL and save them on your host machine at, say `$HOME/genotype_files` with something like: + +```bash +$ mkdir -p $HOME/genotype_files +$ cd $HOME/genotype_files +$ yes | 7z x genotype_files.tar.7z +$ tar xf genotype_files.tar +``` + +The `genotype_files.tar.7z` file seems to only contain the **BXD.geno** genotype file. + +## QTLReaper (rust-qtlreaper) and Trait Files + +To run QTL computations, this system makes use of the [rust-qtlreaper](https://github.com/chfi/rust-qtlreaper.git) utility. + +To do this, the system needs to export the trait data into a tab-separated file, that can then be passed to the utility using the `--traits` option. For more information about the available options, please [see the rust-qtlreaper](https://github.com/chfi/rust-qtlreaper.git) repository. + +### Traits File Format + +The traits file begins with a header row/line with the column headers. The first column in the file has the header **"Trait"**. Every other column has a header for one of the strains in consideration. + +Under the **"Trait"** column, the traits are numbered from **T1** to **T<n>** where **<n>** is the count of the total number of traits in consideration. + +As an example, you could end up with a trait file like the following: + +```txt +Trait BXD27 BXD32 DBA/2J BXD21 ... +T1 10.5735 9.27408 9.48255 9.18253 ... +T2 6.4471 6.7191 5.98015 6.68051 ... +... +``` + +It is very important that the column header names for the strains correspond to the genotype file used. diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py index 3c35029..e0ff05b 100644 --- a/gn3/computations/heatmap.py +++ b/gn3/computations/heatmap.py @@ -149,22 +149,22 @@ def heatmap_data(formd, search_result, conn: Any): def __retrieve_traitlist_and_datalist(threshold, fullname): trait = retrieve_trait_info(threshold, fullname, conn) - return ( - trait, - export_trait_data(retrieve_trait_data(trait, conn), strainlist)) + return (trait, retrieve_trait_data(trait, conn)) traits_details = [ __retrieve_traitlist_and_datalist(threshold, fullname) for fullname in search_result] - traits_list = map(lambda x: x[0], traits_details) - traits_data_list = map(lambda x: x[1], traits_details) + traits_list = tuple(x[0] for x in traits_details) + traits_data_list = [x[1] for x in traits_details] + exported_traits_data_list = tuple( + export_trait_data(td, strainlist) for td in traits_data_list) return { "target_description_checked": formd.formdata.getvalue( "targetDescriptionCheck", ""), "cluster_checked": cluster_checked, "slink_data": ( - slink(cluster_traits(traits_data_list)) + slink(cluster_traits(exported_traits_data_list)) if cluster_checked else False), "sessionfile": formd.formdata.getvalue("session"), "genotype": genotype, @@ -173,5 +173,59 @@ def heatmap_data(formd, search_result, conn: Any): "ppolar": formd.ppolar, "mpolar":formd.mpolar, "traits_list": traits_list, - "traits_data_list": traits_data_list + "traits_data_list": traits_data_list, + "exported_traits_data_list": exported_traits_data_list } + +def compute_heatmap_order( + slink_data, xoffset: int = 40, neworder: tuple = tuple()): + """ + Compute the data used for drawing the heatmap proper from `slink_data`. + + This function tries to reproduce the creation and update of the `neworder` + variable in + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L120 + and in the `web.webqtl.heatmap.Heatmap.draw` function in GN1 + """ + d_1 = (0, 0, 0) # returned from self.draw in lines 391 and 399. This is just a placeholder + + def __order_maker(norder, slnk_dt): + if isinstance(slnk_dt[0], int) and isinstance(slnk_dt[1], int): + return norder + ( + (xoffset+20, slnk_dt[0]), (xoffset + 40, slnk_dt[1])) + + if isinstance(slnk_dt[0], int): + return norder + ((xoffset + 20, slnk_dt[0]), ) + + if isinstance(slnk_dt[1], int): + return norder + ((xoffset + d_1[0] + 20, slnk_dt[1]), ) + + return __order_maker(__order_maker(norder, slnk_dt[0]), slnk_dt[1]) + + return __order_maker(neworder, slink_data) + +def retrieve_strains_and_values(orders, strainlist, traits_data_list): + """ + Get the strains and their corresponding values from `strainlist` and + `traits_data_list`. + + This migrates the code in + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L215-221 + """ + # This feels nasty! There's a lot of mutation of values here, that might + # indicate something untoward in the design of this function and its + # dependents ==> Review + strains = [] + values = [] + rets = [] + for order in orders: + temp_val = traits_data_list[order[1]] + for i, strain in enumerate(strainlist): + if temp_val[i] is not None: + strains.append(strain) + values.append(temp_val[i]) + rets.append([order, strains[:], values[:]]) + strains = [] + values = [] + + return rets diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py new file mode 100644 index 0000000..c058e14 --- /dev/null +++ b/gn3/computations/qtlreaper.py @@ -0,0 +1,92 @@ +""" +This module contains functions to interact with the `qtlreaper` utility for +computation of QTLs. +""" +import os +import random +import string +import subprocess +from gn3.settings import TMPDIR, REAPER_COMMAND + +def random_string(length): + """Generate a random string of length `length`.""" + return "".join( + random.choices( + string.ascii_letters + string.digits, k=length)) + +def generate_traits_file(strains, trait_values, traits_filename): + """ + Generate a traits file for use with `qtlreaper`. + + PARAMETERS: + strains: A list of strains to use as the headers for the various columns. + trait_values: A list of lists of values for each trait and strain. + traits_filename: The tab-separated value to put the values in for + computation of QTLs. + """ + header = "Trait\t{}\n".format("\t".join(strains)) + data = [header] + [ + "T{}\t{}\n".format(i+1, "\t".join([str(i) for i in t])) + for i, t in enumerate(trait_values[:-1])] + [ + "T{}\t{}".format(len(trait_values), "\t".join([str(i) for i in t])) + for t in trait_values[-1:]] + with open(traits_filename, "w") as outfile: + outfile.writelines(data) + +def create_output_directory(path: str): + """Create the output directory at `path` if it does not exist.""" + try: + os.mkdir(path) + except OSError: + pass + +def run_reaper( + genotype_filename: str, traits_filename: str, + other_options: tuple = ("--n_permutations", "1000"), + separate_nperm_output: bool = False, + output_dir: str = TMPDIR): + """ + Run the QTLReaper command to compute the QTLs. + + PARAMETERS: + genotype_filename: The complete path to a genotype file to use in the QTL + computation. + traits_filename: A path to a file previously generated with the + `generate_traits_file` function in this module, to be used in the QTL + computation. + other_options: Other options to pass to the `qtlreaper` command to modify + the QTL computations. + separate_nperm_output: A flag indicating whether or not to provide a + separate output for the permutations computation. The default is False, + which means by default, no separate output file is created. + output_dir: A path to the directory where the outputs are put + + RETURNS: + The function returns a tuple of the main output file, and the output file + for the permutation computations. If the `separate_nperm_output` is `False`, + the second value in the tuple returned is `None`. + + RAISES: + The function will raise a `subprocess.CalledProcessError` exception in case + of any errors running the `qtlreaper` command. + """ + create_output_directory("{}/qtlreaper".format(output_dir)) + output_filename = "{}/qtlreaper/main_output_{}.txt".format( + output_dir, random_string(10)) + output_list = ["--main_output", output_filename] + if separate_nperm_output: + permu_output_filename = "{}/qtlreaper/permu_output_{}.txt".format( + output_dir, random_string(10)) + output_list = output_list + ["--permu_output", permu_output_filename] + else: + permu_output_filename = None + + command_list = [ + REAPER_COMMAND, "--geno", genotype_filename, + *other_options, # this splices the `other_options` list here + "--traits", traits_filename, + *output_list # this splices the `output_list` list here + ] + + subprocess.run(command_list, check=True) + return (output_filename, permu_output_filename) diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py new file mode 100644 index 0000000..610ddde --- /dev/null +++ b/gn3/db/genotypes.py @@ -0,0 +1,69 @@ +"""Genotype utilities""" + +import os +import gzip +from gn3.settings import GENOTYPE_FILES + +def build_genotype_file( + geno_name: str, base_dir: str = GENOTYPE_FILES, + extension: str = "geno"): + """Build the absolute path for the genotype file.""" + return "{}/{}.{}".format(os.path.abspath(base_dir), geno_name, extension) + +def load_genotype_samples(genotype_filename: str, file_type: str = "geno"): + """ + Load sample of strains from genotype files. + + DESCRIPTION: + Traits can contain a varied number of strains, some of which do not exist in + certain genotypes. In order to compute QTLs, GEMMAs, etc, we need to ensure + to pick only those strains that exist in the genotype under consideration + for the traits used in the computation. + + This function loads a list of samples from the genotype files for use in + filtering out unusable strains. + + + PARAMETERS: + genotype_filename: The absolute path to the genotype file to load the + samples from. + file_type: The type of file. Currently supported values are 'geno' and + 'plink'. + """ + file_type_fns = { + "geno": __load_genotype_samples_from_geno, + "plink": __load_genotype_samples_from_plink + } + return file_type_fns[file_type](genotype_filename) + +def __load_genotype_samples_from_geno(genotype_filename: str): + """ + Helper function for `load_genotype_samples` function. + + Loads samples from '.geno' files. + """ + gzipped_filename = "{}.gz".format(genotype_filename) + if os.path.isfile(gzipped_filename): + genofile = gzip.open(gzipped_filename) + else: + genofile = open(genotype_filename) + + for row in genofile: + line = row.strip() + if (not line) or (line.startswith(("#", "@"))): + continue + break + + headers = line.split("\t") + if headers[3] == "Mb": + return headers[4:] + return headers[3:] + +def __load_genotype_samples_from_plink(genotype_filename: str): + """ + Helper function for `load_genotype_samples` function. + + Loads samples from '.plink' files. + """ + genofile = open(genotype_filename) + return [line.split(" ")[1] for line in genofile] diff --git a/gn3/settings.py b/gn3/settings.py index f4866d5..a08f846 100644 --- a/gn3/settings.py +++ b/gn3/settings.py @@ -24,3 +24,10 @@ GN2_BASE_URL = "http://www.genenetwork.org/" # biweight script BIWEIGHT_RSCRIPT = "~/genenetwork3/scripts/calculate_biweight.R" + +# qtlreaper command +REAPER_COMMAND = "{}/bin/qtlreaper".format(os.environ.get("GUIX_ENVIRONMENT")) + +# genotype files +GENOTYPE_FILES = os.environ.get( + "GENOTYPE_FILES", "{}/genotype_files/genotype".format(os.environ.get("HOME"))) @@ -43,7 +43,8 @@ (gnu packages databases) (gnu packages statistics) (gnu packages bioconductor) - (gn packages golang) + (gnu packages golang) + (gn packages genenetwork) (gnu packages python) (gnu packages python-check) (gnu packages python-crypto) @@ -104,7 +105,8 @@ ("r-qtl" ,r-qtl) ("r-stringi" ,r-stringi) ("python-plotly" ,python-plotly) - ("python-pandas" ,python-pandas))) + ("python-pandas" ,python-pandas) + ("rust-qtlreaper" ,rust-qtlreaper))) (build-system python-build-system) (home-page "https://github.com/genenetwork/genenetwork3") (synopsis "GeneNetwork3 API for data science and machine learning.") diff --git a/qtlfilesexport.py b/qtlfilesexport.py new file mode 100644 index 0000000..799de31 --- /dev/null +++ b/qtlfilesexport.py @@ -0,0 +1,68 @@ +""" +Test the qtlfiles export of traits files + +Run with: + + env SQL_URI="mysql://<user>:<password>@<host>:<port>/db_webqtl" python3 qtlfilesexport.py + +replacing the variables in the angled brackets with the appropriate values +""" +from gn3.computations.slink import slink +from gn3.db_utils import database_connector +from gn3.computations.qtlreaper import run_reaper +from gn3.computations.heatmap import export_trait_data +from gn3.db.traits import retrieve_trait_data, retrieve_trait_info +from gn3.db.genotypes import build_genotype_file, load_genotype_samples +from gn3.computations.qtlreaper import random_string, generate_traits_file +from gn3.computations.heatmap import ( + cluster_traits, + compute_heatmap_order, + retrieve_strains_and_values) + +TMPDIR = "tmp/qtltests" + +def trait_fullnames(): + """Return sample names for traits""" + return [ + "UCLA_BXDBXH_CARTILAGE_V2::ILM103710672", + "UCLA_BXDBXH_CARTILAGE_V2::ILM2260338", + "UCLA_BXDBXH_CARTILAGE_V2::ILM3140576", + "UCLA_BXDBXH_CARTILAGE_V2::ILM5670577", + "UCLA_BXDBXH_CARTILAGE_V2::ILM2070121", + "UCLA_BXDBXH_CARTILAGE_V2::ILM103990541", + "UCLA_BXDBXH_CARTILAGE_V2::ILM1190722", + "UCLA_BXDBXH_CARTILAGE_V2::ILM6590722", + "UCLA_BXDBXH_CARTILAGE_V2::ILM4200064", + "UCLA_BXDBXH_CARTILAGE_V2::ILM3140463"] + +def main(): + """entrypoint function""" + conn = database_connector()[0] + threshold = 0 + traits = [ + retrieve_trait_info(threshold, fullname, conn) + for fullname in trait_fullnames()] + traits_data_list = [retrieve_trait_data(t, conn) for t in traits] + genotype_filename = build_genotype_file(traits[0]["riset"]) + strains = load_genotype_samples(genotype_filename) + exported_traits_data_list = [ + export_trait_data(td, strains) for td in traits_data_list] + slinked = slink(cluster_traits(exported_traits_data_list)) + orders = compute_heatmap_order(slinked) + strains_and_values = retrieve_strains_and_values( + orders, strains, exported_traits_data_list) + strains_values = strains_and_values[0][1] + trait_values = [t[2] for t in strains_and_values] + traits_filename = "{}/traits_test_file_{}.txt".format( + TMPDIR, random_string(10)) + generate_traits_file(strains_values, trait_values, traits_filename) + print("Generated file: {}".format(traits_filename)) + + main_output, permutations_output = run_reaper( + genotype_filename, traits_filename, separate_nperm_output=True) + + print("Main output: {}, Permutation output: {}".format( + main_output, permutations_output)) + +if __name__ == "__main__": + main() diff --git a/tests/unit/computations/test_heatmap.py b/tests/unit/computations/test_heatmap.py index 650cb45..686288d 100644 --- a/tests/unit/computations/test_heatmap.py +++ b/tests/unit/computations/test_heatmap.py @@ -1,6 +1,10 @@ """Module contains tests for gn3.computations.heatmap""" from unittest import TestCase -from gn3.computations.heatmap import cluster_traits, export_trait_data +from gn3.computations.heatmap import ( + cluster_traits, + export_trait_data, + compute_heatmap_order, + retrieve_strains_and_values) strainlist = ["B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"] trait_data = { @@ -34,6 +38,16 @@ trait_data = { "C57BL/6J": {"strain_name": "C57BL/6J", "value": 7.50606, "variance": None, "ndata": None}, "DBA/2J": {"strain_name": "DBA/2J", "value": 7.72588, "variance": None, "ndata": None}}} +slinked = ( + (((0, 2, 0.16381088984330505), + ((1, 7, 0.06024619831474998), 5, 0.19179284676938602), + 0.20337048635536847), + 9, + 0.23451785425383564), + ((3, (6, 8, 0.2140799896286565), 0.25879514152086425), + 4, 0.8968250491499363), + 0.9313185954797953) + class TestHeatmap(TestCase): """Class for testing heatmap computation functions""" @@ -141,3 +155,24 @@ class TestHeatmap(TestCase): 0.9313185954797953, 1.1683723389247052, 0.23451785425383564, 1.7413442197913358, 0.33370067057028485, 1.3256191648260216, 0.0))) + + def test_compute_heatmap_order(self): + """Test the orders.""" + for xoff, expected in [ + (40, ((60, 9), (60, 4))), + (30, ((50, 9), (50, 4))), + (20, ((40, 9), (40, 4)))]: + with self.subTest(xoffset=xoff): + self.assertEqual( + compute_heatmap_order(slinked, xoffset=xoff), expected) + + def test_retrieve_strains_and_values(self): + """Test retrieval of strains and values.""" + for slist, tdata, expected in [ + [["s1", "s2", "s3", "s4"], [9, None, 5, 4], + (("s1", "s3", "s4"), (9, 5, 4))], + [["s1", "s2", "s3", "s4", "s5"], [6, None, None, 4, None], + (("s1", "s4"), (6, 4))]]: + with self.subTest(strainlist=slist, traitdata=tdata): + self.assertEqual( + retrieve_strains_and_values(slist, tdata), expected) |