diff options
Diffstat (limited to 'gn3')
-rw-r--r-- | gn3/computations/heatmap.py | 68 | ||||
-rw-r--r-- | gn3/computations/qtlreaper.py | 92 | ||||
-rw-r--r-- | gn3/db/genotypes.py | 69 | ||||
-rw-r--r-- | gn3/settings.py | 7 |
4 files changed, 229 insertions, 7 deletions
diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py index 3c35029..e0ff05b 100644 --- a/gn3/computations/heatmap.py +++ b/gn3/computations/heatmap.py @@ -149,22 +149,22 @@ def heatmap_data(formd, search_result, conn: Any): def __retrieve_traitlist_and_datalist(threshold, fullname): trait = retrieve_trait_info(threshold, fullname, conn) - return ( - trait, - export_trait_data(retrieve_trait_data(trait, conn), strainlist)) + return (trait, retrieve_trait_data(trait, conn)) traits_details = [ __retrieve_traitlist_and_datalist(threshold, fullname) for fullname in search_result] - traits_list = map(lambda x: x[0], traits_details) - traits_data_list = map(lambda x: x[1], traits_details) + traits_list = tuple(x[0] for x in traits_details) + traits_data_list = [x[1] for x in traits_details] + exported_traits_data_list = tuple( + export_trait_data(td, strainlist) for td in traits_data_list) return { "target_description_checked": formd.formdata.getvalue( "targetDescriptionCheck", ""), "cluster_checked": cluster_checked, "slink_data": ( - slink(cluster_traits(traits_data_list)) + slink(cluster_traits(exported_traits_data_list)) if cluster_checked else False), "sessionfile": formd.formdata.getvalue("session"), "genotype": genotype, @@ -173,5 +173,59 @@ def heatmap_data(formd, search_result, conn: Any): "ppolar": formd.ppolar, "mpolar":formd.mpolar, "traits_list": traits_list, - "traits_data_list": traits_data_list + "traits_data_list": traits_data_list, + "exported_traits_data_list": exported_traits_data_list } + +def compute_heatmap_order( + slink_data, xoffset: int = 40, neworder: tuple = tuple()): + """ + Compute the data used for drawing the heatmap proper from `slink_data`. + + This function tries to reproduce the creation and update of the `neworder` + variable in + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L120 + and in the `web.webqtl.heatmap.Heatmap.draw` function in GN1 + """ + d_1 = (0, 0, 0) # returned from self.draw in lines 391 and 399. This is just a placeholder + + def __order_maker(norder, slnk_dt): + if isinstance(slnk_dt[0], int) and isinstance(slnk_dt[1], int): + return norder + ( + (xoffset+20, slnk_dt[0]), (xoffset + 40, slnk_dt[1])) + + if isinstance(slnk_dt[0], int): + return norder + ((xoffset + 20, slnk_dt[0]), ) + + if isinstance(slnk_dt[1], int): + return norder + ((xoffset + d_1[0] + 20, slnk_dt[1]), ) + + return __order_maker(__order_maker(norder, slnk_dt[0]), slnk_dt[1]) + + return __order_maker(neworder, slink_data) + +def retrieve_strains_and_values(orders, strainlist, traits_data_list): + """ + Get the strains and their corresponding values from `strainlist` and + `traits_data_list`. + + This migrates the code in + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L215-221 + """ + # This feels nasty! There's a lot of mutation of values here, that might + # indicate something untoward in the design of this function and its + # dependents ==> Review + strains = [] + values = [] + rets = [] + for order in orders: + temp_val = traits_data_list[order[1]] + for i, strain in enumerate(strainlist): + if temp_val[i] is not None: + strains.append(strain) + values.append(temp_val[i]) + rets.append([order, strains[:], values[:]]) + strains = [] + values = [] + + return rets diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py new file mode 100644 index 0000000..c058e14 --- /dev/null +++ b/gn3/computations/qtlreaper.py @@ -0,0 +1,92 @@ +""" +This module contains functions to interact with the `qtlreaper` utility for +computation of QTLs. +""" +import os +import random +import string +import subprocess +from gn3.settings import TMPDIR, REAPER_COMMAND + +def random_string(length): + """Generate a random string of length `length`.""" + return "".join( + random.choices( + string.ascii_letters + string.digits, k=length)) + +def generate_traits_file(strains, trait_values, traits_filename): + """ + Generate a traits file for use with `qtlreaper`. + + PARAMETERS: + strains: A list of strains to use as the headers for the various columns. + trait_values: A list of lists of values for each trait and strain. + traits_filename: The tab-separated value to put the values in for + computation of QTLs. + """ + header = "Trait\t{}\n".format("\t".join(strains)) + data = [header] + [ + "T{}\t{}\n".format(i+1, "\t".join([str(i) for i in t])) + for i, t in enumerate(trait_values[:-1])] + [ + "T{}\t{}".format(len(trait_values), "\t".join([str(i) for i in t])) + for t in trait_values[-1:]] + with open(traits_filename, "w") as outfile: + outfile.writelines(data) + +def create_output_directory(path: str): + """Create the output directory at `path` if it does not exist.""" + try: + os.mkdir(path) + except OSError: + pass + +def run_reaper( + genotype_filename: str, traits_filename: str, + other_options: tuple = ("--n_permutations", "1000"), + separate_nperm_output: bool = False, + output_dir: str = TMPDIR): + """ + Run the QTLReaper command to compute the QTLs. + + PARAMETERS: + genotype_filename: The complete path to a genotype file to use in the QTL + computation. + traits_filename: A path to a file previously generated with the + `generate_traits_file` function in this module, to be used in the QTL + computation. + other_options: Other options to pass to the `qtlreaper` command to modify + the QTL computations. + separate_nperm_output: A flag indicating whether or not to provide a + separate output for the permutations computation. The default is False, + which means by default, no separate output file is created. + output_dir: A path to the directory where the outputs are put + + RETURNS: + The function returns a tuple of the main output file, and the output file + for the permutation computations. If the `separate_nperm_output` is `False`, + the second value in the tuple returned is `None`. + + RAISES: + The function will raise a `subprocess.CalledProcessError` exception in case + of any errors running the `qtlreaper` command. + """ + create_output_directory("{}/qtlreaper".format(output_dir)) + output_filename = "{}/qtlreaper/main_output_{}.txt".format( + output_dir, random_string(10)) + output_list = ["--main_output", output_filename] + if separate_nperm_output: + permu_output_filename = "{}/qtlreaper/permu_output_{}.txt".format( + output_dir, random_string(10)) + output_list = output_list + ["--permu_output", permu_output_filename] + else: + permu_output_filename = None + + command_list = [ + REAPER_COMMAND, "--geno", genotype_filename, + *other_options, # this splices the `other_options` list here + "--traits", traits_filename, + *output_list # this splices the `output_list` list here + ] + + subprocess.run(command_list, check=True) + return (output_filename, permu_output_filename) diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py new file mode 100644 index 0000000..610ddde --- /dev/null +++ b/gn3/db/genotypes.py @@ -0,0 +1,69 @@ +"""Genotype utilities""" + +import os +import gzip +from gn3.settings import GENOTYPE_FILES + +def build_genotype_file( + geno_name: str, base_dir: str = GENOTYPE_FILES, + extension: str = "geno"): + """Build the absolute path for the genotype file.""" + return "{}/{}.{}".format(os.path.abspath(base_dir), geno_name, extension) + +def load_genotype_samples(genotype_filename: str, file_type: str = "geno"): + """ + Load sample of strains from genotype files. + + DESCRIPTION: + Traits can contain a varied number of strains, some of which do not exist in + certain genotypes. In order to compute QTLs, GEMMAs, etc, we need to ensure + to pick only those strains that exist in the genotype under consideration + for the traits used in the computation. + + This function loads a list of samples from the genotype files for use in + filtering out unusable strains. + + + PARAMETERS: + genotype_filename: The absolute path to the genotype file to load the + samples from. + file_type: The type of file. Currently supported values are 'geno' and + 'plink'. + """ + file_type_fns = { + "geno": __load_genotype_samples_from_geno, + "plink": __load_genotype_samples_from_plink + } + return file_type_fns[file_type](genotype_filename) + +def __load_genotype_samples_from_geno(genotype_filename: str): + """ + Helper function for `load_genotype_samples` function. + + Loads samples from '.geno' files. + """ + gzipped_filename = "{}.gz".format(genotype_filename) + if os.path.isfile(gzipped_filename): + genofile = gzip.open(gzipped_filename) + else: + genofile = open(genotype_filename) + + for row in genofile: + line = row.strip() + if (not line) or (line.startswith(("#", "@"))): + continue + break + + headers = line.split("\t") + if headers[3] == "Mb": + return headers[4:] + return headers[3:] + +def __load_genotype_samples_from_plink(genotype_filename: str): + """ + Helper function for `load_genotype_samples` function. + + Loads samples from '.plink' files. + """ + genofile = open(genotype_filename) + return [line.split(" ")[1] for line in genofile] diff --git a/gn3/settings.py b/gn3/settings.py index f4866d5..a08f846 100644 --- a/gn3/settings.py +++ b/gn3/settings.py @@ -24,3 +24,10 @@ GN2_BASE_URL = "http://www.genenetwork.org/" # biweight script BIWEIGHT_RSCRIPT = "~/genenetwork3/scripts/calculate_biweight.R" + +# qtlreaper command +REAPER_COMMAND = "{}/bin/qtlreaper".format(os.environ.get("GUIX_ENVIRONMENT")) + +# genotype files +GENOTYPE_FILES = os.environ.get( + "GENOTYPE_FILES", "{}/genotype_files/genotype".format(os.environ.get("HOME"))) |