diff options
author | zsloan | 2021-10-12 18:56:34 +0000 |
---|---|---|
committer | zsloan | 2021-10-12 18:56:34 +0000 |
commit | 0f396f4a1a753d449cf2975fc425d587d9350689 (patch) | |
tree | c9dac243dc05e5cb90ccb7f1d96fd599986bf60a /gn3/computations | |
parent | 976660ce9ef915426c7ce5ff9077b439e4102a2c (diff) | |
parent | 77c274b79c3ec01de60e90db3299763cb58f715b (diff) | |
download | genenetwork3-0f396f4a1a753d449cf2975fc425d587d9350689.tar.gz |
Merge branch 'main' of https://github.com/genenetwork/genenetwork3 into feature/add_rqtl_pairscan
Diffstat (limited to 'gn3/computations')
-rw-r--r-- | gn3/computations/heatmap.py | 177 | ||||
-rw-r--r-- | gn3/computations/parsers.py | 10 | ||||
-rw-r--r-- | gn3/computations/qtlreaper.py | 170 | ||||
-rw-r--r-- | gn3/computations/wgcna.py | 51 |
4 files changed, 226 insertions, 182 deletions
diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py deleted file mode 100644 index 3c35029..0000000 --- a/gn3/computations/heatmap.py +++ /dev/null @@ -1,177 +0,0 @@ -""" -This module will contain functions to be used in computation of the data used to -generate various kinds of heatmaps. -""" - -from functools import reduce -from typing import Any, Dict, Sequence -from gn3.computations.slink import slink -from gn3.db.traits import retrieve_trait_data, retrieve_trait_info -from gn3.computations.correlations2 import compute_correlation - -def export_trait_data( - trait_data: dict, strainlist: Sequence[str], dtype: str = "val", - var_exists: bool = False, n_exists: bool = False): - """ - Export data according to `strainlist`. Mostly used in calculating - correlations. - - DESCRIPTION: - Migrated from - https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L166-L211 - - PARAMETERS - trait: (dict) - The dictionary of key-value pairs representing a trait - strainlist: (list) - A list of strain names - type: (str) - ... verify what this is ... - var_exists: (bool) - A flag indicating existence of variance - n_exists: (bool) - A flag indicating existence of ndata - """ - def __export_all_types(tdata, strain): - sample_data = [] - if tdata[strain]["value"]: - sample_data.append(tdata[strain]["value"]) - if var_exists: - if tdata[strain]["variance"]: - sample_data.append(tdata[strain]["variance"]) - else: - sample_data.append(None) - if n_exists: - if tdata[strain]["ndata"]: - sample_data.append(tdata[strain]["ndata"]) - else: - sample_data.append(None) - else: - if var_exists and n_exists: - sample_data += [None, None, None] - elif var_exists or n_exists: - sample_data += [None, None] - else: - sample_data.append(None) - - return tuple(sample_data) - - def __exporter(accumulator, strain): - # pylint: disable=[R0911] - if strain in trait_data["data"]: - if dtype == "val": - return accumulator + (trait_data["data"][strain]["value"], ) - if dtype == "var": - return accumulator + (trait_data["data"][strain]["variance"], ) - if dtype == "N": - return accumulator + (trait_data["data"][strain]["ndata"], ) - if dtype == "all": - return accumulator + __export_all_types(trait_data["data"], strain) - raise KeyError("Type `%s` is incorrect" % dtype) - if var_exists and n_exists: - return accumulator + (None, None, None) - if var_exists or n_exists: - return accumulator + (None, None) - return accumulator + (None,) - - return reduce(__exporter, strainlist, tuple()) - -def trait_display_name(trait: Dict): - """ - Given a trait, return a name to use to display the trait on a heatmap. - - DESCRIPTION - Migrated from - https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L141-L157 - """ - if trait.get("db", None) and trait.get("trait_name", None): - if trait["db"]["dataset_type"] == "Temp": - desc = trait["description"] - if desc.find("PCA") >= 0: - return "%s::%s" % ( - trait["db"]["displayname"], - desc[desc.rindex(':')+1:].strip()) - return "%s::%s" % ( - trait["db"]["displayname"], - desc[:desc.index('entered')].strip()) - prefix = "%s::%s" % ( - trait["db"]["dataset_name"], trait["trait_name"]) - if trait["cellid"]: - return "%s::%s" % (prefix, trait["cellid"]) - return prefix - return trait["description"] - -def cluster_traits(traits_data_list: Sequence[Dict]): - """ - Clusters the trait values. - - DESCRIPTION - Attempts to replicate the clustering of the traits, as done at - https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L138-L162 - """ - def __compute_corr(tdata_i, tdata_j): - if tdata_i[0] == tdata_j[0]: - return 0.0 - corr_vals = compute_correlation(tdata_i[1], tdata_j[1]) - corr = corr_vals[0] - if (1 - corr) < 0: - return 0.0 - return 1 - corr - - def __cluster(tdata_i): - return tuple( - __compute_corr(tdata_i, tdata_j) - for tdata_j in enumerate(traits_data_list)) - - return tuple(__cluster(tdata_i) for tdata_i in enumerate(traits_data_list)) - -def heatmap_data(formd, search_result, conn: Any): - """ - heatmap function - - DESCRIPTION - This function is an attempt to reproduce the initialisation at - https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L46-L64 - and also the clustering and slink computations at - https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L138-L165 - with the help of the `gn3.computations.heatmap.cluster_traits` function. - - It does not try to actually draw the heatmap image. - - PARAMETERS: - TODO: Elaborate on the parameters here... - """ - threshold = 0 # webqtlConfig.PUBLICTHRESH - cluster_checked = formd.formdata.getvalue("clusterCheck", "") - strainlist = [ - strain for strain in formd.strainlist if strain not in formd.parlist] - genotype = formd.genotype - - def __retrieve_traitlist_and_datalist(threshold, fullname): - trait = retrieve_trait_info(threshold, fullname, conn) - return ( - trait, - export_trait_data(retrieve_trait_data(trait, conn), strainlist)) - - traits_details = [ - __retrieve_traitlist_and_datalist(threshold, fullname) - for fullname in search_result] - traits_list = map(lambda x: x[0], traits_details) - traits_data_list = map(lambda x: x[1], traits_details) - - return { - "target_description_checked": formd.formdata.getvalue( - "targetDescriptionCheck", ""), - "cluster_checked": cluster_checked, - "slink_data": ( - slink(cluster_traits(traits_data_list)) - if cluster_checked else False), - "sessionfile": formd.formdata.getvalue("session"), - "genotype": genotype, - "nLoci": sum(map(len, genotype)), - "strainlist": strainlist, - "ppolar": formd.ppolar, - "mpolar":formd.mpolar, - "traits_list": traits_list, - "traits_data_list": traits_data_list - } diff --git a/gn3/computations/parsers.py b/gn3/computations/parsers.py index 94387ff..1af35d6 100644 --- a/gn3/computations/parsers.py +++ b/gn3/computations/parsers.py @@ -14,7 +14,7 @@ def parse_genofile(file_path: str) -> Tuple[List[str], 'h': 0, 'u': None, } - genotypes, strains = [], [] + genotypes, samples = [], [] with open(file_path, "r") as _genofile: for line in _genofile: line = line.strip() @@ -22,8 +22,8 @@ def parse_genofile(file_path: str) -> Tuple[List[str], continue cells = line.split() if line.startswith("Chr"): - strains = cells[4:] - strains = [strain.lower() for strain in strains] + samples = cells[4:] + samples = [sample.lower() for sample in samples] continue values = [__map.get(value.lower(), None) for value in cells[4:]] genotype = { @@ -32,7 +32,7 @@ def parse_genofile(file_path: str) -> Tuple[List[str], "cm": cells[2], "mb": cells[3], "values": values, - "dicvalues": dict(zip(strains, values)), + "dicvalues": dict(zip(samples, values)), } genotypes.append(genotype) - return strains, genotypes + return samples, genotypes diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py new file mode 100644 index 0000000..d1ff4ac --- /dev/null +++ b/gn3/computations/qtlreaper.py @@ -0,0 +1,170 @@ +""" +This module contains functions to interact with the `qtlreaper` utility for +computation of QTLs. +""" +import os +import subprocess +from typing import Union + +from gn3.random import random_string +from gn3.settings import TMPDIR, REAPER_COMMAND + +def generate_traits_file(samples, trait_values, traits_filename): + """ + Generate a traits file for use with `qtlreaper`. + + PARAMETERS: + samples: A list of samples to use as the headers for the various columns. + trait_values: A list of lists of values for each trait and sample. + traits_filename: The tab-separated value to put the values in for + computation of QTLs. + """ + header = "Trait\t{}\n".format("\t".join(samples)) + data = ( + [header] + + ["{}\t{}\n".format(i+1, "\t".join([str(i) for i in t])) + for i, t in enumerate(trait_values[:-1])] + + ["{}\t{}".format( + len(trait_values), "\t".join([str(i) for i in t])) + for t in trait_values[-1:]]) + with open(traits_filename, "w") as outfile: + outfile.writelines(data) + +def create_output_directory(path: str): + """Create the output directory at `path` if it does not exist.""" + try: + os.mkdir(path) + except FileExistsError: + # If the directory already exists, do nothing. + pass + +def run_reaper( + genotype_filename: str, traits_filename: str, + other_options: tuple = ("--n_permutations", "1000"), + separate_nperm_output: bool = False, + output_dir: str = TMPDIR): + """ + Run the QTLReaper command to compute the QTLs. + + PARAMETERS: + genotype_filename: The complete path to a genotype file to use in the QTL + computation. + traits_filename: A path to a file previously generated with the + `generate_traits_file` function in this module, to be used in the QTL + computation. + other_options: Other options to pass to the `qtlreaper` command to modify + the QTL computations. + separate_nperm_output: A flag indicating whether or not to provide a + separate output for the permutations computation. The default is False, + which means by default, no separate output file is created. + output_dir: A path to the directory where the outputs are put + + RETURNS: + The function returns a tuple of the main output file, and the output file + for the permutation computations. If the `separate_nperm_output` is `False`, + the second value in the tuple returned is `None`. + + RAISES: + The function will raise a `subprocess.CalledProcessError` exception in case + of any errors running the `qtlreaper` command. + """ + create_output_directory("{}/qtlreaper".format(output_dir)) + output_filename = "{}/qtlreaper/main_output_{}.txt".format( + output_dir, random_string(10)) + output_list = ["--main_output", output_filename] + if separate_nperm_output: + permu_output_filename: Union[None, str] = "{}/qtlreaper/permu_output_{}.txt".format( + output_dir, random_string(10)) + output_list = output_list + [ + "--permu_output", permu_output_filename] # type: ignore[list-item] + else: + permu_output_filename = None + + command_list = [ + REAPER_COMMAND, "--geno", genotype_filename, + *other_options, # this splices the `other_options` list here + "--traits", traits_filename, + *output_list # this splices the `output_list` list here + ] + + subprocess.run(command_list, check=True) + return (output_filename, permu_output_filename) + +def chromosome_sorter_key_fn(val): + """ + Useful for sorting the chromosomes + """ + if isinstance(val, int): + return val + return ord(val) + +def organise_reaper_main_results(parsed_results): + """ + Provide the results of running reaper in a format that is easier to use. + """ + def __organise_by_chromosome(chr_name, items): + chr_items = [item for item in items if item["Chr"] == chr_name] + return { + "Chr": chr_name, + "loci": [{ + "Locus": locus["Locus"], + "cM": locus["cM"], + "Mb": locus["Mb"], + "LRS": locus["LRS"], + "Additive": locus["Additive"], + "pValue": locus["pValue"] + } for locus in chr_items]} + + def __organise_by_id(identifier, items): + id_items = [item for item in items if item["ID"] == identifier] + unique_chromosomes = {item["Chr"] for item in id_items} + return { + "ID": identifier, + "chromosomes": { + _chr["Chr"]: _chr for _chr in [ + __organise_by_chromosome(chromo, id_items) + for chromo in sorted( + unique_chromosomes, key=chromosome_sorter_key_fn)]}} + + unique_ids = {res["ID"] for res in parsed_results} + return { + trait["ID"]: trait for trait in + [__organise_by_id(_id, parsed_results) for _id in sorted(unique_ids)]} + +def parse_reaper_main_results(results_file): + """ + Parse the results file of running QTLReaper into a list of dicts. + """ + with open(results_file, "r") as infile: + lines = infile.readlines() + + def __parse_column_float_value(value): + # pylint: disable=W0702 + try: + return float(value) + except: + return value + + def __parse_column_int_value(value): + # pylint: disable=W0702 + try: + return int(value) + except: + return value + + def __parse_line(line): + items = line.strip().split("\t") + return items[0:2] + [__parse_column_int_value(items[2])] + [ + __parse_column_float_value(item) for item in items[3:]] + + header = lines[0].strip().split("\t") + return [dict(zip(header, __parse_line(line))) for line in lines[1:]] + +def parse_reaper_permutation_results(results_file): + """ + Parse the results QTLReaper permutations into a list of values. + """ + with open(results_file, "r") as infile: + lines = infile.readlines() + + return [float(line.strip()) for line in lines] diff --git a/gn3/computations/wgcna.py b/gn3/computations/wgcna.py new file mode 100644 index 0000000..fd508fa --- /dev/null +++ b/gn3/computations/wgcna.py @@ -0,0 +1,51 @@ +"""module contains code to preprocess and call wgcna script""" + +import os +import json +import uuid +from gn3.settings import TMPDIR + +from gn3.commands import run_cmd + + +def dump_wgcna_data(request_data: dict): + """function to dump request data to json file""" + filename = f"{str(uuid.uuid4())}.json" + + temp_file_path = os.path.join(TMPDIR, filename) + + with open(temp_file_path, "w") as output_file: + json.dump(request_data, output_file) + + return temp_file_path + + +def compose_wgcna_cmd(rscript_path: str, temp_file_path: str): + """function to componse wgcna cmd""" + # (todo):issue relative paths to abs paths + cmd = f"Rscript ./scripts/{rscript_path} {temp_file_path}" + return cmd + + +def call_wgcna_script(rscript_path: str, request_data: dict): + """function to call wgcna script""" + generated_file = dump_wgcna_data(request_data) + cmd = compose_wgcna_cmd(rscript_path, generated_file) + + try: + + run_cmd_results = run_cmd(cmd) + + with open(generated_file, "r") as outputfile: + + if run_cmd_results["code"] != 0: + return run_cmd_results + return { + "data": json.load(outputfile), + **run_cmd_results + } + except FileNotFoundError: + # relook at handling errors gn3 + return { + "output": "output file not found" + } |