diff options
Diffstat (limited to 'gn3/computations')
-rw-r--r-- | gn3/computations/correlations.py | 36 | ||||
-rw-r--r-- | gn3/computations/correlations2.py | 50 | ||||
-rw-r--r-- | gn3/computations/heatmap.py | 177 | ||||
-rw-r--r-- | gn3/computations/rqtl.py | 77 | ||||
-rw-r--r-- | gn3/computations/slink.py | 198 |
5 files changed, 510 insertions, 28 deletions
diff --git a/gn3/computations/correlations.py b/gn3/computations/correlations.py index bc738a7..bb13ff1 100644 --- a/gn3/computations/correlations.py +++ b/gn3/computations/correlations.py @@ -1,4 +1,5 @@ """module contains code for correlations""" +import math import multiprocessing from typing import List @@ -90,7 +91,7 @@ def compute_sample_r_correlation(trait_name, corr_method, trait_vals, target_values=sanitized_target_vals, corr_method=corr_method) - if corr_coefficient is not None: + if corr_coefficient is not None and not math.isnan(corr_coefficient): return (trait_name, corr_coefficient, p_value, num_overlap) return None @@ -123,11 +124,12 @@ def filter_shared_sample_keys(this_samplelist, return (this_vals, target_vals) -def compute_all_sample_correlation(this_trait, - target_dataset, - corr_method="pearson") -> List: +def fast_compute_all_sample_correlation(this_trait, + target_dataset, + corr_method="pearson") -> List: """Given a trait data sample-list and target__datasets compute all sample correlation + this functions uses multiprocessing if not use the normal fun """ # xtodo fix trait_name currently returning single one @@ -159,9 +161,9 @@ def compute_all_sample_correlation(this_trait, key=lambda trait_name: -abs(list(trait_name.values())[0]["corr_coefficient"])) -def benchmark_compute_all_sample(this_trait, - target_dataset, - corr_method="pearson") -> List: +def compute_all_sample_correlation(this_trait, + target_dataset, + corr_method="pearson") -> List: """Temp function to benchmark with compute_all_sample_r alternative to compute_all_sample_r where we use multiprocessing @@ -173,6 +175,7 @@ def benchmark_compute_all_sample(this_trait, target_trait_data = target_trait["trait_sample_data"] this_vals, target_vals = filter_shared_sample_keys( this_trait_samples, target_trait_data) + sample_correlation = compute_sample_r_correlation( trait_name=trait_name, corr_method=corr_method, @@ -189,7 +192,9 @@ def benchmark_compute_all_sample(this_trait, "num_overlap": num_overlap } corr_results.append({trait_name: corr_result}) - return corr_results + return sorted( + corr_results, + key=lambda trait_name: -abs(list(trait_name.values())[0]["corr_coefficient"])) def tissue_correlation_for_trait( @@ -335,9 +340,9 @@ def compute_all_lit_correlation(conn, trait_lists: List, return sorted_lit_results -def compute_all_tissue_correlation(primary_tissue_dict: dict, - target_tissues_data: dict, - corr_method: str): +def compute_tissue_correlation(primary_tissue_dict: dict, + target_tissues_data: dict, + corr_method: str): """Function acts as an abstraction for tissue_correlation_for_trait\ required input are target tissue object and primary tissue trait\ target tissues data contains the trait_symbol_dict and symbol_tissue_vals @@ -357,8 +362,7 @@ def compute_all_tissue_correlation(primary_tissue_dict: dict, target_tissues_values=target_tissue_vals, trait_id=trait_id, corr_method=corr_method) - tissue_result_dict = {trait_id: tissue_result} - tissues_results.append(tissue_result_dict) + tissues_results.append(tissue_result) return sorted( tissues_results, key=lambda trait_name: -abs(list(trait_name.values())[0]["tissue_corr"])) @@ -381,9 +385,9 @@ def process_trait_symbol_dict(trait_symbol_dict, symbol_tissue_vals_dict) -> Lis return traits_tissue_vals -def compute_tissue_correlation(primary_tissue_dict: dict, - target_tissues_data: dict, - corr_method: str): +def fast_compute_tissue_correlation(primary_tissue_dict: dict, + target_tissues_data: dict, + corr_method: str): """Experimental function that uses multiprocessing for computing tissue correlation diff --git a/gn3/computations/correlations2.py b/gn3/computations/correlations2.py new file mode 100644 index 0000000..93db3fa --- /dev/null +++ b/gn3/computations/correlations2.py @@ -0,0 +1,50 @@ +""" +DESCRIPTION: + TODO: Add a description for the module + +FUNCTIONS: +compute_correlation: + TODO: Describe what the function does...""" + +from math import sqrt +from functools import reduce +## From GN1: mostly for clustering and heatmap generation + +def __items_with_values(dbdata, userdata): + """Retains only corresponding items in the data items that are not `None` values. + This should probably be renamed to something sensible""" + def both_not_none(item1, item2): + """Check that both items are not the value `None`.""" + if (item1 is not None) and (item2 is not None): + return (item1, item2) + return None + def split_lists(accumulator, item): + """Separate the 'x' and 'y' items.""" + return [accumulator[0] + [item[0]], accumulator[1] + [item[1]]] + return reduce( + split_lists, + filter(lambda x: x is not None, map(both_not_none, dbdata, userdata)), + [[], []]) + +def compute_correlation(dbdata, userdata): + """Compute some form of correlation. + + This is extracted from + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/utility/webqtlUtil.py#L622-L647 + """ + x_items, y_items = __items_with_values(dbdata, userdata) + if len(x_items) < 6: + return (0.0, len(x_items)) + meanx = sum(x_items)/len(x_items) + meany = sum(y_items)/len(y_items) + def cal_corr_vals(acc, item): + xitem, yitem = item + return [ + acc[0] + ((xitem - meanx) * (yitem - meany)), + acc[1] + ((xitem - meanx) * (xitem - meanx)), + acc[2] + ((yitem - meany) * (yitem - meany))] + xyd, sxd, syd = reduce(cal_corr_vals, zip(x_items, y_items), [0.0, 0.0, 0.0]) + try: + return ((xyd/(sqrt(sxd)*sqrt(syd))), len(x_items)) + except ZeroDivisionError: + return(0, len(x_items)) diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py new file mode 100644 index 0000000..3c35029 --- /dev/null +++ b/gn3/computations/heatmap.py @@ -0,0 +1,177 @@ +""" +This module will contain functions to be used in computation of the data used to +generate various kinds of heatmaps. +""" + +from functools import reduce +from typing import Any, Dict, Sequence +from gn3.computations.slink import slink +from gn3.db.traits import retrieve_trait_data, retrieve_trait_info +from gn3.computations.correlations2 import compute_correlation + +def export_trait_data( + trait_data: dict, strainlist: Sequence[str], dtype: str = "val", + var_exists: bool = False, n_exists: bool = False): + """ + Export data according to `strainlist`. Mostly used in calculating + correlations. + + DESCRIPTION: + Migrated from + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L166-L211 + + PARAMETERS + trait: (dict) + The dictionary of key-value pairs representing a trait + strainlist: (list) + A list of strain names + type: (str) + ... verify what this is ... + var_exists: (bool) + A flag indicating existence of variance + n_exists: (bool) + A flag indicating existence of ndata + """ + def __export_all_types(tdata, strain): + sample_data = [] + if tdata[strain]["value"]: + sample_data.append(tdata[strain]["value"]) + if var_exists: + if tdata[strain]["variance"]: + sample_data.append(tdata[strain]["variance"]) + else: + sample_data.append(None) + if n_exists: + if tdata[strain]["ndata"]: + sample_data.append(tdata[strain]["ndata"]) + else: + sample_data.append(None) + else: + if var_exists and n_exists: + sample_data += [None, None, None] + elif var_exists or n_exists: + sample_data += [None, None] + else: + sample_data.append(None) + + return tuple(sample_data) + + def __exporter(accumulator, strain): + # pylint: disable=[R0911] + if strain in trait_data["data"]: + if dtype == "val": + return accumulator + (trait_data["data"][strain]["value"], ) + if dtype == "var": + return accumulator + (trait_data["data"][strain]["variance"], ) + if dtype == "N": + return accumulator + (trait_data["data"][strain]["ndata"], ) + if dtype == "all": + return accumulator + __export_all_types(trait_data["data"], strain) + raise KeyError("Type `%s` is incorrect" % dtype) + if var_exists and n_exists: + return accumulator + (None, None, None) + if var_exists or n_exists: + return accumulator + (None, None) + return accumulator + (None,) + + return reduce(__exporter, strainlist, tuple()) + +def trait_display_name(trait: Dict): + """ + Given a trait, return a name to use to display the trait on a heatmap. + + DESCRIPTION + Migrated from + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L141-L157 + """ + if trait.get("db", None) and trait.get("trait_name", None): + if trait["db"]["dataset_type"] == "Temp": + desc = trait["description"] + if desc.find("PCA") >= 0: + return "%s::%s" % ( + trait["db"]["displayname"], + desc[desc.rindex(':')+1:].strip()) + return "%s::%s" % ( + trait["db"]["displayname"], + desc[:desc.index('entered')].strip()) + prefix = "%s::%s" % ( + trait["db"]["dataset_name"], trait["trait_name"]) + if trait["cellid"]: + return "%s::%s" % (prefix, trait["cellid"]) + return prefix + return trait["description"] + +def cluster_traits(traits_data_list: Sequence[Dict]): + """ + Clusters the trait values. + + DESCRIPTION + Attempts to replicate the clustering of the traits, as done at + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L138-L162 + """ + def __compute_corr(tdata_i, tdata_j): + if tdata_i[0] == tdata_j[0]: + return 0.0 + corr_vals = compute_correlation(tdata_i[1], tdata_j[1]) + corr = corr_vals[0] + if (1 - corr) < 0: + return 0.0 + return 1 - corr + + def __cluster(tdata_i): + return tuple( + __compute_corr(tdata_i, tdata_j) + for tdata_j in enumerate(traits_data_list)) + + return tuple(__cluster(tdata_i) for tdata_i in enumerate(traits_data_list)) + +def heatmap_data(formd, search_result, conn: Any): + """ + heatmap function + + DESCRIPTION + This function is an attempt to reproduce the initialisation at + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L46-L64 + and also the clustering and slink computations at + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L138-L165 + with the help of the `gn3.computations.heatmap.cluster_traits` function. + + It does not try to actually draw the heatmap image. + + PARAMETERS: + TODO: Elaborate on the parameters here... + """ + threshold = 0 # webqtlConfig.PUBLICTHRESH + cluster_checked = formd.formdata.getvalue("clusterCheck", "") + strainlist = [ + strain for strain in formd.strainlist if strain not in formd.parlist] + genotype = formd.genotype + + def __retrieve_traitlist_and_datalist(threshold, fullname): + trait = retrieve_trait_info(threshold, fullname, conn) + return ( + trait, + export_trait_data(retrieve_trait_data(trait, conn), strainlist)) + + traits_details = [ + __retrieve_traitlist_and_datalist(threshold, fullname) + for fullname in search_result] + traits_list = map(lambda x: x[0], traits_details) + traits_data_list = map(lambda x: x[1], traits_details) + + return { + "target_description_checked": formd.formdata.getvalue( + "targetDescriptionCheck", ""), + "cluster_checked": cluster_checked, + "slink_data": ( + slink(cluster_traits(traits_data_list)) + if cluster_checked else False), + "sessionfile": formd.formdata.getvalue("session"), + "genotype": genotype, + "nLoci": sum(map(len, genotype)), + "strainlist": strainlist, + "ppolar": formd.ppolar, + "mpolar":formd.mpolar, + "traits_list": traits_list, + "traits_data_list": traits_data_list + } diff --git a/gn3/computations/rqtl.py b/gn3/computations/rqtl.py index 00e1db9..38c5000 100644 --- a/gn3/computations/rqtl.py +++ b/gn3/computations/rqtl.py @@ -1,6 +1,7 @@ """Procedures related rqtl computations""" import os -from typing import Dict, List, Union +from bisect import bisect +from typing import Dict, List, Tuple, Union import numpy as np @@ -80,15 +81,15 @@ def process_rqtl_mapping(file_name: str) -> List: return marker_obs -def process_rqtl_pairscan(file_name: str) -> List: +def process_rqtl_pairscan(file_name: str, geno_file: str) -> List: """Given an output file name, read in R/qtl pair-scan results and return a list of both the JSON needed for the d3panels figure and a list of results to be used when generating the results table (which will include marker names) """ - figure_data = pairscan_results_for_figure(file_name) - table_data = pairscan_results_for_table(file_name) + figure_data = pairscan_for_figure(file_name) + table_data = pairscan_for_table(file_name, geno_file) return [figure_data, table_data] @@ -119,15 +120,14 @@ def pairscan_for_figure(file_name: str) -> Dict: if i == 0: # Skip first line continue line_items = [item.rstrip('\n') for item in line.split(",")] - chr_list.append(line_items[1]) + chr_list.append(line_items[1][1:-1]) pos_list.append(line_items[2]) figure_data['chr'] = chr_list figure_data['pos'] = pos_list return figure_data - -def pairscan_for_table(file_name: str) -> List: +def pairscan_for_table(file_name: str, geno_file: str) -> List: """Given an output file name, read in R/qtl pair-scan results and return a list of results to be used when generating the results table (which will include marker names) @@ -148,32 +148,85 @@ def pairscan_for_table(file_name: str) -> List: marker_list.append(this_marker) + # Get the list of original markers from the .geno file + original_markers = build_marker_pos_dict(geno_file) + # Open the file with the actual results and write the results as # they will be displayed in the results table with open(os.path.join(current_app.config.get("TMPDIR", "/tmp"), "output", file_name), "r") as the_file: for i, line in enumerate(the_file.readlines()[1:]): marker_1 = marker_list[i] + proximal1, distal1 = find_nearest_marker(marker_1['chr'], marker_1['pos'], original_markers) line_items = [item.rstrip('\n') for item in line.split(",")] for j, item in enumerate(line_items[1:]): marker_2 = marker_list[j] + proximal2, distal2 = find_nearest_marker(marker_2['chr'], marker_2['pos'], original_markers) try: lod_score = f"{float(item):.3f}" except: lod_score = f"{item}" this_line = { - 'marker1': f"{marker_1['name']}", + 'proximal1': proximal1, + 'distal1': distal1, 'pos1': f"Chr {marker_1['chr']} @ {float(marker_1['pos']):.1f} cM", 'lod': lod_score, - 'marker2': f"{marker_2['name']}", + 'proximal2': proximal2, + 'distal2': distal2, 'pos2': f"Chr {marker_2['chr']} @ {float(marker_2['pos']):.1f} cM" } - table_data.append(this_line) - return table_data + table_data.append(this_line) + + return sorted(table_data, key = lambda i: float(i['lod']), reverse=True)[:500] + +def build_marker_pos_dict(genotype_file: str) -> Dict: + """ + Gets list of markers and their positions from .geno file + + Basically a pared-down version of parse_genotype_file for R/qtl pair-scan + """ + + with open(genotype_file, "r") as infile: + contents = infile.readlines() + + # Get all lines after the metadata + lines = tuple(line for line in contents if + ((not line.strip().startswith("#")) and + (not line.strip().startswith("@")) and + (not line.strip() == ""))) + + header_items = lines[0].split("\t") + mb_exists = "Mb" in header_items + pos_column = header_items.index("Mb") if mb_exists else header_items.index("cM") + + the_markers = {} + for line in lines[1:]: # The lines with markers + line_items = line.split("\t") + this_chr = line_items[0] + if this_chr not in the_markers: + the_markers[this_chr] = {} + the_markers[this_chr][str(float(line_items[pos_column]))] = line_items[1] + + return the_markers + +def find_nearest_marker(the_chr: str, the_pos: str, marker_list: Dict) -> Tuple[str, str]: + """ + Given a chromosome and position of a pseudomarker (from R/qtl pair-scan results), + return the nearest real marker + """ + + pos_list = [float(pos) for pos in marker_list[the_chr]] + + # Get the position of the pseudomarker in the list of markers for the chr + the_pos_index = bisect(pos_list, float(the_pos)) + + proximal_marker = marker_list[the_chr][str(pos_list[the_pos_index-1])] + distal_marker = marker_list[the_chr][str(pos_list[the_pos_index])] + return proximal_marker, distal_marker -def process_perm_output(file_name: str): +def process_perm_output(file_name: str) -> Tuple[List, float, float]: """Given base filename, read in R/qtl permutation output and calculate suggestive and significant thresholds diff --git a/gn3/computations/slink.py b/gn3/computations/slink.py new file mode 100644 index 0000000..3d7a576 --- /dev/null +++ b/gn3/computations/slink.py @@ -0,0 +1,198 @@ +""" +DESCRIPTION: + TODO: Add a description for the module + +FUNCTIONS: +slink: + TODO: Describe what the function does... +""" +import logging +from typing import Union, Sequence + +NumType = Union[int, float] +SeqOfNums = Sequence[NumType] + +class LengthError(BaseException): + """Raised whenever child lists/tuples are not the same length as the parent + list of tuple.""" + +class MirrorError(BaseException): + """Raised if the distance from child A to child B is not the same as the + distance from child B to child A.""" + +def __is_list_or_tuple(item): + return type(item) in [list, tuple] + +def __raise_valueerror_if_data_is_not_lists_or_tuples(lists): + """Check that `lists` is a list of lists: If not, raise an exception.""" + + if (not __is_list_or_tuple(lists)) or (not all(map(__is_list_or_tuple, lists))): + raise ValueError("Expected list or tuple") + +def __raise_valueerror_if_lists_empty(lists): + """Check that the list and its direct children are not empty.""" + def empty(lst): + return len(lst) == 0 + if (empty(lists)) or not all(map(lambda x: not empty(x), lists)): + raise ValueError("List/Tuple should NOT be empty!") + +def __raise_lengtherror_if_child_lists_are_not_same_as_parent(lists): + def len_is_same_as_parent(lst): + return len(lst) == len(lists) + if not all(map(len_is_same_as_parent, lists)): + raise LengthError("All children lists should be same length as the parent.") + +def __raise_valueerror_if_child_list_distance_from_itself_is_not_zero(lists): + def get_child_distance(child): + idx = lists.index(child) + return lists[idx][idx] + def distance_is_zero(dist): + return dist == 0 + children_distances = map(get_child_distance, lists) + if not all(map(distance_is_zero, children_distances)): + raise ValueError("Distance of each child list/tuple from itself should be zero!") + +def __raise_mirrorerror_of_distances_one_way_are_not_same_other_way(lists): + """Check that the distance from A to B, is the same as the distance from B to A. +If the two distances are different, throw an exception.""" + inner_coords = range(len(lists)) + coords = ((i, j) for i in inner_coords for j in inner_coords) + def __is_same_reversed(coord): + return lists[coord[0]][coord[1]] == lists[coord[1]][coord[0]] + if not all(map(__is_same_reversed, coords)): + raise MirrorError(( + "Distance from one child to the other should be the same in both " + "directions.")) + +def __raise_valueerror_on_negative_distances(lists): + """Check that distances between 'somethings' are all positive, otherwise, +raise an exception.""" + def zero_or_positive(val): + return val >= 0 + # flatten lists + flattened = __flatten_list_of_lists(lists) + if not all(map(zero_or_positive, flattened)): + raise ValueError("Distances should be positive.") + +def __flatten_list_of_lists(parent): + return [item for child in parent for item in child] + +# i and j are Union[SeqOfNums, NumType], but that leads to errors where the +# values of i or j are indexed, since the NumType type is not indexable. +# I don't know how to type this so that it does not fail on running `mypy .` +def nearest(lists: Sequence[SeqOfNums], i, j) -> NumType: + """ + Computes shortest distance between member(s) in `i` and member(s) in `j`. + + Description: + This is 'copied' over from genenetwork1, from + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/slink.py#L42-L64. + + This description should be updated to better describe what 'member' means in + the context where the function is used. + + Parameters: + lists (list of lists of distances): Represents a list of members and their + distances from each other. + Each inner list represents the distances the member at that coordinate + is from other members in the list: for example, a member at index 0 with + the values [0, 9, 1, 7] indicates that the member is: + - 0 units of distance away from itself + - 9 units of distance away from member at coordinate 1 + - 1 unit of distance away from member at coordinate 2 + - 7 units of distance away from member at coordinate 3 + i (int or list of ints): Represents the coordinate of a member, or a list of + coordinates of members on the `lists` list. + j (int or list of ints): Represents the coordinate of a member, or a list of + coordinates of members on the `lists` list. + + Returns: + int: Represents the shortest distance between member(s) in `i` and member(s) + in `j`.""" + + #### Guard Functions: Should we do this a different way? #### + __raise_valueerror_if_data_is_not_lists_or_tuples(lists) + __raise_valueerror_if_lists_empty(lists) + __raise_lengtherror_if_child_lists_are_not_same_as_parent(lists) + __raise_valueerror_if_child_list_distance_from_itself_is_not_zero(lists) + __raise_mirrorerror_of_distances_one_way_are_not_same_other_way(lists) + __raise_valueerror_on_negative_distances(lists) + #### END: Guard Functions #### + if isinstance(i, int) and isinstance(j, int): # From member i to member j + return lists[i][j] + + if isinstance(i, int) and __is_list_or_tuple(j): + return min(map(lambda j_new: nearest(lists, i, j_new), j[:-1])) + if isinstance(j, int) and __is_list_or_tuple(i): + return min(map(lambda i_new: nearest(lists, i_new, j), i[:-1])) + + if __is_list_or_tuple(i) and __is_list_or_tuple(j): + coordinate_pairs = __flatten_list_of_lists( + [[(itemi, itemj) for itemj in j[:-1]] for itemi in i[:-1]]) + return min(map(lambda x: nearest(lists, x[0], x[1]), coordinate_pairs)) + + raise ValueError("member values (i or j) should be lists/tuples of integers or integers") + +# `lists` here could be Sequence[SeqOfNums], but that leads to errors I do not +# understand down the line +# Might have to re-implement the function especially since the errors are thrown +# where `listindexcopy` is involved +def slink(lists): + """ + DESCRIPTION: + TODO: Not quite sure what this function does. Work through the code with a + fine tooth comb, once we understand the context of its use, so as to + give a better description + + The name of the function does not clearly establish what the function + does either, meaning, once that is established, the function should be + renamed to give the user an idea of what it does without necessarily + reading through a ton of code. + + We should also look into refactoring the function to reduce/eliminate + the multiple levels of nested-loops and conditionals + + PARAMETERS: + lists (list of lists of numbers): Give this a better name. + Each item of this list is a list of coordinates of the members in the + group. + What 'member' and 'group' in this context means, is not yet established. + """ + try: + size = len(lists) + listindexcopy = list(range(size)) + listscopy = [list(child[:]) for child in lists] + init_size = size + candidate = [] + while init_size > 2: + mindist = 1e10 + for i in range(init_size): + for j in range(i+1, init_size): + if listscopy[i][j] < mindist: + mindist = listscopy[i][j] + candidate = [[i, j]] + elif listscopy[i][j] == mindist: + mindist = listscopy[i][j] + candidate.append([i, j]) + else: + pass + newmem = ( + listindexcopy[candidate[0][0]], listindexcopy[candidate[0][1]], + mindist) + listindexcopy.pop(candidate[0][1]) + listindexcopy[candidate[0][0]] = newmem + + init_size -= 1 + for i in range(init_size): + for j in range(i+1, init_size): + listscopy[i][j] = nearest( + lists, listindexcopy[i], listindexcopy[j]) + listscopy[j][i] = listscopy[i][j] + listindexcopy.append( + nearest(lists, listindexcopy[0], listindexcopy[1])) + return listindexcopy + except (LengthError, MirrorError, TypeError, IndexError) as exc: + # Look into making the logging log output to the system's + # configured logger(s) + logging.warning("Exception: %s, %s", type(exc), exc) + return [] |