diff options
Diffstat (limited to 'gn3')
-rw-r--r-- | gn3/api/correlation.py | 8 | ||||
-rw-r--r-- | gn3/api/general.py | 4 | ||||
-rw-r--r-- | gn3/api/rqtl.py | 4 | ||||
-rw-r--r-- | gn3/computations/correlations.py | 36 | ||||
-rw-r--r-- | gn3/computations/correlations2.py | 50 | ||||
-rw-r--r-- | gn3/computations/heatmap.py | 177 | ||||
-rw-r--r-- | gn3/computations/rqtl.py | 77 | ||||
-rw-r--r-- | gn3/computations/slink.py | 198 | ||||
-rw-r--r-- | gn3/db/__init__.py | 2 | ||||
-rw-r--r-- | gn3/db/datasets.py | 291 | ||||
-rw-r--r-- | gn3/db/traits.py | 736 | ||||
-rw-r--r-- | gn3/function_helpers.py | 36 | ||||
-rw-r--r-- | gn3/heatmaps/heatmaps.py | 54 |
13 files changed, 1558 insertions, 115 deletions
diff --git a/gn3/api/correlation.py b/gn3/api/correlation.py index e7e89cf..46121f8 100644 --- a/gn3/api/correlation.py +++ b/gn3/api/correlation.py @@ -5,7 +5,7 @@ from flask import request from gn3.computations.correlations import compute_all_sample_correlation from gn3.computations.correlations import compute_all_lit_correlation -from gn3.computations.correlations import compute_all_tissue_correlation +from gn3.computations.correlations import compute_tissue_correlation from gn3.computations.correlations import map_shared_keys_to_values from gn3.db_utils import database_connector @@ -78,8 +78,8 @@ def compute_tissue_corr(corr_method="pearson"): primary_tissue_dict = tissue_input_data["primary_tissue"] target_tissues_dict = tissue_input_data["target_tissues_dict"] - results = compute_all_tissue_correlation(primary_tissue_dict=primary_tissue_dict, - target_tissues_data=target_tissues_dict, - corr_method=corr_method) + results = compute_tissue_correlation(primary_tissue_dict=primary_tissue_dict, + target_tissues_data=target_tissues_dict, + corr_method=corr_method) return jsonify(results) diff --git a/gn3/api/general.py b/gn3/api/general.py index cebb2e3..69ec343 100644 --- a/gn3/api/general.py +++ b/gn3/api/general.py @@ -11,6 +11,10 @@ from gn3.commands import run_cmd general = Blueprint("general", __name__) +@general.route("/version") +def version(): + """Get API version.""" + return jsonify("1.0") @general.route("/metadata/upload/", methods=["POST"], strict_slashes=False) diff --git a/gn3/api/rqtl.py b/gn3/api/rqtl.py index b0405f4..6548e1a 100644 --- a/gn3/api/rqtl.py +++ b/gn3/api/rqtl.py @@ -48,8 +48,8 @@ run the rqtl_wrapper script and return the results as JSON "output", rqtl_cmd.get('output_file'))): os.system(rqtl_cmd.get('rqtl_cmd')) - if "pairscan" in boolean_kwargs: - rqtl_output['results'] = process_rqtl_pairscan(rqtl_cmd.get('output_file')) + if "pairscan" in rqtl_bool_kwargs: + rqtl_output['results'] = process_rqtl_pairscan(rqtl_cmd.get('output_file'), genofile) else: rqtl_output['results'] = process_rqtl_mapping(rqtl_cmd.get('output_file')) diff --git a/gn3/computations/correlations.py b/gn3/computations/correlations.py index bc738a7..bb13ff1 100644 --- a/gn3/computations/correlations.py +++ b/gn3/computations/correlations.py @@ -1,4 +1,5 @@ """module contains code for correlations""" +import math import multiprocessing from typing import List @@ -90,7 +91,7 @@ def compute_sample_r_correlation(trait_name, corr_method, trait_vals, target_values=sanitized_target_vals, corr_method=corr_method) - if corr_coefficient is not None: + if corr_coefficient is not None and not math.isnan(corr_coefficient): return (trait_name, corr_coefficient, p_value, num_overlap) return None @@ -123,11 +124,12 @@ def filter_shared_sample_keys(this_samplelist, return (this_vals, target_vals) -def compute_all_sample_correlation(this_trait, - target_dataset, - corr_method="pearson") -> List: +def fast_compute_all_sample_correlation(this_trait, + target_dataset, + corr_method="pearson") -> List: """Given a trait data sample-list and target__datasets compute all sample correlation + this functions uses multiprocessing if not use the normal fun """ # xtodo fix trait_name currently returning single one @@ -159,9 +161,9 @@ def compute_all_sample_correlation(this_trait, key=lambda trait_name: -abs(list(trait_name.values())[0]["corr_coefficient"])) -def benchmark_compute_all_sample(this_trait, - target_dataset, - corr_method="pearson") -> List: +def compute_all_sample_correlation(this_trait, + target_dataset, + corr_method="pearson") -> List: """Temp function to benchmark with compute_all_sample_r alternative to compute_all_sample_r where we use multiprocessing @@ -173,6 +175,7 @@ def benchmark_compute_all_sample(this_trait, target_trait_data = target_trait["trait_sample_data"] this_vals, target_vals = filter_shared_sample_keys( this_trait_samples, target_trait_data) + sample_correlation = compute_sample_r_correlation( trait_name=trait_name, corr_method=corr_method, @@ -189,7 +192,9 @@ def benchmark_compute_all_sample(this_trait, "num_overlap": num_overlap } corr_results.append({trait_name: corr_result}) - return corr_results + return sorted( + corr_results, + key=lambda trait_name: -abs(list(trait_name.values())[0]["corr_coefficient"])) def tissue_correlation_for_trait( @@ -335,9 +340,9 @@ def compute_all_lit_correlation(conn, trait_lists: List, return sorted_lit_results -def compute_all_tissue_correlation(primary_tissue_dict: dict, - target_tissues_data: dict, - corr_method: str): +def compute_tissue_correlation(primary_tissue_dict: dict, + target_tissues_data: dict, + corr_method: str): """Function acts as an abstraction for tissue_correlation_for_trait\ required input are target tissue object and primary tissue trait\ target tissues data contains the trait_symbol_dict and symbol_tissue_vals @@ -357,8 +362,7 @@ def compute_all_tissue_correlation(primary_tissue_dict: dict, target_tissues_values=target_tissue_vals, trait_id=trait_id, corr_method=corr_method) - tissue_result_dict = {trait_id: tissue_result} - tissues_results.append(tissue_result_dict) + tissues_results.append(tissue_result) return sorted( tissues_results, key=lambda trait_name: -abs(list(trait_name.values())[0]["tissue_corr"])) @@ -381,9 +385,9 @@ def process_trait_symbol_dict(trait_symbol_dict, symbol_tissue_vals_dict) -> Lis return traits_tissue_vals -def compute_tissue_correlation(primary_tissue_dict: dict, - target_tissues_data: dict, - corr_method: str): +def fast_compute_tissue_correlation(primary_tissue_dict: dict, + target_tissues_data: dict, + corr_method: str): """Experimental function that uses multiprocessing for computing tissue correlation diff --git a/gn3/computations/correlations2.py b/gn3/computations/correlations2.py new file mode 100644 index 0000000..93db3fa --- /dev/null +++ b/gn3/computations/correlations2.py @@ -0,0 +1,50 @@ +""" +DESCRIPTION: + TODO: Add a description for the module + +FUNCTIONS: +compute_correlation: + TODO: Describe what the function does...""" + +from math import sqrt +from functools import reduce +## From GN1: mostly for clustering and heatmap generation + +def __items_with_values(dbdata, userdata): + """Retains only corresponding items in the data items that are not `None` values. + This should probably be renamed to something sensible""" + def both_not_none(item1, item2): + """Check that both items are not the value `None`.""" + if (item1 is not None) and (item2 is not None): + return (item1, item2) + return None + def split_lists(accumulator, item): + """Separate the 'x' and 'y' items.""" + return [accumulator[0] + [item[0]], accumulator[1] + [item[1]]] + return reduce( + split_lists, + filter(lambda x: x is not None, map(both_not_none, dbdata, userdata)), + [[], []]) + +def compute_correlation(dbdata, userdata): + """Compute some form of correlation. + + This is extracted from + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/utility/webqtlUtil.py#L622-L647 + """ + x_items, y_items = __items_with_values(dbdata, userdata) + if len(x_items) < 6: + return (0.0, len(x_items)) + meanx = sum(x_items)/len(x_items) + meany = sum(y_items)/len(y_items) + def cal_corr_vals(acc, item): + xitem, yitem = item + return [ + acc[0] + ((xitem - meanx) * (yitem - meany)), + acc[1] + ((xitem - meanx) * (xitem - meanx)), + acc[2] + ((yitem - meany) * (yitem - meany))] + xyd, sxd, syd = reduce(cal_corr_vals, zip(x_items, y_items), [0.0, 0.0, 0.0]) + try: + return ((xyd/(sqrt(sxd)*sqrt(syd))), len(x_items)) + except ZeroDivisionError: + return(0, len(x_items)) diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py new file mode 100644 index 0000000..3c35029 --- /dev/null +++ b/gn3/computations/heatmap.py @@ -0,0 +1,177 @@ +""" +This module will contain functions to be used in computation of the data used to +generate various kinds of heatmaps. +""" + +from functools import reduce +from typing import Any, Dict, Sequence +from gn3.computations.slink import slink +from gn3.db.traits import retrieve_trait_data, retrieve_trait_info +from gn3.computations.correlations2 import compute_correlation + +def export_trait_data( + trait_data: dict, strainlist: Sequence[str], dtype: str = "val", + var_exists: bool = False, n_exists: bool = False): + """ + Export data according to `strainlist`. Mostly used in calculating + correlations. + + DESCRIPTION: + Migrated from + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L166-L211 + + PARAMETERS + trait: (dict) + The dictionary of key-value pairs representing a trait + strainlist: (list) + A list of strain names + type: (str) + ... verify what this is ... + var_exists: (bool) + A flag indicating existence of variance + n_exists: (bool) + A flag indicating existence of ndata + """ + def __export_all_types(tdata, strain): + sample_data = [] + if tdata[strain]["value"]: + sample_data.append(tdata[strain]["value"]) + if var_exists: + if tdata[strain]["variance"]: + sample_data.append(tdata[strain]["variance"]) + else: + sample_data.append(None) + if n_exists: + if tdata[strain]["ndata"]: + sample_data.append(tdata[strain]["ndata"]) + else: + sample_data.append(None) + else: + if var_exists and n_exists: + sample_data += [None, None, None] + elif var_exists or n_exists: + sample_data += [None, None] + else: + sample_data.append(None) + + return tuple(sample_data) + + def __exporter(accumulator, strain): + # pylint: disable=[R0911] + if strain in trait_data["data"]: + if dtype == "val": + return accumulator + (trait_data["data"][strain]["value"], ) + if dtype == "var": + return accumulator + (trait_data["data"][strain]["variance"], ) + if dtype == "N": + return accumulator + (trait_data["data"][strain]["ndata"], ) + if dtype == "all": + return accumulator + __export_all_types(trait_data["data"], strain) + raise KeyError("Type `%s` is incorrect" % dtype) + if var_exists and n_exists: + return accumulator + (None, None, None) + if var_exists or n_exists: + return accumulator + (None, None) + return accumulator + (None,) + + return reduce(__exporter, strainlist, tuple()) + +def trait_display_name(trait: Dict): + """ + Given a trait, return a name to use to display the trait on a heatmap. + + DESCRIPTION + Migrated from + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L141-L157 + """ + if trait.get("db", None) and trait.get("trait_name", None): + if trait["db"]["dataset_type"] == "Temp": + desc = trait["description"] + if desc.find("PCA") >= 0: + return "%s::%s" % ( + trait["db"]["displayname"], + desc[desc.rindex(':')+1:].strip()) + return "%s::%s" % ( + trait["db"]["displayname"], + desc[:desc.index('entered')].strip()) + prefix = "%s::%s" % ( + trait["db"]["dataset_name"], trait["trait_name"]) + if trait["cellid"]: + return "%s::%s" % (prefix, trait["cellid"]) + return prefix + return trait["description"] + +def cluster_traits(traits_data_list: Sequence[Dict]): + """ + Clusters the trait values. + + DESCRIPTION + Attempts to replicate the clustering of the traits, as done at + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L138-L162 + """ + def __compute_corr(tdata_i, tdata_j): + if tdata_i[0] == tdata_j[0]: + return 0.0 + corr_vals = compute_correlation(tdata_i[1], tdata_j[1]) + corr = corr_vals[0] + if (1 - corr) < 0: + return 0.0 + return 1 - corr + + def __cluster(tdata_i): + return tuple( + __compute_corr(tdata_i, tdata_j) + for tdata_j in enumerate(traits_data_list)) + + return tuple(__cluster(tdata_i) for tdata_i in enumerate(traits_data_list)) + +def heatmap_data(formd, search_result, conn: Any): + """ + heatmap function + + DESCRIPTION + This function is an attempt to reproduce the initialisation at + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L46-L64 + and also the clustering and slink computations at + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L138-L165 + with the help of the `gn3.computations.heatmap.cluster_traits` function. + + It does not try to actually draw the heatmap image. + + PARAMETERS: + TODO: Elaborate on the parameters here... + """ + threshold = 0 # webqtlConfig.PUBLICTHRESH + cluster_checked = formd.formdata.getvalue("clusterCheck", "") + strainlist = [ + strain for strain in formd.strainlist if strain not in formd.parlist] + genotype = formd.genotype + + def __retrieve_traitlist_and_datalist(threshold, fullname): + trait = retrieve_trait_info(threshold, fullname, conn) + return ( + trait, + export_trait_data(retrieve_trait_data(trait, conn), strainlist)) + + traits_details = [ + __retrieve_traitlist_and_datalist(threshold, fullname) + for fullname in search_result] + traits_list = map(lambda x: x[0], traits_details) + traits_data_list = map(lambda x: x[1], traits_details) + + return { + "target_description_checked": formd.formdata.getvalue( + "targetDescriptionCheck", ""), + "cluster_checked": cluster_checked, + "slink_data": ( + slink(cluster_traits(traits_data_list)) + if cluster_checked else False), + "sessionfile": formd.formdata.getvalue("session"), + "genotype": genotype, + "nLoci": sum(map(len, genotype)), + "strainlist": strainlist, + "ppolar": formd.ppolar, + "mpolar":formd.mpolar, + "traits_list": traits_list, + "traits_data_list": traits_data_list + } diff --git a/gn3/computations/rqtl.py b/gn3/computations/rqtl.py index 00e1db9..38c5000 100644 --- a/gn3/computations/rqtl.py +++ b/gn3/computations/rqtl.py @@ -1,6 +1,7 @@ """Procedures related rqtl computations""" import os -from typing import Dict, List, Union +from bisect import bisect +from typing import Dict, List, Tuple, Union import numpy as np @@ -80,15 +81,15 @@ def process_rqtl_mapping(file_name: str) -> List: return marker_obs -def process_rqtl_pairscan(file_name: str) -> List: +def process_rqtl_pairscan(file_name: str, geno_file: str) -> List: """Given an output file name, read in R/qtl pair-scan results and return a list of both the JSON needed for the d3panels figure and a list of results to be used when generating the results table (which will include marker names) """ - figure_data = pairscan_results_for_figure(file_name) - table_data = pairscan_results_for_table(file_name) + figure_data = pairscan_for_figure(file_name) + table_data = pairscan_for_table(file_name, geno_file) return [figure_data, table_data] @@ -119,15 +120,14 @@ def pairscan_for_figure(file_name: str) -> Dict: if i == 0: # Skip first line continue line_items = [item.rstrip('\n') for item in line.split(",")] - chr_list.append(line_items[1]) + chr_list.append(line_items[1][1:-1]) pos_list.append(line_items[2]) figure_data['chr'] = chr_list figure_data['pos'] = pos_list return figure_data - -def pairscan_for_table(file_name: str) -> List: +def pairscan_for_table(file_name: str, geno_file: str) -> List: """Given an output file name, read in R/qtl pair-scan results and return a list of results to be used when generating the results table (which will include marker names) @@ -148,32 +148,85 @@ def pairscan_for_table(file_name: str) -> List: marker_list.append(this_marker) + # Get the list of original markers from the .geno file + original_markers = build_marker_pos_dict(geno_file) + # Open the file with the actual results and write the results as # they will be displayed in the results table with open(os.path.join(current_app.config.get("TMPDIR", "/tmp"), "output", file_name), "r") as the_file: for i, line in enumerate(the_file.readlines()[1:]): marker_1 = marker_list[i] + proximal1, distal1 = find_nearest_marker(marker_1['chr'], marker_1['pos'], original_markers) line_items = [item.rstrip('\n') for item in line.split(",")] for j, item in enumerate(line_items[1:]): marker_2 = marker_list[j] + proximal2, distal2 = find_nearest_marker(marker_2['chr'], marker_2['pos'], original_markers) try: lod_score = f"{float(item):.3f}" except: lod_score = f"{item}" this_line = { - 'marker1': f"{marker_1['name']}", + 'proximal1': proximal1, + 'distal1': distal1, 'pos1': f"Chr {marker_1['chr']} @ {float(marker_1['pos']):.1f} cM", 'lod': lod_score, - 'marker2': f"{marker_2['name']}", + 'proximal2': proximal2, + 'distal2': distal2, 'pos2': f"Chr {marker_2['chr']} @ {float(marker_2['pos']):.1f} cM" } - table_data.append(this_line) - return table_data + table_data.append(this_line) + + return sorted(table_data, key = lambda i: float(i['lod']), reverse=True)[:500] + +def build_marker_pos_dict(genotype_file: str) -> Dict: + """ + Gets list of markers and their positions from .geno file + + Basically a pared-down version of parse_genotype_file for R/qtl pair-scan + """ + + with open(genotype_file, "r") as infile: + contents = infile.readlines() + + # Get all lines after the metadata + lines = tuple(line for line in contents if + ((not line.strip().startswith("#")) and + (not line.strip().startswith("@")) and + (not line.strip() == ""))) + + header_items = lines[0].split("\t") + mb_exists = "Mb" in header_items + pos_column = header_items.index("Mb") if mb_exists else header_items.index("cM") + + the_markers = {} + for line in lines[1:]: # The lines with markers + line_items = line.split("\t") + this_chr = line_items[0] + if this_chr not in the_markers: + the_markers[this_chr] = {} + the_markers[this_chr][str(float(line_items[pos_column]))] = line_items[1] + + return the_markers + +def find_nearest_marker(the_chr: str, the_pos: str, marker_list: Dict) -> Tuple[str, str]: + """ + Given a chromosome and position of a pseudomarker (from R/qtl pair-scan results), + return the nearest real marker + """ + + pos_list = [float(pos) for pos in marker_list[the_chr]] + + # Get the position of the pseudomarker in the list of markers for the chr + the_pos_index = bisect(pos_list, float(the_pos)) + + proximal_marker = marker_list[the_chr][str(pos_list[the_pos_index-1])] + distal_marker = marker_list[the_chr][str(pos_list[the_pos_index])] + return proximal_marker, distal_marker -def process_perm_output(file_name: str): +def process_perm_output(file_name: str) -> Tuple[List, float, float]: """Given base filename, read in R/qtl permutation output and calculate suggestive and significant thresholds diff --git a/gn3/computations/slink.py b/gn3/computations/slink.py new file mode 100644 index 0000000..3d7a576 --- /dev/null +++ b/gn3/computations/slink.py @@ -0,0 +1,198 @@ +""" +DESCRIPTION: + TODO: Add a description for the module + +FUNCTIONS: +slink: + TODO: Describe what the function does... +""" +import logging +from typing import Union, Sequence + +NumType = Union[int, float] +SeqOfNums = Sequence[NumType] + +class LengthError(BaseException): + """Raised whenever child lists/tuples are not the same length as the parent + list of tuple.""" + +class MirrorError(BaseException): + """Raised if the distance from child A to child B is not the same as the + distance from child B to child A.""" + +def __is_list_or_tuple(item): + return type(item) in [list, tuple] + +def __raise_valueerror_if_data_is_not_lists_or_tuples(lists): + """Check that `lists` is a list of lists: If not, raise an exception.""" + + if (not __is_list_or_tuple(lists)) or (not all(map(__is_list_or_tuple, lists))): + raise ValueError("Expected list or tuple") + +def __raise_valueerror_if_lists_empty(lists): + """Check that the list and its direct children are not empty.""" + def empty(lst): + return len(lst) == 0 + if (empty(lists)) or not all(map(lambda x: not empty(x), lists)): + raise ValueError("List/Tuple should NOT be empty!") + +def __raise_lengtherror_if_child_lists_are_not_same_as_parent(lists): + def len_is_same_as_parent(lst): + return len(lst) == len(lists) + if not all(map(len_is_same_as_parent, lists)): + raise LengthError("All children lists should be same length as the parent.") + +def __raise_valueerror_if_child_list_distance_from_itself_is_not_zero(lists): + def get_child_distance(child): + idx = lists.index(child) + return lists[idx][idx] + def distance_is_zero(dist): + return dist == 0 + children_distances = map(get_child_distance, lists) + if not all(map(distance_is_zero, children_distances)): + raise ValueError("Distance of each child list/tuple from itself should be zero!") + +def __raise_mirrorerror_of_distances_one_way_are_not_same_other_way(lists): + """Check that the distance from A to B, is the same as the distance from B to A. +If the two distances are different, throw an exception.""" + inner_coords = range(len(lists)) + coords = ((i, j) for i in inner_coords for j in inner_coords) + def __is_same_reversed(coord): + return lists[coord[0]][coord[1]] == lists[coord[1]][coord[0]] + if not all(map(__is_same_reversed, coords)): + raise MirrorError(( + "Distance from one child to the other should be the same in both " + "directions.")) + +def __raise_valueerror_on_negative_distances(lists): + """Check that distances between 'somethings' are all positive, otherwise, +raise an exception.""" + def zero_or_positive(val): + return val >= 0 + # flatten lists + flattened = __flatten_list_of_lists(lists) + if not all(map(zero_or_positive, flattened)): + raise ValueError("Distances should be positive.") + +def __flatten_list_of_lists(parent): + return [item for child in parent for item in child] + +# i and j are Union[SeqOfNums, NumType], but that leads to errors where the +# values of i or j are indexed, since the NumType type is not indexable. +# I don't know how to type this so that it does not fail on running `mypy .` +def nearest(lists: Sequence[SeqOfNums], i, j) -> NumType: + """ + Computes shortest distance between member(s) in `i` and member(s) in `j`. + + Description: + This is 'copied' over from genenetwork1, from + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/slink.py#L42-L64. + + This description should be updated to better describe what 'member' means in + the context where the function is used. + + Parameters: + lists (list of lists of distances): Represents a list of members and their + distances from each other. + Each inner list represents the distances the member at that coordinate + is from other members in the list: for example, a member at index 0 with + the values [0, 9, 1, 7] indicates that the member is: + - 0 units of distance away from itself + - 9 units of distance away from member at coordinate 1 + - 1 unit of distance away from member at coordinate 2 + - 7 units of distance away from member at coordinate 3 + i (int or list of ints): Represents the coordinate of a member, or a list of + coordinates of members on the `lists` list. + j (int or list of ints): Represents the coordinate of a member, or a list of + coordinates of members on the `lists` list. + + Returns: + int: Represents the shortest distance between member(s) in `i` and member(s) + in `j`.""" + + #### Guard Functions: Should we do this a different way? #### + __raise_valueerror_if_data_is_not_lists_or_tuples(lists) + __raise_valueerror_if_lists_empty(lists) + __raise_lengtherror_if_child_lists_are_not_same_as_parent(lists) + __raise_valueerror_if_child_list_distance_from_itself_is_not_zero(lists) + __raise_mirrorerror_of_distances_one_way_are_not_same_other_way(lists) + __raise_valueerror_on_negative_distances(lists) + #### END: Guard Functions #### + if isinstance(i, int) and isinstance(j, int): # From member i to member j + return lists[i][j] + + if isinstance(i, int) and __is_list_or_tuple(j): + return min(map(lambda j_new: nearest(lists, i, j_new), j[:-1])) + if isinstance(j, int) and __is_list_or_tuple(i): + return min(map(lambda i_new: nearest(lists, i_new, j), i[:-1])) + + if __is_list_or_tuple(i) and __is_list_or_tuple(j): + coordinate_pairs = __flatten_list_of_lists( + [[(itemi, itemj) for itemj in j[:-1]] for itemi in i[:-1]]) + return min(map(lambda x: nearest(lists, x[0], x[1]), coordinate_pairs)) + + raise ValueError("member values (i or j) should be lists/tuples of integers or integers") + +# `lists` here could be Sequence[SeqOfNums], but that leads to errors I do not +# understand down the line +# Might have to re-implement the function especially since the errors are thrown +# where `listindexcopy` is involved +def slink(lists): + """ + DESCRIPTION: + TODO: Not quite sure what this function does. Work through the code with a + fine tooth comb, once we understand the context of its use, so as to + give a better description + + The name of the function does not clearly establish what the function + does either, meaning, once that is established, the function should be + renamed to give the user an idea of what it does without necessarily + reading through a ton of code. + + We should also look into refactoring the function to reduce/eliminate + the multiple levels of nested-loops and conditionals + + PARAMETERS: + lists (list of lists of numbers): Give this a better name. + Each item of this list is a list of coordinates of the members in the + group. + What 'member' and 'group' in this context means, is not yet established. + """ + try: + size = len(lists) + listindexcopy = list(range(size)) + listscopy = [list(child[:]) for child in lists] + init_size = size + candidate = [] + while init_size > 2: + mindist = 1e10 + for i in range(init_size): + for j in range(i+1, init_size): + if listscopy[i][j] < mindist: + mindist = listscopy[i][j] + candidate = [[i, j]] + elif listscopy[i][j] == mindist: + mindist = listscopy[i][j] + candidate.append([i, j]) + else: + pass + newmem = ( + listindexcopy[candidate[0][0]], listindexcopy[candidate[0][1]], + mindist) + listindexcopy.pop(candidate[0][1]) + listindexcopy[candidate[0][0]] = newmem + + init_size -= 1 + for i in range(init_size): + for j in range(i+1, init_size): + listscopy[i][j] = nearest( + lists, listindexcopy[i], listindexcopy[j]) + listscopy[j][i] = listscopy[i][j] + listindexcopy.append( + nearest(lists, listindexcopy[0], listindexcopy[1])) + return listindexcopy + except (LengthError, MirrorError, TypeError, IndexError) as exc: + # Look into making the logging log output to the system's + # configured logger(s) + logging.warning("Exception: %s, %s", type(exc), exc) + return [] diff --git a/gn3/db/__init__.py b/gn3/db/__init__.py index 5ab9f3c..149a344 100644 --- a/gn3/db/__init__.py +++ b/gn3/db/__init__.py @@ -1,7 +1,7 @@ # pylint: disable=[R0902, R0903] """Module that exposes common db operations""" from dataclasses import asdict, astuple -from typing import Any, Dict, List, Optional, Generator, Union +from typing import Any, Dict, List, Optional, Generator, Tuple, Union from typing_extensions import Protocol from gn3.db.metadata_audit import MetadataAudit diff --git a/gn3/db/datasets.py b/gn3/db/datasets.py new file mode 100644 index 0000000..4a05499 --- /dev/null +++ b/gn3/db/datasets.py @@ -0,0 +1,291 @@ +""" +This module contains functions relating to specific trait dataset manipulation +""" +from typing import Any + +def retrieve_probeset_trait_dataset_name( + threshold: int, name: str, connection: Any): + """ + Get the ID, DataScale and various name formats for a `ProbeSet` trait. + """ + query = ( + "SELECT Id, Name, FullName, ShortName, DataScale " + "FROM ProbeSetFreeze " + "WHERE " + "public > %(threshold)s " + "AND " + "(Name = %(name)s OR FullName = %(name)s OR ShortName = %(name)s)") + with connection.cursor() as cursor: + cursor.execute( + query, + { + "threshold": threshold, + "name": name + }) + return dict(zip( + ["dataset_id", "dataset_name", "dataset_fullname", + "dataset_shortname", "dataset_datascale"], + cursor.fetchone())) + +def retrieve_publish_trait_dataset_name( + threshold: int, name: str, connection: Any): + """ + Get the ID, DataScale and various name formats for a `Publish` trait. + """ + query = ( + "SELECT Id, Name, FullName, ShortName " + "FROM PublishFreeze " + "WHERE " + "public > %(threshold)s " + "AND " + "(Name = %(name)s OR FullName = %(name)s OR ShortName = %(name)s)") + with connection.cursor() as cursor: + cursor.execute( + query, + { + "threshold": threshold, + "name": name + }) + return dict(zip( + ["dataset_id", "dataset_name", "dataset_fullname", + "dataset_shortname"], + cursor.fetchone())) + +def retrieve_geno_trait_dataset_name( + threshold: int, name: str, connection: Any): + """ + Get the ID, DataScale and various name formats for a `Geno` trait. + """ + query = ( + "SELECT Id, Name, FullName, ShortName " + "FROM GenoFreeze " + "WHERE " + "public > %(threshold)s " + "AND " + "(Name = %(name)s OR FullName = %(name)s OR ShortName = %(name)s)") + with connection.cursor() as cursor: + cursor.execute( + query, + { + "threshold": threshold, + "name": name + }) + return dict(zip( + ["dataset_id", "dataset_name", "dataset_fullname", + "dataset_shortname"], + cursor.fetchone())) + +def retrieve_temp_trait_dataset_name( + threshold: int, name: str, connection: Any): + """ + Get the ID, DataScale and various name formats for a `Temp` trait. + """ + query = ( + "SELECT Id, Name, FullName, ShortName " + "FROM TempFreeze " + "WHERE " + "public > %(threshold)s " + "AND " + "(Name = %(name)s OR FullName = %(name)s OR ShortName = %(name)s)") + with connection.cursor() as cursor: + cursor.execute( + query, + { + "threshold": threshold, + "name": name + }) + return dict(zip( + ["dataset_id", "dataset_name", "dataset_fullname", + "dataset_shortname"], + cursor.fetchone())) + +def retrieve_dataset_name( + trait_type: str, threshold: int, trait_name: str, dataset_name: str, + conn: Any): + """ + Retrieve the name of a trait given the trait's name + + This is extracted from the `webqtlDataset.retrieveName` function as is + implemented at + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlDataset.py#L140-L169 + """ + fn_map = { + "ProbeSet": retrieve_probeset_trait_dataset_name, + "Publish": retrieve_publish_trait_dataset_name, + "Geno": retrieve_geno_trait_dataset_name, + "Temp": retrieve_temp_trait_dataset_name} + if trait_type == "Temp": + return retrieve_temp_trait_dataset_name(threshold, trait_name, conn) + return fn_map[trait_type](threshold, dataset_name, conn) + + +def retrieve_geno_riset_fields(name, conn): + """ + Retrieve the RISet, and RISetID values for various Geno trait types. + """ + query = ( + "SELECT InbredSet.Name, InbredSet.Id " + "FROM InbredSet, GenoFreeze " + "WHERE GenoFreeze.InbredSetId = InbredSet.Id " + "AND GenoFreeze.Name = %(name)s") + with conn.cursor() as cursor: + cursor.execute(query, {"name": name}) + return dict(zip(["riset", "risetid"], cursor.fetchone())) + return {} + +def retrieve_publish_riset_fields(name, conn): + """ + Retrieve the RISet, and RISetID values for various Publish trait types. + """ + query = ( + "SELECT InbredSet.Name, InbredSet.Id " + "FROM InbredSet, PublishFreeze " + "WHERE PublishFreeze.InbredSetId = InbredSet.Id " + "AND PublishFreeze.Name = %(name)s") + with conn.cursor() as cursor: + cursor.execute(query, {"name": name}) + return dict(zip(["riset", "risetid"], cursor.fetchone())) + return {} + +def retrieve_probeset_riset_fields(name, conn): + """ + Retrieve the RISet, and RISetID values for various ProbeSet trait types. + """ + query = ( + "SELECT InbredSet.Name, InbredSet.Id " + "FROM InbredSet, ProbeSetFreeze, ProbeFreeze " + "WHERE ProbeFreeze.InbredSetId = InbredSet.Id " + "AND ProbeFreeze.Id = ProbeSetFreeze.ProbeFreezeId " + "AND ProbeSetFreeze.Name = %(name)s") + with conn.cursor() as cursor: + cursor.execute(query, {"name": name}) + return dict(zip(["riset", "risetid"], cursor.fetchone())) + return {} + +def retrieve_temp_riset_fields(name, conn): + """ + Retrieve the RISet, and RISetID values for `Temp` trait types. + """ + query = ( + "SELECT InbredSet.Name, InbredSet.Id " + "FROM InbredSet, Temp " + "WHERE Temp.InbredSetId = InbredSet.Id " + "AND Temp.Name = %(name)s") + with conn.cursor() as cursor: + cursor.execute(query, {"name": name}) + return dict(zip(["riset", "risetid"], cursor.fetchone())) + return {} + +def retrieve_riset_fields(trait_type, trait_name, dataset_info, conn): + """ + Retrieve the RISet, and RISetID values for various trait types. + """ + riset_fns_map = { + "Geno": retrieve_geno_riset_fields, + "Publish": retrieve_publish_riset_fields, + "ProbeSet": retrieve_probeset_riset_fields + } + + if trait_type == "Temp": + riset_info = retrieve_temp_riset_fields(trait_name, conn) + else: + riset_info = riset_fns_map[trait_type](dataset_info["dataset_name"], conn) + + return { + **dataset_info, + **riset_info, + "riset": ( + "BXD" if riset_info.get("riset") == "BXD300" + else riset_info.get("riset", "")) + } + +def retrieve_temp_trait_dataset(): + """ + Retrieve the dataset that relates to `Temp` traits + """ + # pylint: disable=[C0330] + return { + "searchfield": ["name", "description"], + "disfield": ["name", "description"], + "type": "Temp", + "dataset_id": 1, + "fullname": "Temporary Storage", + "shortname": "Temp" + } + +def retrieve_geno_trait_dataset(): + """ + Retrieve the dataset that relates to `Geno` traits + """ + # pylint: disable=[C0330] + return { + "searchfield": ["name", "chr"], + "disfield": ["name", "chr", "mb", "source2", "sequence"], + "type": "Geno" + } + +def retrieve_publish_trait_dataset(): + """ + Retrieve the dataset that relates to `Publish` traits + """ + # pylint: disable=[C0330] + return { + "searchfield": [ + "name", "post_publication_description", "abstract", "title", + "authors"], + "disfield": [ + "name", "pubmed_id", "pre_publication_description", + "post_publication_description", "original_description", + "pre_publication_abbreviation", "post_publication_abbreviation", + "lab_code", "submitter", "owner", "authorized_users", + "authors", "title", "abstract", "journal", "volume", "pages", + "month", "year", "sequence", "units", "comments"], + "type": "Publish" + } + +def retrieve_probeset_trait_dataset(): + """ + Retrieve the dataset that relates to `ProbeSet` traits + """ + # pylint: disable=[C0330] + return { + "searchfield": [ + "name", "description", "probe_target_description", "symbol", + "alias", "genbankid", "unigeneid", "omim", "refseq_transcriptid", + "probe_set_specificity", "probe_set_blat_score"], + "disfield": [ + "name", "symbol", "description", "probe_target_description", "chr", + "mb", "alias", "geneid", "genbankid", "unigeneid", "omim", + "refseq_transcriptid", "blatseq", "targetseq", "chipid", "comments", + "strand_probe", "strand_gene", "probe_set_target_region", + "proteinid", "probe_set_specificity", "probe_set_blat_score", + "probe_set_blat_mb_start", "probe_set_blat_mb_end", + "probe_set_strand", "probe_set_note_by_rw", "flag"], + "type": "ProbeSet" + } + +def retrieve_trait_dataset(trait_type, trait, threshold, conn): + """ + Retrieve the dataset that relates to a specific trait. + """ + dataset_fns = { + "Temp": retrieve_temp_trait_dataset, + "Geno": retrieve_geno_trait_dataset, + "Publish": retrieve_publish_trait_dataset, + "ProbeSet": retrieve_probeset_trait_dataset + } + dataset_name_info = { + "dataset_id": None, + "dataset_name": trait["db"]["dataset_name"], + **retrieve_dataset_name( + trait_type, threshold, trait["trait_name"], + trait["db"]["dataset_name"], conn) + } + riset = retrieve_riset_fields( + trait_type, trait["trait_name"], dataset_name_info, conn) + return { + "display_name": dataset_name_info["dataset_name"], + **dataset_name_info, + **dataset_fns[trait_type](), + **riset + } diff --git a/gn3/db/traits.py b/gn3/db/traits.py index 4860a07..1031e44 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -1,92 +1,668 @@ -"""This contains all the necessary functions that are required to add traits -to the published database""" -from dataclasses import dataclass -from typing import Any, Dict, Optional +"""This class contains functions relating to trait data manipulation""" +from typing import Any, Dict, Union, Sequence +from gn3.function_helpers import compose +from gn3.db.datasets import retrieve_trait_dataset -@dataclass(frozen=True) -class Riset: - """Class for keeping track of riset. A riset is a group e.g. rat HSNIH-Palmer, -BXD +def get_trait_csv_sample_data(conn: Any, + trait_name: int, phenotype_id: int): + """Fetch a trait and return it as a csv string""" + sql = ("SELECT DISTINCT Strain.Id, PublishData.Id, Strain.Name, " + "PublishData.value, " + "PublishSE.error, NStrain.count FROM " + "(PublishData, Strain, PublishXRef, PublishFreeze) " + "LEFT JOIN PublishSE ON " + "(PublishSE.DataId = PublishData.Id AND " + "PublishSE.StrainId = PublishData.StrainId) " + "LEFT JOIN NStrain ON (NStrain.DataId = PublishData.Id AND " + "NStrain.StrainId = PublishData.StrainId) WHERE " + "PublishXRef.InbredSetId = PublishFreeze.InbredSetId AND " + "PublishData.Id = PublishXRef.DataId AND " + "PublishXRef.Id = %s AND PublishXRef.PhenotypeId = %s " + "AND PublishData.StrainId = Strain.Id Order BY Strain.Name") + csv_data = ["Strain Id,Strain Name,Value,SE,Count"] + publishdata_id = "" + with conn.cursor() as cursor: + cursor.execute(sql, (trait_name, phenotype_id,)) + for record in cursor.fetchall(): + (strain_id, publishdata_id, + strain_name, value, error, count) = record + csv_data.append( + ",".join([str(val) if val else "x" + for val in (strain_id, strain_name, + value, error, count)])) + return f"# Publish Data Id: {publishdata_id}\n\n" + "\n".join(csv_data) + + +def update_sample_data(conn: Any, + strain_name: str, + strain_id: int, + publish_data_id: int, + value: Union[int, float, str], + error: Union[int, float, str], + count: Union[int, str]): + """Given the right parameters, update sample-data from the relevant + table.""" + # pylint: disable=[R0913, R0914] + STRAIN_ID_SQL: str = "UPDATE Strain SET Name = %s WHERE Id = %s" + PUBLISH_DATA_SQL: str = ("UPDATE PublishData SET value = %s " + "WHERE StrainId = %s AND Id = %s") + PUBLISH_SE_SQL: str = ("UPDATE PublishSE SET error = %s " + "WHERE StrainId = %s AND DataId = %s") + N_STRAIN_SQL: str = ("UPDATE NStrain SET count = %s " + "WHERE StrainId = %s AND DataId = %s") + + updated_strains: int = 0 + updated_published_data: int = 0 + updated_se_data: int = 0 + updated_n_strains: int = 0 + + with conn.cursor() as cursor: + # Update the Strains table + cursor.execute(STRAIN_ID_SQL, (strain_name, strain_id)) + updated_strains: int = cursor.rowcount + # Update the PublishData table + cursor.execute(PUBLISH_DATA_SQL, + (None if value == "x" else value, + strain_id, publish_data_id)) + updated_published_data: int = cursor.rowcount + # Update the PublishSE table + cursor.execute(PUBLISH_SE_SQL, + (None if error == "x" else error, + strain_id, publish_data_id)) + updated_se_data: int = cursor.rowcount + # Update the NStrain table + cursor.execute(N_STRAIN_SQL, + (None if count == "x" else count, + strain_id, publish_data_id)) + updated_n_strains: int = cursor.rowcount + return (updated_strains, updated_published_data, + updated_se_data, updated_n_strains) + +def retrieve_publish_trait_info(trait_data_source: Dict[str, Any], conn: Any): + """Retrieve trait information for type `Publish` traits. + + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L399-L421""" + keys = ( + "Id", "PubMed_ID", "Pre_publication_description", + "Post_publication_description", "Original_description", + "Pre_publication_abbreviation", "Post_publication_abbreviation", + "Lab_code", "Submitter", "Owner", "Authorized_Users", "Authors", + "Title", "Abstract", "Journal", "Volume", "Pages", "Month", "Year", + "Sequence", "Units", "comments") + columns = ( + "PublishXRef.Id, Publication.PubMed_ID, " + "Phenotype.Pre_publication_description, " + "Phenotype.Post_publication_description, " + "Phenotype.Original_description, " + "Phenotype.Pre_publication_abbreviation, " + "Phenotype.Post_publication_abbreviation, " + "Phenotype.Lab_code, Phenotype.Submitter, Phenotype.Owner, " + "Phenotype.Authorized_Users, CAST(Publication.Authors AS BINARY), " + "Publication.Title, Publication.Abstract, Publication.Journal, " + "Publication.Volume, Publication.Pages, Publication.Month, " + "Publication.Year, PublishXRef.Sequence, Phenotype.Units, " + "PublishXRef.comments") + query = ( + "SELECT " + "{columns} " + "FROM " + "PublishXRef, Publication, Phenotype, PublishFreeze " + "WHERE " + "PublishXRef.Id = %(trait_name)s AND " + "Phenotype.Id = PublishXRef.PhenotypeId AND " + "Publication.Id = PublishXRef.PublicationId AND " + "PublishXRef.InbredSetId = PublishFreeze.InbredSetId AND " + "PublishFreeze.Id =%(trait_dataset_id)s").format(columns=columns) + with conn.cursor() as cursor: + cursor.execute( + query, + { + k:v for k, v in trait_data_source.items() + if k in ["trait_name", "trait_dataset_id"] + }) + return dict(zip([k.lower() for k in keys], cursor.fetchone())) + +def set_confidential_field(trait_type, trait_info): + """Post processing function for 'Publish' trait types. + + It sets the value for the 'confidential' key.""" + if trait_type == "Publish": + return { + **trait_info, + "confidential": 1 if ( + trait_info.get("pre_publication_description", None) + and not trait_info.get("pubmed_id", None)) else 0} + return trait_info + +def retrieve_probeset_trait_info(trait_data_source: Dict[str, Any], conn: Any): + """Retrieve trait information for type `ProbeSet` traits. + + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L424-L435""" + keys = ( + "name", "symbol", "description", "probe_target_description", "chr", + "mb", "alias", "geneid", "genbankid", "unigeneid", "omim", + "refseq_transcriptid", "blatseq", "targetseq", "chipid", "comments", + "strand_probe", "strand_gene", "probe_set_target_region", "proteinid", + "probe_set_specificity", "probe_set_blat_score", + "probe_set_blat_mb_start", "probe_set_blat_mb_end", "probe_set_strand", + "probe_set_note_by_rw", "flag") + query = ( + "SELECT " + "{columns} " + "FROM " + "ProbeSet, ProbeSetFreeze, ProbeSetXRef " + "WHERE " + "ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id AND " + "ProbeSetXRef.ProbeSetId = ProbeSet.Id AND " + "ProbeSetFreeze.Name = %(trait_dataset_name)s AND " + "ProbeSet.Name = %(trait_name)s").format( + columns=", ".join(["ProbeSet.{}".format(x) for x in keys])) + with conn.cursor() as cursor: + cursor.execute( + query, + { + k:v for k, v in trait_data_source.items() + if k in ["trait_name", "trait_dataset_name"] + }) + return dict(zip(keys, cursor.fetchone())) + +def retrieve_geno_trait_info(trait_data_source: Dict[str, Any], conn: Any): + """Retrieve trait information for type `Geno` traits. + + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L438-L449""" + keys = ("name", "chr", "mb", "source2", "sequence") + query = ( + "SELECT " + "{columns} " + "FROM " + "Geno, GenoFreeze, GenoXRef " + "WHERE " + "GenoXRef.GenoFreezeId = GenoFreeze.Id AND GenoXRef.GenoId = Geno.Id AND " + "GenoFreeze.Name = %(trait_dataset_name)s AND " + "Geno.Name = %(trait_name)s").format( + columns=", ".join(["Geno.{}".format(x) for x in keys])) + with conn.cursor() as cursor: + cursor.execute( + query, + { + k:v for k, v in trait_data_source.items() + if k in ["trait_name", "trait_dataset_name"] + }) + return dict(zip(keys, cursor.fetchone())) + +def retrieve_temp_trait_info(trait_data_source: Dict[str, Any], conn: Any): + """Retrieve trait information for type `Temp` traits. + + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L450-452""" + keys = ("name", "description") + query = ( + "SELECT {columns} FROM Temp " + "WHERE Name = %(trait_name)s").format(columns=", ".join(keys)) + with conn.cursor() as cursor: + cursor.execute( + query, + { + k:v for k, v in trait_data_source.items() + if k in ["trait_name"] + }) + return dict(zip(keys, cursor.fetchone())) + +def set_haveinfo_field(trait_info): + """ + Common postprocessing function for all trait types. + Sets the value for the 'haveinfo' field.""" + return {**trait_info, "haveinfo": 1 if trait_info else 0} + +def set_homologene_id_field_probeset(trait_info, conn): """ - name: Optional[str] - r_id: Optional[int] + Postprocessing function for 'ProbeSet' traits. + Sets the value for the 'homologene' key. + """ + query = ( + "SELECT HomologeneId FROM Homologene, Species, InbredSet" + " WHERE Homologene.GeneId = %(geneid)s AND InbredSet.Name = %(riset)s" + " AND InbredSet.SpeciesId = Species.Id AND" + " Species.TaxonomyId = Homologene.TaxonomyId") + with conn.cursor() as cursor: + cursor.execute( + query, + { + k:v for k, v in trait_info.items() + if k in ["geneid", "riset"] + }) + res = cursor.fetchone() + if res: + return {**trait_info, "homologeneid": res[0]} + return {**trait_info, "homologeneid": None} -@dataclass(frozen=True) -class WebqtlCaseData: - """Class for keeping track of one case data in one trait""" - value: Optional[float] = None - variance: Optional[float] = None - count: Optional[int] = None # Number of Individuals +def set_homologene_id_field(trait_type, trait_info, conn): + """ + Common postprocessing function for all trait types. - def __str__(self): - _str = "" - if self.value: - _str += f"value={self.value:.3f}" - if self.variance: - _str += f" variance={self.variance:.3f}" - if self.count: - _str += " n_data={self.count}" - return _str + Sets the value for the 'homologene' key.""" + set_to_null = lambda ti: {**ti, "homologeneid": None} + functions_table = { + "Temp": set_to_null, + "Geno": set_to_null, + "Publish": set_to_null, + "ProbeSet": lambda ti: set_homologene_id_field_probeset(ti, conn) + } + return functions_table[trait_type](trait_info) +def load_publish_qtl_info(trait_info, conn): + """ + Load extra QTL information for `Publish` traits + """ + query = ( + "SELECT PublishXRef.Locus, PublishXRef.LRS, PublishXRef.additive " + "FROM PublishXRef, PublishFreeze " + "WHERE PublishXRef.Id = %(trait_name)s " + "AND PublishXRef.InbredSetId = PublishFreeze.InbredSetId " + "AND PublishFreeze.Id = %(dataset_id)s") + with conn.cursor() as cursor: + cursor.execute( + query, + { + "trait_name": trait_info["trait_name"], + "dataset_id": trait_info["db"]["dataset_id"] + }) + return dict(zip(["locus", "lrs", "additive"], cursor.fetchone())) + return {"locus": "", "lrs": "", "additive": ""} -def lookup_webqtldataset_name(riset_name: str, conn: Any): - """Given a group name(riset), return it's name e.g. BXDPublish, -HLCPublish.""" +def load_probeset_qtl_info(trait_info, conn): + """ + Load extra QTL information for `ProbeSet` traits + """ + query = ( + "SELECT ProbeSetXRef.Locus, ProbeSetXRef.LRS, ProbeSetXRef.pValue, " + "ProbeSetXRef.mean, ProbeSetXRef.additive " + "FROM ProbeSetXRef, ProbeSet " + "WHERE ProbeSetXRef.ProbeSetId = ProbeSet.Id " + " AND ProbeSet.Name = %(trait_name)s " + "AND ProbeSetXRef.ProbeSetFreezeId = %(dataset_id)s") with conn.cursor() as cursor: cursor.execute( - "SELECT PublishFreeze.Name FROM " - "PublishFreeze, InbredSet WHERE " - "PublishFreeze.InbredSetId = InbredSet.Id " - "AND InbredSet.Name = '%s'" % riset_name) - _result, *_ = cursor.fetchone() - return _result - - -def get_riset(data_type: str, name: str, conn: Any): - """Get the groups given the data type and it's PublishFreeze or GenoFreeze -name - - """ - query, _name, _id = None, None, None - if data_type == "Publish": - query = ("SELECT InbredSet.Name, InbredSet.Id FROM InbredSet, " - "PublishFreeze WHERE PublishFreeze.InbredSetId = " - "InbredSet.Id AND PublishFreeze.Name = '%s'" % name) - elif data_type == "Geno": - query = ("SELECT InbredSet.Name, InbredSet.Id FROM InbredSet, " - "GenoFreeze WHERE GenoFreeze.InbredSetId = " - "InbredSet.Id AND GenoFreeze.Name = '%s'" % name) - elif data_type == "ProbeSet": - query = ("SELECT InbredSet.Name, InbredSet.Id FROM " - "InbredSet, ProbeSetFreeze, ProbeFreeze WHERE " - "ProbeFreeze.InbredSetId = InbredSet.Id AND " - "ProbeFreeze.Id = ProbeSetFreeze.ProbeFreezeId AND " - "ProbeSetFreeze.Name = '%s'" % name) - if query: - with conn.cursor() as cursor: - _name, _id = cursor.fetchone() - if _name == "BXD300": - _name = "BXD" - return Riset(_name, _id) - - -def insert_publication(pubmed_id: int, publication: Optional[Dict], - conn: Any): - """Creates a new publication record if it's not available""" - sql = ("SELECT Id FROM Publication where " - "PubMed_ID = %d" % pubmed_id) - _id = None + query, + { + "trait_name": trait_info["trait_name"], + "dataset_id": trait_info["db"]["dataset_id"] + }) + return dict(zip( + ["locus", "lrs", "pvalue", "mean", "additive"], cursor.fetchone())) + return {"locus": "", "lrs": "", "pvalue": "", "mean": "", "additive": ""} + +def load_qtl_info(qtl, trait_type, trait_info, conn): + """ + Load extra QTL information for traits + + DESCRIPTION: + Migrated from + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L500-L534 + + PARAMETERS: + qtl: boolean + trait_type: string + The type of the trait in consideration + trait_info: map/dictionary + A dictionary of the trait's key-value pairs + conn: + A database connection object + """ + if not qtl: + return trait_info + qtl_info_functions = { + "Publish": load_publish_qtl_info, + "ProbeSet": load_probeset_qtl_info + } + if trait_info["name"] not in qtl_info_functions.keys(): + return trait_info + + return qtl_info_functions[trait_type](trait_info, conn) + +def build_trait_name(trait_fullname): + """ + Initialises the trait's name, and other values from the search data provided + """ + def dataset_type(dset_name): + if dset_name.find('Temp') >= 0: + return "Temp" + if dset_name.find('Geno') >= 0: + return "Geno" + if dset_name.find('Publish') >= 0: + return "Publish" + return "ProbeSet" + + name_parts = trait_fullname.split("::") + assert len(name_parts) >= 2, "Name format error" + dataset_name = name_parts[0] + dataset_type = dataset_type(dataset_name) + return { + "db": { + "dataset_name": dataset_name, + "dataset_type": dataset_type}, + "trait_fullname": trait_fullname, + "trait_name": name_parts[1], + "cellid": name_parts[2] if len(name_parts) == 3 else "" + } + +def retrieve_probeset_sequence(trait, conn): + """ + Retrieve a 'ProbeSet' trait's sequence information + """ + query = ( + "SELECT ProbeSet.BlatSeq " + "FROM ProbeSet, ProbeSetFreeze, ProbeSetXRef " + "WHERE ProbeSet.Id=ProbeSetXRef.ProbeSetId " + "AND ProbeSetFreeze.Id = ProbeSetXRef.ProbeSetFreezeId " + "AND ProbeSet.Name = %(trait_name)s " + "AND ProbeSetFreeze.Name = %(dataset_name)s") with conn.cursor() as cursor: - cursor.execute(sql) - _id = cursor.fetchone() - if not _id and publication: - # The Publication contains the fields: 'authors', 'title', 'abstract', - # 'journal','volume','pages','month','year' - insert_query = ("INSERT into Publication (%s) Values (%s)" % - (", ".join(publication.keys()), - ", ".join(['%s'] * len(publication)))) - with conn.cursor() as cursor: - cursor.execute(insert_query, tuple(publication.values())) + cursor.execute( + query, + { + "trait_name": trait["trait_name"], + "dataset_name": trait["db"]["dataset_name"] + }) + seq = cursor.fetchone() + return {**trait, "sequence": seq[0] if seq else ""} + +def retrieve_trait_info( + threshold: int, trait_full_name: str, conn: Any, + qtl=None): + """Retrieves the trait information. + + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L397-L456 + + This function, or the dependent functions, might be incomplete as they are + currently.""" + trait = build_trait_name(trait_full_name) + trait_dataset_type = trait["db"]["dataset_type"] + trait_info_function_table = { + "Publish": retrieve_publish_trait_info, + "ProbeSet": retrieve_probeset_trait_info, + "Geno": retrieve_geno_trait_info, + "Temp": retrieve_temp_trait_info + } + + common_post_processing_fn = compose( + lambda ti: load_qtl_info(qtl, trait_dataset_type, ti, conn), + lambda ti: set_homologene_id_field(trait_dataset_type, ti, conn), + lambda ti: {"trait_type": trait_dataset_type, **ti}, + lambda ti: {**trait, **ti}) + + trait_post_processing_functions_table = { + "Publish": compose( + lambda ti: set_confidential_field(trait_dataset_type, ti), + common_post_processing_fn), + "ProbeSet": compose( + lambda ti: retrieve_probeset_sequence(ti, conn), + common_post_processing_fn), + "Geno": common_post_processing_fn, + "Temp": common_post_processing_fn + } + + retrieve_info = compose( + set_haveinfo_field, trait_info_function_table[trait_dataset_type]) + + trait_dataset = retrieve_trait_dataset( + trait_dataset_type, trait, threshold, conn) + trait_info = retrieve_info( + { + "trait_name": trait["trait_name"], + "trait_dataset_id": trait_dataset["dataset_id"], + "trait_dataset_name": trait_dataset["dataset_name"] + }, + conn) + if trait_info["haveinfo"]: + return { + **trait_post_processing_functions_table[trait_dataset_type]( + {**trait_info, "riset": trait_dataset["riset"]}), + "db": {**trait["db"], **trait_dataset} + } + return trait_info + +def retrieve_temp_trait_data(trait_info: dict, conn: Any): + """ + Retrieve trait data for `Temp` traits. + """ + query = ( + "SELECT " + "Strain.Name, TempData.value, TempData.SE, TempData.NStrain, " + "TempData.Id " + "FROM TempData, Temp, Strain " + "WHERE TempData.StrainId = Strain.Id " + "AND TempData.Id = Temp.DataId " + "AND Temp.name = %(trait_name)s " + "ORDER BY Strain.Name") + with conn.cursor() as cursor: + cursor.execute( + query, + {"trait_name": trait_info["trait_name"]}) + return [dict(zip( + ["strain_name", "value", "se_error", "nstrain", "id"], row)) + for row in cursor.fetchall()] + return [] + +def retrieve_species_id(riset, conn: Any): + """ + Retrieve a species id given the RISet value + """ + with conn.cursor as cursor: + cursor.execute( + "SELECT SpeciesId from InbredSet WHERE Name = %(riset)s", + {"riset": riset}) + return cursor.fetchone()[0] + return None + +def retrieve_geno_trait_data(trait_info: Dict, conn: Any): + """ + Retrieve trait data for `Geno` traits. + """ + query = ( + "SELECT Strain.Name, GenoData.value, GenoSE.error, GenoData.Id " + "FROM (GenoData, GenoFreeze, Strain, Geno, GenoXRef) " + "LEFT JOIN GenoSE ON " + "(GenoSE.DataId = GenoData.Id AND GenoSE.StrainId = GenoData.StrainId) " + "WHERE Geno.SpeciesId = %(species_id)s " + "AND Geno.Name = %(trait_name)s AND GenoXRef.GenoId = Geno.Id " + "AND GenoXRef.GenoFreezeId = GenoFreeze.Id " + "AND GenoFreeze.Name = %(dataset_name)s " + "AND GenoXRef.DataId = GenoData.Id " + "AND GenoData.StrainId = Strain.Id " + "ORDER BY Strain.Name") + with conn.cursor() as cursor: + cursor.execute( + query, + {"trait_name": trait_info["trait_name"], + "dataset_name": trait_info["db"]["dataset_name"], + "species_id": retrieve_species_id( + trait_info["db"]["riset"], conn)}) + return [dict(zip( + ["strain_name", "value", "se_error", "id"], row)) + for row in cursor.fetchall()] + return [] + +def retrieve_publish_trait_data(trait_info: Dict, conn: Any): + """ + Retrieve trait data for `Publish` traits. + """ + query = ( + "SELECT " + "Strain.Name, PublishData.value, PublishSE.error, NStrain.count, " + "PublishData.Id " + "FROM (PublishData, Strain, PublishXRef, PublishFreeze) " + "LEFT JOIN PublishSE ON " + "(PublishSE.DataId = PublishData.Id " + "AND PublishSE.StrainId = PublishData.StrainId) " + "LEFT JOIN NStrain ON " + "(NStrain.DataId = PublishData.Id " + "AND NStrain.StrainId = PublishData.StrainId) " + "WHERE PublishXRef.InbredSetId = PublishFreeze.InbredSetId " + "AND PublishData.Id = PublishXRef.DataId " + "AND PublishXRef.Id = %(trait_name)s " + "AND PublishFreeze.Id = %(dataset_id)s " + "AND PublishData.StrainId = Strain.Id " + "ORDER BY Strain.Name") + with conn.cursor() as cursor: + cursor.execute( + query, + {"trait_name": trait_info["trait_name"], + "dataset_id": trait_info["db"]["dataset_id"]}) + return [dict(zip( + ["strain_name", "value", "se_error", "nstrain", "id"], row)) + for row in cursor.fetchall()] + return [] + +def retrieve_cellid_trait_data(trait_info: Dict, conn: Any): + """ + Retrieve trait data for `Probe Data` types. + """ + query = ( + "SELECT " + "Strain.Name, ProbeData.value, ProbeSE.error, ProbeData.Id " + "FROM (ProbeData, ProbeFreeze, ProbeSetFreeze, ProbeXRef, Strain," + " Probe, ProbeSet) " + "LEFT JOIN ProbeSE ON " + "(ProbeSE.DataId = ProbeData.Id " + " AND ProbeSE.StrainId = ProbeData.StrainId) " + "WHERE Probe.Name = %(cellid)s " + "AND ProbeSet.Name = %(trait_name)s " + "AND Probe.ProbeSetId = ProbeSet.Id " + "AND ProbeXRef.ProbeId = Probe.Id " + "AND ProbeXRef.ProbeFreezeId = ProbeFreeze.Id " + "AND ProbeSetFreeze.ProbeFreezeId = ProbeFreeze.Id " + "AND ProbeSetFreeze.Name = %(dataset_name)s " + "AND ProbeXRef.DataId = ProbeData.Id " + "AND ProbeData.StrainId = Strain.Id " + "ORDER BY Strain.Name") + with conn.cursor() as cursor: + cursor.execute( + query, + {"cellid": trait_info["cellid"], + "trait_name": trait_info["trait_name"], + "dataset_id": trait_info["db"]["dataset_id"]}) + return [dict(zip( + ["strain_name", "value", "se_error", "id"], row)) + for row in cursor.fetchall()] + return [] + +def retrieve_probeset_trait_data(trait_info: Dict, conn: Any): + """ + Retrieve trait data for `ProbeSet` traits. + """ + query = ( + "SELECT Strain.Name, ProbeSetData.value, ProbeSetSE.error, " + "ProbeSetData.Id " + "FROM (ProbeSetData, ProbeSetFreeze, Strain, ProbeSet, ProbeSetXRef) " + "LEFT JOIN ProbeSetSE ON " + "(ProbeSetSE.DataId = ProbeSetData.Id " + "AND ProbeSetSE.StrainId = ProbeSetData.StrainId) " + "WHERE ProbeSet.Name = %(trait_name)s " + "AND ProbeSetXRef.ProbeSetId = ProbeSet.Id " + "AND ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id " + "AND ProbeSetFreeze.Name = %(dataset_name)s " + "AND ProbeSetXRef.DataId = ProbeSetData.Id " + "AND ProbeSetData.StrainId = Strain.Id " + "ORDER BY Strain.Name") + + with conn.cursor() as cursor: + cursor.execute( + query, + {"trait_name": trait_info["trait_name"], + "dataset_name": trait_info["db"]["dataset_name"]}) + return [dict(zip( + ["strain_name", "value", "se_error", "id"], row)) + for row in cursor.fetchall()] + return [] + +def with_strainlist_data_setup(strainlist: Sequence[str]): + """ + Build function that computes the trait data from provided list of strains. + + PARAMETERS + strainlist: (list) + A list of strain names + + RETURNS: + Returns a function that given some data from the database, computes the + strain's value, variance and ndata values, only if the strain is present + in the provided `strainlist` variable. + """ + def setup_fn(tdata): + if tdata["strain_name"] in strainlist: + val = tdata["value"] + if val is not None: + return { + "strain_name": tdata["strain_name"], + "value": val, + "variance": tdata["se_error"], + "ndata": tdata.get("nstrain", None) + } + return None + return setup_fn + +def without_strainlist_data_setup(): + """ + Build function that computes the trait data. + + RETURNS: + Returns a function that given some data from the database, computes the + strain's value, variance and ndata values. + """ + def setup_fn(tdata): + val = tdata["value"] + if val is not None: + return { + "strain_name": tdata["strain_name"], + "value": val, + "variance": tdata["se_error"], + "ndata": tdata.get("nstrain", None) + } + return None + return setup_fn + +def retrieve_trait_data(trait: dict, conn: Any, strainlist: Sequence[str] = tuple()): + """ + Retrieve trait data + + DESCRIPTION + Retrieve trait data as is done in + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L258-L386 + """ + # I do not like this section, but it retains the flow in the old codebase + if trait["db"]["dataset_type"] == "Temp": + results = retrieve_temp_trait_data(trait, conn) + elif trait["db"]["dataset_type"] == "Publish": + results = retrieve_publish_trait_data(trait, conn) + elif trait["cellid"]: + results = retrieve_cellid_trait_data(trait, conn) + elif trait["db"]["dataset_type"] == "ProbeSet": + results = retrieve_probeset_trait_data(trait, conn) + else: + results = retrieve_geno_trait_data(trait, conn) + + if results: + # do something with mysqlid + mysqlid = results[0]["id"] + if strainlist: + data = [ + item for item in + map(with_strainlist_data_setup(strainlist), results) + if item is not None] + else: + data = [ + item for item in + map(without_strainlist_data_setup(), results) + if item is not None] + + return { + "mysqlid": mysqlid, + "data": dict(map( + lambda x: ( + x["strain_name"], + {k:v for k, v in x.items() if x != "strain_name"}), + data))} + return {} diff --git a/gn3/function_helpers.py b/gn3/function_helpers.py new file mode 100644 index 0000000..397b2da --- /dev/null +++ b/gn3/function_helpers.py @@ -0,0 +1,36 @@ +""" +This module will contain helper functions that should assist in maintaining a +mostly functional way of programming. + +It will also contain miscellaneous functions that can be used globally, and thus +do not fit well in any other module. + +FUNCTIONS: +compose: This function is used to compose multiple functions into a single + function. It passes the results of calling one function to the other until + all the functions to be composed are called. +""" +from functools import reduce + +def compose(*functions): + """Compose multiple functions into a single function. + + The utility in this function is not specific to this module, and as such, + this function can, and probably should, be moved to a more global module. + + DESCRIPTION: + Given `cfn = compose(f_1, f_2, ... f_(n-1), f_n )`, calling + `cfn(arg_1, arg_2, ..., arg_m)` should call `f_n` with the arguments passed + to `cfn` and the results of that should be passed as arguments to `f_(n-1)` + and so on until `f_1` is called with the results of the cumulative calls and + that is the result of the entire chain of calls. + + PARAMETERS: + functions: a variable argument list of function. + """ + def composed_function(*args, **kwargs): + return reduce( + lambda res, fn: fn(res), + reversed(functions[:-1]), + functions[-1](*args, **kwargs)) + return composed_function diff --git a/gn3/heatmaps/heatmaps.py b/gn3/heatmaps/heatmaps.py new file mode 100644 index 0000000..3bf7917 --- /dev/null +++ b/gn3/heatmaps/heatmaps.py @@ -0,0 +1,54 @@ +import random +import plotly.express as px + +#### Remove these #### + +heatmap_dir = "heatmap_images" + +def generate_random_data(data_stop: float = 2, width: int = 10, height: int = 30): + """ + This is mostly a utility function to be used to generate random data, useful + for development of the heatmap generation code, without access to the actual + database data. + """ + return [[random.uniform(0,data_stop) for i in range(0, width)] + for j in range(0, height)] + +def heatmap_x_axis_names(): + return [ + "UCLA_BXDBXH_CARTILAGE_V2::ILM103710672", + "UCLA_BXDBXH_CARTILAGE_V2::ILM2260338", + "UCLA_BXDBXH_CARTILAGE_V2::ILM3140576", + "UCLA_BXDBXH_CARTILAGE_V2::ILM5670577", + "UCLA_BXDBXH_CARTILAGE_V2::ILM2070121", + "UCLA_BXDBXH_CARTILAGE_V2::ILM103990541", + "UCLA_BXDBXH_CARTILAGE_V2::ILM1190722", + "UCLA_BXDBXH_CARTILAGE_V2::ILM6590722", + "UCLA_BXDBXH_CARTILAGE_V2::ILM4200064", + "UCLA_BXDBXH_CARTILAGE_V2::ILM3140463"] +#### END: Remove these #### + +# Grey + Blue + Red +def generate_heatmap(): + rows = 20 + data = generate_random_data(height=rows) + y = (["%s"%x for x in range(1, rows+1)][:-1] + ["X"]) #replace last item with x for now + fig = px.imshow( + data, + x=heatmap_x_axis_names(), + y=y, + width=500) + fig.update_traces(xtype="array") + fig.update_traces(ytype="array") + # fig.update_traces(xgap=10) + fig.update_xaxes( + visible=True, + title_text="Traits", + title_font_size=16) + fig.update_layout( + coloraxis_colorscale=[ + [0.0, '#3B3B3B'], [0.4999999999999999, '#ABABAB'], + [0.5, '#F5DE11'], [1.0, '#FF0D00']]) + + fig.write_html("%s/%s"%(heatmap_dir, "test_image.html")) + return fig |