diff options
Diffstat (limited to 'gn3')
-rw-r--r-- | gn3/computations/correlations.py | 3 | ||||
-rw-r--r-- | gn3/computations/correlations2.py | 37 | ||||
-rw-r--r-- | gn3/computations/slink.py | 97 | ||||
-rw-r--r-- | gn3/db/traits.py | 159 |
4 files changed, 244 insertions, 52 deletions
diff --git a/gn3/computations/correlations.py b/gn3/computations/correlations.py index bc738a7..56f483c 100644 --- a/gn3/computations/correlations.py +++ b/gn3/computations/correlations.py @@ -1,4 +1,5 @@ """module contains code for correlations""" +import math import multiprocessing from typing import List @@ -90,7 +91,7 @@ def compute_sample_r_correlation(trait_name, corr_method, trait_vals, target_values=sanitized_target_vals, corr_method=corr_method) - if corr_coefficient is not None: + if corr_coefficient is not None and not math.isnan(corr_coefficient): return (trait_name, corr_coefficient, p_value, num_overlap) return None diff --git a/gn3/computations/correlations2.py b/gn3/computations/correlations2.py index 6c456db..93db3fa 100644 --- a/gn3/computations/correlations2.py +++ b/gn3/computations/correlations2.py @@ -1,15 +1,25 @@ +""" +DESCRIPTION: + TODO: Add a description for the module + +FUNCTIONS: +compute_correlation: + TODO: Describe what the function does...""" + from math import sqrt from functools import reduce ## From GN1: mostly for clustering and heatmap generation -def items_with_values(dbdata, userdata): +def __items_with_values(dbdata, userdata): """Retains only corresponding items in the data items that are not `None` values. -This should probably be renamed to something sensible""" + This should probably be renamed to something sensible""" def both_not_none(item1, item2): + """Check that both items are not the value `None`.""" if (item1 is not None) and (item2 is not None): return (item1, item2) return None def split_lists(accumulator, item): + """Separate the 'x' and 'y' items.""" return [accumulator[0] + [item[0]], accumulator[1] + [item[1]]] return reduce( split_lists, @@ -17,19 +27,24 @@ This should probably be renamed to something sensible""" [[], []]) def compute_correlation(dbdata, userdata): - x, y = items_with_values(dbdata, userdata) - if len(x) < 6: - return (0.0, len(x)) - meanx = sum(x)/len(x) - meany = sum(y)/len(y) + """Compute some form of correlation. + + This is extracted from + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/utility/webqtlUtil.py#L622-L647 + """ + x_items, y_items = __items_with_values(dbdata, userdata) + if len(x_items) < 6: + return (0.0, len(x_items)) + meanx = sum(x_items)/len(x_items) + meany = sum(y_items)/len(y_items) def cal_corr_vals(acc, item): xitem, yitem = item return [ acc[0] + ((xitem - meanx) * (yitem - meany)), acc[1] + ((xitem - meanx) * (xitem - meanx)), acc[2] + ((yitem - meany) * (yitem - meany))] - xyd, sxd, syd = reduce(cal_corr_vals, zip(x, y), [0.0, 0.0, 0.0]) + xyd, sxd, syd = reduce(cal_corr_vals, zip(x_items, y_items), [0.0, 0.0, 0.0]) try: - return ((xyd/(sqrt(sxd)*sqrt(syd))), len(x)) - except ZeroDivisionError as zde: - return(0, len(x)) + return ((xyd/(sqrt(sxd)*sqrt(syd))), len(x_items)) + except ZeroDivisionError: + return(0, len(x_items)) diff --git a/gn3/computations/slink.py b/gn3/computations/slink.py index 8d51f29..23d3d88 100644 --- a/gn3/computations/slink.py +++ b/gn3/computations/slink.py @@ -7,13 +7,18 @@ slink: TODO: Describe what the function does... """ import logging -from functools import partial +from typing import List, Tuple, Union, Sequence + +NumType = Union[int, float] +SeqOfNums = Sequence[NumType] class LengthError(BaseException): - pass + """Raised whenever child lists/tuples are not the same length as the parent + list of tuple.""" class MirrorError(BaseException): - pass + """Raised if the distance from child A to child B is not the same as the + distance from child B to child A.""" def __is_list_or_tuple(item): return type(item) in [list, tuple] @@ -50,19 +55,20 @@ def __raise_valueerror_if_child_list_distance_from_itself_is_not_zero(lists): def __raise_mirrorerror_of_distances_one_way_are_not_same_other_way(lists): """Check that the distance from A to B, is the same as the distance from B to A. If the two distances are different, throw an exception.""" - for i in range(len(lists)): - for j in range(len(lists)): - if lists[i][j] != lists[j][i]: - raise MirrorError( - ("Distance from one child({}) to the other ({}) " - "should be the same in both directions.").format( - lists[i][j], lists[j][i])) + inner_coords = range(len(lists)) + coords = ((i, j) for i in inner_coords for j in inner_coords) + def __is_same_reversed(coord): + return lists[coord[0]][coord[1]] == lists[coord[1]][coord[0]] + if not all(map(__is_same_reversed, coords)): + raise MirrorError(( + "Distance from one child to the other should be the same in both " + "directions.")) def __raise_valueerror_on_negative_distances(lists): """Check that distances between 'somethings' are all positive, otherwise, raise an exception.""" def zero_or_positive(val): - return val >= 0; + return val >= 0 # flatten lists flattened = __flatten_list_of_lists(lists) if not all(map(zero_or_positive, flattened)): @@ -71,12 +77,16 @@ raise an exception.""" def __flatten_list_of_lists(parent): return [item for child in parent for item in child] -def nearest(lists, i, j): +# i and j are Union[SeqOfNums, NumType], but that leads to errors where the +# values of i or j are indexed, since the NumType type is not indexable. +# I don't know how to type this so that it does not fail on running `mypy .` +def nearest(lists: Sequence[SeqOfNums], i, j) -> NumType: """ Computes shortest distance between member(s) in `i` and member(s) in `j`. Description: - This is 'copied' over from genenetwork1, from https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/slink.py#L42-L64. + This is 'copied' over from genenetwork1, from + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/slink.py#L42-L64. This description should be updated to better describe what 'member' means in the context where the function is used. @@ -108,19 +118,25 @@ def nearest(lists, i, j): __raise_mirrorerror_of_distances_one_way_are_not_same_other_way(lists) __raise_valueerror_on_negative_distances(lists) #### END: Guard Functions #### - if type(i) == int and type(j) == int: # From member i to member j + if isinstance(i, int) and isinstance(j, int): # From member i to member j return lists[i][j] - elif type(i) == int and __is_list_or_tuple(j): + + if isinstance(i, int) and __is_list_or_tuple(j): return min(map(lambda j_new: nearest(lists, i, j_new), j[:-1])) - elif type(j) == int and __is_list_or_tuple(i): + if isinstance(j, int) and __is_list_or_tuple(i): return min(map(lambda i_new: nearest(lists, i_new, j), i[:-1])) - elif __is_list_or_tuple(i) and __is_list_or_tuple(j): + + if __is_list_or_tuple(i) and __is_list_or_tuple(j): coordinate_pairs = __flatten_list_of_lists( [[(itemi, itemj) for itemj in j[:-1]] for itemi in i[:-1]]) return min(map(lambda x: nearest(lists, x[0], x[1]), coordinate_pairs)) - else: - raise ValueError("member values (i or j) should be lists/tuples of integers or integers") + raise ValueError("member values (i or j) should be lists/tuples of integers or integers") + +# `lists` here could be Sequence[SeqOfNums], but that leads to errors I do not +# understand down the line +# Might have to re-implement the function especially since the errors are thrown +# where `listindexcopy` is involved def slink(lists): """ DESCRIPTION: @@ -144,36 +160,39 @@ def slink(lists): """ try: size = len(lists) - listindex = range(size) listindexcopy = list(range(size)) - listscopy = [[item for item in child] for child in lists] - initSize = size + listscopy = [child[:] for child in lists] + init_size = size candidate = [] - while initSize >2: + while init_size > 2: mindist = 1e10 - for i in range(initSize): - for j in range(i+1,initSize): + for i in range(init_size): + for j in range(i+1, init_size): if listscopy[i][j] < mindist: - mindist = listscopy[i][j] - candidate=[[i,j]] + mindist = listscopy[i][j] + candidate = [[i, j]] elif listscopy[i][j] == mindist: - mindist = listscopy[i][j] - candidate.append([i,j]) + mindist = listscopy[i][j] + candidate.append([i, j]) else: pass - newmem = (listindexcopy[candidate[0][0]],listindexcopy[candidate[0][1]],mindist) + newmem = ( + listindexcopy[candidate[0][0]], listindexcopy[candidate[0][1]], + mindist) listindexcopy.pop(candidate[0][1]) listindexcopy[candidate[0][0]] = newmem - initSize -= 1 - for i in range(initSize): - for j in range(i+1,initSize): - listscopy[i][j] = nearest(lists,listindexcopy[i],listindexcopy[j]) + init_size -= 1 + for i in range(init_size): + for j in range(i+1, init_size): + listscopy[i][j] = nearest( + lists, listindexcopy[i], listindexcopy[j]) listscopy[j][i] = listscopy[i][j] - listindexcopy.append(nearest(lists,listindexcopy[0],listindexcopy[1])) + listindexcopy.append( + nearest(lists, listindexcopy[0], listindexcopy[1])) return listindexcopy - except Exception as e: - # TODO: Look into making the logging log output to the system's - # configured logger(s) - logging.warning("Exception: {}, {}".format(type(e), e)) + except (LengthError, MirrorError, TypeError, IndexError) as exc: + # Look into making the logging log output to the system's + # configured logger(s) + logging.warning("Exception: %s, %s", type(exc), exc) return [] diff --git a/gn3/db/traits.py b/gn3/db/traits.py index a77e6a1..ae1939a 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -1,5 +1,5 @@ """This class contains functions relating to trait data manipulation""" -from typing import Any, Union +from typing import Any, Dict, Union def get_trait_csv_sample_data(conn: Any, @@ -75,3 +75,160 @@ def update_sample_data(conn: Any, updated_n_strains: int = cursor.rowcount return (updated_strains, updated_published_data, updated_se_data, updated_n_strains) + + +def retrieve_trait_dataset_name( + trait_type: str, threshold: int, name: str, connection: Any): + """ + Retrieve the name of a trait given the trait's name + + This is extracted from the `webqtlDataset.retrieveName` function as is + implemented at + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlDataset.py#L140-L169 + """ + columns = "Id, Name, FullName, ShortName{}".format( + ", DataScale" if trait_type == "ProbeSet" else "") + query = ( + "SELECT {columns} " + "FROM {trait_type}Freeze " + "WHERE " + "public > %(threshold)s " + "AND " + "(Name = %(name)s OR FullName = %(name)s OR ShortName = %(name)s)").format( + columns=columns, trait_type=trait_type) + with connection.cursor() as cursor: + cursor.execute(query, {"threshold": threshold, "name": name}) + return cursor.fetchone() + +PUBLISH_TRAIT_INFO_QUERY = ( + "SELECT " + "PublishXRef.Id, Publication.PubMed_ID, " + "Phenotype.Pre_publication_description, " + "Phenotype.Post_publication_description, " + "Phenotype.Original_description, " + "Phenotype.Pre_publication_abbreviation, " + "Phenotype.Post_publication_abbreviation, " + "Phenotype.Lab_code, Phenotype.Submitter, Phenotype.Owner, " + "Phenotype.Authorized_Users, CAST(Publication.Authors AS BINARY), " + "Publication.Title, Publication.Abstract, Publication.Journal, " + "Publication.Volume, Publication.Pages, Publication.Month, " + "Publication.Year, PublishXRef.Sequence, Phenotype.Units, " + "PublishXRef.comments " + "FROM " + "PublishXRef, Publication, Phenotype, PublishFreeze " + "WHERE " + "PublishXRef.Id = %(trait_name)s AND " + "Phenotype.Id = PublishXRef.PhenotypeId AND " + "Publication.Id = PublishXRef.PublicationId AND " + "PublishXRef.InbredSetId = PublishFreeze.InbredSetId AND " + "PublishFreeze.Id =%(trait_dataset_id)s") + + +def retrieve_publish_trait_info(trait_data_source: Dict[str, Any], conn: Any): + """Retrieve trait information for type `Publish` traits. + + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L399-L421""" + with conn.cursor() as cursor: + cursor.execute( + PUBLISH_TRAIT_INFO_QUERY, + { + k:v for k, v in trait_data_source.items() + if k in ["trait_name", "trait_dataset_id"] + }) + return cursor.fetchone() + +PROBESET_TRAIT_INFO_QUERY = ( + "SELECT " + "ProbeSet.name, ProbeSet.symbol, ProbeSet.description, " + "ProbeSet.probe_target_description, ProbeSet.chr, ProbeSet.mb, " + "ProbeSet.alias, ProbeSet.geneid, ProbeSet.genbankid, ProbeSet.unigeneid, " + "ProbeSet.omim, ProbeSet.refseq_transcriptid, ProbeSet.blatseq, " + "ProbeSet.targetseq, ProbeSet.chipid, ProbeSet.comments, " + "ProbeSet.strand_probe, ProbeSet.strand_gene, " + "ProbeSet.probe_set_target_region, ProbeSet.proteinid, " + "ProbeSet.probe_set_specificity, ProbeSet.probe_set_blat_score, " + "ProbeSet.probe_set_blat_mb_start, ProbeSet.probe_set_blat_mb_end, " + "ProbeSet.probe_set_strand, ProbeSet.probe_set_note_by_rw, " + "ProbeSet.flag " + "FROM " + "ProbeSet, ProbeSetFreeze, ProbeSetXRef " + "WHERE " + "ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id AND " + "ProbeSetXRef.ProbeSetId = ProbeSet.Id AND " + "ProbeSetFreeze.Name = %(trait_dataset_name)s AND " + "ProbeSet.Name = %(trait_name)s") + +def retrieve_probeset_trait_info(trait_data_source: Dict[str, Any], conn: Any): + """Retrieve trait information for type `ProbeSet` traits. + + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L424-L435""" + with conn.cursor() as cursor: + cursor.execute( + PROBESET_TRAIT_INFO_QUERY, + { + k:v for k, v in trait_data_source.items() + if k in ["trait_name", "trait_dataset_name"] + }) + return cursor.fetchone() + +GENO_TRAIT_INFO_QUERY = ( + "SELECT " + "Geno.name, Geno.chr, Geno.mb, Geno.source2, Geno.sequence " + "FROM " + "Geno, GenoFreeze, GenoXRef " + "WHERE " + "GenoXRef.GenoFreezeId = GenoFreeze.Id AND GenoXRef.GenoId = Geno.Id AND " + "GenoFreeze.Name = %(trait_dataset_name)s AND Geno.Name = %(trait_name)s") + +def retrieve_geno_trait_info(trait_data_source: Dict[str, Any], conn: Any): + """Retrieve trait information for type `Geno` traits. + + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L438-L449""" + with conn.cursor() as cursor: + cursor.execute( + GENO_TRAIT_INFO_QUERY, + { + k:v for k, v in trait_data_source.items() + if k in ["trait_name", "trait_dataset_name"] + }) + return cursor.fetchone() + +TEMP_TRAIT_INFO_QUERY = ( + "SELECT name, description FROM Temp " + "WHERE Name = %(trait_name)s") + +def retrieve_temp_trait_info(trait_data_source: Dict[str, Any], conn: Any): + """Retrieve trait information for type `Temp` traits. + + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L450-452""" + with conn.cursor() as cursor: + cursor.execute( + TEMP_TRAIT_INFO_QUERY, + { + k:v for k, v in trait_data_source.items() + if k in ["trait_name"] + }) + return cursor.fetchone() + +def retrieve_trait_info( + trait_type: str, trait_name: str, trait_dataset_id: int, + trait_dataset_name: str, conn: Any): + """Retrieves the trait information. + + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L397-L456 + + This function, or the dependent functions, might be incomplete as they are + currently.""" + trait_info_function_table = { + "Publish": retrieve_publish_trait_info, + "ProbeSet": retrieve_probeset_trait_info, + "Geno": retrieve_geno_trait_info, + "Temp": retrieve_temp_trait_info + } + return trait_info_function_table[trait_type]( + { + "trait_name": trait_name, + "trait_dataset_id": trait_dataset_id, + "trait_dataset_name":trait_dataset_name + }, + conn) |