about summary refs log tree commit diff
path: root/gn3
diff options
context:
space:
mode:
Diffstat (limited to 'gn3')
-rw-r--r--gn3/computations/correlations.py3
-rw-r--r--gn3/computations/correlations2.py37
-rw-r--r--gn3/computations/slink.py97
-rw-r--r--gn3/db/traits.py159
4 files changed, 244 insertions, 52 deletions
diff --git a/gn3/computations/correlations.py b/gn3/computations/correlations.py
index bc738a7..56f483c 100644
--- a/gn3/computations/correlations.py
+++ b/gn3/computations/correlations.py
@@ -1,4 +1,5 @@
 """module contains code for correlations"""
+import math
 import multiprocessing
 
 from typing import List
@@ -90,7 +91,7 @@ def compute_sample_r_correlation(trait_name, corr_method, trait_vals,
                                        target_values=sanitized_target_vals,
                                        corr_method=corr_method)
 
-        if corr_coefficient is not None:
+        if corr_coefficient is not None and not math.isnan(corr_coefficient):
             return (trait_name, corr_coefficient, p_value, num_overlap)
     return None
 
diff --git a/gn3/computations/correlations2.py b/gn3/computations/correlations2.py
index 6c456db..93db3fa 100644
--- a/gn3/computations/correlations2.py
+++ b/gn3/computations/correlations2.py
@@ -1,15 +1,25 @@
+"""
+DESCRIPTION:
+    TODO: Add a description for the module
+
+FUNCTIONS:
+compute_correlation:
+    TODO: Describe what the function does..."""
+
 from math import sqrt
 from functools import reduce
 ## From GN1: mostly for clustering and heatmap generation
 
-def items_with_values(dbdata, userdata):
+def __items_with_values(dbdata, userdata):
     """Retains only corresponding items in the data items that are not `None` values.
-This should probably be renamed to something sensible"""
+    This should probably be renamed to something sensible"""
     def both_not_none(item1, item2):
+        """Check that both items are not the value `None`."""
         if (item1 is not None) and (item2 is not None):
             return (item1, item2)
         return None
     def split_lists(accumulator, item):
+        """Separate the 'x' and 'y' items."""
         return [accumulator[0] + [item[0]], accumulator[1] + [item[1]]]
     return reduce(
         split_lists,
@@ -17,19 +27,24 @@ This should probably be renamed to something sensible"""
         [[], []])
 
 def compute_correlation(dbdata, userdata):
-    x, y = items_with_values(dbdata, userdata)
-    if len(x) < 6:
-        return (0.0, len(x))
-    meanx = sum(x)/len(x)
-    meany = sum(y)/len(y)
+    """Compute some form of correlation.
+
+    This is extracted from
+    https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/utility/webqtlUtil.py#L622-L647
+    """
+    x_items, y_items = __items_with_values(dbdata, userdata)
+    if len(x_items) < 6:
+        return (0.0, len(x_items))
+    meanx = sum(x_items)/len(x_items)
+    meany = sum(y_items)/len(y_items)
     def cal_corr_vals(acc, item):
         xitem, yitem = item
         return [
             acc[0] + ((xitem - meanx) * (yitem - meany)),
             acc[1] + ((xitem - meanx) * (xitem - meanx)),
             acc[2] + ((yitem - meany) * (yitem - meany))]
-    xyd, sxd, syd = reduce(cal_corr_vals, zip(x, y), [0.0, 0.0, 0.0])
+    xyd, sxd, syd = reduce(cal_corr_vals, zip(x_items, y_items), [0.0, 0.0, 0.0])
     try:
-        return ((xyd/(sqrt(sxd)*sqrt(syd))), len(x))
-    except ZeroDivisionError as zde:
-        return(0, len(x))
+        return ((xyd/(sqrt(sxd)*sqrt(syd))), len(x_items))
+    except ZeroDivisionError:
+        return(0, len(x_items))
diff --git a/gn3/computations/slink.py b/gn3/computations/slink.py
index 8d51f29..23d3d88 100644
--- a/gn3/computations/slink.py
+++ b/gn3/computations/slink.py
@@ -7,13 +7,18 @@ slink:
     TODO: Describe what the function does...
 """
 import logging
-from functools import partial
+from typing import List, Tuple, Union, Sequence
+
+NumType = Union[int, float]
+SeqOfNums = Sequence[NumType]
 
 class LengthError(BaseException):
-    pass
+    """Raised whenever child lists/tuples are not the same length as the parent
+    list of tuple."""
 
 class MirrorError(BaseException):
-    pass
+    """Raised if the distance from child A to child B is not the same as the
+    distance from child B to child A."""
 
 def __is_list_or_tuple(item):
     return type(item) in [list, tuple]
@@ -50,19 +55,20 @@ def __raise_valueerror_if_child_list_distance_from_itself_is_not_zero(lists):
 def __raise_mirrorerror_of_distances_one_way_are_not_same_other_way(lists):
     """Check that the distance from A to B, is the same as the distance from B to A.
 If the two distances are different, throw an exception."""
-    for i in range(len(lists)):
-        for j in range(len(lists)):
-            if lists[i][j] != lists[j][i]:
-                raise MirrorError(
-                    ("Distance from one child({}) to the other ({}) "
-                     "should be the same in both directions.").format(
-                         lists[i][j], lists[j][i]))
+    inner_coords = range(len(lists))
+    coords = ((i, j) for i in inner_coords for j in inner_coords)
+    def __is_same_reversed(coord):
+        return lists[coord[0]][coord[1]] == lists[coord[1]][coord[0]]
+    if not all(map(__is_same_reversed, coords)):
+        raise MirrorError((
+            "Distance from one child to the other should be the same in both "
+            "directions."))
 
 def __raise_valueerror_on_negative_distances(lists):
     """Check that distances between 'somethings' are all positive, otherwise,
 raise an exception."""
     def zero_or_positive(val):
-        return val >= 0;
+        return val >= 0
     # flatten lists
     flattened = __flatten_list_of_lists(lists)
     if not all(map(zero_or_positive, flattened)):
@@ -71,12 +77,16 @@ raise an exception."""
 def __flatten_list_of_lists(parent):
     return [item for child in parent for item in child]
 
-def nearest(lists, i, j):
+# i and j are Union[SeqOfNums, NumType], but that leads to errors where the
+# values of i or j are indexed, since the NumType type is not indexable.
+# I don't know how to type this so that it does not fail on running `mypy .`
+def nearest(lists: Sequence[SeqOfNums], i, j) -> NumType:
     """
     Computes shortest distance between member(s) in `i` and member(s) in `j`.
 
     Description:
-    This is 'copied' over from genenetwork1, from https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/slink.py#L42-L64.
+    This is 'copied' over from genenetwork1, from
+    https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/slink.py#L42-L64.
 
     This description should be updated to better describe what 'member' means in
     the context where the function is used.
@@ -108,19 +118,25 @@ def nearest(lists, i, j):
     __raise_mirrorerror_of_distances_one_way_are_not_same_other_way(lists)
     __raise_valueerror_on_negative_distances(lists)
     #### END: Guard Functions ####
-    if type(i) == int and type(j) == int: # From member i to member j
+    if isinstance(i, int) and isinstance(j, int): # From member i to member j
         return lists[i][j]
-    elif type(i) == int and __is_list_or_tuple(j):
+
+    if isinstance(i, int) and __is_list_or_tuple(j):
         return min(map(lambda j_new: nearest(lists, i, j_new), j[:-1]))
-    elif type(j) == int and __is_list_or_tuple(i):
+    if isinstance(j, int) and __is_list_or_tuple(i):
         return min(map(lambda i_new: nearest(lists, i_new, j), i[:-1]))
-    elif __is_list_or_tuple(i) and __is_list_or_tuple(j):
+
+    if __is_list_or_tuple(i) and __is_list_or_tuple(j):
         coordinate_pairs = __flatten_list_of_lists(
             [[(itemi, itemj) for itemj in j[:-1]] for itemi in i[:-1]])
         return min(map(lambda x: nearest(lists, x[0], x[1]), coordinate_pairs))
-    else:
-        raise ValueError("member values (i or j) should be lists/tuples of integers or integers")
 
+    raise ValueError("member values (i or j) should be lists/tuples of integers or integers")
+
+# `lists` here could be Sequence[SeqOfNums], but that leads to errors I do not
+# understand down the line
+# Might have to re-implement the function especially since the errors are thrown
+# where `listindexcopy` is involved
 def slink(lists):
     """
     DESCRIPTION:
@@ -144,36 +160,39 @@ def slink(lists):
     """
     try:
         size = len(lists)
-        listindex = range(size)
         listindexcopy = list(range(size))
-        listscopy = [[item for item in child] for child in lists]
-        initSize = size
+        listscopy = [child[:] for child in lists]
+        init_size = size
         candidate = []
-        while initSize >2:
+        while init_size > 2:
             mindist = 1e10
-            for i in range(initSize):
-                for j in range(i+1,initSize):
+            for i in range(init_size):
+                for j in range(i+1, init_size):
                     if listscopy[i][j] < mindist:
-                        mindist =  listscopy[i][j]
-                        candidate=[[i,j]]
+                        mindist = listscopy[i][j]
+                        candidate = [[i, j]]
                     elif listscopy[i][j] == mindist:
-                        mindist =  listscopy[i][j]
-                        candidate.append([i,j])
+                        mindist = listscopy[i][j]
+                        candidate.append([i, j])
                     else:
                         pass
-            newmem = (listindexcopy[candidate[0][0]],listindexcopy[candidate[0][1]],mindist)
+            newmem = (
+                listindexcopy[candidate[0][0]], listindexcopy[candidate[0][1]],
+                mindist)
             listindexcopy.pop(candidate[0][1])
             listindexcopy[candidate[0][0]] = newmem
 
-            initSize -= 1
-            for i in range(initSize):
-                for j in range(i+1,initSize):
-                    listscopy[i][j] = nearest(lists,listindexcopy[i],listindexcopy[j])
+            init_size -= 1
+            for i in range(init_size):
+                for j in range(i+1, init_size):
+                    listscopy[i][j] = nearest(
+                        lists, listindexcopy[i], listindexcopy[j])
                     listscopy[j][i] = listscopy[i][j]
-        listindexcopy.append(nearest(lists,listindexcopy[0],listindexcopy[1]))
+        listindexcopy.append(
+            nearest(lists, listindexcopy[0], listindexcopy[1]))
         return listindexcopy
-    except Exception as e:
-        # TODO: Look into making the logging log output to the system's
-        #    configured logger(s)
-        logging.warning("Exception: {}, {}".format(type(e), e))
+    except (LengthError, MirrorError, TypeError, IndexError) as exc:
+        # Look into making the logging log output to the system's
+        #   configured logger(s)
+        logging.warning("Exception: %s, %s", type(exc), exc)
         return []
diff --git a/gn3/db/traits.py b/gn3/db/traits.py
index a77e6a1..ae1939a 100644
--- a/gn3/db/traits.py
+++ b/gn3/db/traits.py
@@ -1,5 +1,5 @@
 """This class contains functions relating to trait data manipulation"""
-from typing import Any, Union
+from typing import Any, Dict, Union
 
 
 def get_trait_csv_sample_data(conn: Any,
@@ -75,3 +75,160 @@ def update_sample_data(conn: Any,
         updated_n_strains: int = cursor.rowcount
     return (updated_strains, updated_published_data,
             updated_se_data, updated_n_strains)
+
+
+def retrieve_trait_dataset_name(
+        trait_type: str, threshold: int, name: str, connection: Any):
+    """
+    Retrieve the name of a trait given the trait's name
+
+    This is extracted from the `webqtlDataset.retrieveName` function as is
+    implemented at
+    https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlDataset.py#L140-L169
+    """
+    columns = "Id, Name, FullName, ShortName{}".format(
+        ", DataScale" if trait_type == "ProbeSet" else "")
+    query = (
+        "SELECT {columns} "
+        "FROM {trait_type}Freeze "
+        "WHERE "
+        "public > %(threshold)s "
+        "AND "
+        "(Name = %(name)s OR FullName = %(name)s OR ShortName = %(name)s)").format(
+            columns=columns, trait_type=trait_type)
+    with connection.cursor() as cursor:
+        cursor.execute(query, {"threshold": threshold, "name": name})
+        return cursor.fetchone()
+
+PUBLISH_TRAIT_INFO_QUERY = (
+    "SELECT "
+    "PublishXRef.Id, Publication.PubMed_ID, "
+    "Phenotype.Pre_publication_description, "
+    "Phenotype.Post_publication_description, "
+    "Phenotype.Original_description, "
+    "Phenotype.Pre_publication_abbreviation, "
+    "Phenotype.Post_publication_abbreviation, "
+    "Phenotype.Lab_code, Phenotype.Submitter, Phenotype.Owner, "
+    "Phenotype.Authorized_Users, CAST(Publication.Authors AS BINARY), "
+    "Publication.Title, Publication.Abstract, Publication.Journal, "
+    "Publication.Volume, Publication.Pages, Publication.Month, "
+    "Publication.Year, PublishXRef.Sequence, Phenotype.Units, "
+    "PublishXRef.comments "
+    "FROM "
+    "PublishXRef, Publication, Phenotype, PublishFreeze "
+    "WHERE "
+    "PublishXRef.Id = %(trait_name)s AND "
+    "Phenotype.Id = PublishXRef.PhenotypeId AND "
+    "Publication.Id = PublishXRef.PublicationId AND "
+    "PublishXRef.InbredSetId = PublishFreeze.InbredSetId AND "
+    "PublishFreeze.Id =%(trait_dataset_id)s")
+
+
+def retrieve_publish_trait_info(trait_data_source: Dict[str, Any], conn: Any):
+    """Retrieve trait information for type `Publish` traits.
+
+    https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L399-L421"""
+    with conn.cursor() as cursor:
+        cursor.execute(
+            PUBLISH_TRAIT_INFO_QUERY,
+            {
+                k:v for k, v in trait_data_source.items()
+                if k in ["trait_name", "trait_dataset_id"]
+            })
+        return cursor.fetchone()
+
+PROBESET_TRAIT_INFO_QUERY = (
+    "SELECT "
+    "ProbeSet.name, ProbeSet.symbol, ProbeSet.description, "
+    "ProbeSet.probe_target_description, ProbeSet.chr, ProbeSet.mb, "
+    "ProbeSet.alias, ProbeSet.geneid, ProbeSet.genbankid, ProbeSet.unigeneid, "
+    "ProbeSet.omim, ProbeSet.refseq_transcriptid, ProbeSet.blatseq, "
+    "ProbeSet.targetseq, ProbeSet.chipid, ProbeSet.comments, "
+    "ProbeSet.strand_probe, ProbeSet.strand_gene, "
+    "ProbeSet.probe_set_target_region, ProbeSet.proteinid, "
+    "ProbeSet.probe_set_specificity, ProbeSet.probe_set_blat_score, "
+    "ProbeSet.probe_set_blat_mb_start, ProbeSet.probe_set_blat_mb_end, "
+    "ProbeSet.probe_set_strand, ProbeSet.probe_set_note_by_rw, "
+    "ProbeSet.flag "
+    "FROM "
+    "ProbeSet, ProbeSetFreeze, ProbeSetXRef "
+    "WHERE "
+    "ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id AND "
+    "ProbeSetXRef.ProbeSetId = ProbeSet.Id AND "
+    "ProbeSetFreeze.Name = %(trait_dataset_name)s AND "
+    "ProbeSet.Name = %(trait_name)s")
+
+def retrieve_probeset_trait_info(trait_data_source: Dict[str, Any], conn: Any):
+    """Retrieve trait information for type `ProbeSet` traits.
+
+    https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L424-L435"""
+    with conn.cursor() as cursor:
+        cursor.execute(
+            PROBESET_TRAIT_INFO_QUERY,
+            {
+                k:v for k, v in trait_data_source.items()
+                if k in ["trait_name", "trait_dataset_name"]
+            })
+        return cursor.fetchone()
+
+GENO_TRAIT_INFO_QUERY = (
+    "SELECT "
+    "Geno.name, Geno.chr, Geno.mb, Geno.source2, Geno.sequence "
+    "FROM "
+    "Geno, GenoFreeze, GenoXRef "
+    "WHERE "
+    "GenoXRef.GenoFreezeId = GenoFreeze.Id AND GenoXRef.GenoId = Geno.Id AND "
+    "GenoFreeze.Name = %(trait_dataset_name)s AND Geno.Name = %(trait_name)s")
+
+def retrieve_geno_trait_info(trait_data_source: Dict[str, Any], conn: Any):
+    """Retrieve trait information for type `Geno` traits.
+
+    https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L438-L449"""
+    with conn.cursor() as cursor:
+        cursor.execute(
+            GENO_TRAIT_INFO_QUERY,
+            {
+                k:v for k, v in trait_data_source.items()
+                if k in ["trait_name", "trait_dataset_name"]
+            })
+        return cursor.fetchone()
+
+TEMP_TRAIT_INFO_QUERY = (
+    "SELECT name, description FROM Temp "
+    "WHERE Name = %(trait_name)s")
+
+def retrieve_temp_trait_info(trait_data_source: Dict[str, Any], conn: Any):
+    """Retrieve trait information for type `Temp` traits.
+
+    https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L450-452"""
+    with conn.cursor() as cursor:
+        cursor.execute(
+            TEMP_TRAIT_INFO_QUERY,
+            {
+                k:v for k, v in trait_data_source.items()
+                if k in ["trait_name"]
+            })
+        return cursor.fetchone()
+
+def retrieve_trait_info(
+        trait_type: str, trait_name: str, trait_dataset_id: int,
+        trait_dataset_name: str, conn: Any):
+    """Retrieves the trait information.
+
+    https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L397-L456
+
+    This function, or the dependent functions, might be incomplete as they are
+    currently."""
+    trait_info_function_table = {
+        "Publish": retrieve_publish_trait_info,
+        "ProbeSet": retrieve_probeset_trait_info,
+        "Geno": retrieve_geno_trait_info,
+        "Temp": retrieve_temp_trait_info
+    }
+    return trait_info_function_table[trait_type](
+        {
+            "trait_name": trait_name,
+            "trait_dataset_id": trait_dataset_id,
+            "trait_dataset_name":trait_dataset_name
+        },
+        conn)