Namespace all modules under gn2.

We move all modules under a gn2 directory. This is important for "correct" packaging and deployment as a Guix service.
author: Arun Isaac 2023-12-29 18:55:37 +0000
committer: Arun Isaac 2023-12-29 19:01:46 +0000
commit: 204a308be0f741726b9a620d88fbc22b22124c81 (patch)
tree: b3cf66906674020b530c844c2bb4982c8a0e2d39 /gn2/wqflask/show_trait
parent: 83062c75442160427b50420161bfcae2c5c34c84 (diff)
download: genenetwork2-204a308be0f741726b9a620d88fbc22b22124c81.tar.gz
4 files changed, 1195 insertions, 0 deletions
diff --git a/gn2/wqflask/show_trait/SampleList.py b/gn2/wqflask/show_trait/SampleList.py
new file mode 100644
index 00000000..64fc8fe6
--- /dev/null
+++ b/gn2/wqflask/show_trait/SampleList.py
@@ -0,0 +1,223 @@
+import re
+import itertools
+
+from gn2.wqflask.database import database_connection
+from gn2.base import webqtlCaseData, webqtlConfig
+from pprint import pformat as pf
+
+from gn2.utility import Plot
+from gn2.utility import Bunch
+from gn2.utility.tools import get_setting
+
+class SampleList:
+    def __init__(self,
+                 dataset,
+                 sample_names,
+                 this_trait,
+                 sample_group_type="primary",
+                 header="Samples"):
+
+        self.dataset = dataset
+        self.this_trait = this_trait
+        self.sample_group_type = sample_group_type    # primary or other
+        self.header = header
+
+        self.sample_list = []  # The actual list
+        self.sample_attribute_values = {}
+
+        self.get_attributes()
+
+        if self.this_trait and self.dataset:
+            self.get_extra_attribute_values()
+
+        for counter, sample_name in enumerate(sample_names, 1):
+            sample_name = sample_name.replace("_2nd_", "")
+
+            # self.this_trait will be a list if it is a Temp trait
+            if isinstance(self.this_trait, list):
+                sample = webqtlCaseData.webqtlCaseData(name=sample_name)
+                if counter <= len(self.this_trait):
+                    if isinstance(self.this_trait[counter - 1], (bytes, bytearray)):
+                        if (self.this_trait[counter - 1].decode("utf-8").lower() != 'x'):
+                            sample = webqtlCaseData.webqtlCaseData(
+                                name=sample_name,
+                                value=float(self.this_trait[counter - 1]))
+                    else:
+                        if (self.this_trait[counter - 1].lower() != 'x'):
+                            sample = webqtlCaseData.webqtlCaseData(
+                                name=sample_name,
+                                value=float(self.this_trait[counter - 1]))
+            else:
+                # If there's no value for the sample/strain,
+                # create the sample object (so samples with no value
+                # are still displayed in the table)
+                try:
+                    sample = self.this_trait.data[sample_name]
+                except KeyError:
+                    sample = webqtlCaseData.webqtlCaseData(name=sample_name)
+
+            sample.extra_info = {}
+            if (self.dataset.group.name == 'AXBXA'
+                    and sample_name in ('AXB18/19/20', 'AXB13/14', 'BXA8/17')):
+                sample.extra_info['url'] = "/mouseCross.html#AXB/BXA"
+                sample.extra_info['css_class'] = "fs12"
+
+            sample.this_id = str(counter)
+
+            # For extra attribute columns; currently only used by
+            # several datasets
+            if self.sample_attribute_values:
+                sample.extra_attributes = self.sample_attribute_values.get(
+                    sample_name, {})
+
+                # Add a url so RRID case attributes can be displayed as links
+                if '36' in sample.extra_attributes:
+                    rrid_string = str(sample.extra_attributes['36'])
+                    if self.dataset.group.species == "mouse":
+                        if len(rrid_string.split(":")) > 1:
+                            the_rrid = rrid_string.split(":")[1]
+                            sample.extra_attributes['36'] = [
+                                rrid_string]
+                            sample.extra_attributes['36'].append(
+                                webqtlConfig.RRID_MOUSE_URL % the_rrid)
+                    elif self.dataset.group.species == "rat":
+                        if len(rrid_string):
+                            the_rrid = rrid_string.split("_")[1]
+                            sample.extra_attributes['36'] = [
+                                rrid_string]
+                            sample.extra_attributes['36'].append(
+                                webqtlConfig.RRID_RAT_URL % the_rrid)
+
+            self.sample_list.append(sample)
+
+        self.se_exists = any(sample.variance for sample in self.sample_list)
+        self.num_cases_exists = False
+        if (any(sample.num_cases for sample in self.sample_list) and
+            any((sample.num_cases and sample.num_cases != "1") for sample in self.sample_list)):
+            self.num_cases_exists = True
+
+        first_attr_col = self.get_first_attr_col()
+        for sample in self.sample_list:
+            sample.first_attr_col = first_attr_col
+
+        self.do_outliers()
+
+    def __repr__(self):
+        return "<SampleList> --> %s" % (pf(self.__dict__))
+
+    def do_outliers(self):
+        values = [sample.value for sample in self.sample_list
+                  if sample.value is not None]
+        upper_bound, lower_bound = Plot.find_outliers(values)
+
+        for sample in self.sample_list:
+            if sample.value:
+                if upper_bound and sample.value > upper_bound:
+                    sample.outlier = True
+                elif lower_bound and sample.value < lower_bound:
+                    sample.outlier = True
+                else:
+                    sample.outlier = False
+
+    def get_attributes(self):
+        """Finds which extra attributes apply to this dataset"""
+
+        # Get attribute names and distinct values for each attribute
+        with database_connection(get_setting("SQL_URI")) as conn, conn.cursor() as cursor:
+            cursor.execute(
+                "SELECT DISTINCT CaseAttribute.CaseAttributeId, "
+                "CaseAttribute.Name, CaseAttribute.Description, "
+                "CaseAttributeXRefNew.Value FROM "
+                "CaseAttribute, CaseAttributeXRefNew WHERE "
+                "CaseAttributeXRefNew.CaseAttributeId = CaseAttribute.CaseAttributeId "
+                "AND CaseAttributeXRefNew.InbredSetId = %s "
+                "ORDER BY CaseAttribute.CaseAttributeId", (str(self.dataset.group.id),)
+            )
+
+            self.attributes = {}
+            for attr, values in itertools.groupby(
+                    cursor.fetchall(), lambda row: (row[0], row[1], row[2])
+            ):
+                key, name, description = attr
+                self.attributes[key] = Bunch()
+                self.attributes[key].id = key
+                self.attributes[key].name = name
+                self.attributes[key].description = description
+                self.attributes[key].distinct_values = [
+                    item[3] for item in values]
+                self.attributes[key].distinct_values = natural_sort(
+                    self.attributes[key].distinct_values)
+                all_numbers = True
+                for value in self.attributes[key].distinct_values:
+                    try:
+                        val_as_float = float(value)
+                    except:
+                        all_numbers = False
+                        break
+
+                if all_numbers:
+                    self.attributes[key].alignment = "right"
+                else:
+                    self.attributes[key].alignment = "left"
+
+    def get_extra_attribute_values(self):
+        if self.attributes:
+            with database_connection(get_setting("SQL_URI")) as conn, conn.cursor() as cursor:
+                cursor.execute(
+                    "SELECT Strain.Name AS SampleName, "
+                    "CaseAttributeId AS Id, "
+                    "CaseAttributeXRefNew.Value FROM Strain, "
+                    "StrainXRef, InbredSet, CaseAttributeXRefNew "
+                    "WHERE StrainXRef.StrainId = Strain.Id "
+                    "AND InbredSet.Id = StrainXRef.InbredSetId "
+                    "AND CaseAttributeXRefNew.StrainId = Strain.Id "
+                    "AND InbredSet.Id = CaseAttributeXRefNew.InbredSetId "
+                    "AND CaseAttributeXRefNew.InbredSetId = %s "
+                    "ORDER BY SampleName", (self.dataset.group.id,)
+                )
+
+                for sample_name, items in itertools.groupby(
+                        cursor.fetchall(), lambda row: row[0]
+                ):
+                    attribute_values = {}
+                    # Make a list of attr IDs without values (that have values for other samples)
+                    valueless_attr_ids = [self.attributes[key].id for key in self.attributes.keys()]
+                    for item in items:
+                        sample_name, _id, value = item
+                        valueless_attr_ids.remove(_id)
+                        attribute_value = value
+
+                        # If it's an int, turn it into one for sorting
+                        # (for example, 101 would be lower than 80 if
+                        # they're strings instead of ints)
+                        try:
+                            attribute_value = int(attribute_value)
+                        except ValueError:
+                            pass
+
+                        attribute_values[str(_id)] = attribute_value
+                    for attr_id in valueless_attr_ids:
+                        attribute_values[str(attr_id)] = ""
+
+                    self.sample_attribute_values[sample_name] = attribute_values
+
+    def get_first_attr_col(self):
+        first_attr_col = 4
+        if self.se_exists:
+            first_attr_col += 2
+        if self.num_cases_exists:
+            first_attr_col += 1
+
+        return first_attr_col
+
+
+def natural_sort(a_list, key=lambda s: s):
+    """
+    Sort the list into natural alphanumeric order.
+    """
+    def get_alphanum_key_func(key):
+        def convert(text): return int(text) if text.isdigit() else text
+        return lambda s: [convert(c) for c in re.split('([0-9]+)', key(s))]
+    sort_key = get_alphanum_key_func(key)
+    sorted_list = sorted(a_list, key=sort_key)
+    return sorted_list
diff --git a/gn2/wqflask/show_trait/__init__.py b/gn2/wqflask/show_trait/__init__.py
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/gn2/wqflask/show_trait/__init__.py
diff --git a/gn2/wqflask/show_trait/export_trait_data.py b/gn2/wqflask/show_trait/export_trait_data.py
new file mode 100644
index 00000000..06c1c502
--- /dev/null
+++ b/gn2/wqflask/show_trait/export_trait_data.py
@@ -0,0 +1,113 @@
+import datetime
+import simplejson as json
+
+from pprint import pformat as pf
+from functools import cmp_to_key
+from gn2.base.trait import create_trait
+from gn2.base import data_set
+
+
+def export_sample_table(targs):
+
+    sample_data = json.loads(targs['export_data'])
+    trait_name = targs['trait_display_name']
+
+    meta_data = get_export_metadata(targs)
+
+    final_sample_data = meta_data
+
+    column_headers = ["Index", "Name", "Value"]
+    attr_pos = 2
+    if any(sample["se"] for sample in sample_data['primary_samples']):
+        column_headers.append("SE")
+        attr_pos = 3
+    if any(sample["num_cases"] for sample in sample_data['primary_samples']):
+        column_headers.append("N")
+        attr_pos = 4
+
+    for key in sample_data["primary_samples"][0].keys():
+        if key not in ["name", "value", "se", "num_cases"]:
+            column_headers.append(key)
+
+    final_sample_data.append(column_headers)
+    for sample_group in ['primary_samples', 'other_samples']:
+        for i, row in enumerate(sample_data[sample_group]):
+            sorted_row = [i + 1] + dict_to_sorted_list(row)[:attr_pos]
+            for attr in sample_data['attributes']:
+                sorted_row.append(row[attr])
+            final_sample_data.append(sorted_row)
+
+    return trait_name, final_sample_data
+
+
+def get_export_metadata(trait_metadata):
+
+    trait_id, display_name, dataset_name, group_name = trait_metadata['trait_id'], trait_metadata['trait_display_name'], trait_metadata['dataset'], trait_metadata['group']
+
+    dataset = data_set.create_dataset(dataset_name, group_name=group_name)
+    this_trait = create_trait(dataset=dataset,
+                              name=trait_id,
+                              cellid=None,
+                              get_qtl_info=False)
+
+    metadata = []
+    if dataset.type == "Publish":
+        metadata.append(["Phenotype ID:", display_name])
+        metadata.append(["Phenotype URL: ", "http://genenetwork.org/show_trait?trait_id=" + \
+                         trait_id + "&dataset=" + dataset_name])
+        metadata.append(["Group: ", dataset.group.name])
+        metadata.append(
+            ["Phenotype: ", this_trait.description_display.replace(",", "\",\"")])
+        metadata.append(
+            ["Authors: ", (this_trait.authors if this_trait.authors else "N/A")])
+        metadata.append(
+            ["Title: ", (this_trait.title if this_trait.title else "N/A")])
+        metadata.append(
+            ["Journal: ", (this_trait.journal if this_trait.journal else "N/A")])
+
+        metadata.append(
+            ["Dataset Link: ", "http://gn1.genenetwork.org/webqtl/main.py?FormID=sharinginfo&InfoPageName=" + dataset.name])
+    else:
+        metadata.append(["Record ID: ", trait_id])
+        metadata.append(["Trait URL: ", "http://genenetwork.org/show_trait?trait_id=" + \
+                         trait_id + "&dataset=" + dataset_name])
+        if this_trait.symbol:
+            metadata.append(["Symbol: ", this_trait.symbol])
+        metadata.append(["Dataset: ", dataset.name])
+        metadata.append(["Group: ", dataset.group.name])
+    metadata.append(
+        ["Export Date: ", datetime.datetime.now().strftime("%B %d, %Y")])
+    metadata.append(
+        ["Export Time: ", datetime.datetime.now().strftime("%H:%M GMT")])
+
+
+    return metadata
+
+
+def dict_to_sorted_list(dictionary):
+    sorted_list = [item for item in list(dictionary.items())]
+    sorted_list = sorted(sorted_list, key=cmp_to_key(cmp_samples))
+    sorted_values = [item[1] for item in sorted_list]
+    return sorted_values
+
+
+def cmp_samples(a, b):
+    if b[0] == 'name':
+        return 1
+    elif b[0] == 'value':
+        if a[0] == 'name':
+            return -1
+        else:
+            return 1
+    elif b[0] == 'se':
+        if a[0] == 'name' or a[0] == 'value':
+            return -1
+        else:
+            return 1
+    elif b[0] == 'num_cases':
+        if a[0] == 'name' or a[0] == 'value' or a[0] == 'se':
+            return -1
+        else:
+            return 1
+    else:
+        return -1
diff --git a/gn2/wqflask/show_trait/show_trait.py b/gn2/wqflask/show_trait/show_trait.py
new file mode 100644
index 00000000..64d9be3b
--- /dev/null
+++ b/gn2/wqflask/show_trait/show_trait.py
@@ -0,0 +1,859 @@
+from typing import Dict
+
+import string
+import datetime
+import uuid
+import requests
+import json as json
+
+from collections import OrderedDict
+
+import numpy as np
+import scipy.stats as ss
+
+from gn2.wqflask.database import database_connection
+
+from gn2.base import webqtlConfig
+from gn2.wqflask.show_trait.SampleList import SampleList
+from gn2.base.trait import create_trait
+from gn2.base import data_set
+from gn2.utility import helper_functions
+from gn2.utility.tools import get_setting, locate_ignore_error
+from gn2.utility.tools import GN_PROXY_URL
+from gn2.utility.redis_tools import get_redis_conn, get_resource_id
+
+from gn3.authentication import get_highest_user_access_role
+
+Redis = get_redis_conn()
+ONE_YEAR = 60 * 60 * 24 * 365
+
+
+###############################################
+#
+# Todo: Put in security to ensure that user has permission to access
+# confidential data sets And add i.p.limiting as necessary
+#
+##############################################
+
+
+class ShowTrait:
+    def __init__(self, db_cursor, user_id, kw):
+        if 'trait_id' in kw and kw['dataset'] != "Temp":
+            self.temp_trait = False
+            self.trait_id = kw['trait_id']
+            helper_functions.get_species_dataset_trait(self, kw)
+            self.resource_id = get_resource_id(self.dataset,
+                                           self.trait_id)
+        elif 'group' in kw:
+            self.temp_trait = True
+            self.trait_id = "Temp_" + kw['species'] + "_" + kw['group'] + \
+                "_" + datetime.datetime.now().strftime("%m%d%H%M%S")
+            self.temp_species = kw['species']
+            self.temp_group = kw['group']
+            self.dataset = data_set.create_dataset(
+                dataset_name="Temp", dataset_type="Temp", group_name=self.temp_group)
+
+            # Put values in Redis so they can be looked up later if
+            # added to a collection
+            Redis.set(self.trait_id, kw['trait_paste'], ex=ONE_YEAR)
+            self.trait_vals = kw['trait_paste'].split()
+            self.this_trait = create_trait(dataset=self.dataset,
+                                           name=self.trait_id,
+                                           cellid=None)
+        else:
+            self.temp_trait = True
+            self.trait_id = kw['trait_id']
+            self.temp_species = self.trait_id.split("_")[1]
+            self.temp_group = self.trait_id.split("_")[2]
+            self.dataset = data_set.create_dataset(
+                dataset_name="Temp", dataset_type="Temp", group_name=self.temp_group)
+            self.this_trait = create_trait(dataset=self.dataset,
+                                           name=self.trait_id,
+                                           cellid=None)
+            self.trait_vals = Redis.get(self.trait_id).split()
+
+        # Get verify/rna-seq link URLs
+        try:
+            blatsequence = self.this_trait.blatseq if self.dataset.type == "ProbeSet" else self.this_trait.sequence
+            if not blatsequence:
+                # XZ, 06/03/2009: ProbeSet name is not unique among platforms. We should use ProbeSet Id instead.
+                seqs = ()
+                if self.dataset.type == "ProbeSet":
+                    db_cursor.execute(
+                        "SELECT Probe.Sequence, Probe.Name "
+                        "FROM Probe, ProbeSet, ProbeSetFreeze, "
+                        "ProbeSetXRef WHERE "
+                        "ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id "
+                        "AND ProbeSetXRef.ProbeSetId = ProbeSet.Id AND "
+                        "ProbeSetFreeze.Name = %s AND "
+                        "ProbeSet.Name = %s AND "
+                        "Probe.ProbeSetId = ProbeSet.Id ORDER "
+                        "BY Probe.SerialOrder",
+                        (self.this_trait.dataset.name, self.this_trait.name,)
+                    )
+                else:
+                    db_cursor.execute(
+                        "SELECT Geno.Sequence "
+                        "FROM Geno, GenoXRef, GenoFreeze "
+                        "WHERE Geno.Name = %s AND "
+                        "Geno.Id = GenoXRef.GenoId AND "
+                        "GenoXRef.GenoFreezeId = GenoFreeze.Id AND "
+                        "GenoFreeze.Name = %s",
+                        (self.this_trait.name, self.this_trait.dataset.name)
+                    )
+                seqs = db_cursor.fetchall()
+                if not seqs:
+                    raise ValueError
+                else:
+                    blatsequence = ''
+                    for seqt in seqs:
+                        if int(seqt[1][-1]) % 2 == 1:
+                            blatsequence += string.strip(seqt[0])
+
+            # --------Hongqiang add this part in order to not only blat ProbeSet, but also blat Probe
+            blatsequence = '%3E' + self.this_trait.name + '%0A' + blatsequence + '%0A'
+
+            # XZ, 06/03/2009: ProbeSet name is not unique among platforms. We should use ProbeSet Id instead.
+            seqs = ()
+            db_cursor.execute(
+                "SELECT Probe.Sequence, Probe.Name "
+                "FROM Probe, ProbeSet, ProbeSetFreeze, "
+                "ProbeSetXRef WHERE "
+                "ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id "
+                "AND ProbeSetXRef.ProbeSetId = ProbeSet.Id AND "
+                "ProbeSetFreeze.Name = %s AND ProbeSet.Name = %s "
+                "AND Probe.ProbeSetId = ProbeSet.Id "
+                "ORDER BY Probe.SerialOrder",
+                (self.this_trait.dataset.name, self.this_trait.name,)
+            )
+            seqs = db_cursor.fetchall()
+            for seqt in seqs:
+                if int(seqt[1][-1]) % 2 == 1:
+                    blatsequence += '%3EProbe_' + \
+                        seqt[1].strip() + '%0A' + seqt[0].strip() + '%0A'
+
+            if self.dataset.group.species == "rat":
+                self.UCSC_BLAT_URL = webqtlConfig.UCSC_BLAT % (
+                    'rat', 'rn7', blatsequence)
+                self.UTHSC_BLAT_URL = ""
+            elif self.dataset.group.species == "mouse":
+                self.UCSC_BLAT_URL = webqtlConfig.UCSC_BLAT % (
+                    'mouse', 'mm10', blatsequence)
+                self.UTHSC_BLAT_URL = webqtlConfig.UTHSC_BLAT % (
+                    'mouse', 'mm10', blatsequence)
+            elif self.dataset.group.species == "human":
+                self.UCSC_BLAT_URL = webqtlConfig.UCSC_BLAT % (
+                    'human', 'hg38', blatsequence)
+                self.UTHSC_BLAT_URL = ""
+            else:
+                self.UCSC_BLAT_URL = ""
+                self.UTHSC_BLAT_URL = ""
+        except:
+            self.UCSC_BLAT_URL = ""
+            self.UTHSC_BLAT_URL = ""
+
+        if self.dataset.type == "ProbeSet":
+            self.show_probes = "True"
+
+        trait_units = get_trait_units(self.this_trait)
+        self.get_external_links()
+        self.build_correlation_tools()
+
+        self.ncbi_summary = get_ncbi_summary(self.this_trait)
+
+        # Get nearest marker for composite mapping
+        if not self.temp_trait:
+            if check_if_attr_exists(self.this_trait, 'locus_chr') and self.dataset.type != "Geno" and self.dataset.type != "Publish":
+                self.nearest_marker = get_nearest_marker(
+                    self.this_trait, self.dataset)
+            else:
+                self.nearest_marker = ""
+
+        self.make_sample_lists()
+
+        trait_vals_by_group = []
+        for sample_type in self.sample_groups:
+            trait_vals_by_group.append(get_trait_vals(sample_type.sample_list))
+
+        self.max_digits_by_group, self.no_decimal_place = get_max_digits(trait_vals_by_group)
+
+        self.qnorm_vals = quantile_normalize_vals(self.sample_groups, trait_vals_by_group)
+        self.z_scores = get_z_scores(self.sample_groups, trait_vals_by_group)
+
+        self.temp_uuid = uuid.uuid4()
+
+        self.sample_group_types = OrderedDict()
+        if len(self.sample_groups) > 1:
+            self.sample_group_types['samples_primary'] = self.dataset.group.name
+            self.sample_group_types['samples_other'] = "Other"
+            self.sample_group_types['samples_all'] = "All"
+        else:
+            self.sample_group_types['samples_primary'] = self.dataset.group.name
+        sample_lists = [group.sample_list for group in self.sample_groups]
+
+        self.categorical_var_list = []
+        self.numerical_var_list = []
+        if not self.temp_trait:
+            # ZS: Only using first samplelist, since I think mapping only uses those samples
+            self.categorical_var_list = get_categorical_variables(
+                self.this_trait, self.sample_groups[0])
+            self.numerical_var_list = get_numerical_variables(
+                self.this_trait, self.sample_groups[0])
+
+        # ZS: Get list of chromosomes to select for mapping
+        self.chr_list = [["All", -1]]
+        for i, this_chr in enumerate(self.dataset.species.chromosomes.chromosomes(db_cursor)):
+            self.chr_list.append(
+                [self.dataset.species.chromosomes.chromosomes(db_cursor)[this_chr].name, i])
+
+        self.genofiles = self.dataset.group.get_genofiles()
+        study_samplelist_json = self.dataset.group.get_study_samplelists()
+        self.study_samplelists = [study["title"] for study in study_samplelist_json]
+
+        # ZS: No need to grab scales from .geno file unless it's using
+        # a mapping method that reads .geno files
+        if "QTLReaper" or "R/qtl" in dataset.group.mapping_names:
+            if self.genofiles:
+                self.scales_in_geno = get_genotype_scales(self.genofiles)
+            else:
+                self.scales_in_geno = get_genotype_scales(
+                    self.dataset.group.name + ".geno")
+        else:
+            self.scales_in_geno = {}
+
+        self.has_num_cases = has_num_cases(self.this_trait)
+
+        # ZS: Needed to know whether to display bar chart + get max
+        # sample name length in order to set table column width
+        self.num_values = 0
+        # ZS: So it knows whether to display the Binary R/qtl mapping
+        # method, which doesn't work unless all values are 0 or 1
+        self.binary = "true"
+        # ZS: Since we don't want to show log2 transform option for
+        # situations where it doesn't make sense
+        self.negative_vals_exist = "false"
+        max_samplename_width = 1
+        for group in self.sample_groups:
+            for sample in group.sample_list:
+                if len(sample.name) > max_samplename_width:
+                    max_samplename_width = len(sample.name)
+                if sample.display_value != "x":
+                    self.num_values += 1
+                    if sample.display_value != 0 or sample.display_value != 1:
+                        self.binary = "false"
+                    if sample.value < 0:
+                        self.negative_vals_exist = "true"
+
+        # ZS: Check whether any attributes have few enough distinct
+        # values to show the "Block samples by group" option
+        self.categorical_attr_exists = "false"
+        for attribute in self.sample_groups[0].attributes:
+            if len(self.sample_groups[0].attributes[attribute].distinct_values) <= 10:
+                self.categorical_attr_exists = "true"
+                break
+
+        sample_column_width = max_samplename_width * 8
+
+        self.stats_table_width, self.trait_table_width = get_table_widths(
+            self.sample_groups, sample_column_width, self.has_num_cases)
+
+        if self.num_values >= 5000:
+            self.maf = 0.01
+        else:
+            self.maf = 0.05
+
+        trait_symbol = None
+        short_description = None
+        if not self.temp_trait:
+            if self.this_trait.symbol:
+                trait_symbol = self.this_trait.symbol
+                short_description = trait_symbol
+
+            elif hasattr(self.this_trait, 'post_publication_abbreviation'):
+                short_description = self.this_trait.post_publication_abbreviation
+
+            elif hasattr(self.this_trait, 'pre_publication_abbreviation'):
+                short_description = self.this_trait.pre_publication_abbreviation
+
+        # Todo: Add back in the ones we actually need from below, as we discover we need them
+        hddn = OrderedDict()
+
+        if self.dataset.group.allsamples:
+            hddn['allsamples'] = ','.join(self.dataset.group.allsamples)
+        hddn['primary_samples'] = ','.join(self.primary_sample_names)
+        hddn['trait_id'] = self.trait_id
+        hddn['trait_display_name'] = self.this_trait.display_name
+        hddn['dataset'] = self.dataset.name
+        hddn['temp_trait'] = False
+        if self.temp_trait:
+            hddn['temp_trait'] = True
+            hddn['group'] = self.temp_group
+            hddn['species'] = self.temp_species
+        else:
+            hddn['group'] = self.dataset.group.name
+            hddn['species'] = self.dataset.group.species
+        hddn['use_outliers'] = False
+        hddn['method'] = "gemma"
+        hddn['mapmethod_rqtl'] = "hk"
+        hddn['mapmodel_rqtl'] = "normal"
+        hddn['pair_scan'] = ""
+        hddn['selected_chr'] = -1
+        hddn['mapping_display_all'] = True
+        hddn['suggestive'] = 0
+        hddn['study_samplelists'] = json.dumps(study_samplelist_json)
+        hddn['num_perm'] = 0
+        hddn['categorical_vars'] = ""
+        if self.categorical_var_list:
+            hddn['categorical_vars'] = ",".join(self.categorical_var_list)
+        hddn['manhattan_plot'] = ""
+        hddn['control_marker'] = ""
+        if not self.temp_trait:
+            if hasattr(self.this_trait, 'locus_chr') and self.this_trait.locus_chr != "" and self.dataset.type != "Geno" and self.dataset.type != "Publish":
+                hddn['control_marker'] = self.nearest_marker
+        hddn['do_control'] = False
+        hddn['maf'] = 0.05
+        hddn['mapping_scale'] = "physic"
+        hddn['compare_traits'] = []
+        hddn['export_data'] = ""
+        hddn['export_format'] = "excel"
+        if len(self.scales_in_geno) < 2 and bool(self.scales_in_geno):
+            hddn['mapping_scale'] = self.scales_in_geno[list(
+                self.scales_in_geno.keys())[0]][0][0]
+
+        # We'll need access to this_trait and hddn in the Jinja2
+        # Template, so we put it inside self
+        self.hddn = hddn
+
+        js_data = dict(trait_id=self.trait_id,
+                       trait_symbol=trait_symbol,
+                       max_digits = self.max_digits_by_group,
+                       no_decimal_place = self.no_decimal_place,
+                       short_description=short_description,
+                       unit_type=trait_units,
+                       dataset_type=self.dataset.type,
+                       species=self.dataset.group.species,
+                       scales_in_geno=self.scales_in_geno,
+                       data_scale=self.dataset.data_scale,
+                       sample_group_types=self.sample_group_types,
+                       sample_lists=sample_lists,
+                       se_exists=self.sample_groups[0].se_exists,
+                       has_num_cases=self.has_num_cases,
+                       attributes=self.sample_groups[0].attributes,
+                       categorical_attr_exists=self.categorical_attr_exists,
+                       categorical_vars=",".join(self.categorical_var_list),
+                       num_values=self.num_values,
+                       qnorm_values=self.qnorm_vals,
+                       zscore_values=self.z_scores,
+                       sample_column_width=sample_column_width,
+                       temp_uuid=self.temp_uuid)
+        self.js_data = js_data
+
+    def get_external_links(self):
+        # ZS: There's some weirdness here because some fields don't
+        # exist while others are empty strings
+        self.pubmed_link = webqtlConfig.PUBMEDLINK_URL % self.this_trait.pubmed_id if check_if_attr_exists(
+            self.this_trait, 'pubmed_id') else None
+        self.ncbi_gene_link = webqtlConfig.NCBI_LOCUSID % self.this_trait.geneid if check_if_attr_exists(
+            self.this_trait, 'geneid') else None
+        self.omim_link = webqtlConfig.OMIM_ID % self.this_trait.omim if check_if_attr_exists(
+            self.this_trait, 'omim') else None
+        self.homologene_link = webqtlConfig.HOMOLOGENE_ID % self.this_trait.homologeneid if check_if_attr_exists(
+            self.this_trait, 'homologeneid') else None
+
+        self.genbank_link = None
+        if check_if_attr_exists(self.this_trait, 'genbankid'):
+            genbank_id = '|'.join(self.this_trait.genbankid.split('|')[0:10])
+            if genbank_id[-1] == '|':
+                genbank_id = genbank_id[0:-1]
+            self.genbank_link = webqtlConfig.GENBANK_ID % genbank_id
+
+        self.uniprot_link = None
+        if check_if_attr_exists(self.this_trait, 'uniprotid'):
+            self.uniprot_link = webqtlConfig.UNIPROT_URL % self.this_trait.uniprotid
+
+        self.genotation_link = self.rgd_link = self.phenogen_link = self.gtex_link = self.genebridge_link = self.ucsc_blat_link = self.biogps_link = self.protein_atlas_link = None
+        self.string_link = self.panther_link = self.aba_link = self.ebi_gwas_link = self.wiki_pi_link = self.genemania_link = self.ensembl_link = None
+        if self.this_trait.symbol:
+            self.genotation_link = webqtlConfig.GENOTATION_URL % self.this_trait.symbol
+            self.gtex_link = webqtlConfig.GTEX_URL % self.this_trait.symbol
+            self.string_link = webqtlConfig.STRING_URL % self.this_trait.symbol
+            self.panther_link = webqtlConfig.PANTHER_URL % self.this_trait.symbol
+            self.ebi_gwas_link = webqtlConfig.EBIGWAS_URL % self.this_trait.symbol
+            self.protein_atlas_link = webqtlConfig.PROTEIN_ATLAS_URL % self.this_trait.symbol
+
+            if self.dataset.group.species == "mouse" or self.dataset.group.species == "human":
+                self.rgd_link = webqtlConfig.RGD_URL % (
+                    self.this_trait.symbol, self.dataset.group.species.capitalize())
+                if self.dataset.group.species == "mouse":
+                    self.genemania_link = webqtlConfig.GENEMANIA_URL % (
+                        "mus-musculus", self.this_trait.symbol)
+                else:
+                    self.genemania_link = webqtlConfig.GENEMANIA_URL % (
+                        "homo-sapiens", self.this_trait.symbol)
+
+                if self.dataset.group.species == "mouse":
+                    self.aba_link = webqtlConfig.ABA_URL % self.this_trait.symbol
+                    results = ()
+                    with database_connection(get_setting("SQL_URI")) as conn, conn.cursor() as cursor:
+                        cursor.execute(
+                            "SELECT chromosome, txStart, txEnd FROM "
+                            "GeneList WHERE geneSymbol = %s",
+                            (self.this_trait.symbol,)
+                        )
+                        results = cursor.fetchone()
+                    if results:
+                        chr, transcript_start, transcript_end = results
+                    else:
+                        chr = transcript_start = transcript_end = None
+
+                    if chr and transcript_start and transcript_end and self.this_trait.refseq_transcriptid:
+                        transcript_start = int(transcript_start * 1000000)
+                        transcript_end = int(transcript_end * 1000000)
+                        self.ucsc_blat_link = webqtlConfig.UCSC_REFSEQ % (
+                            'mm10', self.this_trait.refseq_transcriptid, chr, transcript_start, transcript_end)
+
+            if self.dataset.group.species == "rat":
+                self.rgd_link = webqtlConfig.RGD_URL % (
+                    self.this_trait.symbol, self.dataset.group.species.capitalize())
+                self.phenogen_link = webqtlConfig.PHENOGEN_URL % (
+                    self.this_trait.symbol)
+                self.genemania_link = webqtlConfig.GENEMANIA_URL % (
+                    "rattus-norvegicus", self.this_trait.symbol)
+
+                results = ()
+                with database_connection(get_setting("SQL_URI")) as conn, conn.cursor() as cursor:
+                    cursor.execute(
+                        "SELECT kgID, chromosome, txStart, txEnd "
+                        "FROM GeneList_rn33 WHERE geneSymbol = %s",
+                        (self.this_trait.symbol,)
+                    )
+                    if results := cursor.fetchone():
+                        kgId, chr, transcript_start, transcript_end = results
+                    else:
+                        kgId = chr = transcript_start = transcript_end = None
+
+                if chr and transcript_start and transcript_end and kgId:
+                    # Convert to bases from megabases
+                    transcript_start = int(transcript_start * 1000000)
+                    transcript_end = int(transcript_end * 1000000)
+                    self.ucsc_blat_link = webqtlConfig.UCSC_REFSEQ % (
+                        'rn7', kgId, chr, transcript_start, transcript_end)
+
+            if self.this_trait.geneid and (self.dataset.group.species == "mouse" or self.dataset.group.species == "rat" or self.dataset.group.species == "human"):
+                self.biogps_link = webqtlConfig.BIOGPS_URL % (
+                    self.dataset.group.species, self.this_trait.geneid)
+                self.gemma_link = webqtlConfig.GEMMA_URL % self.this_trait.geneid
+
+                if self.dataset.group.species == "human":
+                    self.aba_link = webqtlConfig.ABA_URL % self.this_trait.geneid
+
+    def build_correlation_tools(self):
+        if self.temp_trait == True:
+            this_group = self.temp_group
+        else:
+            this_group = self.dataset.group.name
+
+        # We're checking a string here!
+        assert isinstance(this_group, str), "We need a string type thing here"
+        if this_group[:3] == 'BXD' and this_group not in webqtlConfig.BXD_GROUP_EXCEPTIONS:
+            this_group = 'BXD'
+
+        if this_group:
+            if self.temp_trait == True:
+                dataset_menu = data_set.datasets(this_group)
+            else:
+                dataset_menu = data_set.datasets(
+                    this_group, self.dataset.group)
+            dataset_menu_selected = None
+            if len(dataset_menu):
+                if self.dataset:
+                    dataset_menu_selected = self.dataset.name
+
+            return_results_menu = (100, 200, 500, 1000,
+                                   2000, 5000, 10000, 15000, 20000)
+            return_results_menu_selected = 500
+
+            self.corr_tools = dict(dataset_menu=dataset_menu,
+                                   dataset_menu_selected=dataset_menu_selected,
+                                   return_results_menu=return_results_menu,
+                                   return_results_menu_selected=return_results_menu_selected,)
+
+    def make_sample_lists(self):
+
+        all_samples_ordered = self.dataset.group.all_samples_ordered()
+
+        parent_f1_samples = []
+        if self.dataset.group.parlist and self.dataset.group.f1list:
+            parent_f1_samples = self.dataset.group.parlist + self.dataset.group.f1list
+
+        primary_sample_names = list(all_samples_ordered)
+
+        if not self.temp_trait:
+            other_sample_names = []
+
+            for sample in list(self.this_trait.data.keys()):
+                if (self.this_trait.data[sample].name2 != self.this_trait.data[sample].name):
+                    if ((self.this_trait.data[sample].name2 in primary_sample_names)
+                            and (self.this_trait.data[sample].name not in primary_sample_names)):
+                        primary_sample_names.append(
+                            self.this_trait.data[sample].name)
+                        primary_sample_names.remove(
+                            self.this_trait.data[sample].name2)
+
+                all_samples_set = set(all_samples_ordered)
+                if sample not in all_samples_set:
+                    all_samples_ordered.append(sample)
+                    other_sample_names.append(sample)
+
+            # CFW is here because the .geno file doesn't properly
+            # contain its full list of samples. This should probably
+            # be fixed.
+            if self.dataset.group.species == "human" or (set(primary_sample_names) == set(parent_f1_samples)) or self.dataset.group.name == "CFW":
+                primary_sample_names += other_sample_names
+                other_sample_names = []
+
+            if other_sample_names:
+                primary_header = "%s Only" % (self.dataset.group.name)
+            else:
+                primary_header = "Samples"
+
+            primary_samples = SampleList(dataset=self.dataset,
+                                         sample_names=primary_sample_names,
+                                         this_trait=self.this_trait,
+                                         sample_group_type='primary',
+                                         header=primary_header)
+
+            # if other_sample_names and self.dataset.group.species !=
+            # "human" and self.dataset.group.name != "CFW":
+            if len(other_sample_names) > 0:
+                other_sample_names.sort()  # Sort other samples
+                other_samples = SampleList(dataset=self.dataset,
+                                           sample_names=other_sample_names,
+                                           this_trait=self.this_trait,
+                                           sample_group_type='other',
+                                           header="Other")
+
+                self.sample_groups = (primary_samples, other_samples)
+            else:
+                self.sample_groups = (primary_samples,)
+        else:
+            primary_samples = SampleList(dataset=self.dataset,
+                                         sample_names=primary_sample_names,
+                                         this_trait=self.trait_vals,
+                                         sample_group_type='primary',
+                                         header="%s Only" % (self.dataset.group.name))
+            self.sample_groups = (primary_samples,)
+        self.primary_sample_names = primary_sample_names
+        self.dataset.group.allsamples = all_samples_ordered
+
+
+def get_trait_vals(sample_list):
+    trait_vals = []
+    for sample in sample_list:
+        try:
+            trait_vals.append(float(sample.value))
+        except:
+            continue
+
+    return trait_vals
+
+def get_max_digits(trait_vals):
+    def __max_digits__(these_vals):
+        if not bool(these_vals):
+            return None
+        return len(str(max(these_vals))) - 1
+
+    return [__max_digits__(val) for val in trait_vals], all([val.is_integer() for val in sum(trait_vals, [])])
+
+def normf(trait_vals):
+    ranked_vals = ss.rankdata(trait_vals)
+    p_list = []
+    for i, val in enumerate(trait_vals):
+        p_list.append(((i + 1) - 0.5) / len(trait_vals))
+
+    z = ss.norm.ppf(p_list)
+
+    normed_vals = []
+    for rank in ranked_vals:
+        normed_vals.append("%0.3f" % z[int(rank) - 1])
+
+    return normed_vals
+
+def quantile_normalize_vals(sample_groups, trait_vals):
+    qnorm_by_group = []
+    for i, sample_type in enumerate(sample_groups):
+        qnorm_vals = normf(trait_vals[i])
+        qnorm_vals_with_x = []
+        counter = 0
+        for sample in sample_type.sample_list:
+            if sample.display_value == "x":
+                qnorm_vals_with_x.append("x")
+            else:
+                qnorm_vals_with_x.append(qnorm_vals[counter])
+                counter += 1
+
+        qnorm_by_group.append(qnorm_vals_with_x)
+
+    return qnorm_by_group
+
+
+def get_z_scores(sample_groups, trait_vals):
+    zscore_by_group = []
+    for i, sample_type in enumerate(sample_groups):
+        zscores = ss.mstats.zscore(np.array(trait_vals[i])).tolist()
+        zscores_with_x = []
+        counter = 0
+        for sample in sample_type.sample_list:
+            if sample.display_value == "x":
+                zscores_with_x.append("x")
+            else:
+                zscores_with_x.append("%0.3f" % zscores[counter])
+                counter += 1
+
+        zscore_by_group.append(zscores_with_x)
+
+    return zscore_by_group
+
+
+def get_nearest_marker(this_trait, this_db):
+    this_chr = this_trait.locus_chr
+    this_mb = this_trait.locus_mb
+    # One option is to take flanking markers, another is to take the
+    # two (or one) closest
+    with database_connection(get_setting("SQL_URI")) as conn, conn.cursor() as cursor:
+        cursor.execute(
+            "SELECT Geno.Name FROM Geno, GenoXRef, "
+            "GenoFreeze WHERE Geno.Chr = %s AND "
+            "GenoXRef.GenoId = Geno.Id AND "
+            "GenoFreeze.Id = GenoXRef.GenoFreezeId "
+            "AND GenoFreeze.Name = %s ORDER BY "
+            "ABS( Geno.Mb - %s) LIMIT 1",
+            (this_chr, f"{this_db.group.name}Geno", this_mb,)
+        )
+        if result := cursor.fetchall():
+            return result[0][0]
+        return ""
+
+
+def get_table_widths(sample_groups, sample_column_width, has_num_cases=False):
+    stats_table_width = 250
+    if len(sample_groups) > 1:
+        stats_table_width = 450
+
+    trait_table_width = 300 + sample_column_width
+    if sample_groups[0].se_exists:
+        trait_table_width += 80
+    if has_num_cases:
+        trait_table_width += 80
+    trait_table_width += len(sample_groups[0].attributes) * 88
+
+    return stats_table_width, trait_table_width
+
+
+def has_num_cases(this_trait):
+    has_n = False
+    if this_trait.dataset.type != "ProbeSet" and this_trait.dataset.type != "Geno":
+        for name, sample in list(this_trait.data.items()):
+            if sample.num_cases and sample.num_cases != "1":
+                has_n = True
+                break
+
+    return has_n
+
+
+def get_trait_units(this_trait):
+    unit_type = ""
+    inside_brackets = False
+    if this_trait.description_fmt:
+        if ("[" in this_trait.description_fmt) and ("]" in this_trait.description_fmt):
+            for i in this_trait.description_fmt:
+                if inside_brackets:
+                    if i != "]":
+                        unit_type += i
+                    else:
+                        inside_brackets = False
+                if i == "[":
+                    inside_brackets = True
+
+    if unit_type == "":
+        unit_type = "value"
+
+    return unit_type
+
+
+def check_if_attr_exists(the_trait, id_type):
+    if hasattr(the_trait, id_type):
+        if getattr(the_trait, id_type) == None or getattr(the_trait, id_type) == "":
+            return False
+        else:
+            return True
+    else:
+        return False
+
+
+def get_ncbi_summary(this_trait):
+    if check_if_attr_exists(this_trait, 'geneid'):
+        # ZS: Need to switch this try/except to something that checks
+        # the output later
+        try:
+            response = requests.get(
+                "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=gene&id=%s&retmode=json" % this_trait.geneid)
+            summary = json.loads(response.content)[
+                'result'][this_trait.geneid]['summary']
+            return summary
+        except:
+            return None
+    else:
+        return None
+
+
+def get_categorical_variables(this_trait, sample_list) -> list:
+    categorical_var_list = []
+
+    if len(sample_list.attributes) > 0:
+        for attribute in sample_list.attributes:
+            if len(sample_list.attributes[attribute].distinct_values) < 10:
+                categorical_var_list.append(str(sample_list.attributes[attribute].id))
+
+    return categorical_var_list
+
+def get_numerical_variables(this_trait, sample_list) -> list:
+    numerical_var_list = []
+
+    if len(sample_list.attributes) > 0:
+        for attribute in sample_list.attributes:
+            all_numeric = True
+            all_none = True
+            for attr_val in sample_list.attributes[attribute].distinct_values:
+                if not attr_val:
+                    continue
+                try:
+                    val_as_float = float(attr_val)
+                    all_none = False
+                except:
+                    all_numeric = False
+                    break
+            if all_numeric and not all_none:
+                numerical_var_list.append(sample_list.attributes[attribute].name)
+
+    return numerical_var_list
+
+def get_genotype_scales(genofiles):
+    geno_scales = {}
+    if isinstance(genofiles, list):
+        for the_file in genofiles:
+            file_location = the_file['location']
+            geno_scales[file_location] = get_scales_from_genofile(
+                file_location)
+    else:
+        geno_scales[genofiles] = get_scales_from_genofile(genofiles)
+
+    return geno_scales
+
+
+def get_scales_from_genofile(file_location):
+    geno_path = locate_ignore_error(file_location, 'genotype')
+    # ZS: This is just to allow the code to run when
+    if not geno_path:
+        return [["physic", "Mb"]]
+    cm_and_mb_cols_exist = True
+    cm_column = None
+    mb_column = None
+    with open(geno_path, "r") as geno_fh:
+        for i, line in enumerate(geno_fh):
+            if line[0] == "#" or line[0] == "@":
+                # ZS: If the scale is made explicit in the metadata,
+                # use that
+                if "@scale" in line:
+                    scale = line.split(":")[1].strip()
+                    if scale == "morgan":
+                        return [["morgan", "cM"]]
+                    else:
+                        return [["physic", "Mb"]]
+                else:
+                    continue
+            if line[:3] == "Chr":
+                first_marker_line = i + 1
+                if line.split("\t")[2].strip() == "cM":
+                    cm_column = 2
+                elif line.split("\t")[3].strip() == "cM":
+                    cm_column = 3
+                if line.split("\t")[2].strip() == "Mb":
+                    mb_column = 2
+                elif line.split("\t")[3].strip() == "Mb":
+                    mb_column = 3
+                break
+
+        # ZS: This attempts to check whether the cM and Mb columns are
+        # 'real', since some .geno files have one column be a copy of
+        # the other column, or have one column that is all 0s
+        cm_all_zero = True
+        mb_all_zero = True
+        cm_mb_all_equal = True
+        for i, line in enumerate(geno_fh):
+            # ZS: I'm assuming there won't be more than 10 markers
+            # where the position is listed as 0
+            if first_marker_line <= i < first_marker_line + 10:
+                if cm_column:
+                    cm_val = line.split("\t")[cm_column].strip()
+                    if cm_val != "0":
+                        cm_all_zero = False
+                if mb_column:
+                    mb_val = line.split("\t")[mb_column].strip()
+                    if mb_val != "0":
+                        mb_all_zero = False
+                if cm_column and mb_column:
+                    if cm_val != mb_val:
+                        cm_mb_all_equal = False
+            else:
+                if i > first_marker_line + 10:
+                    break
+
+    # ZS: This assumes that both won't be all zero, since if that's
+    # the case mapping shouldn't be an option to begin with
+    if mb_all_zero:
+        return [["morgan", "cM"]]
+    elif cm_mb_all_equal:
+        return [["physic", "Mb"]]
+    elif cm_and_mb_cols_exist:
+        return [["physic", "Mb"], ["morgan", "cM"]]
+    else:
+        return [["physic", "Mb"]]
+
+
+
+def get_diff_of_vals(new_vals: Dict, trait_id: str) -> Dict:
+    """ Get the diff between current sample values and the values in the DB
+
+    Given a dict of the changed values and the trait/dataset ID, return a Dict
+    with keys corresponding to each sample with a changed value and a value
+    that is a dict with keys for the old_value and new_value
+
+    """
+
+    trait_name = trait_id.split(":")[0]
+    dataset_name = trait_id.split(":")[1]
+    trait_ob = create_trait(name=trait_name, dataset_name=dataset_name)
+
+    old_vals = {sample : trait_ob.data[sample].value for sample in trait_ob.data}
+
+    shared_samples = set.union(set(new_vals.keys()), set(old_vals.keys()))
+
+    diff_dict = {}
+    for sample in shared_samples:
+        try:
+            new_val = round(float(new_vals[sample]), 3)
+        except:
+            new_val = "x"
+        try:
+            old_val = round(float(old_vals[sample]), 3)
+        except:
+            old_val = "x"
+
+        if new_val != old_val:
+            diff_dict[sample] = {
+                "new_val": new_val,
+                "old_val": old_val
+            }
+
+    return diff_dict
author	Arun Isaac	2023-12-29 18:55:37 +0000
committer	Arun Isaac	2023-12-29 19:01:46 +0000
commit	204a308be0f741726b9a620d88fbc22b22124c81 (patch)
tree	b3cf66906674020b530c844c2bb4982c8a0e2d39 /gn2/wqflask/show_trait
parent	83062c75442160427b50420161bfcae2c5c34c84 (diff)
download	genenetwork2-204a308be0f741726b9a620d88fbc22b22124c81.tar.gz