diff options
Diffstat (limited to 'gn2/wqflask/show_trait')
-rw-r--r-- | gn2/wqflask/show_trait/SampleList.py | 223 | ||||
-rw-r--r-- | gn2/wqflask/show_trait/__init__.py | 0 | ||||
-rw-r--r-- | gn2/wqflask/show_trait/export_trait_data.py | 113 | ||||
-rw-r--r-- | gn2/wqflask/show_trait/show_trait.py | 859 |
4 files changed, 1195 insertions, 0 deletions
diff --git a/gn2/wqflask/show_trait/SampleList.py b/gn2/wqflask/show_trait/SampleList.py new file mode 100644 index 00000000..64fc8fe6 --- /dev/null +++ b/gn2/wqflask/show_trait/SampleList.py @@ -0,0 +1,223 @@ +import re +import itertools + +from gn2.wqflask.database import database_connection +from gn2.base import webqtlCaseData, webqtlConfig +from pprint import pformat as pf + +from gn2.utility import Plot +from gn2.utility import Bunch +from gn2.utility.tools import get_setting + +class SampleList: + def __init__(self, + dataset, + sample_names, + this_trait, + sample_group_type="primary", + header="Samples"): + + self.dataset = dataset + self.this_trait = this_trait + self.sample_group_type = sample_group_type # primary or other + self.header = header + + self.sample_list = [] # The actual list + self.sample_attribute_values = {} + + self.get_attributes() + + if self.this_trait and self.dataset: + self.get_extra_attribute_values() + + for counter, sample_name in enumerate(sample_names, 1): + sample_name = sample_name.replace("_2nd_", "") + + # self.this_trait will be a list if it is a Temp trait + if isinstance(self.this_trait, list): + sample = webqtlCaseData.webqtlCaseData(name=sample_name) + if counter <= len(self.this_trait): + if isinstance(self.this_trait[counter - 1], (bytes, bytearray)): + if (self.this_trait[counter - 1].decode("utf-8").lower() != 'x'): + sample = webqtlCaseData.webqtlCaseData( + name=sample_name, + value=float(self.this_trait[counter - 1])) + else: + if (self.this_trait[counter - 1].lower() != 'x'): + sample = webqtlCaseData.webqtlCaseData( + name=sample_name, + value=float(self.this_trait[counter - 1])) + else: + # If there's no value for the sample/strain, + # create the sample object (so samples with no value + # are still displayed in the table) + try: + sample = self.this_trait.data[sample_name] + except KeyError: + sample = webqtlCaseData.webqtlCaseData(name=sample_name) + + sample.extra_info = {} + if (self.dataset.group.name == 'AXBXA' + and sample_name in ('AXB18/19/20', 'AXB13/14', 'BXA8/17')): + sample.extra_info['url'] = "/mouseCross.html#AXB/BXA" + sample.extra_info['css_class'] = "fs12" + + sample.this_id = str(counter) + + # For extra attribute columns; currently only used by + # several datasets + if self.sample_attribute_values: + sample.extra_attributes = self.sample_attribute_values.get( + sample_name, {}) + + # Add a url so RRID case attributes can be displayed as links + if '36' in sample.extra_attributes: + rrid_string = str(sample.extra_attributes['36']) + if self.dataset.group.species == "mouse": + if len(rrid_string.split(":")) > 1: + the_rrid = rrid_string.split(":")[1] + sample.extra_attributes['36'] = [ + rrid_string] + sample.extra_attributes['36'].append( + webqtlConfig.RRID_MOUSE_URL % the_rrid) + elif self.dataset.group.species == "rat": + if len(rrid_string): + the_rrid = rrid_string.split("_")[1] + sample.extra_attributes['36'] = [ + rrid_string] + sample.extra_attributes['36'].append( + webqtlConfig.RRID_RAT_URL % the_rrid) + + self.sample_list.append(sample) + + self.se_exists = any(sample.variance for sample in self.sample_list) + self.num_cases_exists = False + if (any(sample.num_cases for sample in self.sample_list) and + any((sample.num_cases and sample.num_cases != "1") for sample in self.sample_list)): + self.num_cases_exists = True + + first_attr_col = self.get_first_attr_col() + for sample in self.sample_list: + sample.first_attr_col = first_attr_col + + self.do_outliers() + + def __repr__(self): + return "<SampleList> --> %s" % (pf(self.__dict__)) + + def do_outliers(self): + values = [sample.value for sample in self.sample_list + if sample.value is not None] + upper_bound, lower_bound = Plot.find_outliers(values) + + for sample in self.sample_list: + if sample.value: + if upper_bound and sample.value > upper_bound: + sample.outlier = True + elif lower_bound and sample.value < lower_bound: + sample.outlier = True + else: + sample.outlier = False + + def get_attributes(self): + """Finds which extra attributes apply to this dataset""" + + # Get attribute names and distinct values for each attribute + with database_connection(get_setting("SQL_URI")) as conn, conn.cursor() as cursor: + cursor.execute( + "SELECT DISTINCT CaseAttribute.CaseAttributeId, " + "CaseAttribute.Name, CaseAttribute.Description, " + "CaseAttributeXRefNew.Value FROM " + "CaseAttribute, CaseAttributeXRefNew WHERE " + "CaseAttributeXRefNew.CaseAttributeId = CaseAttribute.CaseAttributeId " + "AND CaseAttributeXRefNew.InbredSetId = %s " + "ORDER BY CaseAttribute.CaseAttributeId", (str(self.dataset.group.id),) + ) + + self.attributes = {} + for attr, values in itertools.groupby( + cursor.fetchall(), lambda row: (row[0], row[1], row[2]) + ): + key, name, description = attr + self.attributes[key] = Bunch() + self.attributes[key].id = key + self.attributes[key].name = name + self.attributes[key].description = description + self.attributes[key].distinct_values = [ + item[3] for item in values] + self.attributes[key].distinct_values = natural_sort( + self.attributes[key].distinct_values) + all_numbers = True + for value in self.attributes[key].distinct_values: + try: + val_as_float = float(value) + except: + all_numbers = False + break + + if all_numbers: + self.attributes[key].alignment = "right" + else: + self.attributes[key].alignment = "left" + + def get_extra_attribute_values(self): + if self.attributes: + with database_connection(get_setting("SQL_URI")) as conn, conn.cursor() as cursor: + cursor.execute( + "SELECT Strain.Name AS SampleName, " + "CaseAttributeId AS Id, " + "CaseAttributeXRefNew.Value FROM Strain, " + "StrainXRef, InbredSet, CaseAttributeXRefNew " + "WHERE StrainXRef.StrainId = Strain.Id " + "AND InbredSet.Id = StrainXRef.InbredSetId " + "AND CaseAttributeXRefNew.StrainId = Strain.Id " + "AND InbredSet.Id = CaseAttributeXRefNew.InbredSetId " + "AND CaseAttributeXRefNew.InbredSetId = %s " + "ORDER BY SampleName", (self.dataset.group.id,) + ) + + for sample_name, items in itertools.groupby( + cursor.fetchall(), lambda row: row[0] + ): + attribute_values = {} + # Make a list of attr IDs without values (that have values for other samples) + valueless_attr_ids = [self.attributes[key].id for key in self.attributes.keys()] + for item in items: + sample_name, _id, value = item + valueless_attr_ids.remove(_id) + attribute_value = value + + # If it's an int, turn it into one for sorting + # (for example, 101 would be lower than 80 if + # they're strings instead of ints) + try: + attribute_value = int(attribute_value) + except ValueError: + pass + + attribute_values[str(_id)] = attribute_value + for attr_id in valueless_attr_ids: + attribute_values[str(attr_id)] = "" + + self.sample_attribute_values[sample_name] = attribute_values + + def get_first_attr_col(self): + first_attr_col = 4 + if self.se_exists: + first_attr_col += 2 + if self.num_cases_exists: + first_attr_col += 1 + + return first_attr_col + + +def natural_sort(a_list, key=lambda s: s): + """ + Sort the list into natural alphanumeric order. + """ + def get_alphanum_key_func(key): + def convert(text): return int(text) if text.isdigit() else text + return lambda s: [convert(c) for c in re.split('([0-9]+)', key(s))] + sort_key = get_alphanum_key_func(key) + sorted_list = sorted(a_list, key=sort_key) + return sorted_list diff --git a/gn2/wqflask/show_trait/__init__.py b/gn2/wqflask/show_trait/__init__.py new file mode 100644 index 00000000..e69de29b --- /dev/null +++ b/gn2/wqflask/show_trait/__init__.py diff --git a/gn2/wqflask/show_trait/export_trait_data.py b/gn2/wqflask/show_trait/export_trait_data.py new file mode 100644 index 00000000..06c1c502 --- /dev/null +++ b/gn2/wqflask/show_trait/export_trait_data.py @@ -0,0 +1,113 @@ +import datetime +import simplejson as json + +from pprint import pformat as pf +from functools import cmp_to_key +from gn2.base.trait import create_trait +from gn2.base import data_set + + +def export_sample_table(targs): + + sample_data = json.loads(targs['export_data']) + trait_name = targs['trait_display_name'] + + meta_data = get_export_metadata(targs) + + final_sample_data = meta_data + + column_headers = ["Index", "Name", "Value"] + attr_pos = 2 + if any(sample["se"] for sample in sample_data['primary_samples']): + column_headers.append("SE") + attr_pos = 3 + if any(sample["num_cases"] for sample in sample_data['primary_samples']): + column_headers.append("N") + attr_pos = 4 + + for key in sample_data["primary_samples"][0].keys(): + if key not in ["name", "value", "se", "num_cases"]: + column_headers.append(key) + + final_sample_data.append(column_headers) + for sample_group in ['primary_samples', 'other_samples']: + for i, row in enumerate(sample_data[sample_group]): + sorted_row = [i + 1] + dict_to_sorted_list(row)[:attr_pos] + for attr in sample_data['attributes']: + sorted_row.append(row[attr]) + final_sample_data.append(sorted_row) + + return trait_name, final_sample_data + + +def get_export_metadata(trait_metadata): + + trait_id, display_name, dataset_name, group_name = trait_metadata['trait_id'], trait_metadata['trait_display_name'], trait_metadata['dataset'], trait_metadata['group'] + + dataset = data_set.create_dataset(dataset_name, group_name=group_name) + this_trait = create_trait(dataset=dataset, + name=trait_id, + cellid=None, + get_qtl_info=False) + + metadata = [] + if dataset.type == "Publish": + metadata.append(["Phenotype ID:", display_name]) + metadata.append(["Phenotype URL: ", "http://genenetwork.org/show_trait?trait_id=" + \ + trait_id + "&dataset=" + dataset_name]) + metadata.append(["Group: ", dataset.group.name]) + metadata.append( + ["Phenotype: ", this_trait.description_display.replace(",", "\",\"")]) + metadata.append( + ["Authors: ", (this_trait.authors if this_trait.authors else "N/A")]) + metadata.append( + ["Title: ", (this_trait.title if this_trait.title else "N/A")]) + metadata.append( + ["Journal: ", (this_trait.journal if this_trait.journal else "N/A")]) + + metadata.append( + ["Dataset Link: ", "http://gn1.genenetwork.org/webqtl/main.py?FormID=sharinginfo&InfoPageName=" + dataset.name]) + else: + metadata.append(["Record ID: ", trait_id]) + metadata.append(["Trait URL: ", "http://genenetwork.org/show_trait?trait_id=" + \ + trait_id + "&dataset=" + dataset_name]) + if this_trait.symbol: + metadata.append(["Symbol: ", this_trait.symbol]) + metadata.append(["Dataset: ", dataset.name]) + metadata.append(["Group: ", dataset.group.name]) + metadata.append( + ["Export Date: ", datetime.datetime.now().strftime("%B %d, %Y")]) + metadata.append( + ["Export Time: ", datetime.datetime.now().strftime("%H:%M GMT")]) + + + return metadata + + +def dict_to_sorted_list(dictionary): + sorted_list = [item for item in list(dictionary.items())] + sorted_list = sorted(sorted_list, key=cmp_to_key(cmp_samples)) + sorted_values = [item[1] for item in sorted_list] + return sorted_values + + +def cmp_samples(a, b): + if b[0] == 'name': + return 1 + elif b[0] == 'value': + if a[0] == 'name': + return -1 + else: + return 1 + elif b[0] == 'se': + if a[0] == 'name' or a[0] == 'value': + return -1 + else: + return 1 + elif b[0] == 'num_cases': + if a[0] == 'name' or a[0] == 'value' or a[0] == 'se': + return -1 + else: + return 1 + else: + return -1 diff --git a/gn2/wqflask/show_trait/show_trait.py b/gn2/wqflask/show_trait/show_trait.py new file mode 100644 index 00000000..64d9be3b --- /dev/null +++ b/gn2/wqflask/show_trait/show_trait.py @@ -0,0 +1,859 @@ +from typing import Dict + +import string +import datetime +import uuid +import requests +import json as json + +from collections import OrderedDict + +import numpy as np +import scipy.stats as ss + +from gn2.wqflask.database import database_connection + +from gn2.base import webqtlConfig +from gn2.wqflask.show_trait.SampleList import SampleList +from gn2.base.trait import create_trait +from gn2.base import data_set +from gn2.utility import helper_functions +from gn2.utility.tools import get_setting, locate_ignore_error +from gn2.utility.tools import GN_PROXY_URL +from gn2.utility.redis_tools import get_redis_conn, get_resource_id + +from gn3.authentication import get_highest_user_access_role + +Redis = get_redis_conn() +ONE_YEAR = 60 * 60 * 24 * 365 + + +############################################### +# +# Todo: Put in security to ensure that user has permission to access +# confidential data sets And add i.p.limiting as necessary +# +############################################## + + +class ShowTrait: + def __init__(self, db_cursor, user_id, kw): + if 'trait_id' in kw and kw['dataset'] != "Temp": + self.temp_trait = False + self.trait_id = kw['trait_id'] + helper_functions.get_species_dataset_trait(self, kw) + self.resource_id = get_resource_id(self.dataset, + self.trait_id) + elif 'group' in kw: + self.temp_trait = True + self.trait_id = "Temp_" + kw['species'] + "_" + kw['group'] + \ + "_" + datetime.datetime.now().strftime("%m%d%H%M%S") + self.temp_species = kw['species'] + self.temp_group = kw['group'] + self.dataset = data_set.create_dataset( + dataset_name="Temp", dataset_type="Temp", group_name=self.temp_group) + + # Put values in Redis so they can be looked up later if + # added to a collection + Redis.set(self.trait_id, kw['trait_paste'], ex=ONE_YEAR) + self.trait_vals = kw['trait_paste'].split() + self.this_trait = create_trait(dataset=self.dataset, + name=self.trait_id, + cellid=None) + else: + self.temp_trait = True + self.trait_id = kw['trait_id'] + self.temp_species = self.trait_id.split("_")[1] + self.temp_group = self.trait_id.split("_")[2] + self.dataset = data_set.create_dataset( + dataset_name="Temp", dataset_type="Temp", group_name=self.temp_group) + self.this_trait = create_trait(dataset=self.dataset, + name=self.trait_id, + cellid=None) + self.trait_vals = Redis.get(self.trait_id).split() + + # Get verify/rna-seq link URLs + try: + blatsequence = self.this_trait.blatseq if self.dataset.type == "ProbeSet" else self.this_trait.sequence + if not blatsequence: + # XZ, 06/03/2009: ProbeSet name is not unique among platforms. We should use ProbeSet Id instead. + seqs = () + if self.dataset.type == "ProbeSet": + db_cursor.execute( + "SELECT Probe.Sequence, Probe.Name " + "FROM Probe, ProbeSet, ProbeSetFreeze, " + "ProbeSetXRef WHERE " + "ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id " + "AND ProbeSetXRef.ProbeSetId = ProbeSet.Id AND " + "ProbeSetFreeze.Name = %s AND " + "ProbeSet.Name = %s AND " + "Probe.ProbeSetId = ProbeSet.Id ORDER " + "BY Probe.SerialOrder", + (self.this_trait.dataset.name, self.this_trait.name,) + ) + else: + db_cursor.execute( + "SELECT Geno.Sequence " + "FROM Geno, GenoXRef, GenoFreeze " + "WHERE Geno.Name = %s AND " + "Geno.Id = GenoXRef.GenoId AND " + "GenoXRef.GenoFreezeId = GenoFreeze.Id AND " + "GenoFreeze.Name = %s", + (self.this_trait.name, self.this_trait.dataset.name) + ) + seqs = db_cursor.fetchall() + if not seqs: + raise ValueError + else: + blatsequence = '' + for seqt in seqs: + if int(seqt[1][-1]) % 2 == 1: + blatsequence += string.strip(seqt[0]) + + # --------Hongqiang add this part in order to not only blat ProbeSet, but also blat Probe + blatsequence = '%3E' + self.this_trait.name + '%0A' + blatsequence + '%0A' + + # XZ, 06/03/2009: ProbeSet name is not unique among platforms. We should use ProbeSet Id instead. + seqs = () + db_cursor.execute( + "SELECT Probe.Sequence, Probe.Name " + "FROM Probe, ProbeSet, ProbeSetFreeze, " + "ProbeSetXRef WHERE " + "ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id " + "AND ProbeSetXRef.ProbeSetId = ProbeSet.Id AND " + "ProbeSetFreeze.Name = %s AND ProbeSet.Name = %s " + "AND Probe.ProbeSetId = ProbeSet.Id " + "ORDER BY Probe.SerialOrder", + (self.this_trait.dataset.name, self.this_trait.name,) + ) + seqs = db_cursor.fetchall() + for seqt in seqs: + if int(seqt[1][-1]) % 2 == 1: + blatsequence += '%3EProbe_' + \ + seqt[1].strip() + '%0A' + seqt[0].strip() + '%0A' + + if self.dataset.group.species == "rat": + self.UCSC_BLAT_URL = webqtlConfig.UCSC_BLAT % ( + 'rat', 'rn7', blatsequence) + self.UTHSC_BLAT_URL = "" + elif self.dataset.group.species == "mouse": + self.UCSC_BLAT_URL = webqtlConfig.UCSC_BLAT % ( + 'mouse', 'mm10', blatsequence) + self.UTHSC_BLAT_URL = webqtlConfig.UTHSC_BLAT % ( + 'mouse', 'mm10', blatsequence) + elif self.dataset.group.species == "human": + self.UCSC_BLAT_URL = webqtlConfig.UCSC_BLAT % ( + 'human', 'hg38', blatsequence) + self.UTHSC_BLAT_URL = "" + else: + self.UCSC_BLAT_URL = "" + self.UTHSC_BLAT_URL = "" + except: + self.UCSC_BLAT_URL = "" + self.UTHSC_BLAT_URL = "" + + if self.dataset.type == "ProbeSet": + self.show_probes = "True" + + trait_units = get_trait_units(self.this_trait) + self.get_external_links() + self.build_correlation_tools() + + self.ncbi_summary = get_ncbi_summary(self.this_trait) + + # Get nearest marker for composite mapping + if not self.temp_trait: + if check_if_attr_exists(self.this_trait, 'locus_chr') and self.dataset.type != "Geno" and self.dataset.type != "Publish": + self.nearest_marker = get_nearest_marker( + self.this_trait, self.dataset) + else: + self.nearest_marker = "" + + self.make_sample_lists() + + trait_vals_by_group = [] + for sample_type in self.sample_groups: + trait_vals_by_group.append(get_trait_vals(sample_type.sample_list)) + + self.max_digits_by_group, self.no_decimal_place = get_max_digits(trait_vals_by_group) + + self.qnorm_vals = quantile_normalize_vals(self.sample_groups, trait_vals_by_group) + self.z_scores = get_z_scores(self.sample_groups, trait_vals_by_group) + + self.temp_uuid = uuid.uuid4() + + self.sample_group_types = OrderedDict() + if len(self.sample_groups) > 1: + self.sample_group_types['samples_primary'] = self.dataset.group.name + self.sample_group_types['samples_other'] = "Other" + self.sample_group_types['samples_all'] = "All" + else: + self.sample_group_types['samples_primary'] = self.dataset.group.name + sample_lists = [group.sample_list for group in self.sample_groups] + + self.categorical_var_list = [] + self.numerical_var_list = [] + if not self.temp_trait: + # ZS: Only using first samplelist, since I think mapping only uses those samples + self.categorical_var_list = get_categorical_variables( + self.this_trait, self.sample_groups[0]) + self.numerical_var_list = get_numerical_variables( + self.this_trait, self.sample_groups[0]) + + # ZS: Get list of chromosomes to select for mapping + self.chr_list = [["All", -1]] + for i, this_chr in enumerate(self.dataset.species.chromosomes.chromosomes(db_cursor)): + self.chr_list.append( + [self.dataset.species.chromosomes.chromosomes(db_cursor)[this_chr].name, i]) + + self.genofiles = self.dataset.group.get_genofiles() + study_samplelist_json = self.dataset.group.get_study_samplelists() + self.study_samplelists = [study["title"] for study in study_samplelist_json] + + # ZS: No need to grab scales from .geno file unless it's using + # a mapping method that reads .geno files + if "QTLReaper" or "R/qtl" in dataset.group.mapping_names: + if self.genofiles: + self.scales_in_geno = get_genotype_scales(self.genofiles) + else: + self.scales_in_geno = get_genotype_scales( + self.dataset.group.name + ".geno") + else: + self.scales_in_geno = {} + + self.has_num_cases = has_num_cases(self.this_trait) + + # ZS: Needed to know whether to display bar chart + get max + # sample name length in order to set table column width + self.num_values = 0 + # ZS: So it knows whether to display the Binary R/qtl mapping + # method, which doesn't work unless all values are 0 or 1 + self.binary = "true" + # ZS: Since we don't want to show log2 transform option for + # situations where it doesn't make sense + self.negative_vals_exist = "false" + max_samplename_width = 1 + for group in self.sample_groups: + for sample in group.sample_list: + if len(sample.name) > max_samplename_width: + max_samplename_width = len(sample.name) + if sample.display_value != "x": + self.num_values += 1 + if sample.display_value != 0 or sample.display_value != 1: + self.binary = "false" + if sample.value < 0: + self.negative_vals_exist = "true" + + # ZS: Check whether any attributes have few enough distinct + # values to show the "Block samples by group" option + self.categorical_attr_exists = "false" + for attribute in self.sample_groups[0].attributes: + if len(self.sample_groups[0].attributes[attribute].distinct_values) <= 10: + self.categorical_attr_exists = "true" + break + + sample_column_width = max_samplename_width * 8 + + self.stats_table_width, self.trait_table_width = get_table_widths( + self.sample_groups, sample_column_width, self.has_num_cases) + + if self.num_values >= 5000: + self.maf = 0.01 + else: + self.maf = 0.05 + + trait_symbol = None + short_description = None + if not self.temp_trait: + if self.this_trait.symbol: + trait_symbol = self.this_trait.symbol + short_description = trait_symbol + + elif hasattr(self.this_trait, 'post_publication_abbreviation'): + short_description = self.this_trait.post_publication_abbreviation + + elif hasattr(self.this_trait, 'pre_publication_abbreviation'): + short_description = self.this_trait.pre_publication_abbreviation + + # Todo: Add back in the ones we actually need from below, as we discover we need them + hddn = OrderedDict() + + if self.dataset.group.allsamples: + hddn['allsamples'] = ','.join(self.dataset.group.allsamples) + hddn['primary_samples'] = ','.join(self.primary_sample_names) + hddn['trait_id'] = self.trait_id + hddn['trait_display_name'] = self.this_trait.display_name + hddn['dataset'] = self.dataset.name + hddn['temp_trait'] = False + if self.temp_trait: + hddn['temp_trait'] = True + hddn['group'] = self.temp_group + hddn['species'] = self.temp_species + else: + hddn['group'] = self.dataset.group.name + hddn['species'] = self.dataset.group.species + hddn['use_outliers'] = False + hddn['method'] = "gemma" + hddn['mapmethod_rqtl'] = "hk" + hddn['mapmodel_rqtl'] = "normal" + hddn['pair_scan'] = "" + hddn['selected_chr'] = -1 + hddn['mapping_display_all'] = True + hddn['suggestive'] = 0 + hddn['study_samplelists'] = json.dumps(study_samplelist_json) + hddn['num_perm'] = 0 + hddn['categorical_vars'] = "" + if self.categorical_var_list: + hddn['categorical_vars'] = ",".join(self.categorical_var_list) + hddn['manhattan_plot'] = "" + hddn['control_marker'] = "" + if not self.temp_trait: + if hasattr(self.this_trait, 'locus_chr') and self.this_trait.locus_chr != "" and self.dataset.type != "Geno" and self.dataset.type != "Publish": + hddn['control_marker'] = self.nearest_marker + hddn['do_control'] = False + hddn['maf'] = 0.05 + hddn['mapping_scale'] = "physic" + hddn['compare_traits'] = [] + hddn['export_data'] = "" + hddn['export_format'] = "excel" + if len(self.scales_in_geno) < 2 and bool(self.scales_in_geno): + hddn['mapping_scale'] = self.scales_in_geno[list( + self.scales_in_geno.keys())[0]][0][0] + + # We'll need access to this_trait and hddn in the Jinja2 + # Template, so we put it inside self + self.hddn = hddn + + js_data = dict(trait_id=self.trait_id, + trait_symbol=trait_symbol, + max_digits = self.max_digits_by_group, + no_decimal_place = self.no_decimal_place, + short_description=short_description, + unit_type=trait_units, + dataset_type=self.dataset.type, + species=self.dataset.group.species, + scales_in_geno=self.scales_in_geno, + data_scale=self.dataset.data_scale, + sample_group_types=self.sample_group_types, + sample_lists=sample_lists, + se_exists=self.sample_groups[0].se_exists, + has_num_cases=self.has_num_cases, + attributes=self.sample_groups[0].attributes, + categorical_attr_exists=self.categorical_attr_exists, + categorical_vars=",".join(self.categorical_var_list), + num_values=self.num_values, + qnorm_values=self.qnorm_vals, + zscore_values=self.z_scores, + sample_column_width=sample_column_width, + temp_uuid=self.temp_uuid) + self.js_data = js_data + + def get_external_links(self): + # ZS: There's some weirdness here because some fields don't + # exist while others are empty strings + self.pubmed_link = webqtlConfig.PUBMEDLINK_URL % self.this_trait.pubmed_id if check_if_attr_exists( + self.this_trait, 'pubmed_id') else None + self.ncbi_gene_link = webqtlConfig.NCBI_LOCUSID % self.this_trait.geneid if check_if_attr_exists( + self.this_trait, 'geneid') else None + self.omim_link = webqtlConfig.OMIM_ID % self.this_trait.omim if check_if_attr_exists( + self.this_trait, 'omim') else None + self.homologene_link = webqtlConfig.HOMOLOGENE_ID % self.this_trait.homologeneid if check_if_attr_exists( + self.this_trait, 'homologeneid') else None + + self.genbank_link = None + if check_if_attr_exists(self.this_trait, 'genbankid'): + genbank_id = '|'.join(self.this_trait.genbankid.split('|')[0:10]) + if genbank_id[-1] == '|': + genbank_id = genbank_id[0:-1] + self.genbank_link = webqtlConfig.GENBANK_ID % genbank_id + + self.uniprot_link = None + if check_if_attr_exists(self.this_trait, 'uniprotid'): + self.uniprot_link = webqtlConfig.UNIPROT_URL % self.this_trait.uniprotid + + self.genotation_link = self.rgd_link = self.phenogen_link = self.gtex_link = self.genebridge_link = self.ucsc_blat_link = self.biogps_link = self.protein_atlas_link = None + self.string_link = self.panther_link = self.aba_link = self.ebi_gwas_link = self.wiki_pi_link = self.genemania_link = self.ensembl_link = None + if self.this_trait.symbol: + self.genotation_link = webqtlConfig.GENOTATION_URL % self.this_trait.symbol + self.gtex_link = webqtlConfig.GTEX_URL % self.this_trait.symbol + self.string_link = webqtlConfig.STRING_URL % self.this_trait.symbol + self.panther_link = webqtlConfig.PANTHER_URL % self.this_trait.symbol + self.ebi_gwas_link = webqtlConfig.EBIGWAS_URL % self.this_trait.symbol + self.protein_atlas_link = webqtlConfig.PROTEIN_ATLAS_URL % self.this_trait.symbol + + if self.dataset.group.species == "mouse" or self.dataset.group.species == "human": + self.rgd_link = webqtlConfig.RGD_URL % ( + self.this_trait.symbol, self.dataset.group.species.capitalize()) + if self.dataset.group.species == "mouse": + self.genemania_link = webqtlConfig.GENEMANIA_URL % ( + "mus-musculus", self.this_trait.symbol) + else: + self.genemania_link = webqtlConfig.GENEMANIA_URL % ( + "homo-sapiens", self.this_trait.symbol) + + if self.dataset.group.species == "mouse": + self.aba_link = webqtlConfig.ABA_URL % self.this_trait.symbol + results = () + with database_connection(get_setting("SQL_URI")) as conn, conn.cursor() as cursor: + cursor.execute( + "SELECT chromosome, txStart, txEnd FROM " + "GeneList WHERE geneSymbol = %s", + (self.this_trait.symbol,) + ) + results = cursor.fetchone() + if results: + chr, transcript_start, transcript_end = results + else: + chr = transcript_start = transcript_end = None + + if chr and transcript_start and transcript_end and self.this_trait.refseq_transcriptid: + transcript_start = int(transcript_start * 1000000) + transcript_end = int(transcript_end * 1000000) + self.ucsc_blat_link = webqtlConfig.UCSC_REFSEQ % ( + 'mm10', self.this_trait.refseq_transcriptid, chr, transcript_start, transcript_end) + + if self.dataset.group.species == "rat": + self.rgd_link = webqtlConfig.RGD_URL % ( + self.this_trait.symbol, self.dataset.group.species.capitalize()) + self.phenogen_link = webqtlConfig.PHENOGEN_URL % ( + self.this_trait.symbol) + self.genemania_link = webqtlConfig.GENEMANIA_URL % ( + "rattus-norvegicus", self.this_trait.symbol) + + results = () + with database_connection(get_setting("SQL_URI")) as conn, conn.cursor() as cursor: + cursor.execute( + "SELECT kgID, chromosome, txStart, txEnd " + "FROM GeneList_rn33 WHERE geneSymbol = %s", + (self.this_trait.symbol,) + ) + if results := cursor.fetchone(): + kgId, chr, transcript_start, transcript_end = results + else: + kgId = chr = transcript_start = transcript_end = None + + if chr and transcript_start and transcript_end and kgId: + # Convert to bases from megabases + transcript_start = int(transcript_start * 1000000) + transcript_end = int(transcript_end * 1000000) + self.ucsc_blat_link = webqtlConfig.UCSC_REFSEQ % ( + 'rn7', kgId, chr, transcript_start, transcript_end) + + if self.this_trait.geneid and (self.dataset.group.species == "mouse" or self.dataset.group.species == "rat" or self.dataset.group.species == "human"): + self.biogps_link = webqtlConfig.BIOGPS_URL % ( + self.dataset.group.species, self.this_trait.geneid) + self.gemma_link = webqtlConfig.GEMMA_URL % self.this_trait.geneid + + if self.dataset.group.species == "human": + self.aba_link = webqtlConfig.ABA_URL % self.this_trait.geneid + + def build_correlation_tools(self): + if self.temp_trait == True: + this_group = self.temp_group + else: + this_group = self.dataset.group.name + + # We're checking a string here! + assert isinstance(this_group, str), "We need a string type thing here" + if this_group[:3] == 'BXD' and this_group not in webqtlConfig.BXD_GROUP_EXCEPTIONS: + this_group = 'BXD' + + if this_group: + if self.temp_trait == True: + dataset_menu = data_set.datasets(this_group) + else: + dataset_menu = data_set.datasets( + this_group, self.dataset.group) + dataset_menu_selected = None + if len(dataset_menu): + if self.dataset: + dataset_menu_selected = self.dataset.name + + return_results_menu = (100, 200, 500, 1000, + 2000, 5000, 10000, 15000, 20000) + return_results_menu_selected = 500 + + self.corr_tools = dict(dataset_menu=dataset_menu, + dataset_menu_selected=dataset_menu_selected, + return_results_menu=return_results_menu, + return_results_menu_selected=return_results_menu_selected,) + + def make_sample_lists(self): + + all_samples_ordered = self.dataset.group.all_samples_ordered() + + parent_f1_samples = [] + if self.dataset.group.parlist and self.dataset.group.f1list: + parent_f1_samples = self.dataset.group.parlist + self.dataset.group.f1list + + primary_sample_names = list(all_samples_ordered) + + if not self.temp_trait: + other_sample_names = [] + + for sample in list(self.this_trait.data.keys()): + if (self.this_trait.data[sample].name2 != self.this_trait.data[sample].name): + if ((self.this_trait.data[sample].name2 in primary_sample_names) + and (self.this_trait.data[sample].name not in primary_sample_names)): + primary_sample_names.append( + self.this_trait.data[sample].name) + primary_sample_names.remove( + self.this_trait.data[sample].name2) + + all_samples_set = set(all_samples_ordered) + if sample not in all_samples_set: + all_samples_ordered.append(sample) + other_sample_names.append(sample) + + # CFW is here because the .geno file doesn't properly + # contain its full list of samples. This should probably + # be fixed. + if self.dataset.group.species == "human" or (set(primary_sample_names) == set(parent_f1_samples)) or self.dataset.group.name == "CFW": + primary_sample_names += other_sample_names + other_sample_names = [] + + if other_sample_names: + primary_header = "%s Only" % (self.dataset.group.name) + else: + primary_header = "Samples" + + primary_samples = SampleList(dataset=self.dataset, + sample_names=primary_sample_names, + this_trait=self.this_trait, + sample_group_type='primary', + header=primary_header) + + # if other_sample_names and self.dataset.group.species != + # "human" and self.dataset.group.name != "CFW": + if len(other_sample_names) > 0: + other_sample_names.sort() # Sort other samples + other_samples = SampleList(dataset=self.dataset, + sample_names=other_sample_names, + this_trait=self.this_trait, + sample_group_type='other', + header="Other") + + self.sample_groups = (primary_samples, other_samples) + else: + self.sample_groups = (primary_samples,) + else: + primary_samples = SampleList(dataset=self.dataset, + sample_names=primary_sample_names, + this_trait=self.trait_vals, + sample_group_type='primary', + header="%s Only" % (self.dataset.group.name)) + self.sample_groups = (primary_samples,) + self.primary_sample_names = primary_sample_names + self.dataset.group.allsamples = all_samples_ordered + + +def get_trait_vals(sample_list): + trait_vals = [] + for sample in sample_list: + try: + trait_vals.append(float(sample.value)) + except: + continue + + return trait_vals + +def get_max_digits(trait_vals): + def __max_digits__(these_vals): + if not bool(these_vals): + return None + return len(str(max(these_vals))) - 1 + + return [__max_digits__(val) for val in trait_vals], all([val.is_integer() for val in sum(trait_vals, [])]) + +def normf(trait_vals): + ranked_vals = ss.rankdata(trait_vals) + p_list = [] + for i, val in enumerate(trait_vals): + p_list.append(((i + 1) - 0.5) / len(trait_vals)) + + z = ss.norm.ppf(p_list) + + normed_vals = [] + for rank in ranked_vals: + normed_vals.append("%0.3f" % z[int(rank) - 1]) + + return normed_vals + +def quantile_normalize_vals(sample_groups, trait_vals): + qnorm_by_group = [] + for i, sample_type in enumerate(sample_groups): + qnorm_vals = normf(trait_vals[i]) + qnorm_vals_with_x = [] + counter = 0 + for sample in sample_type.sample_list: + if sample.display_value == "x": + qnorm_vals_with_x.append("x") + else: + qnorm_vals_with_x.append(qnorm_vals[counter]) + counter += 1 + + qnorm_by_group.append(qnorm_vals_with_x) + + return qnorm_by_group + + +def get_z_scores(sample_groups, trait_vals): + zscore_by_group = [] + for i, sample_type in enumerate(sample_groups): + zscores = ss.mstats.zscore(np.array(trait_vals[i])).tolist() + zscores_with_x = [] + counter = 0 + for sample in sample_type.sample_list: + if sample.display_value == "x": + zscores_with_x.append("x") + else: + zscores_with_x.append("%0.3f" % zscores[counter]) + counter += 1 + + zscore_by_group.append(zscores_with_x) + + return zscore_by_group + + +def get_nearest_marker(this_trait, this_db): + this_chr = this_trait.locus_chr + this_mb = this_trait.locus_mb + # One option is to take flanking markers, another is to take the + # two (or one) closest + with database_connection(get_setting("SQL_URI")) as conn, conn.cursor() as cursor: + cursor.execute( + "SELECT Geno.Name FROM Geno, GenoXRef, " + "GenoFreeze WHERE Geno.Chr = %s AND " + "GenoXRef.GenoId = Geno.Id AND " + "GenoFreeze.Id = GenoXRef.GenoFreezeId " + "AND GenoFreeze.Name = %s ORDER BY " + "ABS( Geno.Mb - %s) LIMIT 1", + (this_chr, f"{this_db.group.name}Geno", this_mb,) + ) + if result := cursor.fetchall(): + return result[0][0] + return "" + + +def get_table_widths(sample_groups, sample_column_width, has_num_cases=False): + stats_table_width = 250 + if len(sample_groups) > 1: + stats_table_width = 450 + + trait_table_width = 300 + sample_column_width + if sample_groups[0].se_exists: + trait_table_width += 80 + if has_num_cases: + trait_table_width += 80 + trait_table_width += len(sample_groups[0].attributes) * 88 + + return stats_table_width, trait_table_width + + +def has_num_cases(this_trait): + has_n = False + if this_trait.dataset.type != "ProbeSet" and this_trait.dataset.type != "Geno": + for name, sample in list(this_trait.data.items()): + if sample.num_cases and sample.num_cases != "1": + has_n = True + break + + return has_n + + +def get_trait_units(this_trait): + unit_type = "" + inside_brackets = False + if this_trait.description_fmt: + if ("[" in this_trait.description_fmt) and ("]" in this_trait.description_fmt): + for i in this_trait.description_fmt: + if inside_brackets: + if i != "]": + unit_type += i + else: + inside_brackets = False + if i == "[": + inside_brackets = True + + if unit_type == "": + unit_type = "value" + + return unit_type + + +def check_if_attr_exists(the_trait, id_type): + if hasattr(the_trait, id_type): + if getattr(the_trait, id_type) == None or getattr(the_trait, id_type) == "": + return False + else: + return True + else: + return False + + +def get_ncbi_summary(this_trait): + if check_if_attr_exists(this_trait, 'geneid'): + # ZS: Need to switch this try/except to something that checks + # the output later + try: + response = requests.get( + "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=gene&id=%s&retmode=json" % this_trait.geneid) + summary = json.loads(response.content)[ + 'result'][this_trait.geneid]['summary'] + return summary + except: + return None + else: + return None + + +def get_categorical_variables(this_trait, sample_list) -> list: + categorical_var_list = [] + + if len(sample_list.attributes) > 0: + for attribute in sample_list.attributes: + if len(sample_list.attributes[attribute].distinct_values) < 10: + categorical_var_list.append(str(sample_list.attributes[attribute].id)) + + return categorical_var_list + +def get_numerical_variables(this_trait, sample_list) -> list: + numerical_var_list = [] + + if len(sample_list.attributes) > 0: + for attribute in sample_list.attributes: + all_numeric = True + all_none = True + for attr_val in sample_list.attributes[attribute].distinct_values: + if not attr_val: + continue + try: + val_as_float = float(attr_val) + all_none = False + except: + all_numeric = False + break + if all_numeric and not all_none: + numerical_var_list.append(sample_list.attributes[attribute].name) + + return numerical_var_list + +def get_genotype_scales(genofiles): + geno_scales = {} + if isinstance(genofiles, list): + for the_file in genofiles: + file_location = the_file['location'] + geno_scales[file_location] = get_scales_from_genofile( + file_location) + else: + geno_scales[genofiles] = get_scales_from_genofile(genofiles) + + return geno_scales + + +def get_scales_from_genofile(file_location): + geno_path = locate_ignore_error(file_location, 'genotype') + # ZS: This is just to allow the code to run when + if not geno_path: + return [["physic", "Mb"]] + cm_and_mb_cols_exist = True + cm_column = None + mb_column = None + with open(geno_path, "r") as geno_fh: + for i, line in enumerate(geno_fh): + if line[0] == "#" or line[0] == "@": + # ZS: If the scale is made explicit in the metadata, + # use that + if "@scale" in line: + scale = line.split(":")[1].strip() + if scale == "morgan": + return [["morgan", "cM"]] + else: + return [["physic", "Mb"]] + else: + continue + if line[:3] == "Chr": + first_marker_line = i + 1 + if line.split("\t")[2].strip() == "cM": + cm_column = 2 + elif line.split("\t")[3].strip() == "cM": + cm_column = 3 + if line.split("\t")[2].strip() == "Mb": + mb_column = 2 + elif line.split("\t")[3].strip() == "Mb": + mb_column = 3 + break + + # ZS: This attempts to check whether the cM and Mb columns are + # 'real', since some .geno files have one column be a copy of + # the other column, or have one column that is all 0s + cm_all_zero = True + mb_all_zero = True + cm_mb_all_equal = True + for i, line in enumerate(geno_fh): + # ZS: I'm assuming there won't be more than 10 markers + # where the position is listed as 0 + if first_marker_line <= i < first_marker_line + 10: + if cm_column: + cm_val = line.split("\t")[cm_column].strip() + if cm_val != "0": + cm_all_zero = False + if mb_column: + mb_val = line.split("\t")[mb_column].strip() + if mb_val != "0": + mb_all_zero = False + if cm_column and mb_column: + if cm_val != mb_val: + cm_mb_all_equal = False + else: + if i > first_marker_line + 10: + break + + # ZS: This assumes that both won't be all zero, since if that's + # the case mapping shouldn't be an option to begin with + if mb_all_zero: + return [["morgan", "cM"]] + elif cm_mb_all_equal: + return [["physic", "Mb"]] + elif cm_and_mb_cols_exist: + return [["physic", "Mb"], ["morgan", "cM"]] + else: + return [["physic", "Mb"]] + + + +def get_diff_of_vals(new_vals: Dict, trait_id: str) -> Dict: + """ Get the diff between current sample values and the values in the DB + + Given a dict of the changed values and the trait/dataset ID, return a Dict + with keys corresponding to each sample with a changed value and a value + that is a dict with keys for the old_value and new_value + + """ + + trait_name = trait_id.split(":")[0] + dataset_name = trait_id.split(":")[1] + trait_ob = create_trait(name=trait_name, dataset_name=dataset_name) + + old_vals = {sample : trait_ob.data[sample].value for sample in trait_ob.data} + + shared_samples = set.union(set(new_vals.keys()), set(old_vals.keys())) + + diff_dict = {} + for sample in shared_samples: + try: + new_val = round(float(new_vals[sample]), 3) + except: + new_val = "x" + try: + old_val = round(float(old_vals[sample]), 3) + except: + old_val = "x" + + if new_val != old_val: + diff_dict[sample] = { + "new_val": new_val, + "old_val": old_val + } + + return diff_dict |