import re
import itertools
from gn2.wqflask.database import database_connection
from gn2.base import webqtlCaseData, webqtlConfig
from pprint import pformat as pf
from gn2.utility import Plot
from gn2.utility import Bunch
from gn2.utility.tools import get_setting
class SampleList:
def __init__(self,
dataset,
sample_names,
this_trait,
sample_group_type="primary",
header="Samples"):
self.dataset = dataset
self.this_trait = this_trait
self.sample_group_type = sample_group_type # primary or other
self.header = header
self.sample_list = [] # The actual list
self.sample_attribute_values = {}
self.get_attributes()
if self.this_trait and self.dataset:
self.get_extra_attribute_values()
for counter, sample_name in enumerate(sample_names, 1):
sample_name = sample_name.replace("_2nd_", "")
# self.this_trait will be a list if it is a Temp trait
if isinstance(self.this_trait, list):
sample = webqtlCaseData.webqtlCaseData(name=sample_name)
if counter <= len(self.this_trait):
if isinstance(self.this_trait[counter - 1], (bytes, bytearray)):
if (self.this_trait[counter - 1].decode("utf-8").lower() != 'x'):
sample = webqtlCaseData.webqtlCaseData(
name=sample_name,
value=float(self.this_trait[counter - 1]))
else:
if (self.this_trait[counter - 1].lower() != 'x'):
sample = webqtlCaseData.webqtlCaseData(
name=sample_name,
value=float(self.this_trait[counter - 1]))
else:
# If there's no value for the sample/strain,
# create the sample object (so samples with no value
# are still displayed in the table)
try:
sample = self.this_trait.data[sample_name]
except KeyError:
sample = webqtlCaseData.webqtlCaseData(name=sample_name)
sample.extra_info = {}
if (self.dataset.group.name == 'AXBXA'
and sample_name in ('AXB18/19/20', 'AXB13/14', 'BXA8/17')):
sample.extra_info['url'] = "/mouseCross.html#AXB/BXA"
sample.extra_info['css_class'] = "fs12"
sample.this_id = str(counter)
# For extra attribute columns; currently only used by
# several datasets
if self.sample_attribute_values:
sample.extra_attributes = self.sample_attribute_values.get(
sample_name, {})
# Add a url so RRID case attributes can be displayed as links
if '36' in sample.extra_attributes:
rrid_string = str(sample.extra_attributes['36'])
if self.dataset.group.species == "mouse":
if len(rrid_string.split(":")) > 1:
the_rrid = rrid_string.split(":")[1]
sample.extra_attributes['36'] = [
rrid_string]
sample.extra_attributes['36'].append(
webqtlConfig.RRID_MOUSE_URL % the_rrid)
elif self.dataset.group.species == "rat":
# Check if it's a list just in case a parent/f1 strain also shows up in the .geno file, to avoid being added twice
if len(rrid_string) and not isinstance(sample.extra_attributes['36'], list):
the_rrid = rrid_string.split("_")[1]
sample.extra_attributes['36'] = [
rrid_string]
sample.extra_attributes['36'].append(
webqtlConfig.RRID_RAT_URL % the_rrid)
self.sample_list.append(sample)
self.se_exists = any(sample.variance for sample in self.sample_list)
self.num_cases_exists = False
if (any(sample.num_cases for sample in self.sample_list) and
any((sample.num_cases and sample.num_cases != "1") for sample in self.sample_list)):
self.num_cases_exists = True
first_attr_col = self.get_first_attr_col()
for sample in self.sample_list:
sample.first_attr_col = first_attr_col
self.do_outliers()
def __repr__(self):
return "<SampleList> --> %s" % (pf(self.__dict__))
def do_outliers(self):
values = [sample.value for sample in self.sample_list
if sample.value is not None]
upper_bound, lower_bound = Plot.find_outliers(values)
for sample in self.sample_list:
if sample.value:
if upper_bound and sample.value > upper_bound:
sample.outlier = True
elif lower_bound and sample.value < lower_bound:
sample.outlier = True
else:
sample.outlier = False
def get_attributes(self):
"""Finds which extra attributes apply to this dataset"""
# Get attribute names and distinct values for each attribute
with database_connection(get_setting("SQL_URI")) as conn, conn.cursor() as cursor:
cursor.execute(
"SELECT DISTINCT CaseAttribute.CaseAttributeId, "
"CaseAttribute.Name, CaseAttribute.Description, "
"CaseAttributeXRefNew.Value FROM "
"CaseAttribute, CaseAttributeXRefNew WHERE "
"CaseAttributeXRefNew.CaseAttributeId = CaseAttribute.CaseAttributeId "
"AND CaseAttributeXRefNew.InbredSetId = %s "
"AND CaseAttribute.InbredSetId = %s "
"ORDER BY CaseAttribute.CaseAttributeId", (str(self.dataset.group.id),str(self.dataset.group.id))
)
self.attributes = {}
for attr, values in itertools.groupby(
cursor.fetchall(), lambda row: (row[0], row[1], row[2])
):
key, name, description = attr
self.attributes[key] = Bunch()
self.attributes[key].id = key
self.attributes[key].name = name
self.attributes[key].description = description
self.attributes[key].distinct_values = [
item[3] for item in values]
self.attributes[key].distinct_values = natural_sort(
self.attributes[key].distinct_values)
all_numbers = True
for value in self.attributes[key].distinct_values:
try:
val_as_float = float(value)
except:
all_numbers = False
break
if all_numbers:
self.attributes[key].alignment = "right"
else:
self.attributes[key].alignment = "left"
def get_extra_attribute_values(self):
if self.attributes:
with database_connection(get_setting("SQL_URI")) as conn, conn.cursor() as cursor:
cursor.execute(
"SELECT Strain.Name AS SampleName, "
"CaseAttributeId AS Id, "
"CaseAttributeXRefNew.Value FROM Strain, "
"StrainXRef, InbredSet, CaseAttributeXRefNew "
"WHERE StrainXRef.StrainId = Strain.Id "
"AND InbredSet.Id = StrainXRef.InbredSetId "
"AND CaseAttributeXRefNew.StrainId = Strain.Id "
"AND InbredSet.Id = CaseAttributeXRefNew.InbredSetId "
"AND CaseAttributeXRefNew.InbredSetId = %s "
"ORDER BY SampleName", (self.dataset.group.id,)
)
for sample_name, items in itertools.groupby(
cursor.fetchall(), lambda row: row[0]
):
attribute_values = {}
# Make a list of attr IDs without values (that have values for other samples)
valueless_attr_ids = [self.attributes[key].id for key in self.attributes.keys()]
for item in items:
sample_name, _id, value = item
valueless_attr_ids.remove(_id)
attribute_value = value
# If it's an int, turn it into one for sorting
# (for example, 101 would be lower than 80 if
# they're strings instead of ints)
try:
attribute_value = int(attribute_value)
except ValueError:
pass
attribute_values[str(_id)] = attribute_value
for attr_id in valueless_attr_ids:
attribute_values[str(attr_id)] = ""
self.sample_attribute_values[sample_name] = attribute_values
def get_first_attr_col(self):
first_attr_col = 4
if self.se_exists:
first_attr_col += 2
if self.num_cases_exists:
first_attr_col += 1
return first_attr_col
def natural_sort(a_list, key=lambda s: s):
"""
Sort the list into natural alphanumeric order.
"""
def get_alphanum_key_func(key):
def convert(text): return int(text) if text.isdigit() else text
return lambda s: [convert(c) for c in re.split('([0-9]+)', key(s))]
sort_key = get_alphanum_key_func(key)
sorted_list = sorted(a_list, key=sort_key)
return sorted_list