From d6c18c29ed9ccfadc1798c8a7c460faa428d1d5b Mon Sep 17 00:00:00 2001 From: BonfaceKilz Date: Tue, 3 Nov 2020 17:27:15 +0300 Subject: Apply PEP-8 to file * wqflask/wqflask/__init__.py: Remove unused import and variables. Also add module docstring and pylint errors to ignore. (before_request): Use correct spacing. --- wqflask/wqflask/__init__.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/wqflask/wqflask/__init__.py b/wqflask/wqflask/__init__.py index 274c3d82..eeece241 100644 --- a/wqflask/wqflask/__init__.py +++ b/wqflask/wqflask/__init__.py @@ -1,4 +1,5 @@ -import sys +"""Entry point for flask app""" +# pylint: disable=C0413,E0611 import time import jinja2 @@ -6,10 +7,6 @@ from flask import g from flask import Flask from utility import formatting -import logging -logger = logging.getLogger(__name__ ) -logging.basicConfig(level=logging.INFO) - app = Flask(__name__) # See http://flask.pocoo.org/docs/config/#configuring-from-files @@ -22,8 +19,8 @@ app.jinja_env.globals.update( @app.before_request def before_request(): - g.request_start_time = time.time() - g.request_time = lambda: "%.5fs" % (time.time() - g.request_start_time) + g.request_start_time = time.time() + g.request_time = lambda: "%.5fs" % (time.time() - g.request_start_time) from wqflask.api import router -- cgit v1.2.3 From 52b2b9b8a68572207d9165e57b50fd90c63b1d1e Mon Sep 17 00:00:00 2001 From: BonfaceKilz Date: Tue, 3 Nov 2020 17:29:04 +0300 Subject: Add and register glossary blueprint --- wqflask/wqflask/__init__.py | 3 +++ wqflask/wqflask/glossary.py | 9 +++++++++ 2 files changed, 12 insertions(+) create mode 100644 wqflask/wqflask/glossary.py diff --git a/wqflask/wqflask/__init__.py b/wqflask/wqflask/__init__.py index eeece241..a3870ce6 100644 --- a/wqflask/wqflask/__init__.py +++ b/wqflask/wqflask/__init__.py @@ -6,6 +6,7 @@ import jinja2 from flask import g from flask import Flask from utility import formatting +from wqflask.glossary import glossary_blueprint app = Flask(__name__) @@ -16,6 +17,8 @@ app.jinja_env.globals.update( undefined=jinja2.StrictUndefined, numify=formatting.numify) +# Registering blueprints +app.register_blueprint(glossary_blueprint, url_prefix="/glossary") @app.before_request def before_request(): diff --git a/wqflask/wqflask/glossary.py b/wqflask/wqflask/glossary.py new file mode 100644 index 00000000..0918744a --- /dev/null +++ b/wqflask/wqflask/glossary.py @@ -0,0 +1,9 @@ +from flask import Blueprint + + +glossary_blueprint = Blueprint('glossary_blueprint', __name__) + + +@glossary_blueprint.route('/') +def glossary(): + return "This is a test", 200 -- cgit v1.2.3 From 682763de47dbf15048faf302bb7f8ed524ecb27b Mon Sep 17 00:00:00 2001 From: BonfaceKilz Date: Tue, 3 Nov 2020 17:33:58 +0300 Subject: Add glossary template and use it --- wqflask/wqflask/glossary.py | 4 ++-- wqflask/wqflask/templates/glossary.html | 7 +++++++ 2 files changed, 9 insertions(+), 2 deletions(-) create mode 100644 wqflask/wqflask/templates/glossary.html diff --git a/wqflask/wqflask/glossary.py b/wqflask/wqflask/glossary.py index 0918744a..a44e7c45 100644 --- a/wqflask/wqflask/glossary.py +++ b/wqflask/wqflask/glossary.py @@ -1,9 +1,9 @@ from flask import Blueprint - +from flask import render_template glossary_blueprint = Blueprint('glossary_blueprint', __name__) @glossary_blueprint.route('/') def glossary(): - return "This is a test", 200 + return render_template("glossary.html"), 200 diff --git a/wqflask/wqflask/templates/glossary.html b/wqflask/wqflask/templates/glossary.html new file mode 100644 index 00000000..988297d3 --- /dev/null +++ b/wqflask/wqflask/templates/glossary.html @@ -0,0 +1,7 @@ +{% extends "base.html" %} + +{% block title %}Glossary{% endblock %} + +{% block content %} +Test +{% endblock %} -- cgit v1.2.3 From 5d9b1f5d6380beaf8a2d713b5c33baa5a163b2bc Mon Sep 17 00:00:00 2001 From: BonfaceKilz Date: Tue, 3 Nov 2020 18:11:06 +0300 Subject: Add test for "/glossary" route --- wqflask/tests/integration/__init__.py | 0 wqflask/tests/integration/test_glossary.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 28 insertions(+) create mode 100644 wqflask/tests/integration/__init__.py create mode 100644 wqflask/tests/integration/test_glossary.py diff --git a/wqflask/tests/integration/__init__.py b/wqflask/tests/integration/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/wqflask/tests/integration/test_glossary.py b/wqflask/tests/integration/test_glossary.py new file mode 100644 index 00000000..c9f1e62a --- /dev/null +++ b/wqflask/tests/integration/test_glossary.py @@ -0,0 +1,28 @@ +"Integration tests for glossary" +import unittest + +from bs4 import BeautifulSoup + +from wqflask import app + + +class TestGenMenu(unittest.TestCase): + """Tests for glossary""" + + def setUp(self): + self.app = app.test_client() + + def tearDown(self): + pass + + def test_glossary_page(self): + """Test that the glossary page is rendered properly""" + response = self.app.get('/glossary', follow_redirects=True) + html_content = BeautifulSoup(response.data, "lxml") + self.assertEqual(html_content.find("title").get_text(), + "Glossary GeneNetwork 2") + self.assertEqual( + html_content.find( + 'p', + attrs={'id': 'mytest'}).get_text(), + "Test") -- cgit v1.2.3 From f4a3652ee5b8087f551553df9498d5f00e169a86 Mon Sep 17 00:00:00 2001 From: BonfaceKilz Date: Tue, 3 Nov 2020 18:13:57 +0300 Subject: Separate unittests from integration tests --- wqflask/tests/base/__init__.py | 0 wqflask/tests/base/data.py | 110 ------ wqflask/tests/base/test_data_set.py | 181 --------- wqflask/tests/base/test_general_object.py | 40 -- wqflask/tests/base/test_trait.py | 241 ------------ wqflask/tests/base/test_webqtl_case_data.py | 39 -- wqflask/tests/unit/__init__.py | 0 wqflask/tests/unit/base/__init__.py | 0 wqflask/tests/unit/base/data.py | 110 ++++++ wqflask/tests/unit/base/test_data_set.py | 181 +++++++++ wqflask/tests/unit/base/test_general_object.py | 40 ++ wqflask/tests/unit/base/test_trait.py | 241 ++++++++++++ wqflask/tests/unit/base/test_webqtl_case_data.py | 39 ++ wqflask/tests/unit/utility/__init__.py | 0 .../unit/utility/test_authentication_tools.py | 189 ++++++++++ wqflask/tests/unit/utility/test_chunks.py | 19 + wqflask/tests/unit/utility/test_corestats.py | 55 +++ .../tests/unit/utility/test_corr_result_helpers.py | 32 ++ wqflask/tests/unit/utility/test_formatting.py | 33 ++ wqflask/tests/unit/utility/test_hmac.py | 52 +++ wqflask/tests/unit/wqflask/__init__.py | 0 wqflask/tests/unit/wqflask/api/__init__.py | 0 wqflask/tests/unit/wqflask/api/test_gen_menu.py | 413 +++++++++++++++++++++ .../unit/wqflask/marker_regression/__init__.py | 0 .../test_display_mapping_results.py | 156 ++++++++ wqflask/tests/unit/wqflask/show_trait/__init__.py | 0 .../wqflask/show_trait/test_export_trait_data.py | 212 +++++++++++ wqflask/tests/unit/wqflask/test_collect.py | 73 ++++ wqflask/tests/unit/wqflask/test_pbkdf2.py | 61 +++ wqflask/tests/unit/wqflask/test_user_login.py | 21 ++ wqflask/tests/unit/wqflask/test_user_session.py | 15 + wqflask/tests/utility/__init__.py | 0 wqflask/tests/utility/test_authentication_tools.py | 189 ---------- wqflask/tests/utility/test_chunks.py | 19 - wqflask/tests/utility/test_corestats.py | 55 --- wqflask/tests/utility/test_corr_result_helpers.py | 32 -- wqflask/tests/utility/test_formatting.py | 33 -- wqflask/tests/utility/test_hmac.py | 52 --- wqflask/tests/wqflask/__init__.py | 0 wqflask/tests/wqflask/api/__init__.py | 0 wqflask/tests/wqflask/api/test_gen_menu.py | 413 --------------------- .../tests/wqflask/marker_regression/__init__.py | 0 .../test_display_mapping_results.py | 156 -------- wqflask/tests/wqflask/show_trait/__init__.py | 0 .../wqflask/show_trait/test_export_trait_data.py | 212 ----------- wqflask/tests/wqflask/test_collect.py | 73 ---- wqflask/tests/wqflask/test_pbkdf2.py | 61 --- wqflask/tests/wqflask/test_user_login.py | 21 -- wqflask/tests/wqflask/test_user_session.py | 15 - wqflask/wqflask/templates/glossary.html | 2 +- 50 files changed, 1943 insertions(+), 1943 deletions(-) delete mode 100644 wqflask/tests/base/__init__.py delete mode 100644 wqflask/tests/base/data.py delete mode 100644 wqflask/tests/base/test_data_set.py delete mode 100644 wqflask/tests/base/test_general_object.py delete mode 100644 wqflask/tests/base/test_trait.py delete mode 100644 wqflask/tests/base/test_webqtl_case_data.py create mode 100644 wqflask/tests/unit/__init__.py create mode 100644 wqflask/tests/unit/base/__init__.py create mode 100644 wqflask/tests/unit/base/data.py create mode 100644 wqflask/tests/unit/base/test_data_set.py create mode 100644 wqflask/tests/unit/base/test_general_object.py create mode 100644 wqflask/tests/unit/base/test_trait.py create mode 100644 wqflask/tests/unit/base/test_webqtl_case_data.py create mode 100644 wqflask/tests/unit/utility/__init__.py create mode 100644 wqflask/tests/unit/utility/test_authentication_tools.py create mode 100644 wqflask/tests/unit/utility/test_chunks.py create mode 100644 wqflask/tests/unit/utility/test_corestats.py create mode 100644 wqflask/tests/unit/utility/test_corr_result_helpers.py create mode 100644 wqflask/tests/unit/utility/test_formatting.py create mode 100644 wqflask/tests/unit/utility/test_hmac.py create mode 100644 wqflask/tests/unit/wqflask/__init__.py create mode 100644 wqflask/tests/unit/wqflask/api/__init__.py create mode 100644 wqflask/tests/unit/wqflask/api/test_gen_menu.py create mode 100644 wqflask/tests/unit/wqflask/marker_regression/__init__.py create mode 100644 wqflask/tests/unit/wqflask/marker_regression/test_display_mapping_results.py create mode 100644 wqflask/tests/unit/wqflask/show_trait/__init__.py create mode 100644 wqflask/tests/unit/wqflask/show_trait/test_export_trait_data.py create mode 100644 wqflask/tests/unit/wqflask/test_collect.py create mode 100644 wqflask/tests/unit/wqflask/test_pbkdf2.py create mode 100644 wqflask/tests/unit/wqflask/test_user_login.py create mode 100644 wqflask/tests/unit/wqflask/test_user_session.py delete mode 100644 wqflask/tests/utility/__init__.py delete mode 100644 wqflask/tests/utility/test_authentication_tools.py delete mode 100644 wqflask/tests/utility/test_chunks.py delete mode 100644 wqflask/tests/utility/test_corestats.py delete mode 100644 wqflask/tests/utility/test_corr_result_helpers.py delete mode 100644 wqflask/tests/utility/test_formatting.py delete mode 100644 wqflask/tests/utility/test_hmac.py delete mode 100644 wqflask/tests/wqflask/__init__.py delete mode 100644 wqflask/tests/wqflask/api/__init__.py delete mode 100644 wqflask/tests/wqflask/api/test_gen_menu.py delete mode 100644 wqflask/tests/wqflask/marker_regression/__init__.py delete mode 100644 wqflask/tests/wqflask/marker_regression/test_display_mapping_results.py delete mode 100644 wqflask/tests/wqflask/show_trait/__init__.py delete mode 100644 wqflask/tests/wqflask/show_trait/test_export_trait_data.py delete mode 100644 wqflask/tests/wqflask/test_collect.py delete mode 100644 wqflask/tests/wqflask/test_pbkdf2.py delete mode 100644 wqflask/tests/wqflask/test_user_login.py delete mode 100644 wqflask/tests/wqflask/test_user_session.py diff --git a/wqflask/tests/base/__init__.py b/wqflask/tests/base/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/wqflask/tests/base/data.py b/wqflask/tests/base/data.py deleted file mode 100644 index 06a5a989..00000000 --- a/wqflask/tests/base/data.py +++ /dev/null @@ -1,110 +0,0 @@ -gen_menu_json = """ -{ - "datasets": { - "human": { - "HLC": { - "Liver mRNA": [ - [ - "320", - "HLC_0311", - "GSE9588 Human Liver Normal (Mar11) Both Sexes" - ] - ], - "Phenotypes": [ - [ - "635", - "HLCPublish", - "HLC Published Phenotypes" - ] - ] - } - }, - "mouse": { - "BXD": { - "Genotypes": [ - [ - "600", - "BXDGeno", - "BXD Genotypes" - ] - ], - "Hippocampus mRNA": [ - [ - "112", - "HC_M2_0606_P", - "Hippocampus Consortium M430v2 (Jun06) PDNN" - ] - ], - "Phenotypes": [ - [ - "602", - "BXDPublish", - "BXD Published Phenotypes" - ] - ] - } - } - }, - "groups": { - "human": [ - [ - "HLC", - "Liver: Normal Gene Expression with Genotypes (Merck)", - "Family:None" - ] - ], - "mouse": [ - [ - "BXD", - "BXD", - "Family:None" - ] - ] - }, - "species": [ - [ - "human", - "Human" - ], - [ - "mouse", - "Mouse" - ] - ], - "types": { - "human": { - "HLC": [ - [ - "Phenotypes", - "Traits and Cofactors", - "Phenotypes" - ], - [ - "Liver mRNA", - "Liver mRNA", - "Molecular Trait Datasets" - ] - ] - }, - "mouse": { - "BXD": [ - [ - "Phenotypes", - "Traits and Cofactors", - "Phenotypes" - ], - [ - "Genotypes", - "DNA Markers and SNPs", - "Genotypes" - ], - [ - "Hippocampus mRNA", - "Hippocampus mRNA", - "Molecular Trait Datasets" - ] - ] - } - } -} -""" diff --git a/wqflask/tests/base/test_data_set.py b/wqflask/tests/base/test_data_set.py deleted file mode 100644 index 96563a16..00000000 --- a/wqflask/tests/base/test_data_set.py +++ /dev/null @@ -1,181 +0,0 @@ -"""Tests for wqflask/base/data_set.py""" - -import unittest -from unittest import mock - -from wqflask import app -from .data import gen_menu_json -from base.data_set import DatasetType - - -class TestDataSetTypes(unittest.TestCase): - """Tests for the DataSetType class""" - - def setUp(self): - self.test_dataset = """ - { - "AD-cases-controls-MyersGeno": "Geno", - "AD-cases-controls-MyersPublish": "Publish", - "AKXDGeno": "Geno", - "AXBXAGeno": "Geno", - "AXBXAPublish": "Publish", - "Aging-Brain-UCIPublish": "Publish", - "All Phenotypes": "Publish", - "B139_K_1206_M": "ProbeSet", - "B139_K_1206_R": "ProbeSet" - } - """ - self.app_context = app.app_context() - self.app_context.push() - - def tearDown(self): - self.app_context.pop() - - @mock.patch('base.data_set.g') - def test_data_set_type(self, db_mock): - """Test that DatasetType returns correctly if the Redis Instance is not empty - and the name variable exists in the dictionary - - """ - with app.app_context(): - db_mock.get = mock.Mock() - redis_mock = mock.Mock() - redis_mock.get.return_value = self.test_dataset - self.assertEqual(DatasetType(redis_mock) - ("All Phenotypes"), "Publish") - redis_mock.get.assert_called_once_with("dataset_structure") - - @mock.patch('base.data_set.requests.get') - def test_data_set_type_with_empty_redis(self, request_mock): - """Test that DatasetType returns correctly if the Redis Instance is empty and - the name variable exists in the dictionary - - """ - with app.app_context(): - request_mock.return_value.content = gen_menu_json - redis_mock = mock.Mock() - redis_mock.get.return_value = None - data_set = DatasetType(redis_mock) - self.assertEqual(data_set("BXDGeno"), "Geno") - self.assertEqual(data_set("BXDPublish"), "Publish") - self.assertEqual(data_set("HLC_0311"), "ProbeSet") - - redis_mock.set.assert_called_once_with( - "dataset_structure", - ('{"HLC_0311": "ProbeSet", ' - '"HLCPublish": "Publish", ' - '"BXDGeno": "Geno", ' - '"HC_M2_0606_P": "ProbeSet", ' - '"BXDPublish": "Publish"}')) - - @mock.patch('base.data_set.g') - def test_set_dataset_key_mrna(self, db_mock): - with app.app_context(): - db_mock.db.execute.return_value.fetchone.return_value = [1, 2, 3] - redis_mock = mock.Mock() - redis_mock.get.return_value = self.test_dataset - data_set = DatasetType(redis_mock) - data_set.set_dataset_key("mrna_expr", "Test") - self.assertEqual(data_set("Test"), "ProbeSet") - redis_mock.set.assert_called_once_with( - "dataset_structure", - ('{"AD-cases-controls-MyersGeno": "Geno", ' - '"AD-cases-controls-MyersPublish": "Publish", ' - '"AKXDGeno": "Geno", ' - '"AXBXAGeno": "Geno", ' - '"AXBXAPublish": "Publish", ' - '"Aging-Brain-UCIPublish": "Publish", ' - '"All Phenotypes": "Publish", ' - '"B139_K_1206_M": "ProbeSet", ' - '"B139_K_1206_R": "ProbeSet", ' - '"Test": "ProbeSet"}')) - - db_mock.db.execute.assert_called_with( - ("SELECT ProbeSetFreeze.Id FROM ProbeSetFreeze " + - "WHERE ProbeSetFreeze.Name = \"Test\" ") - ) - - @mock.patch('base.data_set.g') - def test_set_dataset_key_pheno(self, db_mock): - with app.app_context(): - db_mock.db.execute.return_value.fetchone.return_value = [1, 2, 3] - redis_mock = mock.Mock() - redis_mock.get.return_value = self.test_dataset - data_set = DatasetType(redis_mock) - data_set.set_dataset_key("pheno", "Test") - self.assertEqual(data_set("Test"), "Publish") - redis_mock.set.assert_called_once_with( - "dataset_structure", - ('{"AD-cases-controls-MyersGeno": "Geno", ' - '"AD-cases-controls-MyersPublish": "Publish", ' - '"AKXDGeno": "Geno", ' - '"AXBXAGeno": "Geno", ' - '"AXBXAPublish": "Publish", ' - '"Aging-Brain-UCIPublish": "Publish", ' - '"All Phenotypes": "Publish", ' - '"B139_K_1206_M": "ProbeSet", ' - '"B139_K_1206_R": "ProbeSet", ' - '"Test": "Publish"}')) - db_mock.db.execute.assert_called_with( - ("SELECT InfoFiles.GN_AccesionId " - "FROM InfoFiles, PublishFreeze, InbredSet " - "WHERE InbredSet.Name = 'Test' AND " - "PublishFreeze.InbredSetId = InbredSet.Id AND " - "InfoFiles.InfoPageName = PublishFreeze.Name") - ) - - @mock.patch('base.data_set.g') - def test_set_dataset_other_pheno(self, db_mock): - with app.app_context(): - db_mock.db.execute.return_value.fetchone.return_value = [1, 2, 3] - redis_mock = mock.Mock() - redis_mock.get.return_value = self.test_dataset - data_set = DatasetType(redis_mock) - data_set.set_dataset_key("other_pheno", "Test") - self.assertEqual(data_set("Test"), "Publish") - - redis_mock.set.assert_called_once_with( - "dataset_structure", - ('{"AD-cases-controls-MyersGeno": "Geno", ' - '"AD-cases-controls-MyersPublish": "Publish", ' - '"AKXDGeno": "Geno", ' - '"AXBXAGeno": "Geno", ' - '"AXBXAPublish": "Publish", ' - '"Aging-Brain-UCIPublish": "Publish", ' - '"All Phenotypes": "Publish", ' - '"B139_K_1206_M": "ProbeSet", ' - '"B139_K_1206_R": "ProbeSet", ' - '"Test": "Publish"}')) - - db_mock.db.execute.assert_called_with( - ("SELECT PublishFreeze.Name " + - "FROM PublishFreeze, InbredSet " + - "WHERE InbredSet.Name = 'Test' AND " - "PublishFreeze.InbredSetId = InbredSet.Id") - ) - - @mock.patch('base.data_set.g') - def test_set_dataset_geno(self, db_mock): - with app.app_context(): - db_mock.db.execute.return_value.fetchone.return_value = [1, 2, 3] - redis_mock = mock.Mock() - redis_mock.get.return_value = self.test_dataset - data_set = DatasetType(redis_mock) - data_set.set_dataset_key("geno", "Test") - self.assertEqual(data_set("Test"), "Geno") - redis_mock.set.assert_called_once_with( - "dataset_structure", - ('{"AD-cases-controls-MyersGeno": "Geno", ' - '"AD-cases-controls-MyersPublish": "Publish", ' - '"AKXDGeno": "Geno", ' - '"AXBXAGeno": "Geno", ' - '"AXBXAPublish": "Publish", ' - '"Aging-Brain-UCIPublish": "Publish", ' - '"All Phenotypes": "Publish", ' - '"B139_K_1206_M": "ProbeSet", ' - '"B139_K_1206_R": "ProbeSet", ' - '"Test": "Geno"}')) - - db_mock.db.execute.assert_called_with( - ("SELECT GenoFreeze.Id FROM " - "GenoFreeze WHERE GenoFreeze.Name = \"Test\" ")) diff --git a/wqflask/tests/base/test_general_object.py b/wqflask/tests/base/test_general_object.py deleted file mode 100644 index 00fd3c72..00000000 --- a/wqflask/tests/base/test_general_object.py +++ /dev/null @@ -1,40 +0,0 @@ -import unittest - -from base.GeneralObject import GeneralObject - - -class TestGeneralObjectTests(unittest.TestCase): - """ - Test the GeneralObject base class - """ - - def test_object_contents(self): - """Test whether base contents are stored properly""" - test_obj = GeneralObject("a", "b", "c") - self.assertEqual("abc", ''.join(test_obj.contents)) - self.assertEqual(len(test_obj), 0) - - def test_object_dict(self): - """Test whether the base class is printed properly""" - test_obj = GeneralObject("a", name="test", value=1) - self.assertEqual(str(test_obj), "name = test\nvalue = 1\n") - self.assertEqual( - repr(test_obj), "contents = ['a']\nname = test\nvalue = 1\n") - self.assertEqual(len(test_obj), 2) - self.assertEqual(test_obj["value"], 1) - test_obj["test"] = 1 - self.assertEqual(test_obj["test"], 1) - - def test_get_attribute(self): - "Test that getattr works" - test_obj = GeneralObject("a", name="test", value=1) - self.assertEqual(getattr(test_obj, "value", None), 1) - self.assertEqual(getattr(test_obj, "non-existent", None), None) - - def test_object_comparisons(self): - "Test that 2 objects of the same length are equal" - test_obj1 = GeneralObject("a", name="test", value=1) - test_obj2 = GeneralObject("b", name="test2", value=2) - test_obj3 = GeneralObject("a", name="test", x=1, y=2) - self.assertTrue(test_obj1 == test_obj2) - self.assertFalse(test_obj1 == test_obj3) diff --git a/wqflask/tests/base/test_trait.py b/wqflask/tests/base/test_trait.py deleted file mode 100644 index 826ccefd..00000000 --- a/wqflask/tests/base/test_trait.py +++ /dev/null @@ -1,241 +0,0 @@ -# -*- coding: utf-8 -*- -"""Tests wqflask/base/trait.py""" -import unittest -from unittest import mock - -from wqflask import app -from base.trait import GeneralTrait -from base.trait import retrieve_trait_info - - -class TestResponse: - """Mock Test Response after a request""" - @property - def content(self): - """Mock the content from Requests.get(params).content""" - return "[1, 2, 3, 4]" - - -class TestNilResponse: - """Mock Test Response after a request""" - @property - def content(self): - """Mock the content from Requests.get(params).content""" - return "{}" - - -class MockTrait(GeneralTrait): - @property - def wikidata_alias_fmt(self): - return "Mock alias" - - -class TestRetrieveTraitInfo(unittest.TestCase): - """Tests for 'retrieve_trait_info'""" - - def setUp(self): - self.app_context = app.app_context() - self.app_context.push() - - def tearDown(self): - self.app_context.pop() - - def test_retrieve_trait_info_with_empty_dataset(self): - """Test that an exception is raised when dataset is empty""" - with self.assertRaises(AssertionError): - retrieve_trait_info(trait=mock.MagicMock(), - dataset={}) - - @mock.patch('base.trait.requests.get') - @mock.patch('base.trait.g', mock.Mock()) - def test_retrieve_trait_info_with_empty_trait_info(self, - requests_mock): - """Empty trait info""" - requests_mock.return_value = TestNilResponse() - with self.assertRaises(KeyError): - retrieve_trait_info(trait=mock.MagicMock(), - dataset=mock.MagicMock()) - - @mock.patch('base.trait.requests.get') - @mock.patch('base.trait.g', mock.Mock()) - def test_retrieve_trait_info_with_non_empty_trait_info(self, - requests_mock): - """Test that attributes are set""" - mock_dataset = mock.MagicMock() - requests_mock.return_value = TestResponse() - type(mock_dataset).display_fields = mock.PropertyMock( - return_value=["a", "b", "c", "d"]) - test_trait = retrieve_trait_info(trait=MockTrait(dataset=mock_dataset), - dataset=mock_dataset) - self.assertEqual(test_trait.a, 1) - self.assertEqual(test_trait.b, 2) - self.assertEqual(test_trait.c, 3) - self.assertEqual(test_trait.d, 4) - - @mock.patch('base.trait.requests.get') - @mock.patch('base.trait.g', mock.Mock()) - def test_retrieve_trait_info_utf8_parsing(self, - requests_mock): - """Test that utf-8 strings are parsed correctly""" - utf_8_string = "test_string" - mock_dataset = mock.MagicMock() - requests_mock.return_value = TestResponse() - type(mock_dataset).display_fields = mock.PropertyMock( - return_value=["a", "b", "c", "d"]) - type(mock_dataset).type = 'Publish' - - mock_trait = MockTrait( - dataset=mock_dataset, - pre_publication_description=utf_8_string - ) - trait_attrs = { - "group_code": "test_code", - "pre_publication_description": "test_pre_pub", - "pre_publication_abbreviation": "ファイルを画面毎に見て行くには、次のコマンドを使います。", - "post_publication_description": None, - "pubmed_id": None, - 'year': "2020", - "authors": "Jane Doe かいと", - } - for key, val in list(trait_attrs.items()): - setattr(mock_trait, key, val) - test_trait = retrieve_trait_info(trait=mock_trait, - dataset=mock_dataset) - self.assertEqual(test_trait.abbreviation, - "ファイルを画面毎に見て行くには、次のコマンドを使います。") - self.assertEqual(test_trait.authors, - "Jane Doe かいと") - - @mock.patch('base.trait.requests.get') - @mock.patch('base.trait.g') - @mock.patch('base.trait.get_resource_id') - def test_retrieve_trait_info_with_non_empty_lrs(self, - resource_id_mock, - g_mock, - requests_mock): - """Test retrieve trait info when lrs has a value""" - resource_id_mock.return_value = 1 - g_mock.db.execute.return_value.fetchone = mock.Mock() - g_mock.db.execute.return_value.fetchone.side_effect = [ - [1, 2, 3, 4], # trait_info = g.db.execute(query).fetchone() - [1, 2.37, 3, 4, 5], # trait_qtl = g.db.execute(query).fetchone() - [2.7333, 2.1204] # trait_info = g.db.execute(query).fetchone() - ] - requests_mock.return_value = None - - mock_dataset = mock.MagicMock() - type(mock_dataset).display_fields = mock.PropertyMock( - return_value=["a", "b", "c", "d"]) - type(mock_dataset).type = "ProbeSet" - type(mock_dataset).name = "RandomName" - - mock_trait = MockTrait( - dataset=mock_dataset, - pre_publication_description="test_string" - ) - trait_attrs = { - "description": "some description", - "probe_target_description": "some description", - "cellid": False, - "chr": 2.733, - "mb": 2.1204 - } - - for key, val in list(trait_attrs.items()): - setattr(mock_trait, key, val) - test_trait = retrieve_trait_info(trait=mock_trait, - dataset=mock_dataset, - get_qtl_info=True) - self.assertEqual(test_trait.LRS_score_repr, - "2.4") - - @mock.patch('base.trait.requests.get') - @mock.patch('base.trait.g') - @mock.patch('base.trait.get_resource_id') - def test_retrieve_trait_info_with_empty_lrs_field(self, - resource_id_mock, - g_mock, - requests_mock): - """Test retrieve trait info with empty lrs field""" - resource_id_mock.return_value = 1 - g_mock.db.execute.return_value.fetchone = mock.Mock() - g_mock.db.execute.return_value.fetchone.side_effect = [ - [1, 2, 3, 4], # trait_info = g.db.execute(query).fetchone() - [1, None, 3, 4, 5], # trait_qtl = g.db.execute(query).fetchone() - [2, 3] # trait_info = g.db.execute(query).fetchone() - ] - requests_mock.return_value = None - - mock_dataset = mock.MagicMock() - type(mock_dataset).display_fields = mock.PropertyMock( - return_value=["a", "b", "c", "d"]) - type(mock_dataset).type = "ProbeSet" - type(mock_dataset).name = "RandomName" - - mock_trait = MockTrait( - dataset=mock_dataset, - pre_publication_description="test_string" - ) - trait_attrs = { - "description": "some description", - "probe_target_description": "some description", - "cellid": False, - "chr": 2.733, - "mb": 2.1204 - } - - for key, val in list(trait_attrs.items()): - setattr(mock_trait, key, val) - test_trait = retrieve_trait_info(trait=mock_trait, - dataset=mock_dataset, - get_qtl_info=True) - self.assertEqual(test_trait.LRS_score_repr, - "N/A") - self.assertEqual(test_trait.LRS_location_repr, - "Chr2: 3.000000") - - @mock.patch('base.trait.requests.get') - @mock.patch('base.trait.g') - @mock.patch('base.trait.get_resource_id') - def test_retrieve_trait_info_with_empty_chr_field(self, - resource_id_mock, - g_mock, - requests_mock): - """Test retrieve trait info with empty chr field""" - resource_id_mock.return_value = 1 - g_mock.db.execute.return_value.fetchone = mock.Mock() - g_mock.db.execute.return_value.fetchone.side_effect = [ - [1, 2, 3, 4], # trait_info = g.db.execute(query).fetchone() - [1, 2, 3, 4, 5], # trait_qtl = g.db.execute(query).fetchone() - [None, 3] # trait_info = g.db.execute(query).fetchone() - ] - - requests_mock.return_value = None - - mock_dataset = mock.MagicMock() - type(mock_dataset).display_fields = mock.PropertyMock( - return_value=["a", "b", "c", "d"]) - type(mock_dataset).type = "ProbeSet" - type(mock_dataset).name = "RandomName" - - mock_trait = MockTrait( - dataset=mock_dataset, - pre_publication_description="test_string" - ) - trait_attrs = { - "description": "some description", - "probe_target_description": "some description", - "cellid": False, - "chr": 2.733, - "mb": 2.1204 - } - - for key, val in list(trait_attrs.items()): - setattr(mock_trait, key, val) - test_trait = retrieve_trait_info(trait=mock_trait, - dataset=mock_dataset, - get_qtl_info=True) - self.assertEqual(test_trait.LRS_score_repr, - "N/A") - self.assertEqual(test_trait.LRS_location_repr, - "N/A") diff --git a/wqflask/tests/base/test_webqtl_case_data.py b/wqflask/tests/base/test_webqtl_case_data.py deleted file mode 100644 index 8e8ba482..00000000 --- a/wqflask/tests/base/test_webqtl_case_data.py +++ /dev/null @@ -1,39 +0,0 @@ -"""Tests for wqflask/base/webqtlCaseData.py""" -import unittest - -from wqflask import app # Required because of utility.tools in webqtlCaseData.py -from base.webqtlCaseData import webqtlCaseData - -class TestWebqtlCaseData(unittest.TestCase): - """Tests for WebqtlCaseData class""" - - def setUp(self): - self.w = webqtlCaseData(name="Test", - value=0, - variance=0.0, - num_cases=10, - name2="Test2") - - def test_webqtl_case_data_repr(self): - self.assertEqual( - repr(self.w), - " value=0.000 variance=0.000 ndata=10 name=Test name2=Test2" - ) - - def test_class_outlier(self): - self.assertEqual(self.w.class_outlier, "") - - def test_display_value(self): - self.assertEqual(self.w.display_value, "0.000") - self.w.value = None - self.assertEqual(self.w.display_value, "x") - - def test_display_variance(self): - self.assertEqual(self.w.display_variance, "0.000") - self.w.variance = None - self.assertEqual(self.w.display_variance, "x") - - def test_display_num_cases(self): - self.assertEqual(self.w.display_num_cases, "10") - self.w.num_cases = None - self.assertEqual(self.w.display_num_cases, "x") diff --git a/wqflask/tests/unit/__init__.py b/wqflask/tests/unit/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/wqflask/tests/unit/base/__init__.py b/wqflask/tests/unit/base/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/wqflask/tests/unit/base/data.py b/wqflask/tests/unit/base/data.py new file mode 100644 index 00000000..06a5a989 --- /dev/null +++ b/wqflask/tests/unit/base/data.py @@ -0,0 +1,110 @@ +gen_menu_json = """ +{ + "datasets": { + "human": { + "HLC": { + "Liver mRNA": [ + [ + "320", + "HLC_0311", + "GSE9588 Human Liver Normal (Mar11) Both Sexes" + ] + ], + "Phenotypes": [ + [ + "635", + "HLCPublish", + "HLC Published Phenotypes" + ] + ] + } + }, + "mouse": { + "BXD": { + "Genotypes": [ + [ + "600", + "BXDGeno", + "BXD Genotypes" + ] + ], + "Hippocampus mRNA": [ + [ + "112", + "HC_M2_0606_P", + "Hippocampus Consortium M430v2 (Jun06) PDNN" + ] + ], + "Phenotypes": [ + [ + "602", + "BXDPublish", + "BXD Published Phenotypes" + ] + ] + } + } + }, + "groups": { + "human": [ + [ + "HLC", + "Liver: Normal Gene Expression with Genotypes (Merck)", + "Family:None" + ] + ], + "mouse": [ + [ + "BXD", + "BXD", + "Family:None" + ] + ] + }, + "species": [ + [ + "human", + "Human" + ], + [ + "mouse", + "Mouse" + ] + ], + "types": { + "human": { + "HLC": [ + [ + "Phenotypes", + "Traits and Cofactors", + "Phenotypes" + ], + [ + "Liver mRNA", + "Liver mRNA", + "Molecular Trait Datasets" + ] + ] + }, + "mouse": { + "BXD": [ + [ + "Phenotypes", + "Traits and Cofactors", + "Phenotypes" + ], + [ + "Genotypes", + "DNA Markers and SNPs", + "Genotypes" + ], + [ + "Hippocampus mRNA", + "Hippocampus mRNA", + "Molecular Trait Datasets" + ] + ] + } + } +} +""" diff --git a/wqflask/tests/unit/base/test_data_set.py b/wqflask/tests/unit/base/test_data_set.py new file mode 100644 index 00000000..96563a16 --- /dev/null +++ b/wqflask/tests/unit/base/test_data_set.py @@ -0,0 +1,181 @@ +"""Tests for wqflask/base/data_set.py""" + +import unittest +from unittest import mock + +from wqflask import app +from .data import gen_menu_json +from base.data_set import DatasetType + + +class TestDataSetTypes(unittest.TestCase): + """Tests for the DataSetType class""" + + def setUp(self): + self.test_dataset = """ + { + "AD-cases-controls-MyersGeno": "Geno", + "AD-cases-controls-MyersPublish": "Publish", + "AKXDGeno": "Geno", + "AXBXAGeno": "Geno", + "AXBXAPublish": "Publish", + "Aging-Brain-UCIPublish": "Publish", + "All Phenotypes": "Publish", + "B139_K_1206_M": "ProbeSet", + "B139_K_1206_R": "ProbeSet" + } + """ + self.app_context = app.app_context() + self.app_context.push() + + def tearDown(self): + self.app_context.pop() + + @mock.patch('base.data_set.g') + def test_data_set_type(self, db_mock): + """Test that DatasetType returns correctly if the Redis Instance is not empty + and the name variable exists in the dictionary + + """ + with app.app_context(): + db_mock.get = mock.Mock() + redis_mock = mock.Mock() + redis_mock.get.return_value = self.test_dataset + self.assertEqual(DatasetType(redis_mock) + ("All Phenotypes"), "Publish") + redis_mock.get.assert_called_once_with("dataset_structure") + + @mock.patch('base.data_set.requests.get') + def test_data_set_type_with_empty_redis(self, request_mock): + """Test that DatasetType returns correctly if the Redis Instance is empty and + the name variable exists in the dictionary + + """ + with app.app_context(): + request_mock.return_value.content = gen_menu_json + redis_mock = mock.Mock() + redis_mock.get.return_value = None + data_set = DatasetType(redis_mock) + self.assertEqual(data_set("BXDGeno"), "Geno") + self.assertEqual(data_set("BXDPublish"), "Publish") + self.assertEqual(data_set("HLC_0311"), "ProbeSet") + + redis_mock.set.assert_called_once_with( + "dataset_structure", + ('{"HLC_0311": "ProbeSet", ' + '"HLCPublish": "Publish", ' + '"BXDGeno": "Geno", ' + '"HC_M2_0606_P": "ProbeSet", ' + '"BXDPublish": "Publish"}')) + + @mock.patch('base.data_set.g') + def test_set_dataset_key_mrna(self, db_mock): + with app.app_context(): + db_mock.db.execute.return_value.fetchone.return_value = [1, 2, 3] + redis_mock = mock.Mock() + redis_mock.get.return_value = self.test_dataset + data_set = DatasetType(redis_mock) + data_set.set_dataset_key("mrna_expr", "Test") + self.assertEqual(data_set("Test"), "ProbeSet") + redis_mock.set.assert_called_once_with( + "dataset_structure", + ('{"AD-cases-controls-MyersGeno": "Geno", ' + '"AD-cases-controls-MyersPublish": "Publish", ' + '"AKXDGeno": "Geno", ' + '"AXBXAGeno": "Geno", ' + '"AXBXAPublish": "Publish", ' + '"Aging-Brain-UCIPublish": "Publish", ' + '"All Phenotypes": "Publish", ' + '"B139_K_1206_M": "ProbeSet", ' + '"B139_K_1206_R": "ProbeSet", ' + '"Test": "ProbeSet"}')) + + db_mock.db.execute.assert_called_with( + ("SELECT ProbeSetFreeze.Id FROM ProbeSetFreeze " + + "WHERE ProbeSetFreeze.Name = \"Test\" ") + ) + + @mock.patch('base.data_set.g') + def test_set_dataset_key_pheno(self, db_mock): + with app.app_context(): + db_mock.db.execute.return_value.fetchone.return_value = [1, 2, 3] + redis_mock = mock.Mock() + redis_mock.get.return_value = self.test_dataset + data_set = DatasetType(redis_mock) + data_set.set_dataset_key("pheno", "Test") + self.assertEqual(data_set("Test"), "Publish") + redis_mock.set.assert_called_once_with( + "dataset_structure", + ('{"AD-cases-controls-MyersGeno": "Geno", ' + '"AD-cases-controls-MyersPublish": "Publish", ' + '"AKXDGeno": "Geno", ' + '"AXBXAGeno": "Geno", ' + '"AXBXAPublish": "Publish", ' + '"Aging-Brain-UCIPublish": "Publish", ' + '"All Phenotypes": "Publish", ' + '"B139_K_1206_M": "ProbeSet", ' + '"B139_K_1206_R": "ProbeSet", ' + '"Test": "Publish"}')) + db_mock.db.execute.assert_called_with( + ("SELECT InfoFiles.GN_AccesionId " + "FROM InfoFiles, PublishFreeze, InbredSet " + "WHERE InbredSet.Name = 'Test' AND " + "PublishFreeze.InbredSetId = InbredSet.Id AND " + "InfoFiles.InfoPageName = PublishFreeze.Name") + ) + + @mock.patch('base.data_set.g') + def test_set_dataset_other_pheno(self, db_mock): + with app.app_context(): + db_mock.db.execute.return_value.fetchone.return_value = [1, 2, 3] + redis_mock = mock.Mock() + redis_mock.get.return_value = self.test_dataset + data_set = DatasetType(redis_mock) + data_set.set_dataset_key("other_pheno", "Test") + self.assertEqual(data_set("Test"), "Publish") + + redis_mock.set.assert_called_once_with( + "dataset_structure", + ('{"AD-cases-controls-MyersGeno": "Geno", ' + '"AD-cases-controls-MyersPublish": "Publish", ' + '"AKXDGeno": "Geno", ' + '"AXBXAGeno": "Geno", ' + '"AXBXAPublish": "Publish", ' + '"Aging-Brain-UCIPublish": "Publish", ' + '"All Phenotypes": "Publish", ' + '"B139_K_1206_M": "ProbeSet", ' + '"B139_K_1206_R": "ProbeSet", ' + '"Test": "Publish"}')) + + db_mock.db.execute.assert_called_with( + ("SELECT PublishFreeze.Name " + + "FROM PublishFreeze, InbredSet " + + "WHERE InbredSet.Name = 'Test' AND " + "PublishFreeze.InbredSetId = InbredSet.Id") + ) + + @mock.patch('base.data_set.g') + def test_set_dataset_geno(self, db_mock): + with app.app_context(): + db_mock.db.execute.return_value.fetchone.return_value = [1, 2, 3] + redis_mock = mock.Mock() + redis_mock.get.return_value = self.test_dataset + data_set = DatasetType(redis_mock) + data_set.set_dataset_key("geno", "Test") + self.assertEqual(data_set("Test"), "Geno") + redis_mock.set.assert_called_once_with( + "dataset_structure", + ('{"AD-cases-controls-MyersGeno": "Geno", ' + '"AD-cases-controls-MyersPublish": "Publish", ' + '"AKXDGeno": "Geno", ' + '"AXBXAGeno": "Geno", ' + '"AXBXAPublish": "Publish", ' + '"Aging-Brain-UCIPublish": "Publish", ' + '"All Phenotypes": "Publish", ' + '"B139_K_1206_M": "ProbeSet", ' + '"B139_K_1206_R": "ProbeSet", ' + '"Test": "Geno"}')) + + db_mock.db.execute.assert_called_with( + ("SELECT GenoFreeze.Id FROM " + "GenoFreeze WHERE GenoFreeze.Name = \"Test\" ")) diff --git a/wqflask/tests/unit/base/test_general_object.py b/wqflask/tests/unit/base/test_general_object.py new file mode 100644 index 00000000..00fd3c72 --- /dev/null +++ b/wqflask/tests/unit/base/test_general_object.py @@ -0,0 +1,40 @@ +import unittest + +from base.GeneralObject import GeneralObject + + +class TestGeneralObjectTests(unittest.TestCase): + """ + Test the GeneralObject base class + """ + + def test_object_contents(self): + """Test whether base contents are stored properly""" + test_obj = GeneralObject("a", "b", "c") + self.assertEqual("abc", ''.join(test_obj.contents)) + self.assertEqual(len(test_obj), 0) + + def test_object_dict(self): + """Test whether the base class is printed properly""" + test_obj = GeneralObject("a", name="test", value=1) + self.assertEqual(str(test_obj), "name = test\nvalue = 1\n") + self.assertEqual( + repr(test_obj), "contents = ['a']\nname = test\nvalue = 1\n") + self.assertEqual(len(test_obj), 2) + self.assertEqual(test_obj["value"], 1) + test_obj["test"] = 1 + self.assertEqual(test_obj["test"], 1) + + def test_get_attribute(self): + "Test that getattr works" + test_obj = GeneralObject("a", name="test", value=1) + self.assertEqual(getattr(test_obj, "value", None), 1) + self.assertEqual(getattr(test_obj, "non-existent", None), None) + + def test_object_comparisons(self): + "Test that 2 objects of the same length are equal" + test_obj1 = GeneralObject("a", name="test", value=1) + test_obj2 = GeneralObject("b", name="test2", value=2) + test_obj3 = GeneralObject("a", name="test", x=1, y=2) + self.assertTrue(test_obj1 == test_obj2) + self.assertFalse(test_obj1 == test_obj3) diff --git a/wqflask/tests/unit/base/test_trait.py b/wqflask/tests/unit/base/test_trait.py new file mode 100644 index 00000000..826ccefd --- /dev/null +++ b/wqflask/tests/unit/base/test_trait.py @@ -0,0 +1,241 @@ +# -*- coding: utf-8 -*- +"""Tests wqflask/base/trait.py""" +import unittest +from unittest import mock + +from wqflask import app +from base.trait import GeneralTrait +from base.trait import retrieve_trait_info + + +class TestResponse: + """Mock Test Response after a request""" + @property + def content(self): + """Mock the content from Requests.get(params).content""" + return "[1, 2, 3, 4]" + + +class TestNilResponse: + """Mock Test Response after a request""" + @property + def content(self): + """Mock the content from Requests.get(params).content""" + return "{}" + + +class MockTrait(GeneralTrait): + @property + def wikidata_alias_fmt(self): + return "Mock alias" + + +class TestRetrieveTraitInfo(unittest.TestCase): + """Tests for 'retrieve_trait_info'""" + + def setUp(self): + self.app_context = app.app_context() + self.app_context.push() + + def tearDown(self): + self.app_context.pop() + + def test_retrieve_trait_info_with_empty_dataset(self): + """Test that an exception is raised when dataset is empty""" + with self.assertRaises(AssertionError): + retrieve_trait_info(trait=mock.MagicMock(), + dataset={}) + + @mock.patch('base.trait.requests.get') + @mock.patch('base.trait.g', mock.Mock()) + def test_retrieve_trait_info_with_empty_trait_info(self, + requests_mock): + """Empty trait info""" + requests_mock.return_value = TestNilResponse() + with self.assertRaises(KeyError): + retrieve_trait_info(trait=mock.MagicMock(), + dataset=mock.MagicMock()) + + @mock.patch('base.trait.requests.get') + @mock.patch('base.trait.g', mock.Mock()) + def test_retrieve_trait_info_with_non_empty_trait_info(self, + requests_mock): + """Test that attributes are set""" + mock_dataset = mock.MagicMock() + requests_mock.return_value = TestResponse() + type(mock_dataset).display_fields = mock.PropertyMock( + return_value=["a", "b", "c", "d"]) + test_trait = retrieve_trait_info(trait=MockTrait(dataset=mock_dataset), + dataset=mock_dataset) + self.assertEqual(test_trait.a, 1) + self.assertEqual(test_trait.b, 2) + self.assertEqual(test_trait.c, 3) + self.assertEqual(test_trait.d, 4) + + @mock.patch('base.trait.requests.get') + @mock.patch('base.trait.g', mock.Mock()) + def test_retrieve_trait_info_utf8_parsing(self, + requests_mock): + """Test that utf-8 strings are parsed correctly""" + utf_8_string = "test_string" + mock_dataset = mock.MagicMock() + requests_mock.return_value = TestResponse() + type(mock_dataset).display_fields = mock.PropertyMock( + return_value=["a", "b", "c", "d"]) + type(mock_dataset).type = 'Publish' + + mock_trait = MockTrait( + dataset=mock_dataset, + pre_publication_description=utf_8_string + ) + trait_attrs = { + "group_code": "test_code", + "pre_publication_description": "test_pre_pub", + "pre_publication_abbreviation": "ファイルを画面毎に見て行くには、次のコマンドを使います。", + "post_publication_description": None, + "pubmed_id": None, + 'year': "2020", + "authors": "Jane Doe かいと", + } + for key, val in list(trait_attrs.items()): + setattr(mock_trait, key, val) + test_trait = retrieve_trait_info(trait=mock_trait, + dataset=mock_dataset) + self.assertEqual(test_trait.abbreviation, + "ファイルを画面毎に見て行くには、次のコマンドを使います。") + self.assertEqual(test_trait.authors, + "Jane Doe かいと") + + @mock.patch('base.trait.requests.get') + @mock.patch('base.trait.g') + @mock.patch('base.trait.get_resource_id') + def test_retrieve_trait_info_with_non_empty_lrs(self, + resource_id_mock, + g_mock, + requests_mock): + """Test retrieve trait info when lrs has a value""" + resource_id_mock.return_value = 1 + g_mock.db.execute.return_value.fetchone = mock.Mock() + g_mock.db.execute.return_value.fetchone.side_effect = [ + [1, 2, 3, 4], # trait_info = g.db.execute(query).fetchone() + [1, 2.37, 3, 4, 5], # trait_qtl = g.db.execute(query).fetchone() + [2.7333, 2.1204] # trait_info = g.db.execute(query).fetchone() + ] + requests_mock.return_value = None + + mock_dataset = mock.MagicMock() + type(mock_dataset).display_fields = mock.PropertyMock( + return_value=["a", "b", "c", "d"]) + type(mock_dataset).type = "ProbeSet" + type(mock_dataset).name = "RandomName" + + mock_trait = MockTrait( + dataset=mock_dataset, + pre_publication_description="test_string" + ) + trait_attrs = { + "description": "some description", + "probe_target_description": "some description", + "cellid": False, + "chr": 2.733, + "mb": 2.1204 + } + + for key, val in list(trait_attrs.items()): + setattr(mock_trait, key, val) + test_trait = retrieve_trait_info(trait=mock_trait, + dataset=mock_dataset, + get_qtl_info=True) + self.assertEqual(test_trait.LRS_score_repr, + "2.4") + + @mock.patch('base.trait.requests.get') + @mock.patch('base.trait.g') + @mock.patch('base.trait.get_resource_id') + def test_retrieve_trait_info_with_empty_lrs_field(self, + resource_id_mock, + g_mock, + requests_mock): + """Test retrieve trait info with empty lrs field""" + resource_id_mock.return_value = 1 + g_mock.db.execute.return_value.fetchone = mock.Mock() + g_mock.db.execute.return_value.fetchone.side_effect = [ + [1, 2, 3, 4], # trait_info = g.db.execute(query).fetchone() + [1, None, 3, 4, 5], # trait_qtl = g.db.execute(query).fetchone() + [2, 3] # trait_info = g.db.execute(query).fetchone() + ] + requests_mock.return_value = None + + mock_dataset = mock.MagicMock() + type(mock_dataset).display_fields = mock.PropertyMock( + return_value=["a", "b", "c", "d"]) + type(mock_dataset).type = "ProbeSet" + type(mock_dataset).name = "RandomName" + + mock_trait = MockTrait( + dataset=mock_dataset, + pre_publication_description="test_string" + ) + trait_attrs = { + "description": "some description", + "probe_target_description": "some description", + "cellid": False, + "chr": 2.733, + "mb": 2.1204 + } + + for key, val in list(trait_attrs.items()): + setattr(mock_trait, key, val) + test_trait = retrieve_trait_info(trait=mock_trait, + dataset=mock_dataset, + get_qtl_info=True) + self.assertEqual(test_trait.LRS_score_repr, + "N/A") + self.assertEqual(test_trait.LRS_location_repr, + "Chr2: 3.000000") + + @mock.patch('base.trait.requests.get') + @mock.patch('base.trait.g') + @mock.patch('base.trait.get_resource_id') + def test_retrieve_trait_info_with_empty_chr_field(self, + resource_id_mock, + g_mock, + requests_mock): + """Test retrieve trait info with empty chr field""" + resource_id_mock.return_value = 1 + g_mock.db.execute.return_value.fetchone = mock.Mock() + g_mock.db.execute.return_value.fetchone.side_effect = [ + [1, 2, 3, 4], # trait_info = g.db.execute(query).fetchone() + [1, 2, 3, 4, 5], # trait_qtl = g.db.execute(query).fetchone() + [None, 3] # trait_info = g.db.execute(query).fetchone() + ] + + requests_mock.return_value = None + + mock_dataset = mock.MagicMock() + type(mock_dataset).display_fields = mock.PropertyMock( + return_value=["a", "b", "c", "d"]) + type(mock_dataset).type = "ProbeSet" + type(mock_dataset).name = "RandomName" + + mock_trait = MockTrait( + dataset=mock_dataset, + pre_publication_description="test_string" + ) + trait_attrs = { + "description": "some description", + "probe_target_description": "some description", + "cellid": False, + "chr": 2.733, + "mb": 2.1204 + } + + for key, val in list(trait_attrs.items()): + setattr(mock_trait, key, val) + test_trait = retrieve_trait_info(trait=mock_trait, + dataset=mock_dataset, + get_qtl_info=True) + self.assertEqual(test_trait.LRS_score_repr, + "N/A") + self.assertEqual(test_trait.LRS_location_repr, + "N/A") diff --git a/wqflask/tests/unit/base/test_webqtl_case_data.py b/wqflask/tests/unit/base/test_webqtl_case_data.py new file mode 100644 index 00000000..8e8ba482 --- /dev/null +++ b/wqflask/tests/unit/base/test_webqtl_case_data.py @@ -0,0 +1,39 @@ +"""Tests for wqflask/base/webqtlCaseData.py""" +import unittest + +from wqflask import app # Required because of utility.tools in webqtlCaseData.py +from base.webqtlCaseData import webqtlCaseData + +class TestWebqtlCaseData(unittest.TestCase): + """Tests for WebqtlCaseData class""" + + def setUp(self): + self.w = webqtlCaseData(name="Test", + value=0, + variance=0.0, + num_cases=10, + name2="Test2") + + def test_webqtl_case_data_repr(self): + self.assertEqual( + repr(self.w), + " value=0.000 variance=0.000 ndata=10 name=Test name2=Test2" + ) + + def test_class_outlier(self): + self.assertEqual(self.w.class_outlier, "") + + def test_display_value(self): + self.assertEqual(self.w.display_value, "0.000") + self.w.value = None + self.assertEqual(self.w.display_value, "x") + + def test_display_variance(self): + self.assertEqual(self.w.display_variance, "0.000") + self.w.variance = None + self.assertEqual(self.w.display_variance, "x") + + def test_display_num_cases(self): + self.assertEqual(self.w.display_num_cases, "10") + self.w.num_cases = None + self.assertEqual(self.w.display_num_cases, "x") diff --git a/wqflask/tests/unit/utility/__init__.py b/wqflask/tests/unit/utility/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/wqflask/tests/unit/utility/test_authentication_tools.py b/wqflask/tests/unit/utility/test_authentication_tools.py new file mode 100644 index 00000000..5c391be5 --- /dev/null +++ b/wqflask/tests/unit/utility/test_authentication_tools.py @@ -0,0 +1,189 @@ +"""Tests for authentication tools""" +import unittest +from unittest import mock + +from utility.authentication_tools import check_resource_availability +from utility.authentication_tools import add_new_resource + + +class TestResponse: + """Mock Test Response after a request""" + @property + def content(self): + """Mock the content from Requests.get(params).content""" + return '["foo"]' + + +class TestUser: + """Mock user""" + @property + def user_id(self): + """Mockes user id. Used in Flask.g.user_session.user_id""" + return "Jane" + + +class TestUserSession: + """Mock user session""" + @property + def user_session(self): + """Mock user session. Mocks Flask.g.user_session object""" + return TestUser() + + +def mock_add_resource(resource_ob, update=False): + return resource_ob + + +class TestCheckResourceAvailability(unittest.TestCase): + """Test methods related to checking the resource availability""" + @mock.patch('utility.authentication_tools.add_new_resource') + @mock.patch('utility.authentication_tools.Redis') + @mock.patch('utility.authentication_tools.g', mock.Mock()) + @mock.patch('utility.authentication_tools.get_resource_id') + def test_check_resource_availability_default_mask( + self, + resource_id_mock, + redis_mock, + add_new_resource_mock): + """Test the resource availability with default mask""" + resource_id_mock.return_value = 1 + redis_mock.smembers.return_value = [] + test_dataset = mock.MagicMock() + type(test_dataset).type = mock.PropertyMock(return_value="Test") + add_new_resource_mock.return_value = {"default_mask": 2} + self.assertEqual(check_resource_availability(test_dataset), 2) + + @mock.patch('utility.authentication_tools.requests.get') + @mock.patch('utility.authentication_tools.add_new_resource') + @mock.patch('utility.authentication_tools.Redis') + @mock.patch('utility.authentication_tools.g', TestUserSession()) + @mock.patch('utility.authentication_tools.get_resource_id') + def test_check_resource_availability_non_default_mask( + self, + resource_id_mock, + redis_mock, + add_new_resource_mock, + requests_mock): + """Test the resource availability with default mask""" + resource_id_mock.return_value = 1 + redis_mock.smembers.return_value = [] + add_new_resource_mock.return_value = {"default_mask": 2} + requests_mock.return_value = TestResponse() + test_dataset = mock.MagicMock() + type(test_dataset).type = mock.PropertyMock(return_value="Test") + self.assertEqual(check_resource_availability(test_dataset), + ['foo']) + + @mock.patch('utility.authentication_tools.webqtlConfig.SUPER_PRIVILEGES', + "SUPERUSER") + @mock.patch('utility.authentication_tools.requests.get') + @mock.patch('utility.authentication_tools.add_new_resource') + @mock.patch('utility.authentication_tools.Redis') + @mock.patch('utility.authentication_tools.g', TestUserSession()) + @mock.patch('utility.authentication_tools.get_resource_id') + def test_check_resource_availability_of_super_user( + self, + resource_id_mock, + redis_mock, + add_new_resource_mock, + requests_mock): + """Test the resource availability if the user is the super user""" + resource_id_mock.return_value = 1 + redis_mock.smembers.return_value = ["Jane"] + add_new_resource_mock.return_value = {"default_mask": 2} + requests_mock.return_value = TestResponse() + test_dataset = mock.MagicMock() + type(test_dataset).type = mock.PropertyMock(return_value="Test") + self.assertEqual(check_resource_availability(test_dataset), + "SUPERUSER") + + @mock.patch('utility.authentication_tools.webqtlConfig.DEFAULT_PRIVILEGES', + "John Doe") + def test_check_resource_availability_string_dataset(self): + """Test the resource availability if the dataset is a string""" + self.assertEqual(check_resource_availability("Test"), + "John Doe") + + @mock.patch('utility.authentication_tools.webqtlConfig.DEFAULT_PRIVILEGES', + "John Doe") + def test_check_resource_availability_temp(self): + """Test the resource availability if the dataset is a string""" + test_dataset = mock.MagicMock() + type(test_dataset).type = mock.PropertyMock(return_value="Temp") + self.assertEqual(check_resource_availability(test_dataset), + "John Doe") + + +class TestAddNewResource(unittest.TestCase): + """Test cases for add_new_resource method""" + @mock.patch('utility.authentication_tools.webqtlConfig.DEFAULT_PRIVILEGES', + "John Doe") + @mock.patch('utility.authentication_tools.add_resource', mock_add_resource) + @mock.patch('utility.authentication_tools.get_group_code') + def test_add_new_resource_if_publish_datatype(self, group_code_mock): + """Test add_new_resource if dataset type is 'publish'""" + group_code_mock.return_value = "Test" + test_dataset = mock.MagicMock() + type(test_dataset).type = mock.PropertyMock(return_value="Publish") + type(test_dataset).id = mock.PropertyMock(return_value=10) + expected_value = { + "owner_id": "none", + "default_mask": "John Doe", + "group_masks": {}, + "name": "Test_None", + "data": { + "dataset": 10, + "trait": None + }, + "type": "dataset-publish" + } + self.assertEqual(add_new_resource(test_dataset), + expected_value) + + @mock.patch('utility.authentication_tools.webqtlConfig.DEFAULT_PRIVILEGES', + "John Doe") + @mock.patch('utility.authentication_tools.add_resource', mock_add_resource) + @mock.patch('utility.authentication_tools.get_group_code') + def test_add_new_resource_if_geno_datatype(self, group_code_mock): + """Test add_new_resource if dataset type is 'geno'""" + group_code_mock.return_value = "Test" + test_dataset = mock.MagicMock() + type(test_dataset).name = mock.PropertyMock(return_value="Geno") + type(test_dataset).type = mock.PropertyMock(return_value="Geno") + type(test_dataset).id = mock.PropertyMock(return_value=20) + expected_value = { + "owner_id": "none", + "default_mask": "John Doe", + "group_masks": {}, + "name": "Geno", + "data": { + "dataset": 20, + }, + "type": "dataset-geno" + } + self.assertEqual(add_new_resource(test_dataset), + expected_value) + + @mock.patch('utility.authentication_tools.webqtlConfig.DEFAULT_PRIVILEGES', + "John Doe") + @mock.patch('utility.authentication_tools.add_resource', mock_add_resource) + @mock.patch('utility.authentication_tools.get_group_code') + def test_add_new_resource_if_other_datatype(self, group_code_mock): + """Test add_new_resource if dataset type is not 'geno' or 'publish'""" + group_code_mock.return_value = "Test" + test_dataset = mock.MagicMock() + type(test_dataset).name = mock.PropertyMock(return_value="Geno") + type(test_dataset).type = mock.PropertyMock(return_value="other") + type(test_dataset).id = mock.PropertyMock(return_value=20) + expected_value = { + "owner_id": "none", + "default_mask": "John Doe", + "group_masks": {}, + "name": "Geno", + "data": { + "dataset": 20, + }, + "type": "dataset-probeset" + } + self.assertEqual(add_new_resource(test_dataset), + expected_value) diff --git a/wqflask/tests/unit/utility/test_chunks.py b/wqflask/tests/unit/utility/test_chunks.py new file mode 100644 index 00000000..8d90a1ec --- /dev/null +++ b/wqflask/tests/unit/utility/test_chunks.py @@ -0,0 +1,19 @@ +"""Test chunking""" + +import unittest + +from utility.chunks import divide_into_chunks + + +class TestChunks(unittest.TestCase): + "Test Utility method for chunking" + def test_divide_into_chunks(self): + "Check that a list is chunked correctly" + self.assertEqual(divide_into_chunks([1, 2, 7, 3, 22, 8, 5, 22, 333], 3), + [[1, 2, 7], [3, 22, 8], [5, 22, 333]]) + self.assertEqual(divide_into_chunks([1, 2, 7, 3, 22, 8, 5, 22, 333], 4), + [[1, 2, 7], [3, 22, 8], [5, 22, 333]]) + self.assertEqual(divide_into_chunks([1, 2, 7, 3, 22, 8, 5, 22, 333], 5), + [[1, 2], [7, 3], [22, 8], [5, 22], [333]]) + self.assertEqual(divide_into_chunks([], 5), + [[]]) diff --git a/wqflask/tests/unit/utility/test_corestats.py b/wqflask/tests/unit/utility/test_corestats.py new file mode 100644 index 00000000..cf91a248 --- /dev/null +++ b/wqflask/tests/unit/utility/test_corestats.py @@ -0,0 +1,55 @@ +"""Test Core Stats""" + +import unittest + +from utility.corestats import Stats + + +class TestChunks(unittest.TestCase): + "Test Utility method for chunking" + + def setUp(self): + self.stat_test = Stats((x for x in range(1, 11))) + + def test_stats_sum(self): + """ Test sequence sum """ + self.assertEqual(self.stat_test.sum(), 55) + self.stat_test = Stats([]) + self.assertEqual(self.stat_test.sum(), None) + + def test_stats_count(self): + """ Test sequence count """ + self.assertEqual(self.stat_test.count(), 10) + self.stat_test = Stats([]) + self.assertEqual(self.stat_test.count(), 0) + + def test_stats_min(self): + """ Test min value in sequence""" + self.assertEqual(self.stat_test.min(), 1) + self.stat_test = Stats([]) + self.assertEqual(self.stat_test.min(), None) + + def test_stats_max(self): + """ Test max value in sequence """ + self.assertEqual(self.stat_test.max(), 10) + self.stat_test = Stats([]) + self.assertEqual(self.stat_test.max(), None) + + def test_stats_avg(self): + """ Test avg of sequence """ + self.assertEqual(self.stat_test.avg(), 5.5) + self.stat_test = Stats([]) + self.assertEqual(self.stat_test.avg(), None) + + def test_stats_stdev(self): + """ Test standard deviation of sequence """ + self.assertEqual(self.stat_test.stdev(), 3.0276503540974917) + self.stat_test = Stats([]) + self.assertEqual(self.stat_test.stdev(), None) + + def test_stats_percentile(self): + """ Test percentile of sequence """ + self.assertEqual(self.stat_test.percentile(20), 3.0) + self.assertEqual(self.stat_test.percentile(101), None) + self.stat_test = Stats([]) + self.assertEqual(self.stat_test.percentile(20), None) diff --git a/wqflask/tests/unit/utility/test_corr_result_helpers.py b/wqflask/tests/unit/utility/test_corr_result_helpers.py new file mode 100644 index 00000000..e196fbdf --- /dev/null +++ b/wqflask/tests/unit/utility/test_corr_result_helpers.py @@ -0,0 +1,32 @@ +""" Test correlation helper methods """ + +import unittest +from utility.corr_result_helpers import normalize_values, common_keys, normalize_values_with_samples + + +class TestCorrelationHelpers(unittest.TestCase): + """Test methods for normalising lists""" + + def test_normalize_values(self): + """Test that a list is normalised correctly""" + self.assertEqual( + normalize_values([2.3, None, None, 3.2, 4.1, 5], [ + 3.4, 7.2, 1.3, None, 6.2, 4.1]), + ([2.3, 4.1, 5], [3.4, 6.2, 4.1], 3) + ) + + def test_common_keys(self): + """Test that common keys are returned as a list""" + a = dict(BXD1=9.113, BXD2=9.825, BXD14=8.985, BXD15=9.300) + b = dict(BXD1=9.723, BXD3=9.825, BXD14=9.124, BXD16=9.300) + self.assertEqual(sorted(common_keys(a, b)), ['BXD1', 'BXD14']) + + def test_normalize_values_with_samples(self): + """Test that a sample(dict) is normalised correctly""" + self.assertEqual( + normalize_values_with_samples( + dict(BXD1=9.113, BXD2=9.825, BXD14=8.985, + BXD15=9.300, BXD20=9.300), + dict(BXD1=9.723, BXD3=9.825, BXD14=9.124, BXD16=9.300)), + (({'BXD1': 9.113, 'BXD14': 8.985}, {'BXD1': 9.723, 'BXD14': 9.124}, 2)) + ) diff --git a/wqflask/tests/unit/utility/test_formatting.py b/wqflask/tests/unit/utility/test_formatting.py new file mode 100644 index 00000000..9d3033d1 --- /dev/null +++ b/wqflask/tests/unit/utility/test_formatting.py @@ -0,0 +1,33 @@ +import unittest +from utility.formatting import numify, commify + + +class TestFormatting(unittest.TestCase): + """Test formatting numbers by numifying or commifying""" + + def test_numify(self): + "Test that a number is correctly converted to a English readable string" + self.assertEqual(numify(1, 'item', 'items'), + 'one item') + self.assertEqual(numify(2, 'book'), 'two') + self.assertEqual(numify(2, 'book', 'books'), 'two books') + self.assertEqual(numify(0, 'book', 'books'), 'zero books') + self.assertEqual(numify(0), 'zero') + self.assertEqual(numify(5), 'five') + self.assertEqual(numify(14, 'book', 'books'), '14 books') + self.assertEqual(numify(999, 'book', 'books'), '999 books') + self.assertEqual(numify(1000000, 'book', 'books'), '1,000,000 books') + self.assertEqual(numify(1956), '1956') + + def test_commify(self): + "Test that commas are added correctly" + self.assertEqual(commify(1), '1') + self.assertEqual(commify(123), '123') + self.assertEqual(commify(1234), '1234') + self.assertEqual(commify(12345), '12,345') + self.assertEqual(commify(1234567890), '1,234,567,890') + self.assertEqual(commify(123.0), '123.0') + self.assertEqual(commify(1234.5), '1234.5') + self.assertEqual(commify(1234.56789), '1234.56789') + self.assertEqual(commify(123456.789), '123,456.789') + self.assertEqual(commify(None), None) diff --git a/wqflask/tests/unit/utility/test_hmac.py b/wqflask/tests/unit/utility/test_hmac.py new file mode 100644 index 00000000..4e3652f8 --- /dev/null +++ b/wqflask/tests/unit/utility/test_hmac.py @@ -0,0 +1,52 @@ +# -*- coding: utf-8 -*- +"""Test hmac utility functions""" + +import unittest +from unittest import mock + +from utility.hmac import data_hmac +from utility.hmac import url_for_hmac +from utility.hmac import hmac_creation + + +class TestHmacUtil(unittest.TestCase): + """Test Utility method for hmac creation""" + + @mock.patch("utility.hmac.app.config", {'SECRET_HMAC_CODE': "secret"}) + def test_hmac_creation(self): + """Test hmac creation with a utf-8 string""" + self.assertEqual(hmac_creation("ファイ"), "7410466338cfe109e946") + + @mock.patch("utility.hmac.app.config", + {'SECRET_HMAC_CODE': ('\x08\xdf\xfa\x93N\x80' + '\xd9\\H@\\\x9f`\x98d^' + '\xb4a;\xc6OM\x946a\xbc' + '\xfc\x80:*\xebc')}) + def test_hmac_creation_with_cookie(self): + """Test hmac creation with a cookie""" + cookie = "3f4c1dbf-5b56-4260-87d6-f35445bda37e:af4fcf5eace9e7c864ce" + uuid_, _, signature = cookie.partition(":") + self.assertEqual( + hmac_creation(uuid_), + "af4fcf5eace9e7c864ce") + + @mock.patch("utility.hmac.app.config", {'SECRET_HMAC_CODE': "secret"}) + def test_data_hmac(self): + """Test data_hmac fn with a utf-8 string""" + self.assertEqual(data_hmac("ファイ"), "ファイ:7410466338cfe109e946") + + @mock.patch("utility.hmac.app.config", {'SECRET_HMAC_CODE': "secret"}) + @mock.patch("utility.hmac.url_for") + def test_url_for_hmac_with_plain_url(self, mock_url): + """Test url_for_hmac without params""" + mock_url.return_value = "https://mock_url.com/ファイ/" + self.assertEqual(url_for_hmac("ファイ"), + "https://mock_url.com/ファイ/?hm=05bc39e659b1948f41e7") + + @mock.patch("utility.hmac.app.config", {'SECRET_HMAC_CODE': "secret"}) + @mock.patch("utility.hmac.url_for") + def test_url_for_hmac_with_param_in_url(self, mock_url): + """Test url_for_hmac with params""" + mock_url.return_value = "https://mock_url.com/?ファイ=1" + self.assertEqual(url_for_hmac("ファイ"), + "https://mock_url.com/?ファイ=1&hm=4709c1708270644aed79") diff --git a/wqflask/tests/unit/wqflask/__init__.py b/wqflask/tests/unit/wqflask/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/wqflask/tests/unit/wqflask/api/__init__.py b/wqflask/tests/unit/wqflask/api/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/wqflask/tests/unit/wqflask/api/test_gen_menu.py b/wqflask/tests/unit/wqflask/api/test_gen_menu.py new file mode 100644 index 00000000..84898bd1 --- /dev/null +++ b/wqflask/tests/unit/wqflask/api/test_gen_menu.py @@ -0,0 +1,413 @@ +"""Test cases for wqflask.api.gen_menu""" +import unittest +from unittest import mock + +from wqflask import app +from wqflask.api.gen_menu import gen_dropdown_json +from wqflask.api.gen_menu import get_species +from wqflask.api.gen_menu import get_groups +from wqflask.api.gen_menu import get_types +from wqflask.api.gen_menu import get_datasets +from wqflask.api.gen_menu import phenotypes_exist +from wqflask.api.gen_menu import genotypes_exist +from wqflask.api.gen_menu import build_datasets +from wqflask.api.gen_menu import build_types + + +class TestGenMenu(unittest.TestCase): + """Tests for the gen_menu module""" + + def setUp(self): + self.app_context = app.app_context() + self.app_context.push() + self.test_group = { + 'mouse': [ + ['H_T1', + 'H_T', + 'Family:DescriptionA' + ], + ['H_T2', "H_T'", 'Family:None'] + ], + 'human': [ + ['BXD', 'BXD', 'Family:None'], + ['HLC', 'Liver: Normal Gene Expression with Genotypes (Merck)', + 'Family:Test'] + ] + } + + self.test_type = { + 'mouse': { + 'H_T2': [('Phenotypes', + 'Traits and Cofactors', + 'Phenotypes'), + ('Genotypes', + 'DNA Markers and SNPs', + 'Genotypes'), + ['M', 'M', 'Molecular Trait Datasets']], + 'H_T1': [('Phenotypes', + 'Traits and Cofactors', + 'Phenotypes'), + ('Genotypes', + 'DNA Markers and SNPs', + 'Genotypes'), + ['M', 'M', 'Molecular Trait Datasets']] + }, + 'human': { + 'HLC': [('Phenotypes', + 'Traits and Cofactors', + 'Phenotypes'), + ('Genotypes', + 'DNA Markers and SNPs', + 'Genotypes'), + ['M', 'M', 'Molecular Trait Datasets']], + 'BXD': [('Phenotypes', + 'Traits and Cofactors', + 'Phenotypes'), + ('Genotypes', + 'DNA Markers and SNPs', + 'Genotypes'), + ['M', 'M', 'Molecular Trait Datasets']] + } + } + + def tearDown(self): + self.app_context.pop() + + @mock.patch('wqflask.api.gen_menu.g') + def test_get_species(self, db_mock): + """Test that assertion is raised when dataset and dataset_name + are defined""" + db_mock.db.execute.return_value.fetchall.return_value = ( + ('human', 'Human'), + ('mouse', 'Mouse')) + self.assertEqual(get_species(), + [['human', 'Human'], ['mouse', 'Mouse']]) + db_mock.db.execute.assert_called_once_with( + "SELECT Name, MenuName FROM Species ORDER BY OrderId" + ) + + @mock.patch('wqflask.api.gen_menu.g') + def test_get_groups(self, db_mock): + """Test that species groups are grouped correctly""" + db_mock.db.execute.return_value.fetchall.side_effect = [ + # Mouse + (('BXD', 'BXD', None), + ('HLC', 'Liver: Normal Gene Expression with Genotypes (Merck)', + 'Test')), + # Human + (('H_T1', "H_T", "DescriptionA"), + ('H_T2', "H_T'", None)) + ] + + self.assertEqual(get_groups([["human", "Human"], ["mouse", "Mouse"]]), + self.test_group) + + for name in ["mouse", "human"]: + db_mock.db.execute.assert_any_call( + ("SELECT InbredSet.Name, InbredSet.FullName, " + + "IFNULL(InbredSet.Family, 'None') " + + "FROM InbredSet, Species WHERE Species.Name " + + "= '{}' AND InbredSet.SpeciesId = Species.Id GROUP by " + + "InbredSet.Name ORDER BY IFNULL(InbredSet.FamilyOrder, " + + "InbredSet.FullName) ASC, IFNULL(InbredSet.Family, " + + "InbredSet.FullName) ASC, InbredSet.FullName ASC, " + + "InbredSet.MenuOrderId ASC").format(name) + ) + + @mock.patch('wqflask.api.gen_menu.g') + def test_phenotypes_exist_called_with_correct_query(self, db_mock): + """Test that phenotypes_exist is called with the correct query""" + db_mock.db.execute.return_value.fetchone.return_value = None + phenotypes_exist("test") + db_mock.db.execute.assert_called_with( + "SELECT Name FROM PublishFreeze " + "WHERE PublishFreeze.Name = 'testPublish'" + ) + + @mock.patch('wqflask.api.gen_menu.g') + def test_phenotypes_exist_with_falsy_values(self, db_mock): + """Test that phenotype check returns correctly when given + a None value""" + for x in [None, False, (), [], ""]: + db_mock.db.execute.return_value.fetchone.return_value = x + self.assertFalse(phenotypes_exist("test")) + + @mock.patch('wqflask.api.gen_menu.g') + def test_phenotypes_exist_with_truthy_value(self, db_mock): + """Test that phenotype check returns correctly when given Truthy """ + for x in ["x", ("result"), ["result"], [1]]: + db_mock.db.execute.return_value.fetchone.return_value = (x) + self.assertTrue(phenotypes_exist("test")) + + @mock.patch('wqflask.api.gen_menu.g') + def test_genotypes_exist_called_with_correct_query(self, db_mock): + """Test that genotypes_exist is called with the correct query""" + db_mock.db.execute.return_value.fetchone.return_value = None + genotypes_exist("test") + db_mock.db.execute.assert_called_with( + "SELECT Name FROM GenoFreeze WHERE GenoFreeze.Name = 'testGeno'" + ) + + @mock.patch('wqflask.api.gen_menu.g') + def test_genotypes_exist_with_falsy_values(self, db_mock): + """Test that genotype check returns correctly when given + a None value""" + for x in [None, False, (), [], ""]: + db_mock.db.execute.return_value.fetchone.return_value = x + self.assertFalse(genotypes_exist("test")) + + @mock.patch('wqflask.api.gen_menu.g') + def test_genotypes_exist_with_truthy_value(self, db_mock): + """Test that genotype check returns correctly when given Truthy """ + for x in ["x", ("result"), ["result"], [1]]: + db_mock.db.execute.return_value.fetchone.return_value = (x) + self.assertTrue(phenotypes_exist("test")) + + @mock.patch('wqflask.api.gen_menu.g') + def test_build_datasets_with_type_phenotypes(self, db_mock): + """Test that correct dataset is returned for a phenotype type""" + db_mock.db.execute.return_value.fetchall.return_value = ( + (602, "BXDPublish", "BXD Published Phenotypes"), + ) + self.assertEqual(build_datasets("Mouse", "BXD", "Phenotypes"), + [['602', "BXDPublish", "BXD Published Phenotypes"]]) + db_mock.db.execute.assert_called_with( + "SELECT InfoFiles.GN_AccesionId, PublishFreeze.Name, " + + "PublishFreeze.FullName FROM InfoFiles, PublishFreeze, " + + "InbredSet WHERE InbredSet.Name = 'BXD' AND " + + "PublishFreeze.InbredSetId = InbredSet.Id AND " + + "InfoFiles.InfoPageName = PublishFreeze.Name " + + "ORDER BY PublishFreeze.CreateTime ASC" + ) + self.assertEqual(build_datasets("Mouse", "MDP", "Phenotypes"), + [['602', "BXDPublish", "Mouse Phenome Database"]]) + + db_mock.db.execute.return_value.fetchall.return_value = () + db_mock.db.execute.return_value.fetchone.return_value = ( + "BXDPublish", "Mouse Phenome Database" + ) + self.assertEqual(build_datasets("Mouse", "MDP", "Phenotypes"), + [["None", "BXDPublish", "Mouse Phenome Database"]]) + + @mock.patch('wqflask.api.gen_menu.g') + def test_build_datasets_with_type_phenotypes_and_no_results(self, db_mock): + """Test that correct dataset is returned for a phenotype type with no + results + + """ + db_mock.db.execute.return_value.fetchall.return_value = None + db_mock.db.execute.return_value.fetchone.return_value = (121, + "text value") + self.assertEqual(build_datasets("Mouse", "BXD", "Phenotypes"), + [["None", "121", "text value"]]) + db_mock.db.execute.assert_called_with( + "SELECT PublishFreeze.Name, PublishFreeze.FullName " + "FROM PublishFreeze, InbredSet " + "WHERE InbredSet.Name = 'BXD' AND " + "PublishFreeze.InbredSetId = InbredSet.Id " + "ORDER BY PublishFreeze.CreateTime ASC" + ) + + @mock.patch('wqflask.api.gen_menu.g') + def test_build_datasets_with_type_genotypes(self, db_mock): + """Test that correct dataset is returned for a phenotype type""" + db_mock.db.execute.return_value.fetchone.return_value = ( + 635, "HLCPublish", "HLC Published Genotypes" + ) + + self.assertEqual(build_datasets("Mouse", "HLC", "Genotypes"), + [["635", "HLCGeno", "HLC Genotypes"]]) + db_mock.db.execute.assert_called_with( + "SELECT InfoFiles.GN_AccesionId FROM InfoFiles, " + "GenoFreeze, InbredSet WHERE InbredSet.Name = 'HLC' AND " + "GenoFreeze.InbredSetId = InbredSet.Id AND " + "InfoFiles.InfoPageName = GenoFreeze.ShortName " + + "ORDER BY GenoFreeze.CreateTime DESC" + ) + db_mock.db.execute.return_value.fetchone.return_value = () + self.assertEqual(build_datasets("Mouse", "HLC", "Genotypes"), + [["None", "HLCGeno", "HLC Genotypes"]]) + + @mock.patch('wqflask.api.gen_menu.g') + def test_build_datasets_with_type_mrna(self, db_mock): + """Test that correct dataset is returned for a mRNA + expression/ Probeset""" + db_mock.db.execute.return_value.fetchall.return_value = ( + (112, "HC_M2_0606_P", + "Hippocampus Consortium M430v2 (Jun06) PDNN"), ) + self.assertEqual(build_datasets("Mouse", "HLC", "mRNA"), [[ + "112", 'HC_M2_0606_P', "Hippocampus Consortium M430v2 (Jun06) PDNN" + ]]) + db_mock.db.execute.assert_called_once_with( + "SELECT ProbeSetFreeze.Id, ProbeSetFreeze.Name, " + + "ProbeSetFreeze.FullName FROM ProbeSetFreeze, " + + "ProbeFreeze, InbredSet, Tissue, Species WHERE " + + "Species.Name = 'Mouse' AND Species.Id = " + + "InbredSet.SpeciesId AND InbredSet.Name = 'HLC' AND " + + "ProbeSetFreeze.ProbeFreezeId = ProbeFreeze.Id and " + + "Tissue.Name = 'mRNA' AND ProbeFreeze.TissueId = " + + "Tissue.Id and ProbeFreeze.InbredSetId = InbredSet.Id " + + "ORDER BY ProbeSetFreeze.CreateTime DESC") + + @mock.patch('wqflask.api.gen_menu.build_datasets') + @mock.patch('wqflask.api.gen_menu.g') + def test_build_types(self, db_mock, datasets_mock): + """Test that correct tissue metadata is returned""" + datasets_mock.return_value = [ + ["112", 'HC_M2_0606_P', + "Hippocampus Consortium M430v2 (Jun06) PDNN"] + ] + db_mock.db.execute.return_value.fetchall.return_value = ( + ('Mouse Tissue'), ('Human Tissue'), ('Rat Tissue') + ) + self.assertEqual(build_types('mouse', 'random group'), + [['M', 'M', 'Molecular Traits'], + ['H', 'H', 'Molecular Traits'], + ['R', 'R', 'Molecular Traits']]) + db_mock.db.execute.assert_called_once_with( + "SELECT DISTINCT Tissue.Name " + + "FROM ProbeFreeze, ProbeSetFreeze, InbredSet, " + + "Tissue, Species WHERE Species.Name = 'mouse' " + + "AND Species.Id = InbredSet.SpeciesId AND " + + "InbredSet.Name = 'random group' AND " + + "ProbeFreeze.TissueId = Tissue.Id AND " + + "ProbeFreeze.InbredSetId = InbredSet.Id AND " + + "ProbeSetFreeze.ProbeFreezeId = ProbeFreeze.Id " + + "ORDER BY Tissue.Name" + ) + + @mock.patch('wqflask.api.gen_menu.build_types') + @mock.patch('wqflask.api.gen_menu.genotypes_exist') + @mock.patch('wqflask.api.gen_menu.phenotypes_exist') + def test_get_types_with_existing_genotype_and_phenotypes( + self, + phenotypes_exist_mock, + genotypes_exist_mock, + build_types_mock): + """Test that build types are constructed correctly if phenotypes and genotypes + exist + + """ + phenotypes_exist_mock.return_value = True + genotypes_exist_mock.return_value = True + + expected_result = self.test_type + + build_types_mock.return_value = [ + ['M', 'M', 'Molecular Trait Datasets'] + ] + self.assertEqual(get_types(self.test_group), expected_result) + + @mock.patch('wqflask.api.gen_menu.build_types') + @mock.patch('wqflask.api.gen_menu.genotypes_exist') + @mock.patch('wqflask.api.gen_menu.phenotypes_exist') + def test_get_types_with_buildtype_and_non_existent_genotype_and_phenotypes( + self, + phenotypes_exist_mock, + genotypes_exist_mock, + build_types_mock): + """Test that build types are constructed correctly if phenotypes_exist and + genotypes_exist are false but build_type is falsy + + """ + phenotypes_exist_mock.return_value = False + genotypes_exist_mock.return_value = False + + build_types_mock.return_value = [] + self.assertEqual(get_types(self.test_group), { + 'mouse': {}, + 'human': {} + }) + + @mock.patch('wqflask.api.gen_menu.build_types') + @mock.patch('wqflask.api.gen_menu.genotypes_exist') + @mock.patch('wqflask.api.gen_menu.phenotypes_exist') + def test_get_types_with_non_existent_genotype_phenotypes_and_buildtype( + self, + phenotypes_exist_mock, + genotypes_exist_mock, + build_types_mock): + """Test that build types are constructed correctly if phenotypes_exist, + genotypes_exist and build_types are truthy + + """ + phenotypes_exist_mock.return_value = False + genotypes_exist_mock.return_value = False + + build_types_mock.return_value = [ + ['M', 'M', 'Molecular Trait Datasets'] + ] + expected_result = { + 'mouse': { + 'H_T2': [['M', 'M', 'Molecular Trait Datasets']], + 'H_T1': [['M', 'M', 'Molecular Trait Datasets']]}, + 'human': { + 'HLC': [['M', 'M', 'Molecular Trait Datasets']], + 'BXD': [['M', 'M', 'Molecular Trait Datasets']]}} + self.assertEqual(get_types(self.test_group), + expected_result) + + @mock.patch('wqflask.api.gen_menu.build_datasets') + def test_get_datasets_with_existent_datasets(self, + build_datasets_mock): + """Test correct dataset is returned with existent build_datasets""" + build_datasets_mock.return_value = "Test" + expected_result = { + 'mouse': { + 'H_T2': {'Genotypes': 'Test', + 'M': 'Test', + 'Phenotypes': 'Test'}, + 'H_T1': {'Genotypes': 'Test', + 'M': 'Test', + 'Phenotypes': 'Test'}}, + 'human': {'HLC': {'Genotypes': 'Test', + 'M': 'Test', + 'Phenotypes': 'Test'}, + 'BXD': {'Genotypes': 'Test', + 'M': 'Test', + 'Phenotypes': 'Test'}}} + self.assertEqual(get_datasets(self.test_type), + expected_result) + + @mock.patch('wqflask.api.gen_menu.build_datasets') + def test_get_datasets_with_non_existent_datasets(self, + build_datasets_mock): + """Test correct dataset is returned with non-existent build_datasets""" + build_datasets_mock.return_value = None + expected_result = { + 'mouse': { + 'H_T2': {}, + 'H_T1': {}}, + 'human': {'HLC': {}, + 'BXD': {}}} + self.assertEqual(get_datasets(self.test_type), + expected_result) + + @mock.patch('wqflask.api.gen_menu.get_datasets') + @mock.patch('wqflask.api.gen_menu.get_types') + @mock.patch('wqflask.api.gen_menu.get_groups') + @mock.patch('wqflask.api.gen_menu.get_species') + def test_gen_dropdown_json(self, + species_mock, + groups_mock, + types_mock, + datasets_mock): + "Test that the correct dictionary is constructed properly" + species_mock.return_value = ("speciesA speciesB speciesC speciesD" + .split(" ")) + datasets_mock.return_value = ("datasetA datasetB datasetC datasetD" + .split(" ")) + groups_mock.return_value = ("groupA groupB groupC groupD" + .split(" ")) + types_mock.return_value = ("typeA typeB typeC typeD" + .split(" ")) + datasets_mock.return_value = ("datasetA datasetB datasetC datasetD" + .split(" ")) + + expected_result = { + 'datasets': ['datasetA', 'datasetB', 'datasetC', 'datasetD'], + 'types': ['typeA', 'typeB', 'typeC', 'typeD'], + 'groups': ['groupA', 'groupB', 'groupC', 'groupD'], + 'species': ['speciesA', 'speciesB', 'speciesC', 'speciesD']} + + self.assertEqual(gen_dropdown_json(), expected_result) diff --git a/wqflask/tests/unit/wqflask/marker_regression/__init__.py b/wqflask/tests/unit/wqflask/marker_regression/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/wqflask/tests/unit/wqflask/marker_regression/test_display_mapping_results.py b/wqflask/tests/unit/wqflask/marker_regression/test_display_mapping_results.py new file mode 100644 index 00000000..8ae0f09f --- /dev/null +++ b/wqflask/tests/unit/wqflask/marker_regression/test_display_mapping_results.py @@ -0,0 +1,156 @@ +import unittest + +import htmlgen as HT +from wqflask.marker_regression.display_mapping_results import ( + DisplayMappingResults, + HtmlGenWrapper +) + + +class TestDisplayMappingResults(unittest.TestCase): + """Basic Methods to test Mapping Results""" + def test_pil_colors(self): + """Test that colors use PILLOW color format""" + self.assertEqual(DisplayMappingResults.CLICKABLE_WEBQTL_REGION_COLOR, + (245, 211, 211)) + + +class TestHtmlGenWrapper(unittest.TestCase): + """Test Wrapper around HTMLGen""" + def test_create_image(self): + """Test HT.Image method""" + self.assertEqual( + str(HtmlGenWrapper.create_image_tag(src="test.png", + alt="random", + border="0", + width="10", + height="13", + usemap="#webqtlmap")), + ("""random""") + ) + + def test_create_form(self): + """Test HT.Form method""" + test_form = HtmlGenWrapper.create_form_tag( + cgi="/testing/", + enctype='multipart/form-data', + name="formName", + submit=HtmlGenWrapper.create_input_tag(type_='hidden', name='Default_Name') + ) + test_image = HtmlGenWrapper.create_image_tag( + src="test.png", + alt="random", + border="0", + width="10", + height="13", + usemap="#webqtlmap" + ) + self.assertEqual( + str(test_form).replace("\n", ""), + ("""
""")) + hddn = { + 'FormID': 'showDatabase', + 'ProbeSetID': '_', + 'database': "TestGeno", + 'CellID': '_', + 'RISet': "Test", + 'incparentsf1': 'ON' + } + for key in hddn.keys(): + test_form.append( + HtmlGenWrapper.create_input_tag( + name=key, + value=hddn[key], + type_='hidden')) + test_form.append(test_image) + + self.assertEqual(str(test_form).replace("\n", ""), ( + """
""" + """""" + """""" + """""" + """""" + """""" + """""" + """""" + """random""" + """
""")) + + def test_create_paragraph(self): + """Test HT.Paragraph method""" + test_p_element = HtmlGenWrapper.create_p_tag(id="smallSize") + par_text = ( + "Mapping using genotype data as " + "a trait will result in infinity LRS at one locus. " + "In order to display the result properly, all LRSs " + "higher than 100 are capped at 100." + ) + self.assertEqual( + str(test_p_element), + """

""" + ) + test_p_element.append(HtmlGenWrapper.create_br_tag()) + test_p_element.append(par_text) + self.assertEqual( + str(test_p_element), + """


{}

""".format(par_text) + ) + + def test_create_br_tag(self): + """Test HT.BR() method""" + self.assertEqual(str(HtmlGenWrapper.create_br_tag()), + "
") + + def test_create_input_tag(self): + """Test HT.Input method""" + self.assertEqual( + str(HtmlGenWrapper.create_input_tag( + type_="hidden", + name="name", + value="key", + Class="trait trait_")).replace("\n", ""), + ("""""")) + + def test_create_map_tag(self): + """Test HT.Map method""" + self.assertEqual(str(HtmlGenWrapper.create_map_tag( + name="WebqTLImageMap")).replace("\n", ""), + """""") + gifmap = HtmlGenWrapper.create_map_tag(name="test") + gifmap.append(HtmlGenWrapper.create_area_tag(shape="rect", + coords='1 2 3', href='#area1')) + gifmap.append(HtmlGenWrapper.create_area_tag(shape="rect", + coords='1 2 3', href='#area2')) + self.assertEqual( + str(gifmap).replace("\n", ""), + ("""""" + """""" + """""" + """""")) + + def test_create_area_tag(self): + """Test HT.Area method""" + self.assertEqual( + str(HtmlGenWrapper.create_area_tag( + shape="rect", + coords="1 2", + href="http://test.com", + title="Some Title")).replace("\n", ""), + ("""""")) + + def test_create_link_tag(self): + """Test HT.HREF method""" + self.assertEqual( + str(HtmlGenWrapper.create_link_tag( + "www.test.com", "test", target="_blank")).replace("\n", ""), + """test""") diff --git a/wqflask/tests/unit/wqflask/show_trait/__init__.py b/wqflask/tests/unit/wqflask/show_trait/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/wqflask/tests/unit/wqflask/show_trait/test_export_trait_data.py b/wqflask/tests/unit/wqflask/show_trait/test_export_trait_data.py new file mode 100644 index 00000000..41761944 --- /dev/null +++ b/wqflask/tests/unit/wqflask/show_trait/test_export_trait_data.py @@ -0,0 +1,212 @@ +import unittest +from unittest import mock +from wqflask.show_trait.export_trait_data import dict_to_sorted_list +from wqflask.show_trait.export_trait_data import cmp_samples +from wqflask.show_trait.export_trait_data import export_sample_table +from wqflask.show_trait.export_trait_data import get_export_metadata + + +class AttributesSetter: + def __init__(self, obj): + for key, value in obj.items(): + setattr(self, key, value) + + +class TestExportTraits(unittest.TestCase): + """Test methods related to converting dict to sortedlist""" + @mock.patch("wqflask.show_trait.export_trait_data.create_trait") + @mock.patch("wqflask.show_trait.export_trait_data.data_set") + def test_get_export_metadata_no_publish(self, mock_dataset, mock_trait): + """test for exporting metadata with no publish""" + mock_dataset_attributes = AttributesSetter( + {"type": "no_publish", "dataset_name": "Temp", "name": "Temp"}) + + mock_nested_attributes = AttributesSetter({"name": "name"}) + mock_dataset_attributes.group = mock_nested_attributes + mock_dataset.create_dataset.return_value = mock_dataset_attributes + mock_trait.return_value = AttributesSetter({"symbol": "", "description_display": "Description", + "title": "research1", "journal": "", "authors": ""}) + + results = get_export_metadata("random_id", "Temp") + expected = [["Record ID: random_id"], + ["Trait URL: http://genenetwork.org/show_trait?trait_id=random_id&dataset=Temp"], + ["Dataset: Temp"], + ["Group: name"], []] + + mock_dataset.create_dataset.assert_called_with("Temp") + mock_trait.assert_called_with( + dataset=mock_dataset_attributes, name="random_id", cellid=None, get_qtl_info=False) + self.assertEqual(results, expected) + + @mock.patch("wqflask.show_trait.export_trait_data.create_trait") + @mock.patch("wqflask.show_trait.export_trait_data.data_set") + def test_get_export_metadata_with_publish(self, data_mock, trait_mock): + """test for exporting metadata with dataset.type=Publish""" + mock_dataset_attributes = AttributesSetter({"type": "Publish", "dataset_name": "Temp", + "name": "Temp", "description_display": "Description goes here"}) + + mock_nested_attributes = AttributesSetter({"name": "name"}) + mock_dataset_attributes.group = mock_nested_attributes + data_mock.create_dataset.return_value = mock_dataset_attributes + trait_instance = AttributesSetter({"symbol": "", "description_display": "Description", + "title": "research1", "journal": "", "authors": ""}) + trait_mock.return_value = trait_instance + + results = get_export_metadata( + "29ae0615-0d77-4814-97c7-c9e91f6bfd7b", "Temp") + + expected = [['Phenotype ID: 29ae0615-0d77-4814-97c7-c9e91f6bfd7b'], + ['Phenotype URL: http://genenetwork.org/show_trait?trait_id=29ae0615-0d77-4814-97c7-c9e91f6bfd7b&dataset=Temp'], [ + 'Group: name'], ['Phenotype: Description'], + ['Authors: N/A'], ['Title: research1'], + ['Journal: N/A'], ['Dataset Link: http://gn1.genenetwork.org/webqtl/main.py?FormID=sharinginfo&InfoPageName=Temp'], []] + + self.assertEqual(results, expected) + + @mock.patch("wqflask.show_trait.export_trait_data.dict_to_sorted_list") + @mock.patch("wqflask.show_trait.export_trait_data.get_export_metadata") + def test_export_sample_table(self, exp_metadata, dict_list): + """test for exporting sample table""" + targs_obj = { + "export_data": """{ + "primary_samples": [ + { + "other": "germanotta", + "name": "Sauroniops", + "se":{ + "name":"S2" + }, + "num_cases":{ + "k1":"value" + + } + } + ], + "other_samples": [ + { + "se": 1, + "num_cases": 4, + "value": 6, + "name": 3 + } + ] + }""", + "trait_display_name": "Hair_color", + "trait_id": "23177fdc-312e-4084-ad0c-f3eae785fff5", + "dataset": { + } + } + exp_metadata.return_value = [ + ["Phenotype ID:0a2be192-57f5-400b-bbbd-0cf50135995f"], ['Group:gp1'], + ["Phenotype:p1"], [ + "Authors:N/A"], + ["Title:research1"], + ["Journal:N/A"], + ["Dataset Link: http://gn1.genenetwork.org/webqtl/main.py?FormID=sharinginfo&InfoPageName=name1"], []] + expected = ('Hair_color', + [['Phenotype ID:0a2be192-57f5-400b-bbbd-0cf50135995f'], + ['Group:gp1'], + ['Phenotype:p1'], + ['Authors:N/A'], + ['Title:research1'], + ['Journal:N/A'], + ['Dataset Link: ' + 'http://gn1.genenetwork.org/webqtl/main.py?FormID=sharinginfo&InfoPageName=name1'], + [], + ['Name', 'Value', 'SE', 'N'], + ['Sauroniops', 'germanotta'], + [3, 6, 1, 4]]) + + dict_list.side_effect = [['Sauroniops', 'germanotta'], [3, 6, 1, 4]] + + self.assertEqual(export_sample_table(targs_obj), expected) + exp_metadata.assert_called_with( + "23177fdc-312e-4084-ad0c-f3eae785fff5", {}) + self.assertEqual(dict_list.call_count, 2) + + def test_dict_to_sortedlist(self): + """test for conversion of dict to sorted list""" + sample1 = { + "other": "exp1", + "name": "exp2" + } + sample2 = { + "se": 1, + "num_cases": 4, + "value": 6, + "name": 3 + + } + rever = { + "name": 3, + "value": 6, + "num_cases": 4, + "se": 1 + } + oneItem = { + "item1": "one" + } + + self.assertEqual(["exp2", "exp1"], dict_to_sorted_list(sample1)) + self.assertEqual([3, 6, 1, 4], dict_to_sorted_list(sample2)) + self.assertEqual([3, 6, 1, 4], dict_to_sorted_list(rever)) + self.assertEqual(["one"], dict_to_sorted_list(oneItem)) + """test that the func returns the values not the keys""" + self.assertFalse(["other", "name"] == dict_to_sorted_list(sample1)) + + def test_cmp_samples(self): + """test for comparing samples function""" + sampleA = [ + [ + ("value", "other"), + ("name", "test_name") + ] + ] + sampleB = [ + [ + ("value", "other"), + ("unknown", "test_name") + ] + ] + sampleC = [ + [("other", "value"), + ("name", "value") + ], + [ + ("name", "value"), + ("value", "name") + ], + [ + ("other", "value"), + ("name", "value" + )], + [ + ("name", "name1"), + ("se", "valuex") + ], + [( + "value", "name1"), + ("se", "valuex") + ], + [( + "other", "name1"), + ("se", "valuex" + ) + ], + [( + "name", "name_val"), + ("num_cases", "num_val") + ], + [( + "other_a", "val_a"), + ("other_b", "val" + ) + ] + ] + results = [cmp_samples(val[0], val[1]) for val in sampleA] + resultB = [cmp_samples(val[0], val[1]) for val in sampleB] + resultC = [cmp_samples(val[0], val[1]) for val in sampleC] + + self.assertEqual(1, *results) + self.assertEqual(-1, *resultB) + self.assertEqual([1, -1, 1, -1, -1, 1, -1, -1], resultC) diff --git a/wqflask/tests/unit/wqflask/test_collect.py b/wqflask/tests/unit/wqflask/test_collect.py new file mode 100644 index 00000000..9a36132d --- /dev/null +++ b/wqflask/tests/unit/wqflask/test_collect.py @@ -0,0 +1,73 @@ +"""Test cases for some methods in collect.py""" + +import unittest +from unittest import mock + +from flask import Flask +from wqflask.collect import process_traits + +app = Flask(__name__) + + +class MockSession: + """Helper class for mocking wqflask.collect.g.user_session.logged_in""" + def __init__(self, is_logged_in=False): + self.is_logged_in = is_logged_in + + @property + def logged_in(self): + return self.is_logged_in + + +class MockFlaskG: + """Helper class for mocking wqflask.collect.g.user_session""" + def __init__(self, is_logged_in=False): + self.is_logged_in = is_logged_in + + @property + def user_session(self): + if self.is_logged_in: + return MockSession(is_logged_in=True) + return MockSession() + + +class TestCollect(unittest.TestCase): + + def setUp(self): + self.app_context = app.app_context() + self.app_context.push() + + def tearDown(self): + self.app_context.pop() + + @mock.patch("wqflask.collect.g", MockFlaskG()) + def test_process_traits_with_bytestring(self): + """ + Test that the correct traits are returned when the user is logged + out and bytes are used. + """ + self.assertEqual(process_traits( + b'1452452_at:HC_M2_0606_P:163d04f7db7c9e110de6,' + b'1452447_at:HC_M2_0606_P:eeece8fceb67072debea,' + b'1451401_a_at:HC_M2_0606_P:a043d23b3b3906d8318e,' + b'1429252_at:HC_M2_0606_P:6fa378b349bc9180e8f5'), + set(['1429252_at:HC_M2_0606_P', + '1451401_a_at:HC_M2_0606_P', + '1452447_at:HC_M2_0606_P', + '1452452_at:HC_M2_0606_P'])) + + @mock.patch("wqflask.collect.g", MockFlaskG()) + def test_process_traits_with_normal_string(self): + """ + Test that the correct traits are returned when the user is logged + out and a normal string is used. + """ + self.assertEqual(process_traits( + '1452452_at:HC_M2_0606_P:163d04f7db7c9e110de6,' + '1452447_at:HC_M2_0606_P:eeece8fceb67072debea,' + '1451401_a_at:HC_M2_0606_P:a043d23b3b3906d8318e,' + '1429252_at:HC_M2_0606_P:6fa378b349bc9180e8f5'), + set(['1429252_at:HC_M2_0606_P', + '1451401_a_at:HC_M2_0606_P', + '1452447_at:HC_M2_0606_P', + '1452452_at:HC_M2_0606_P'])) diff --git a/wqflask/tests/unit/wqflask/test_pbkdf2.py b/wqflask/tests/unit/wqflask/test_pbkdf2.py new file mode 100644 index 00000000..a33fbd4f --- /dev/null +++ b/wqflask/tests/unit/wqflask/test_pbkdf2.py @@ -0,0 +1,61 @@ +"""Test cases pbkdf2""" + +import unittest +from wqflask.pbkdf2 import pbkdf2_hex + + +class TestPbkdf2(unittest.TestCase): + def test_pbkdf2_hex(self): + """ + Test pbkdf2_hex function + """ + + for password, salt, iterations, keylen, expected_value in [ + ('password', 'salt', 1, 20, + '0c60c80f961f0e71f3a9b524af6012062fe037a6'), + ('password', 'salt', 2, 20, + 'ea6c014dc72d6f8ccd1ed92ace1d41f0d8de8957'), + ('password', 'salt', 4096, 20, + '4b007901b765489abead49d926f721d065a429c1'), + ('passwordPASSWORDpassword', + 'saltSALTsaltSALTsaltSALTsaltSALTsalt', + 4096, 25, + '3d2eec4fe41c849b80c8d83662c0e44a8b291a964cf2f07038'), + ('pass\x00word', 'sa\x00lt', 4096, 16, + '56fa6aa75548099dcc37d7f03425e0c3'), + ('password', 'ATHENA.MIT.EDUraeburn', 1, 16, + 'cdedb5281bb2f801565a1122b2563515'), + ('password', 'ATHENA.MIT.EDUraeburn', 1, 32, + ('cdedb5281bb2f80' + '1565a1122b256351' + '50ad1f7a04bb9f3a33' + '3ecc0e2e1f70837')), + ('password', 'ATHENA.MIT.EDUraeburn', 2, 16, + '01dbee7f4a9e243e988b62c73cda935d'), + ('password', 'ATHENA.MIT.EDUraeburn', 2, 32, + ('01dbee7f4a9e243e9' + '88b62c73cda935da05' + '378b93244ec8f48a99' + 'e61ad799d86')), + ('password', 'ATHENA.MIT.EDUraeburn', 1200, 32, + ('5c08eb61fdf71e' + '4e4ec3cf6ba1f55' + '12ba7e52ddbc5e51' + '42f708a31e2e62b1e13')), + ('X' * 64, 'pass phrase equals block size', 1200, 32, + ('139c30c0966bc32ba' + '55fdbf212530ac9c5' + 'ec59f1a452f5cc9ad' + '940fea0598ed1')), + ('X' * 65, 'pass phrase exceeds block size', 1200, 32, + ('9ccad6d468770cd' + '51b10e6a68721be6' + '11a8b4d282601db3' + 'b36be9246915ec82a')) + ]: + self.assertEqual( + pbkdf2_hex(data=password, + salt=salt, + iterations=iterations, + keylen=keylen), + expected_value) diff --git a/wqflask/tests/unit/wqflask/test_user_login.py b/wqflask/tests/unit/wqflask/test_user_login.py new file mode 100644 index 00000000..61cd9ab9 --- /dev/null +++ b/wqflask/tests/unit/wqflask/test_user_login.py @@ -0,0 +1,21 @@ +"""Test cases for some methods in login.py""" + +import unittest +from wqflask.user_login import encode_password + + +class TestUserLogin(unittest.TestCase): + def test_encode_password(self): + """ + Test encode password + """ + pass_gen_fields = { + "salt": "salt", + "hashfunc": "sha1", + "iterations": 4096, + "keylength": 20, + } + self.assertEqual( + encode_password(pass_gen_fields, + "password").get("password"), + '4b007901b765489abead49d926f721d065a429c1') diff --git a/wqflask/tests/unit/wqflask/test_user_session.py b/wqflask/tests/unit/wqflask/test_user_session.py new file mode 100644 index 00000000..ebb0334a --- /dev/null +++ b/wqflask/tests/unit/wqflask/test_user_session.py @@ -0,0 +1,15 @@ +"""Test cases for some methods in user_session.py""" + +import unittest +from wqflask.user_session import verify_cookie + + +class TestUserSession(unittest.TestCase): + def test_verify_cookie(self): + """ + Test cookie verification + """ + self.assertEqual( + "3f4c1dbf-5b56-4260-87d6-f35445bda37e", + verify_cookie(("3f4c1dbf-5b56-4260-87d6-" + "f35445bda37e:af4fcf5eace9e7c864ce"))) diff --git a/wqflask/tests/utility/__init__.py b/wqflask/tests/utility/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/wqflask/tests/utility/test_authentication_tools.py b/wqflask/tests/utility/test_authentication_tools.py deleted file mode 100644 index 5c391be5..00000000 --- a/wqflask/tests/utility/test_authentication_tools.py +++ /dev/null @@ -1,189 +0,0 @@ -"""Tests for authentication tools""" -import unittest -from unittest import mock - -from utility.authentication_tools import check_resource_availability -from utility.authentication_tools import add_new_resource - - -class TestResponse: - """Mock Test Response after a request""" - @property - def content(self): - """Mock the content from Requests.get(params).content""" - return '["foo"]' - - -class TestUser: - """Mock user""" - @property - def user_id(self): - """Mockes user id. Used in Flask.g.user_session.user_id""" - return "Jane" - - -class TestUserSession: - """Mock user session""" - @property - def user_session(self): - """Mock user session. Mocks Flask.g.user_session object""" - return TestUser() - - -def mock_add_resource(resource_ob, update=False): - return resource_ob - - -class TestCheckResourceAvailability(unittest.TestCase): - """Test methods related to checking the resource availability""" - @mock.patch('utility.authentication_tools.add_new_resource') - @mock.patch('utility.authentication_tools.Redis') - @mock.patch('utility.authentication_tools.g', mock.Mock()) - @mock.patch('utility.authentication_tools.get_resource_id') - def test_check_resource_availability_default_mask( - self, - resource_id_mock, - redis_mock, - add_new_resource_mock): - """Test the resource availability with default mask""" - resource_id_mock.return_value = 1 - redis_mock.smembers.return_value = [] - test_dataset = mock.MagicMock() - type(test_dataset).type = mock.PropertyMock(return_value="Test") - add_new_resource_mock.return_value = {"default_mask": 2} - self.assertEqual(check_resource_availability(test_dataset), 2) - - @mock.patch('utility.authentication_tools.requests.get') - @mock.patch('utility.authentication_tools.add_new_resource') - @mock.patch('utility.authentication_tools.Redis') - @mock.patch('utility.authentication_tools.g', TestUserSession()) - @mock.patch('utility.authentication_tools.get_resource_id') - def test_check_resource_availability_non_default_mask( - self, - resource_id_mock, - redis_mock, - add_new_resource_mock, - requests_mock): - """Test the resource availability with default mask""" - resource_id_mock.return_value = 1 - redis_mock.smembers.return_value = [] - add_new_resource_mock.return_value = {"default_mask": 2} - requests_mock.return_value = TestResponse() - test_dataset = mock.MagicMock() - type(test_dataset).type = mock.PropertyMock(return_value="Test") - self.assertEqual(check_resource_availability(test_dataset), - ['foo']) - - @mock.patch('utility.authentication_tools.webqtlConfig.SUPER_PRIVILEGES', - "SUPERUSER") - @mock.patch('utility.authentication_tools.requests.get') - @mock.patch('utility.authentication_tools.add_new_resource') - @mock.patch('utility.authentication_tools.Redis') - @mock.patch('utility.authentication_tools.g', TestUserSession()) - @mock.patch('utility.authentication_tools.get_resource_id') - def test_check_resource_availability_of_super_user( - self, - resource_id_mock, - redis_mock, - add_new_resource_mock, - requests_mock): - """Test the resource availability if the user is the super user""" - resource_id_mock.return_value = 1 - redis_mock.smembers.return_value = ["Jane"] - add_new_resource_mock.return_value = {"default_mask": 2} - requests_mock.return_value = TestResponse() - test_dataset = mock.MagicMock() - type(test_dataset).type = mock.PropertyMock(return_value="Test") - self.assertEqual(check_resource_availability(test_dataset), - "SUPERUSER") - - @mock.patch('utility.authentication_tools.webqtlConfig.DEFAULT_PRIVILEGES', - "John Doe") - def test_check_resource_availability_string_dataset(self): - """Test the resource availability if the dataset is a string""" - self.assertEqual(check_resource_availability("Test"), - "John Doe") - - @mock.patch('utility.authentication_tools.webqtlConfig.DEFAULT_PRIVILEGES', - "John Doe") - def test_check_resource_availability_temp(self): - """Test the resource availability if the dataset is a string""" - test_dataset = mock.MagicMock() - type(test_dataset).type = mock.PropertyMock(return_value="Temp") - self.assertEqual(check_resource_availability(test_dataset), - "John Doe") - - -class TestAddNewResource(unittest.TestCase): - """Test cases for add_new_resource method""" - @mock.patch('utility.authentication_tools.webqtlConfig.DEFAULT_PRIVILEGES', - "John Doe") - @mock.patch('utility.authentication_tools.add_resource', mock_add_resource) - @mock.patch('utility.authentication_tools.get_group_code') - def test_add_new_resource_if_publish_datatype(self, group_code_mock): - """Test add_new_resource if dataset type is 'publish'""" - group_code_mock.return_value = "Test" - test_dataset = mock.MagicMock() - type(test_dataset).type = mock.PropertyMock(return_value="Publish") - type(test_dataset).id = mock.PropertyMock(return_value=10) - expected_value = { - "owner_id": "none", - "default_mask": "John Doe", - "group_masks": {}, - "name": "Test_None", - "data": { - "dataset": 10, - "trait": None - }, - "type": "dataset-publish" - } - self.assertEqual(add_new_resource(test_dataset), - expected_value) - - @mock.patch('utility.authentication_tools.webqtlConfig.DEFAULT_PRIVILEGES', - "John Doe") - @mock.patch('utility.authentication_tools.add_resource', mock_add_resource) - @mock.patch('utility.authentication_tools.get_group_code') - def test_add_new_resource_if_geno_datatype(self, group_code_mock): - """Test add_new_resource if dataset type is 'geno'""" - group_code_mock.return_value = "Test" - test_dataset = mock.MagicMock() - type(test_dataset).name = mock.PropertyMock(return_value="Geno") - type(test_dataset).type = mock.PropertyMock(return_value="Geno") - type(test_dataset).id = mock.PropertyMock(return_value=20) - expected_value = { - "owner_id": "none", - "default_mask": "John Doe", - "group_masks": {}, - "name": "Geno", - "data": { - "dataset": 20, - }, - "type": "dataset-geno" - } - self.assertEqual(add_new_resource(test_dataset), - expected_value) - - @mock.patch('utility.authentication_tools.webqtlConfig.DEFAULT_PRIVILEGES', - "John Doe") - @mock.patch('utility.authentication_tools.add_resource', mock_add_resource) - @mock.patch('utility.authentication_tools.get_group_code') - def test_add_new_resource_if_other_datatype(self, group_code_mock): - """Test add_new_resource if dataset type is not 'geno' or 'publish'""" - group_code_mock.return_value = "Test" - test_dataset = mock.MagicMock() - type(test_dataset).name = mock.PropertyMock(return_value="Geno") - type(test_dataset).type = mock.PropertyMock(return_value="other") - type(test_dataset).id = mock.PropertyMock(return_value=20) - expected_value = { - "owner_id": "none", - "default_mask": "John Doe", - "group_masks": {}, - "name": "Geno", - "data": { - "dataset": 20, - }, - "type": "dataset-probeset" - } - self.assertEqual(add_new_resource(test_dataset), - expected_value) diff --git a/wqflask/tests/utility/test_chunks.py b/wqflask/tests/utility/test_chunks.py deleted file mode 100644 index 8d90a1ec..00000000 --- a/wqflask/tests/utility/test_chunks.py +++ /dev/null @@ -1,19 +0,0 @@ -"""Test chunking""" - -import unittest - -from utility.chunks import divide_into_chunks - - -class TestChunks(unittest.TestCase): - "Test Utility method for chunking" - def test_divide_into_chunks(self): - "Check that a list is chunked correctly" - self.assertEqual(divide_into_chunks([1, 2, 7, 3, 22, 8, 5, 22, 333], 3), - [[1, 2, 7], [3, 22, 8], [5, 22, 333]]) - self.assertEqual(divide_into_chunks([1, 2, 7, 3, 22, 8, 5, 22, 333], 4), - [[1, 2, 7], [3, 22, 8], [5, 22, 333]]) - self.assertEqual(divide_into_chunks([1, 2, 7, 3, 22, 8, 5, 22, 333], 5), - [[1, 2], [7, 3], [22, 8], [5, 22], [333]]) - self.assertEqual(divide_into_chunks([], 5), - [[]]) diff --git a/wqflask/tests/utility/test_corestats.py b/wqflask/tests/utility/test_corestats.py deleted file mode 100644 index cf91a248..00000000 --- a/wqflask/tests/utility/test_corestats.py +++ /dev/null @@ -1,55 +0,0 @@ -"""Test Core Stats""" - -import unittest - -from utility.corestats import Stats - - -class TestChunks(unittest.TestCase): - "Test Utility method for chunking" - - def setUp(self): - self.stat_test = Stats((x for x in range(1, 11))) - - def test_stats_sum(self): - """ Test sequence sum """ - self.assertEqual(self.stat_test.sum(), 55) - self.stat_test = Stats([]) - self.assertEqual(self.stat_test.sum(), None) - - def test_stats_count(self): - """ Test sequence count """ - self.assertEqual(self.stat_test.count(), 10) - self.stat_test = Stats([]) - self.assertEqual(self.stat_test.count(), 0) - - def test_stats_min(self): - """ Test min value in sequence""" - self.assertEqual(self.stat_test.min(), 1) - self.stat_test = Stats([]) - self.assertEqual(self.stat_test.min(), None) - - def test_stats_max(self): - """ Test max value in sequence """ - self.assertEqual(self.stat_test.max(), 10) - self.stat_test = Stats([]) - self.assertEqual(self.stat_test.max(), None) - - def test_stats_avg(self): - """ Test avg of sequence """ - self.assertEqual(self.stat_test.avg(), 5.5) - self.stat_test = Stats([]) - self.assertEqual(self.stat_test.avg(), None) - - def test_stats_stdev(self): - """ Test standard deviation of sequence """ - self.assertEqual(self.stat_test.stdev(), 3.0276503540974917) - self.stat_test = Stats([]) - self.assertEqual(self.stat_test.stdev(), None) - - def test_stats_percentile(self): - """ Test percentile of sequence """ - self.assertEqual(self.stat_test.percentile(20), 3.0) - self.assertEqual(self.stat_test.percentile(101), None) - self.stat_test = Stats([]) - self.assertEqual(self.stat_test.percentile(20), None) diff --git a/wqflask/tests/utility/test_corr_result_helpers.py b/wqflask/tests/utility/test_corr_result_helpers.py deleted file mode 100644 index e196fbdf..00000000 --- a/wqflask/tests/utility/test_corr_result_helpers.py +++ /dev/null @@ -1,32 +0,0 @@ -""" Test correlation helper methods """ - -import unittest -from utility.corr_result_helpers import normalize_values, common_keys, normalize_values_with_samples - - -class TestCorrelationHelpers(unittest.TestCase): - """Test methods for normalising lists""" - - def test_normalize_values(self): - """Test that a list is normalised correctly""" - self.assertEqual( - normalize_values([2.3, None, None, 3.2, 4.1, 5], [ - 3.4, 7.2, 1.3, None, 6.2, 4.1]), - ([2.3, 4.1, 5], [3.4, 6.2, 4.1], 3) - ) - - def test_common_keys(self): - """Test that common keys are returned as a list""" - a = dict(BXD1=9.113, BXD2=9.825, BXD14=8.985, BXD15=9.300) - b = dict(BXD1=9.723, BXD3=9.825, BXD14=9.124, BXD16=9.300) - self.assertEqual(sorted(common_keys(a, b)), ['BXD1', 'BXD14']) - - def test_normalize_values_with_samples(self): - """Test that a sample(dict) is normalised correctly""" - self.assertEqual( - normalize_values_with_samples( - dict(BXD1=9.113, BXD2=9.825, BXD14=8.985, - BXD15=9.300, BXD20=9.300), - dict(BXD1=9.723, BXD3=9.825, BXD14=9.124, BXD16=9.300)), - (({'BXD1': 9.113, 'BXD14': 8.985}, {'BXD1': 9.723, 'BXD14': 9.124}, 2)) - ) diff --git a/wqflask/tests/utility/test_formatting.py b/wqflask/tests/utility/test_formatting.py deleted file mode 100644 index 9d3033d1..00000000 --- a/wqflask/tests/utility/test_formatting.py +++ /dev/null @@ -1,33 +0,0 @@ -import unittest -from utility.formatting import numify, commify - - -class TestFormatting(unittest.TestCase): - """Test formatting numbers by numifying or commifying""" - - def test_numify(self): - "Test that a number is correctly converted to a English readable string" - self.assertEqual(numify(1, 'item', 'items'), - 'one item') - self.assertEqual(numify(2, 'book'), 'two') - self.assertEqual(numify(2, 'book', 'books'), 'two books') - self.assertEqual(numify(0, 'book', 'books'), 'zero books') - self.assertEqual(numify(0), 'zero') - self.assertEqual(numify(5), 'five') - self.assertEqual(numify(14, 'book', 'books'), '14 books') - self.assertEqual(numify(999, 'book', 'books'), '999 books') - self.assertEqual(numify(1000000, 'book', 'books'), '1,000,000 books') - self.assertEqual(numify(1956), '1956') - - def test_commify(self): - "Test that commas are added correctly" - self.assertEqual(commify(1), '1') - self.assertEqual(commify(123), '123') - self.assertEqual(commify(1234), '1234') - self.assertEqual(commify(12345), '12,345') - self.assertEqual(commify(1234567890), '1,234,567,890') - self.assertEqual(commify(123.0), '123.0') - self.assertEqual(commify(1234.5), '1234.5') - self.assertEqual(commify(1234.56789), '1234.56789') - self.assertEqual(commify(123456.789), '123,456.789') - self.assertEqual(commify(None), None) diff --git a/wqflask/tests/utility/test_hmac.py b/wqflask/tests/utility/test_hmac.py deleted file mode 100644 index 4e3652f8..00000000 --- a/wqflask/tests/utility/test_hmac.py +++ /dev/null @@ -1,52 +0,0 @@ -# -*- coding: utf-8 -*- -"""Test hmac utility functions""" - -import unittest -from unittest import mock - -from utility.hmac import data_hmac -from utility.hmac import url_for_hmac -from utility.hmac import hmac_creation - - -class TestHmacUtil(unittest.TestCase): - """Test Utility method for hmac creation""" - - @mock.patch("utility.hmac.app.config", {'SECRET_HMAC_CODE': "secret"}) - def test_hmac_creation(self): - """Test hmac creation with a utf-8 string""" - self.assertEqual(hmac_creation("ファイ"), "7410466338cfe109e946") - - @mock.patch("utility.hmac.app.config", - {'SECRET_HMAC_CODE': ('\x08\xdf\xfa\x93N\x80' - '\xd9\\H@\\\x9f`\x98d^' - '\xb4a;\xc6OM\x946a\xbc' - '\xfc\x80:*\xebc')}) - def test_hmac_creation_with_cookie(self): - """Test hmac creation with a cookie""" - cookie = "3f4c1dbf-5b56-4260-87d6-f35445bda37e:af4fcf5eace9e7c864ce" - uuid_, _, signature = cookie.partition(":") - self.assertEqual( - hmac_creation(uuid_), - "af4fcf5eace9e7c864ce") - - @mock.patch("utility.hmac.app.config", {'SECRET_HMAC_CODE': "secret"}) - def test_data_hmac(self): - """Test data_hmac fn with a utf-8 string""" - self.assertEqual(data_hmac("ファイ"), "ファイ:7410466338cfe109e946") - - @mock.patch("utility.hmac.app.config", {'SECRET_HMAC_CODE': "secret"}) - @mock.patch("utility.hmac.url_for") - def test_url_for_hmac_with_plain_url(self, mock_url): - """Test url_for_hmac without params""" - mock_url.return_value = "https://mock_url.com/ファイ/" - self.assertEqual(url_for_hmac("ファイ"), - "https://mock_url.com/ファイ/?hm=05bc39e659b1948f41e7") - - @mock.patch("utility.hmac.app.config", {'SECRET_HMAC_CODE': "secret"}) - @mock.patch("utility.hmac.url_for") - def test_url_for_hmac_with_param_in_url(self, mock_url): - """Test url_for_hmac with params""" - mock_url.return_value = "https://mock_url.com/?ファイ=1" - self.assertEqual(url_for_hmac("ファイ"), - "https://mock_url.com/?ファイ=1&hm=4709c1708270644aed79") diff --git a/wqflask/tests/wqflask/__init__.py b/wqflask/tests/wqflask/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/wqflask/tests/wqflask/api/__init__.py b/wqflask/tests/wqflask/api/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/wqflask/tests/wqflask/api/test_gen_menu.py b/wqflask/tests/wqflask/api/test_gen_menu.py deleted file mode 100644 index 84898bd1..00000000 --- a/wqflask/tests/wqflask/api/test_gen_menu.py +++ /dev/null @@ -1,413 +0,0 @@ -"""Test cases for wqflask.api.gen_menu""" -import unittest -from unittest import mock - -from wqflask import app -from wqflask.api.gen_menu import gen_dropdown_json -from wqflask.api.gen_menu import get_species -from wqflask.api.gen_menu import get_groups -from wqflask.api.gen_menu import get_types -from wqflask.api.gen_menu import get_datasets -from wqflask.api.gen_menu import phenotypes_exist -from wqflask.api.gen_menu import genotypes_exist -from wqflask.api.gen_menu import build_datasets -from wqflask.api.gen_menu import build_types - - -class TestGenMenu(unittest.TestCase): - """Tests for the gen_menu module""" - - def setUp(self): - self.app_context = app.app_context() - self.app_context.push() - self.test_group = { - 'mouse': [ - ['H_T1', - 'H_T', - 'Family:DescriptionA' - ], - ['H_T2', "H_T'", 'Family:None'] - ], - 'human': [ - ['BXD', 'BXD', 'Family:None'], - ['HLC', 'Liver: Normal Gene Expression with Genotypes (Merck)', - 'Family:Test'] - ] - } - - self.test_type = { - 'mouse': { - 'H_T2': [('Phenotypes', - 'Traits and Cofactors', - 'Phenotypes'), - ('Genotypes', - 'DNA Markers and SNPs', - 'Genotypes'), - ['M', 'M', 'Molecular Trait Datasets']], - 'H_T1': [('Phenotypes', - 'Traits and Cofactors', - 'Phenotypes'), - ('Genotypes', - 'DNA Markers and SNPs', - 'Genotypes'), - ['M', 'M', 'Molecular Trait Datasets']] - }, - 'human': { - 'HLC': [('Phenotypes', - 'Traits and Cofactors', - 'Phenotypes'), - ('Genotypes', - 'DNA Markers and SNPs', - 'Genotypes'), - ['M', 'M', 'Molecular Trait Datasets']], - 'BXD': [('Phenotypes', - 'Traits and Cofactors', - 'Phenotypes'), - ('Genotypes', - 'DNA Markers and SNPs', - 'Genotypes'), - ['M', 'M', 'Molecular Trait Datasets']] - } - } - - def tearDown(self): - self.app_context.pop() - - @mock.patch('wqflask.api.gen_menu.g') - def test_get_species(self, db_mock): - """Test that assertion is raised when dataset and dataset_name - are defined""" - db_mock.db.execute.return_value.fetchall.return_value = ( - ('human', 'Human'), - ('mouse', 'Mouse')) - self.assertEqual(get_species(), - [['human', 'Human'], ['mouse', 'Mouse']]) - db_mock.db.execute.assert_called_once_with( - "SELECT Name, MenuName FROM Species ORDER BY OrderId" - ) - - @mock.patch('wqflask.api.gen_menu.g') - def test_get_groups(self, db_mock): - """Test that species groups are grouped correctly""" - db_mock.db.execute.return_value.fetchall.side_effect = [ - # Mouse - (('BXD', 'BXD', None), - ('HLC', 'Liver: Normal Gene Expression with Genotypes (Merck)', - 'Test')), - # Human - (('H_T1', "H_T", "DescriptionA"), - ('H_T2', "H_T'", None)) - ] - - self.assertEqual(get_groups([["human", "Human"], ["mouse", "Mouse"]]), - self.test_group) - - for name in ["mouse", "human"]: - db_mock.db.execute.assert_any_call( - ("SELECT InbredSet.Name, InbredSet.FullName, " + - "IFNULL(InbredSet.Family, 'None') " + - "FROM InbredSet, Species WHERE Species.Name " + - "= '{}' AND InbredSet.SpeciesId = Species.Id GROUP by " + - "InbredSet.Name ORDER BY IFNULL(InbredSet.FamilyOrder, " + - "InbredSet.FullName) ASC, IFNULL(InbredSet.Family, " + - "InbredSet.FullName) ASC, InbredSet.FullName ASC, " + - "InbredSet.MenuOrderId ASC").format(name) - ) - - @mock.patch('wqflask.api.gen_menu.g') - def test_phenotypes_exist_called_with_correct_query(self, db_mock): - """Test that phenotypes_exist is called with the correct query""" - db_mock.db.execute.return_value.fetchone.return_value = None - phenotypes_exist("test") - db_mock.db.execute.assert_called_with( - "SELECT Name FROM PublishFreeze " - "WHERE PublishFreeze.Name = 'testPublish'" - ) - - @mock.patch('wqflask.api.gen_menu.g') - def test_phenotypes_exist_with_falsy_values(self, db_mock): - """Test that phenotype check returns correctly when given - a None value""" - for x in [None, False, (), [], ""]: - db_mock.db.execute.return_value.fetchone.return_value = x - self.assertFalse(phenotypes_exist("test")) - - @mock.patch('wqflask.api.gen_menu.g') - def test_phenotypes_exist_with_truthy_value(self, db_mock): - """Test that phenotype check returns correctly when given Truthy """ - for x in ["x", ("result"), ["result"], [1]]: - db_mock.db.execute.return_value.fetchone.return_value = (x) - self.assertTrue(phenotypes_exist("test")) - - @mock.patch('wqflask.api.gen_menu.g') - def test_genotypes_exist_called_with_correct_query(self, db_mock): - """Test that genotypes_exist is called with the correct query""" - db_mock.db.execute.return_value.fetchone.return_value = None - genotypes_exist("test") - db_mock.db.execute.assert_called_with( - "SELECT Name FROM GenoFreeze WHERE GenoFreeze.Name = 'testGeno'" - ) - - @mock.patch('wqflask.api.gen_menu.g') - def test_genotypes_exist_with_falsy_values(self, db_mock): - """Test that genotype check returns correctly when given - a None value""" - for x in [None, False, (), [], ""]: - db_mock.db.execute.return_value.fetchone.return_value = x - self.assertFalse(genotypes_exist("test")) - - @mock.patch('wqflask.api.gen_menu.g') - def test_genotypes_exist_with_truthy_value(self, db_mock): - """Test that genotype check returns correctly when given Truthy """ - for x in ["x", ("result"), ["result"], [1]]: - db_mock.db.execute.return_value.fetchone.return_value = (x) - self.assertTrue(phenotypes_exist("test")) - - @mock.patch('wqflask.api.gen_menu.g') - def test_build_datasets_with_type_phenotypes(self, db_mock): - """Test that correct dataset is returned for a phenotype type""" - db_mock.db.execute.return_value.fetchall.return_value = ( - (602, "BXDPublish", "BXD Published Phenotypes"), - ) - self.assertEqual(build_datasets("Mouse", "BXD", "Phenotypes"), - [['602', "BXDPublish", "BXD Published Phenotypes"]]) - db_mock.db.execute.assert_called_with( - "SELECT InfoFiles.GN_AccesionId, PublishFreeze.Name, " + - "PublishFreeze.FullName FROM InfoFiles, PublishFreeze, " + - "InbredSet WHERE InbredSet.Name = 'BXD' AND " + - "PublishFreeze.InbredSetId = InbredSet.Id AND " + - "InfoFiles.InfoPageName = PublishFreeze.Name " + - "ORDER BY PublishFreeze.CreateTime ASC" - ) - self.assertEqual(build_datasets("Mouse", "MDP", "Phenotypes"), - [['602', "BXDPublish", "Mouse Phenome Database"]]) - - db_mock.db.execute.return_value.fetchall.return_value = () - db_mock.db.execute.return_value.fetchone.return_value = ( - "BXDPublish", "Mouse Phenome Database" - ) - self.assertEqual(build_datasets("Mouse", "MDP", "Phenotypes"), - [["None", "BXDPublish", "Mouse Phenome Database"]]) - - @mock.patch('wqflask.api.gen_menu.g') - def test_build_datasets_with_type_phenotypes_and_no_results(self, db_mock): - """Test that correct dataset is returned for a phenotype type with no - results - - """ - db_mock.db.execute.return_value.fetchall.return_value = None - db_mock.db.execute.return_value.fetchone.return_value = (121, - "text value") - self.assertEqual(build_datasets("Mouse", "BXD", "Phenotypes"), - [["None", "121", "text value"]]) - db_mock.db.execute.assert_called_with( - "SELECT PublishFreeze.Name, PublishFreeze.FullName " - "FROM PublishFreeze, InbredSet " - "WHERE InbredSet.Name = 'BXD' AND " - "PublishFreeze.InbredSetId = InbredSet.Id " - "ORDER BY PublishFreeze.CreateTime ASC" - ) - - @mock.patch('wqflask.api.gen_menu.g') - def test_build_datasets_with_type_genotypes(self, db_mock): - """Test that correct dataset is returned for a phenotype type""" - db_mock.db.execute.return_value.fetchone.return_value = ( - 635, "HLCPublish", "HLC Published Genotypes" - ) - - self.assertEqual(build_datasets("Mouse", "HLC", "Genotypes"), - [["635", "HLCGeno", "HLC Genotypes"]]) - db_mock.db.execute.assert_called_with( - "SELECT InfoFiles.GN_AccesionId FROM InfoFiles, " - "GenoFreeze, InbredSet WHERE InbredSet.Name = 'HLC' AND " - "GenoFreeze.InbredSetId = InbredSet.Id AND " - "InfoFiles.InfoPageName = GenoFreeze.ShortName " + - "ORDER BY GenoFreeze.CreateTime DESC" - ) - db_mock.db.execute.return_value.fetchone.return_value = () - self.assertEqual(build_datasets("Mouse", "HLC", "Genotypes"), - [["None", "HLCGeno", "HLC Genotypes"]]) - - @mock.patch('wqflask.api.gen_menu.g') - def test_build_datasets_with_type_mrna(self, db_mock): - """Test that correct dataset is returned for a mRNA - expression/ Probeset""" - db_mock.db.execute.return_value.fetchall.return_value = ( - (112, "HC_M2_0606_P", - "Hippocampus Consortium M430v2 (Jun06) PDNN"), ) - self.assertEqual(build_datasets("Mouse", "HLC", "mRNA"), [[ - "112", 'HC_M2_0606_P', "Hippocampus Consortium M430v2 (Jun06) PDNN" - ]]) - db_mock.db.execute.assert_called_once_with( - "SELECT ProbeSetFreeze.Id, ProbeSetFreeze.Name, " + - "ProbeSetFreeze.FullName FROM ProbeSetFreeze, " + - "ProbeFreeze, InbredSet, Tissue, Species WHERE " + - "Species.Name = 'Mouse' AND Species.Id = " + - "InbredSet.SpeciesId AND InbredSet.Name = 'HLC' AND " + - "ProbeSetFreeze.ProbeFreezeId = ProbeFreeze.Id and " + - "Tissue.Name = 'mRNA' AND ProbeFreeze.TissueId = " + - "Tissue.Id and ProbeFreeze.InbredSetId = InbredSet.Id " + - "ORDER BY ProbeSetFreeze.CreateTime DESC") - - @mock.patch('wqflask.api.gen_menu.build_datasets') - @mock.patch('wqflask.api.gen_menu.g') - def test_build_types(self, db_mock, datasets_mock): - """Test that correct tissue metadata is returned""" - datasets_mock.return_value = [ - ["112", 'HC_M2_0606_P', - "Hippocampus Consortium M430v2 (Jun06) PDNN"] - ] - db_mock.db.execute.return_value.fetchall.return_value = ( - ('Mouse Tissue'), ('Human Tissue'), ('Rat Tissue') - ) - self.assertEqual(build_types('mouse', 'random group'), - [['M', 'M', 'Molecular Traits'], - ['H', 'H', 'Molecular Traits'], - ['R', 'R', 'Molecular Traits']]) - db_mock.db.execute.assert_called_once_with( - "SELECT DISTINCT Tissue.Name " + - "FROM ProbeFreeze, ProbeSetFreeze, InbredSet, " + - "Tissue, Species WHERE Species.Name = 'mouse' " + - "AND Species.Id = InbredSet.SpeciesId AND " + - "InbredSet.Name = 'random group' AND " + - "ProbeFreeze.TissueId = Tissue.Id AND " + - "ProbeFreeze.InbredSetId = InbredSet.Id AND " + - "ProbeSetFreeze.ProbeFreezeId = ProbeFreeze.Id " + - "ORDER BY Tissue.Name" - ) - - @mock.patch('wqflask.api.gen_menu.build_types') - @mock.patch('wqflask.api.gen_menu.genotypes_exist') - @mock.patch('wqflask.api.gen_menu.phenotypes_exist') - def test_get_types_with_existing_genotype_and_phenotypes( - self, - phenotypes_exist_mock, - genotypes_exist_mock, - build_types_mock): - """Test that build types are constructed correctly if phenotypes and genotypes - exist - - """ - phenotypes_exist_mock.return_value = True - genotypes_exist_mock.return_value = True - - expected_result = self.test_type - - build_types_mock.return_value = [ - ['M', 'M', 'Molecular Trait Datasets'] - ] - self.assertEqual(get_types(self.test_group), expected_result) - - @mock.patch('wqflask.api.gen_menu.build_types') - @mock.patch('wqflask.api.gen_menu.genotypes_exist') - @mock.patch('wqflask.api.gen_menu.phenotypes_exist') - def test_get_types_with_buildtype_and_non_existent_genotype_and_phenotypes( - self, - phenotypes_exist_mock, - genotypes_exist_mock, - build_types_mock): - """Test that build types are constructed correctly if phenotypes_exist and - genotypes_exist are false but build_type is falsy - - """ - phenotypes_exist_mock.return_value = False - genotypes_exist_mock.return_value = False - - build_types_mock.return_value = [] - self.assertEqual(get_types(self.test_group), { - 'mouse': {}, - 'human': {} - }) - - @mock.patch('wqflask.api.gen_menu.build_types') - @mock.patch('wqflask.api.gen_menu.genotypes_exist') - @mock.patch('wqflask.api.gen_menu.phenotypes_exist') - def test_get_types_with_non_existent_genotype_phenotypes_and_buildtype( - self, - phenotypes_exist_mock, - genotypes_exist_mock, - build_types_mock): - """Test that build types are constructed correctly if phenotypes_exist, - genotypes_exist and build_types are truthy - - """ - phenotypes_exist_mock.return_value = False - genotypes_exist_mock.return_value = False - - build_types_mock.return_value = [ - ['M', 'M', 'Molecular Trait Datasets'] - ] - expected_result = { - 'mouse': { - 'H_T2': [['M', 'M', 'Molecular Trait Datasets']], - 'H_T1': [['M', 'M', 'Molecular Trait Datasets']]}, - 'human': { - 'HLC': [['M', 'M', 'Molecular Trait Datasets']], - 'BXD': [['M', 'M', 'Molecular Trait Datasets']]}} - self.assertEqual(get_types(self.test_group), - expected_result) - - @mock.patch('wqflask.api.gen_menu.build_datasets') - def test_get_datasets_with_existent_datasets(self, - build_datasets_mock): - """Test correct dataset is returned with existent build_datasets""" - build_datasets_mock.return_value = "Test" - expected_result = { - 'mouse': { - 'H_T2': {'Genotypes': 'Test', - 'M': 'Test', - 'Phenotypes': 'Test'}, - 'H_T1': {'Genotypes': 'Test', - 'M': 'Test', - 'Phenotypes': 'Test'}}, - 'human': {'HLC': {'Genotypes': 'Test', - 'M': 'Test', - 'Phenotypes': 'Test'}, - 'BXD': {'Genotypes': 'Test', - 'M': 'Test', - 'Phenotypes': 'Test'}}} - self.assertEqual(get_datasets(self.test_type), - expected_result) - - @mock.patch('wqflask.api.gen_menu.build_datasets') - def test_get_datasets_with_non_existent_datasets(self, - build_datasets_mock): - """Test correct dataset is returned with non-existent build_datasets""" - build_datasets_mock.return_value = None - expected_result = { - 'mouse': { - 'H_T2': {}, - 'H_T1': {}}, - 'human': {'HLC': {}, - 'BXD': {}}} - self.assertEqual(get_datasets(self.test_type), - expected_result) - - @mock.patch('wqflask.api.gen_menu.get_datasets') - @mock.patch('wqflask.api.gen_menu.get_types') - @mock.patch('wqflask.api.gen_menu.get_groups') - @mock.patch('wqflask.api.gen_menu.get_species') - def test_gen_dropdown_json(self, - species_mock, - groups_mock, - types_mock, - datasets_mock): - "Test that the correct dictionary is constructed properly" - species_mock.return_value = ("speciesA speciesB speciesC speciesD" - .split(" ")) - datasets_mock.return_value = ("datasetA datasetB datasetC datasetD" - .split(" ")) - groups_mock.return_value = ("groupA groupB groupC groupD" - .split(" ")) - types_mock.return_value = ("typeA typeB typeC typeD" - .split(" ")) - datasets_mock.return_value = ("datasetA datasetB datasetC datasetD" - .split(" ")) - - expected_result = { - 'datasets': ['datasetA', 'datasetB', 'datasetC', 'datasetD'], - 'types': ['typeA', 'typeB', 'typeC', 'typeD'], - 'groups': ['groupA', 'groupB', 'groupC', 'groupD'], - 'species': ['speciesA', 'speciesB', 'speciesC', 'speciesD']} - - self.assertEqual(gen_dropdown_json(), expected_result) diff --git a/wqflask/tests/wqflask/marker_regression/__init__.py b/wqflask/tests/wqflask/marker_regression/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/wqflask/tests/wqflask/marker_regression/test_display_mapping_results.py b/wqflask/tests/wqflask/marker_regression/test_display_mapping_results.py deleted file mode 100644 index 8ae0f09f..00000000 --- a/wqflask/tests/wqflask/marker_regression/test_display_mapping_results.py +++ /dev/null @@ -1,156 +0,0 @@ -import unittest - -import htmlgen as HT -from wqflask.marker_regression.display_mapping_results import ( - DisplayMappingResults, - HtmlGenWrapper -) - - -class TestDisplayMappingResults(unittest.TestCase): - """Basic Methods to test Mapping Results""" - def test_pil_colors(self): - """Test that colors use PILLOW color format""" - self.assertEqual(DisplayMappingResults.CLICKABLE_WEBQTL_REGION_COLOR, - (245, 211, 211)) - - -class TestHtmlGenWrapper(unittest.TestCase): - """Test Wrapper around HTMLGen""" - def test_create_image(self): - """Test HT.Image method""" - self.assertEqual( - str(HtmlGenWrapper.create_image_tag(src="test.png", - alt="random", - border="0", - width="10", - height="13", - usemap="#webqtlmap")), - ("""random""") - ) - - def test_create_form(self): - """Test HT.Form method""" - test_form = HtmlGenWrapper.create_form_tag( - cgi="/testing/", - enctype='multipart/form-data', - name="formName", - submit=HtmlGenWrapper.create_input_tag(type_='hidden', name='Default_Name') - ) - test_image = HtmlGenWrapper.create_image_tag( - src="test.png", - alt="random", - border="0", - width="10", - height="13", - usemap="#webqtlmap" - ) - self.assertEqual( - str(test_form).replace("\n", ""), - ("""
""")) - hddn = { - 'FormID': 'showDatabase', - 'ProbeSetID': '_', - 'database': "TestGeno", - 'CellID': '_', - 'RISet': "Test", - 'incparentsf1': 'ON' - } - for key in hddn.keys(): - test_form.append( - HtmlGenWrapper.create_input_tag( - name=key, - value=hddn[key], - type_='hidden')) - test_form.append(test_image) - - self.assertEqual(str(test_form).replace("\n", ""), ( - """
""" - """""" - """""" - """""" - """""" - """""" - """""" - """""" - """random""" - """
""")) - - def test_create_paragraph(self): - """Test HT.Paragraph method""" - test_p_element = HtmlGenWrapper.create_p_tag(id="smallSize") - par_text = ( - "Mapping using genotype data as " - "a trait will result in infinity LRS at one locus. " - "In order to display the result properly, all LRSs " - "higher than 100 are capped at 100." - ) - self.assertEqual( - str(test_p_element), - """

""" - ) - test_p_element.append(HtmlGenWrapper.create_br_tag()) - test_p_element.append(par_text) - self.assertEqual( - str(test_p_element), - """


{}

""".format(par_text) - ) - - def test_create_br_tag(self): - """Test HT.BR() method""" - self.assertEqual(str(HtmlGenWrapper.create_br_tag()), - "
") - - def test_create_input_tag(self): - """Test HT.Input method""" - self.assertEqual( - str(HtmlGenWrapper.create_input_tag( - type_="hidden", - name="name", - value="key", - Class="trait trait_")).replace("\n", ""), - ("""""")) - - def test_create_map_tag(self): - """Test HT.Map method""" - self.assertEqual(str(HtmlGenWrapper.create_map_tag( - name="WebqTLImageMap")).replace("\n", ""), - """""") - gifmap = HtmlGenWrapper.create_map_tag(name="test") - gifmap.append(HtmlGenWrapper.create_area_tag(shape="rect", - coords='1 2 3', href='#area1')) - gifmap.append(HtmlGenWrapper.create_area_tag(shape="rect", - coords='1 2 3', href='#area2')) - self.assertEqual( - str(gifmap).replace("\n", ""), - ("""""" - """""" - """""" - """""")) - - def test_create_area_tag(self): - """Test HT.Area method""" - self.assertEqual( - str(HtmlGenWrapper.create_area_tag( - shape="rect", - coords="1 2", - href="http://test.com", - title="Some Title")).replace("\n", ""), - ("""""")) - - def test_create_link_tag(self): - """Test HT.HREF method""" - self.assertEqual( - str(HtmlGenWrapper.create_link_tag( - "www.test.com", "test", target="_blank")).replace("\n", ""), - """test""") diff --git a/wqflask/tests/wqflask/show_trait/__init__.py b/wqflask/tests/wqflask/show_trait/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/wqflask/tests/wqflask/show_trait/test_export_trait_data.py b/wqflask/tests/wqflask/show_trait/test_export_trait_data.py deleted file mode 100644 index 41761944..00000000 --- a/wqflask/tests/wqflask/show_trait/test_export_trait_data.py +++ /dev/null @@ -1,212 +0,0 @@ -import unittest -from unittest import mock -from wqflask.show_trait.export_trait_data import dict_to_sorted_list -from wqflask.show_trait.export_trait_data import cmp_samples -from wqflask.show_trait.export_trait_data import export_sample_table -from wqflask.show_trait.export_trait_data import get_export_metadata - - -class AttributesSetter: - def __init__(self, obj): - for key, value in obj.items(): - setattr(self, key, value) - - -class TestExportTraits(unittest.TestCase): - """Test methods related to converting dict to sortedlist""" - @mock.patch("wqflask.show_trait.export_trait_data.create_trait") - @mock.patch("wqflask.show_trait.export_trait_data.data_set") - def test_get_export_metadata_no_publish(self, mock_dataset, mock_trait): - """test for exporting metadata with no publish""" - mock_dataset_attributes = AttributesSetter( - {"type": "no_publish", "dataset_name": "Temp", "name": "Temp"}) - - mock_nested_attributes = AttributesSetter({"name": "name"}) - mock_dataset_attributes.group = mock_nested_attributes - mock_dataset.create_dataset.return_value = mock_dataset_attributes - mock_trait.return_value = AttributesSetter({"symbol": "", "description_display": "Description", - "title": "research1", "journal": "", "authors": ""}) - - results = get_export_metadata("random_id", "Temp") - expected = [["Record ID: random_id"], - ["Trait URL: http://genenetwork.org/show_trait?trait_id=random_id&dataset=Temp"], - ["Dataset: Temp"], - ["Group: name"], []] - - mock_dataset.create_dataset.assert_called_with("Temp") - mock_trait.assert_called_with( - dataset=mock_dataset_attributes, name="random_id", cellid=None, get_qtl_info=False) - self.assertEqual(results, expected) - - @mock.patch("wqflask.show_trait.export_trait_data.create_trait") - @mock.patch("wqflask.show_trait.export_trait_data.data_set") - def test_get_export_metadata_with_publish(self, data_mock, trait_mock): - """test for exporting metadata with dataset.type=Publish""" - mock_dataset_attributes = AttributesSetter({"type": "Publish", "dataset_name": "Temp", - "name": "Temp", "description_display": "Description goes here"}) - - mock_nested_attributes = AttributesSetter({"name": "name"}) - mock_dataset_attributes.group = mock_nested_attributes - data_mock.create_dataset.return_value = mock_dataset_attributes - trait_instance = AttributesSetter({"symbol": "", "description_display": "Description", - "title": "research1", "journal": "", "authors": ""}) - trait_mock.return_value = trait_instance - - results = get_export_metadata( - "29ae0615-0d77-4814-97c7-c9e91f6bfd7b", "Temp") - - expected = [['Phenotype ID: 29ae0615-0d77-4814-97c7-c9e91f6bfd7b'], - ['Phenotype URL: http://genenetwork.org/show_trait?trait_id=29ae0615-0d77-4814-97c7-c9e91f6bfd7b&dataset=Temp'], [ - 'Group: name'], ['Phenotype: Description'], - ['Authors: N/A'], ['Title: research1'], - ['Journal: N/A'], ['Dataset Link: http://gn1.genenetwork.org/webqtl/main.py?FormID=sharinginfo&InfoPageName=Temp'], []] - - self.assertEqual(results, expected) - - @mock.patch("wqflask.show_trait.export_trait_data.dict_to_sorted_list") - @mock.patch("wqflask.show_trait.export_trait_data.get_export_metadata") - def test_export_sample_table(self, exp_metadata, dict_list): - """test for exporting sample table""" - targs_obj = { - "export_data": """{ - "primary_samples": [ - { - "other": "germanotta", - "name": "Sauroniops", - "se":{ - "name":"S2" - }, - "num_cases":{ - "k1":"value" - - } - } - ], - "other_samples": [ - { - "se": 1, - "num_cases": 4, - "value": 6, - "name": 3 - } - ] - }""", - "trait_display_name": "Hair_color", - "trait_id": "23177fdc-312e-4084-ad0c-f3eae785fff5", - "dataset": { - } - } - exp_metadata.return_value = [ - ["Phenotype ID:0a2be192-57f5-400b-bbbd-0cf50135995f"], ['Group:gp1'], - ["Phenotype:p1"], [ - "Authors:N/A"], - ["Title:research1"], - ["Journal:N/A"], - ["Dataset Link: http://gn1.genenetwork.org/webqtl/main.py?FormID=sharinginfo&InfoPageName=name1"], []] - expected = ('Hair_color', - [['Phenotype ID:0a2be192-57f5-400b-bbbd-0cf50135995f'], - ['Group:gp1'], - ['Phenotype:p1'], - ['Authors:N/A'], - ['Title:research1'], - ['Journal:N/A'], - ['Dataset Link: ' - 'http://gn1.genenetwork.org/webqtl/main.py?FormID=sharinginfo&InfoPageName=name1'], - [], - ['Name', 'Value', 'SE', 'N'], - ['Sauroniops', 'germanotta'], - [3, 6, 1, 4]]) - - dict_list.side_effect = [['Sauroniops', 'germanotta'], [3, 6, 1, 4]] - - self.assertEqual(export_sample_table(targs_obj), expected) - exp_metadata.assert_called_with( - "23177fdc-312e-4084-ad0c-f3eae785fff5", {}) - self.assertEqual(dict_list.call_count, 2) - - def test_dict_to_sortedlist(self): - """test for conversion of dict to sorted list""" - sample1 = { - "other": "exp1", - "name": "exp2" - } - sample2 = { - "se": 1, - "num_cases": 4, - "value": 6, - "name": 3 - - } - rever = { - "name": 3, - "value": 6, - "num_cases": 4, - "se": 1 - } - oneItem = { - "item1": "one" - } - - self.assertEqual(["exp2", "exp1"], dict_to_sorted_list(sample1)) - self.assertEqual([3, 6, 1, 4], dict_to_sorted_list(sample2)) - self.assertEqual([3, 6, 1, 4], dict_to_sorted_list(rever)) - self.assertEqual(["one"], dict_to_sorted_list(oneItem)) - """test that the func returns the values not the keys""" - self.assertFalse(["other", "name"] == dict_to_sorted_list(sample1)) - - def test_cmp_samples(self): - """test for comparing samples function""" - sampleA = [ - [ - ("value", "other"), - ("name", "test_name") - ] - ] - sampleB = [ - [ - ("value", "other"), - ("unknown", "test_name") - ] - ] - sampleC = [ - [("other", "value"), - ("name", "value") - ], - [ - ("name", "value"), - ("value", "name") - ], - [ - ("other", "value"), - ("name", "value" - )], - [ - ("name", "name1"), - ("se", "valuex") - ], - [( - "value", "name1"), - ("se", "valuex") - ], - [( - "other", "name1"), - ("se", "valuex" - ) - ], - [( - "name", "name_val"), - ("num_cases", "num_val") - ], - [( - "other_a", "val_a"), - ("other_b", "val" - ) - ] - ] - results = [cmp_samples(val[0], val[1]) for val in sampleA] - resultB = [cmp_samples(val[0], val[1]) for val in sampleB] - resultC = [cmp_samples(val[0], val[1]) for val in sampleC] - - self.assertEqual(1, *results) - self.assertEqual(-1, *resultB) - self.assertEqual([1, -1, 1, -1, -1, 1, -1, -1], resultC) diff --git a/wqflask/tests/wqflask/test_collect.py b/wqflask/tests/wqflask/test_collect.py deleted file mode 100644 index 9a36132d..00000000 --- a/wqflask/tests/wqflask/test_collect.py +++ /dev/null @@ -1,73 +0,0 @@ -"""Test cases for some methods in collect.py""" - -import unittest -from unittest import mock - -from flask import Flask -from wqflask.collect import process_traits - -app = Flask(__name__) - - -class MockSession: - """Helper class for mocking wqflask.collect.g.user_session.logged_in""" - def __init__(self, is_logged_in=False): - self.is_logged_in = is_logged_in - - @property - def logged_in(self): - return self.is_logged_in - - -class MockFlaskG: - """Helper class for mocking wqflask.collect.g.user_session""" - def __init__(self, is_logged_in=False): - self.is_logged_in = is_logged_in - - @property - def user_session(self): - if self.is_logged_in: - return MockSession(is_logged_in=True) - return MockSession() - - -class TestCollect(unittest.TestCase): - - def setUp(self): - self.app_context = app.app_context() - self.app_context.push() - - def tearDown(self): - self.app_context.pop() - - @mock.patch("wqflask.collect.g", MockFlaskG()) - def test_process_traits_with_bytestring(self): - """ - Test that the correct traits are returned when the user is logged - out and bytes are used. - """ - self.assertEqual(process_traits( - b'1452452_at:HC_M2_0606_P:163d04f7db7c9e110de6,' - b'1452447_at:HC_M2_0606_P:eeece8fceb67072debea,' - b'1451401_a_at:HC_M2_0606_P:a043d23b3b3906d8318e,' - b'1429252_at:HC_M2_0606_P:6fa378b349bc9180e8f5'), - set(['1429252_at:HC_M2_0606_P', - '1451401_a_at:HC_M2_0606_P', - '1452447_at:HC_M2_0606_P', - '1452452_at:HC_M2_0606_P'])) - - @mock.patch("wqflask.collect.g", MockFlaskG()) - def test_process_traits_with_normal_string(self): - """ - Test that the correct traits are returned when the user is logged - out and a normal string is used. - """ - self.assertEqual(process_traits( - '1452452_at:HC_M2_0606_P:163d04f7db7c9e110de6,' - '1452447_at:HC_M2_0606_P:eeece8fceb67072debea,' - '1451401_a_at:HC_M2_0606_P:a043d23b3b3906d8318e,' - '1429252_at:HC_M2_0606_P:6fa378b349bc9180e8f5'), - set(['1429252_at:HC_M2_0606_P', - '1451401_a_at:HC_M2_0606_P', - '1452447_at:HC_M2_0606_P', - '1452452_at:HC_M2_0606_P'])) diff --git a/wqflask/tests/wqflask/test_pbkdf2.py b/wqflask/tests/wqflask/test_pbkdf2.py deleted file mode 100644 index a33fbd4f..00000000 --- a/wqflask/tests/wqflask/test_pbkdf2.py +++ /dev/null @@ -1,61 +0,0 @@ -"""Test cases pbkdf2""" - -import unittest -from wqflask.pbkdf2 import pbkdf2_hex - - -class TestPbkdf2(unittest.TestCase): - def test_pbkdf2_hex(self): - """ - Test pbkdf2_hex function - """ - - for password, salt, iterations, keylen, expected_value in [ - ('password', 'salt', 1, 20, - '0c60c80f961f0e71f3a9b524af6012062fe037a6'), - ('password', 'salt', 2, 20, - 'ea6c014dc72d6f8ccd1ed92ace1d41f0d8de8957'), - ('password', 'salt', 4096, 20, - '4b007901b765489abead49d926f721d065a429c1'), - ('passwordPASSWORDpassword', - 'saltSALTsaltSALTsaltSALTsaltSALTsalt', - 4096, 25, - '3d2eec4fe41c849b80c8d83662c0e44a8b291a964cf2f07038'), - ('pass\x00word', 'sa\x00lt', 4096, 16, - '56fa6aa75548099dcc37d7f03425e0c3'), - ('password', 'ATHENA.MIT.EDUraeburn', 1, 16, - 'cdedb5281bb2f801565a1122b2563515'), - ('password', 'ATHENA.MIT.EDUraeburn', 1, 32, - ('cdedb5281bb2f80' - '1565a1122b256351' - '50ad1f7a04bb9f3a33' - '3ecc0e2e1f70837')), - ('password', 'ATHENA.MIT.EDUraeburn', 2, 16, - '01dbee7f4a9e243e988b62c73cda935d'), - ('password', 'ATHENA.MIT.EDUraeburn', 2, 32, - ('01dbee7f4a9e243e9' - '88b62c73cda935da05' - '378b93244ec8f48a99' - 'e61ad799d86')), - ('password', 'ATHENA.MIT.EDUraeburn', 1200, 32, - ('5c08eb61fdf71e' - '4e4ec3cf6ba1f55' - '12ba7e52ddbc5e51' - '42f708a31e2e62b1e13')), - ('X' * 64, 'pass phrase equals block size', 1200, 32, - ('139c30c0966bc32ba' - '55fdbf212530ac9c5' - 'ec59f1a452f5cc9ad' - '940fea0598ed1')), - ('X' * 65, 'pass phrase exceeds block size', 1200, 32, - ('9ccad6d468770cd' - '51b10e6a68721be6' - '11a8b4d282601db3' - 'b36be9246915ec82a')) - ]: - self.assertEqual( - pbkdf2_hex(data=password, - salt=salt, - iterations=iterations, - keylen=keylen), - expected_value) diff --git a/wqflask/tests/wqflask/test_user_login.py b/wqflask/tests/wqflask/test_user_login.py deleted file mode 100644 index 61cd9ab9..00000000 --- a/wqflask/tests/wqflask/test_user_login.py +++ /dev/null @@ -1,21 +0,0 @@ -"""Test cases for some methods in login.py""" - -import unittest -from wqflask.user_login import encode_password - - -class TestUserLogin(unittest.TestCase): - def test_encode_password(self): - """ - Test encode password - """ - pass_gen_fields = { - "salt": "salt", - "hashfunc": "sha1", - "iterations": 4096, - "keylength": 20, - } - self.assertEqual( - encode_password(pass_gen_fields, - "password").get("password"), - '4b007901b765489abead49d926f721d065a429c1') diff --git a/wqflask/tests/wqflask/test_user_session.py b/wqflask/tests/wqflask/test_user_session.py deleted file mode 100644 index ebb0334a..00000000 --- a/wqflask/tests/wqflask/test_user_session.py +++ /dev/null @@ -1,15 +0,0 @@ -"""Test cases for some methods in user_session.py""" - -import unittest -from wqflask.user_session import verify_cookie - - -class TestUserSession(unittest.TestCase): - def test_verify_cookie(self): - """ - Test cookie verification - """ - self.assertEqual( - "3f4c1dbf-5b56-4260-87d6-f35445bda37e", - verify_cookie(("3f4c1dbf-5b56-4260-87d6-" - "f35445bda37e:af4fcf5eace9e7c864ce"))) diff --git a/wqflask/wqflask/templates/glossary.html b/wqflask/wqflask/templates/glossary.html index 988297d3..80e74ceb 100644 --- a/wqflask/wqflask/templates/glossary.html +++ b/wqflask/wqflask/templates/glossary.html @@ -3,5 +3,5 @@ {% block title %}Glossary{% endblock %} {% block content %} -Test +

Test

{% endblock %} -- cgit v1.2.3 From 390dcc3c46495a8e316df36ceb57dae2089456da Mon Sep 17 00:00:00 2001 From: BonfaceKilz Date: Tue, 3 Nov 2020 19:14:24 +0300 Subject: Remove encoding header for file In python3 the default encoding is utf-8 so this is redundant. --- wqflask/tests/unit/utility/test_hmac.py | 1 - wqflask/wqflask/views.py | 4 +--- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/wqflask/tests/unit/utility/test_hmac.py b/wqflask/tests/unit/utility/test_hmac.py index 4e3652f8..13d6261d 100644 --- a/wqflask/tests/unit/utility/test_hmac.py +++ b/wqflask/tests/unit/utility/test_hmac.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Test hmac utility functions""" import unittest diff --git a/wqflask/wqflask/views.py b/wqflask/wqflask/views.py index 08673f79..b7c4d142 100644 --- a/wqflask/wqflask/views.py +++ b/wqflask/wqflask/views.py @@ -1,6 +1,4 @@ -# -*- coding: utf-8 -*- -# -# Main routing table for GN2 +"""Main routing table for GN2""" import traceback # for error page import os # for error gifs -- cgit v1.2.3 From ddd071b2bd50d243c01c335e06955316df08fc45 Mon Sep 17 00:00:00 2001 From: BonfaceKilz Date: Tue, 3 Nov 2020 19:16:17 +0300 Subject: Rename glossary.py to markdown_routes.py Evary page with markdown content will have routes added in markdown_routes.py * markdown_routes.py: New file. * wqflask/wqflask/glossary.py: Delete it. * wqflask/wqflask/__init__.py: Update import. --- wqflask/wqflask/__init__.py | 2 +- wqflask/wqflask/glossary.py | 9 --------- wqflask/wqflask/markdown_routes.py | 29 +++++++++++++++++++++++++++++ 3 files changed, 30 insertions(+), 10 deletions(-) delete mode 100644 wqflask/wqflask/glossary.py create mode 100644 wqflask/wqflask/markdown_routes.py diff --git a/wqflask/wqflask/__init__.py b/wqflask/wqflask/__init__.py index a3870ce6..874cde17 100644 --- a/wqflask/wqflask/__init__.py +++ b/wqflask/wqflask/__init__.py @@ -6,7 +6,7 @@ import jinja2 from flask import g from flask import Flask from utility import formatting -from wqflask.glossary import glossary_blueprint +from wqflask.markdown_routes import glossary_blueprint app = Flask(__name__) diff --git a/wqflask/wqflask/glossary.py b/wqflask/wqflask/glossary.py deleted file mode 100644 index a44e7c45..00000000 --- a/wqflask/wqflask/glossary.py +++ /dev/null @@ -1,9 +0,0 @@ -from flask import Blueprint -from flask import render_template - -glossary_blueprint = Blueprint('glossary_blueprint', __name__) - - -@glossary_blueprint.route('/') -def glossary(): - return render_template("glossary.html"), 200 diff --git a/wqflask/wqflask/markdown_routes.py b/wqflask/wqflask/markdown_routes.py new file mode 100644 index 00000000..e7e3c33e --- /dev/null +++ b/wqflask/wqflask/markdown_routes.py @@ -0,0 +1,29 @@ +"""Markdown routes + +Render pages from github, or if they are unavailable, look for it else where +""" +import requests +import mistune + +from flask import Blueprint +from flask import render_template + +glossary_blueprint = Blueprint('glossary_blueprint', __name__) + + +@glossary_blueprint.route('/') +def glossary(): + markdown_url = ("https://raw.githubusercontent.com" + "/genenetwork/genenetwork2/" + "wqflask/wqflask/static" + "/glossary.md") + md_content = requests.get(markdown_url) + if md_content.status_code == 200: + return render_template( + "glossary_html", + rendered_markdown=mistune.html( + md_content.content.decode("utf-8"))), 200 + + return render_template( + "glossary.html", + rendered_markdown=mistune.html("# Github Down!")), 200 -- cgit v1.2.3 From f2ffa6a8bcd94a3435a19d82700465c4b9508f00 Mon Sep 17 00:00:00 2001 From: BonfaceKilz Date: Tue, 3 Nov 2020 20:54:09 +0300 Subject: Update glossary template with markdown content --- wqflask/wqflask/templates/glossary.html | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/wqflask/wqflask/templates/glossary.html b/wqflask/wqflask/templates/glossary.html index 80e74ceb..718baf13 100644 --- a/wqflask/wqflask/templates/glossary.html +++ b/wqflask/wqflask/templates/glossary.html @@ -3,5 +3,7 @@ {% block title %}Glossary{% endblock %} {% block content %} -

Test

+ +{{ rendered_markdown|safe }} + {% endblock %} -- cgit v1.2.3 From f31fe80702cd24fa3687e7254e9ad212340715fd Mon Sep 17 00:00:00 2001 From: BonfaceKilz Date: Tue, 3 Nov 2020 21:05:52 +0300 Subject: Add simple md content in glossary.md --- wqflask/wqflask/static/markdown/glossary.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 wqflask/wqflask/static/markdown/glossary.md diff --git a/wqflask/wqflask/static/markdown/glossary.md b/wqflask/wqflask/static/markdown/glossary.md new file mode 100644 index 00000000..241dbf0a --- /dev/null +++ b/wqflask/wqflask/static/markdown/glossary.md @@ -0,0 +1 @@ +# Content -- cgit v1.2.3 From b526fc6f6b6f450e169c01db5ffea92f94512393 Mon Sep 17 00:00:00 2001 From: BonfaceKilz Date: Tue, 3 Nov 2020 21:54:36 +0300 Subject: Use python3-genenetwork2 on commit bdce85d in container * .github/workflows/main.yml: Update gn2 container. --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index f27feb5f..d76a5433 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -11,7 +11,7 @@ on: jobs: unittest: runs-on: ubuntu-latest - container: bonfacekilz/python3-genenetwork2:latest + container: bonfacekilz/python3-genenetwork2:bdce85d steps: # First start with mariadb set then checkout. The checkout gives -- cgit v1.2.3 From a7622fc3d996407799cec166968c1e56baf07ea9 Mon Sep 17 00:00:00 2001 From: BonfaceKilz Date: Tue, 3 Nov 2020 21:59:49 +0300 Subject: Move logic for fetching md files to it's own function * wqflask/wqflask/markdown_routes.py (render_markdown): New function. (glossary): use render_markdown function. --- wqflask/wqflask/markdown_routes.py | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/wqflask/wqflask/markdown_routes.py b/wqflask/wqflask/markdown_routes.py index e7e3c33e..33092947 100644 --- a/wqflask/wqflask/markdown_routes.py +++ b/wqflask/wqflask/markdown_routes.py @@ -2,6 +2,7 @@ Render pages from github, or if they are unavailable, look for it else where """ +import os import requests import mistune @@ -11,19 +12,27 @@ from flask import render_template glossary_blueprint = Blueprint('glossary_blueprint', __name__) -@glossary_blueprint.route('/') -def glossary(): - markdown_url = ("https://raw.githubusercontent.com" - "/genenetwork/genenetwork2/" - "wqflask/wqflask/static" - "/glossary.md") +def render_markdown(file_name): + """Try to fetch the file name from Github and if that fails, try to +look for it inside the file system + + """ + markdown_url = (f"https://raw.githubusercontent.com" + f"/genenetwork/genenetwork2/" + f"wqflask/wqflask/static/" + f"{file_name}") md_content = requests.get(markdown_url) if md_content.status_code == 200: - return render_template( - "glossary_html", - rendered_markdown=mistune.html( - md_content.content.decode("utf-8"))), 200 + return mistune.html(md_content.content.decode("utf-8")) + + with open(os.path.join(os.path.abspath(os.path.dirname(__file__)), + f"static/markdown/{file_name}")) as md_file: + markdown = md_file.read() + return mistune.html(markdown) + +@glossary_blueprint.route('/') +def glossary(): return render_template( "glossary.html", - rendered_markdown=mistune.html("# Github Down!")), 200 + rendered_markdown=render_markdown("glossary.md")), 200 -- cgit v1.2.3 From 74550ef0c76a941c473c8d024ccc0a0403631c49 Mon Sep 17 00:00:00 2001 From: BonfaceKilz Date: Tue, 3 Nov 2020 22:03:26 +0300 Subject: Add basic structure for "/glossary" routes test --- wqflask/tests/integration/test_markdown_routes.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 wqflask/tests/integration/test_markdown_routes.py diff --git a/wqflask/tests/integration/test_markdown_routes.py b/wqflask/tests/integration/test_markdown_routes.py new file mode 100644 index 00000000..5e3e5045 --- /dev/null +++ b/wqflask/tests/integration/test_markdown_routes.py @@ -0,0 +1,21 @@ +"Integration tests for markdown routes" +import unittest + +from bs4 import BeautifulSoup + +from wqflask import app + + +class TestGenMenu(unittest.TestCase): + """Tests for glossary""" + + def setUp(self): + self.app = app.test_client() + + def tearDown(self): + pass + + def test_glossary_page(self): + """Test that the glossary page is rendered properly""" + response = self.app.get('/glossary', follow_redirects=True) + pass -- cgit v1.2.3 From bb46ab063cc86525946563c809a896532d87147a Mon Sep 17 00:00:00 2001 From: BonfaceKilz Date: Tue, 3 Nov 2020 22:45:44 +0300 Subject: Add basic tests for rendering_markdown * wqflask/tests/unit/wqflask/test_markdown_routes.py: New tests. --- wqflask/tests/unit/wqflask/test_markdown_routes.py | 43 ++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 wqflask/tests/unit/wqflask/test_markdown_routes.py diff --git a/wqflask/tests/unit/wqflask/test_markdown_routes.py b/wqflask/tests/unit/wqflask/test_markdown_routes.py new file mode 100644 index 00000000..8b6f7490 --- /dev/null +++ b/wqflask/tests/unit/wqflask/test_markdown_routes.py @@ -0,0 +1,43 @@ +"""Test functions in markdown utils""" + +import unittest +from unittest import mock + +from wqflask.markdown_routes import render_markdown + + +class MockRequests404: + @property + def status_code(): + return 404 + +class MockRequests200: + @property + def status_code(): + return 200 + + @property + def content(): + return """ + # Glossary + + This is some content + + ## Sub-heading + This is another sub-heading + """ + +class TestMarkdownRoutesFunctions(unittest.TestCase): + """Test cases for functions in markdown_routes""" + + @mock.patch('wqflask.markdown_routes.requests.get') + def test_render_markdown(self, requests_mock): + requests_mock.return_value = MockRequests404 + markdown_content = render_markdown("glossary.md") + requests_mock.assert_called_with( + "https://raw.githubusercontent.com" + "/genenetwork/genenetwork2/" + "wqflask/wqflask/static/" + "glossary.md") + self.assertEqual("

Content

\n", + markdown_content) -- cgit v1.2.3 From 3ca276a1e9d3aa29f037696b640c64b2d8629c7f Mon Sep 17 00:00:00 2001 From: BonfaceKilz Date: Tue, 3 Nov 2020 22:50:52 +0300 Subject: Delete file * wqflask/tests/integration/test_glossary.py: Delete it. Earlier renamed to test_markdown_routes. --- wqflask/tests/integration/test_glossary.py | 28 ---------------------------- 1 file changed, 28 deletions(-) delete mode 100644 wqflask/tests/integration/test_glossary.py diff --git a/wqflask/tests/integration/test_glossary.py b/wqflask/tests/integration/test_glossary.py deleted file mode 100644 index c9f1e62a..00000000 --- a/wqflask/tests/integration/test_glossary.py +++ /dev/null @@ -1,28 +0,0 @@ -"Integration tests for glossary" -import unittest - -from bs4 import BeautifulSoup - -from wqflask import app - - -class TestGenMenu(unittest.TestCase): - """Tests for glossary""" - - def setUp(self): - self.app = app.test_client() - - def tearDown(self): - pass - - def test_glossary_page(self): - """Test that the glossary page is rendered properly""" - response = self.app.get('/glossary', follow_redirects=True) - html_content = BeautifulSoup(response.data, "lxml") - self.assertEqual(html_content.find("title").get_text(), - "Glossary GeneNetwork 2") - self.assertEqual( - html_content.find( - 'p', - attrs={'id': 'mytest'}).get_text(), - "Test") -- cgit v1.2.3 From 5f6756f05baeeb6be34a079726d0df749a0557ec Mon Sep 17 00:00:00 2001 From: BonfaceKilz Date: Tue, 3 Nov 2020 23:04:53 +0300 Subject: Fix false-positive tests * wqflask/tests/unit/wqflask/test_markdown_routes.py: (MockRequests404): Pass self in all properties. (MockRequests200): Ditto. (test_render_markdown): Rename to test_render_markdown_when_fetching_locally. (test_render_markdown_when_fetching_remotely): New test. --- wqflask/tests/unit/wqflask/test_markdown_routes.py | 37 +++++++++++++++------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/wqflask/tests/unit/wqflask/test_markdown_routes.py b/wqflask/tests/unit/wqflask/test_markdown_routes.py index 8b6f7490..3de14276 100644 --- a/wqflask/tests/unit/wqflask/test_markdown_routes.py +++ b/wqflask/tests/unit/wqflask/test_markdown_routes.py @@ -8,31 +8,30 @@ from wqflask.markdown_routes import render_markdown class MockRequests404: @property - def status_code(): + def status_code(self): return 404 class MockRequests200: @property - def status_code(): + def status_code(self): return 200 @property - def content(): - return """ - # Glossary + def content(self): + return b""" +# Glossary +This is some content - This is some content - - ## Sub-heading - This is another sub-heading +## Sub-heading +This is another sub-heading """ class TestMarkdownRoutesFunctions(unittest.TestCase): """Test cases for functions in markdown_routes""" @mock.patch('wqflask.markdown_routes.requests.get') - def test_render_markdown(self, requests_mock): - requests_mock.return_value = MockRequests404 + def test_render_markdown_when_fetching_locally(self, requests_mock): + requests_mock.return_value = MockRequests404() markdown_content = render_markdown("glossary.md") requests_mock.assert_called_with( "https://raw.githubusercontent.com" @@ -41,3 +40,19 @@ class TestMarkdownRoutesFunctions(unittest.TestCase): "glossary.md") self.assertEqual("

Content

\n", markdown_content) + + @mock.patch('wqflask.markdown_routes.requests.get') + def test_render_markdown_when_fetching_remotely(self, requests_mock): + requests_mock.return_value = MockRequests200() + markdown_content = render_markdown("glossary.md") + requests_mock.assert_called_with( + "https://raw.githubusercontent.com" + "/genenetwork/genenetwork2/" + "wqflask/wqflask/static/" + "glossary.md") + self.assertEqual("""

Glossary

+

This is some content

+

Sub-heading

+

This is another sub-heading

+""", + markdown_content) -- cgit v1.2.3 From 13db311203483f5ef6082e49e15158840ce072ef Mon Sep 17 00:00:00 2001 From: BonfaceKilz Date: Wed, 4 Nov 2020 02:24:47 +0300 Subject: Use python3-genenetwork2 docker image on commit 0bf4ee6 --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index d76a5433..c78f6d85 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -11,7 +11,7 @@ on: jobs: unittest: runs-on: ubuntu-latest - container: bonfacekilz/python3-genenetwork2:bdce85d + container: bonfacekilz/python3-genenetwork2:0bf4ee6 steps: # First start with mariadb set then checkout. The checkout gives -- cgit v1.2.3 From 5899240349f6a943dc4dc1093ffbec08646c0376 Mon Sep 17 00:00:00 2001 From: BonfaceKilz Date: Wed, 4 Nov 2020 17:41:53 +0300 Subject: Update glossary markdown file --- wqflask/wqflask/static/Congenic.png | Bin 0 -> 56578 bytes .../wqflask/static/images/Belknap_Fig1_1998.png | Bin 0 -> 117246 bytes wqflask/wqflask/static/images/Chrna1vsMyf6.gif | Bin 0 -> 59529 bytes wqflask/wqflask/static/images/Normal_Plot.gif | Bin 0 -> 47289 bytes wqflask/wqflask/static/images/SilverFig3_2.png | Bin 0 -> 61570 bytes wqflask/wqflask/static/images/SilverFig3_6.png | Bin 0 -> 22017 bytes wqflask/wqflask/static/images/Winsorize1.png | Bin 0 -> 15117 bytes wqflask/wqflask/static/images/Winsorize3.png | Bin 0 -> 17317 bytes wqflask/wqflask/static/markdown/glossary.md | 595 ++++++++++++++++++++- 9 files changed, 594 insertions(+), 1 deletion(-) create mode 100644 wqflask/wqflask/static/Congenic.png create mode 100644 wqflask/wqflask/static/images/Belknap_Fig1_1998.png create mode 100644 wqflask/wqflask/static/images/Chrna1vsMyf6.gif create mode 100644 wqflask/wqflask/static/images/Normal_Plot.gif create mode 100644 wqflask/wqflask/static/images/SilverFig3_2.png create mode 100644 wqflask/wqflask/static/images/SilverFig3_6.png create mode 100644 wqflask/wqflask/static/images/Winsorize1.png create mode 100644 wqflask/wqflask/static/images/Winsorize3.png diff --git a/wqflask/wqflask/static/Congenic.png b/wqflask/wqflask/static/Congenic.png new file mode 100644 index 00000000..8cd489a4 Binary files /dev/null and b/wqflask/wqflask/static/Congenic.png differ diff --git a/wqflask/wqflask/static/images/Belknap_Fig1_1998.png b/wqflask/wqflask/static/images/Belknap_Fig1_1998.png new file mode 100644 index 00000000..46305fa1 Binary files /dev/null and b/wqflask/wqflask/static/images/Belknap_Fig1_1998.png differ diff --git a/wqflask/wqflask/static/images/Chrna1vsMyf6.gif b/wqflask/wqflask/static/images/Chrna1vsMyf6.gif new file mode 100644 index 00000000..881a08e8 Binary files /dev/null and b/wqflask/wqflask/static/images/Chrna1vsMyf6.gif differ diff --git a/wqflask/wqflask/static/images/Normal_Plot.gif b/wqflask/wqflask/static/images/Normal_Plot.gif new file mode 100644 index 00000000..dc239f8e Binary files /dev/null and b/wqflask/wqflask/static/images/Normal_Plot.gif differ diff --git a/wqflask/wqflask/static/images/SilverFig3_2.png b/wqflask/wqflask/static/images/SilverFig3_2.png new file mode 100644 index 00000000..5b4b2c70 Binary files /dev/null and b/wqflask/wqflask/static/images/SilverFig3_2.png differ diff --git a/wqflask/wqflask/static/images/SilverFig3_6.png b/wqflask/wqflask/static/images/SilverFig3_6.png new file mode 100644 index 00000000..5b91d991 Binary files /dev/null and b/wqflask/wqflask/static/images/SilverFig3_6.png differ diff --git a/wqflask/wqflask/static/images/Winsorize1.png b/wqflask/wqflask/static/images/Winsorize1.png new file mode 100644 index 00000000..f3a65f29 Binary files /dev/null and b/wqflask/wqflask/static/images/Winsorize1.png differ diff --git a/wqflask/wqflask/static/images/Winsorize3.png b/wqflask/wqflask/static/images/Winsorize3.png new file mode 100644 index 00000000..a9ed95d6 Binary files /dev/null and b/wqflask/wqflask/static/images/Winsorize3.png differ diff --git a/wqflask/wqflask/static/markdown/glossary.md b/wqflask/wqflask/static/markdown/glossary.md index 241dbf0a..3c14ab78 100644 --- a/wqflask/wqflask/static/markdown/glossary.md +++ b/wqflask/wqflask/static/markdown/glossary.md @@ -1 +1,594 @@ -# Content +# Glossary of Terms and Features + +
+ +[A](#a) | [B](#b) | [C](#c)| [D](#d) | [E](#e) | [F](#f) | [G](#g) | [H](#h) | [I](#i) | [J](#j) | [K](#k) | [L](#l) | [M](#m) | [N](#n) | [O](#o) | [P](#p) | [Q](#q) | [R](#r) | [S](#s) | [T](#t) | [U](#u) | [V](#v) | [W](#w) | [X](#x) | [Y](#y) | [Z](#z) + +You are welcome to cite or reproduce these glossary +definitions. Please cite or link: Author AA. "Insert Glossary Term +Here." From The WebQTL Glossary--A GeneNetwork +Resource. gn1.genenetwork.org/glossary.html + +
+ +## A + +#### Additive Allele Effect: + +The additive allele effect is an estimate of the change in the average phenotype that would be produced by substituting a single allele of one type with that of another type (e.g., a replaced by A) in a population. In a standard F2 intercross between two inbred parental lines there are two alleles at every polymorphic locus that are often referred to as the little "a" allele and big "A" allele. F2 progeny inherit the a/a, a/A, or A/A genotypes at every genetic locus in a ratio close to 1:2:1. The additive effect is half of the difference between the mean of all cases that are homozygous for one parental allele (aa) compared to the mean of all cases that are homozygous for the other parental allele (AA): + +[(mean of AA cases)-(mean of aa cases)]/2 + +GeneNetwork displays the additive values on the far right of many trait/QTL maps, usually as red or green lines along the maps. The units of measurement of additive effects (and dominance effects) are defined by the trait itself and are shown in Trait Data and Analysis windows. For mRNA estimates these units are usually normalized log2 expression values. For this reason an additive effect of 0.5 units indicates that the A/A and a/a genotypes at a particular locus or marker differ by 1 unit (twice the effect of swapping a single A allele for an a allele). On this log2 scale this is equivalent to a 2-fold difference (2 raised to the power of 1). + +On the QTL map plots the polarity of allele effects is represented by the color of the line. For example, in mouse BXD family maps, if the DBA/2J allele produces higher values than the C57BL/6J allele then the additive effect line is colored in green. In contrast, if the C57BL/6J allele produces higher values then the line is colored in red. For computational purposes, C57BL/6J red values are considered negative. + +The dominance effects of alleles are also computed on maps for F2 populations (e.g., B6D2F2 and B6BTBRF2). Orange and purple line colors are used to distinguish the polarity of effects. Purple is the positive dominance effect that matches the polarity of the green additive effect, whereas orange is the negative dominance effect that matches the polarity of the red additive effect. [Please also see entry on Dominance Effects: Williams RW, Oct 15, 2004; Sept 3, 2005; Dec 4, 2005; Oct 25, 2011] + +[Go back to index](#index) + +
+ +#### Bootstrap: + +A bootstrap sample is a randomly drawn sample (or resample) that is taken from the original data set and that has the same number of samples as the original data set. In a single bootstrap sample, some cases will by chance be represented one or more times; other cases may not be represented at all (in other words, the sampling is done "with replacement" after each selection). To get a better intuitive feel for the method, imagine a bag of 26 Scrabble pieces that contain each letter of the English alphabet. In a bootstrap sample of these 26 pieces, you would shake the bag, insert your hand, and draw out one piece. You would then write down that letter on a piece of paper, and the place that Scrabble piece back in the bag in preparation for the next random selection. You would repeat this process (shake, draw, replace) 25 more times to generate a single bootstrap resample of the alphabet. Some letters will be represented several time in each sample and others will not be represented at al. If you repeat this procedure 1000 times you would have a set of bootstrap resamples of the type that GN uses to remap data sets. + +Bootstrap resampling is a method that can be used to estimate statistical parameters and error terms. GeneNetwork uses a bootstrap procedure to evaluate approximate confidence limits of QTL peaks using a method proposed by Peter Visscher and colleagues (1996). We generate 2000 bootstraps, remap each, and keep track of the location of the single locus with the highest LRS score locations (equivalent to a "letter" in the Scrabble example). The 2000 "best" locations are used to produce the yellow histograms plotted on some of the QTL maps. If the position of a QTL is firm, then the particular composition of the sample, will not shift the position of the QTL peak by very much. In such a case, the histogram of "best QTLs" (yellow bars in the maps) that is displayed in WebQTL maps will tend to have a sharp peak (the scale is the percentage of bootstrap resamples that fall into each bar of the bootstrap histogram). In contrast, if the the yellow bootstrap histograms are spread out along a chromosome, then the precise location of a QTL may not be accurate, even in the original correct data set. Bootstrap results naturally vary between runs due to the random generation of the samples. See the related entry "Frequency of Peak LRS." + +KNOWN PROBLEMS and INTERPRETATION of BOOTSTRAP RESULTS: The reliability of bootstrap analysis of QTL confidence intervals has been criticized by Manichaikul and colleagues (2006). Their work applies in particular to standard intercrosses and backcrosses in which markers are spaced every 2 cM. They recommend that confidence intervals be estimated either by a conventional 1.5 to 2.0 LOD drop-off interval or by a Bayes credible Interval method. + +There is a known flaw in the way in which GeneNetwork displays bootstrap results (Sept 2011). If a map has two or more adjacent markers with identical LOD score and identical strain distribution patterns, all of the bootstrap results are assigned incorrectly to just one of the "twin" markers. This results in a false perception of precision. + +QTL mapping methods can be highly sensitive to cases with very high or very low phenotype values (outliers). The bootstrap method does not provide protection against the effects of outliers and their effects on QTL maps. Make sure you review your data for outliers before mapping. Options include (1) Do nothing, (2) Delete the outliers and see what happens to your maps, (3) Winsorize the values of the outliers. You might try all three options and determine if your main results are stable or not. With small samples or extreme outliers, you may find the mapping results to be quite volatile. In general, if the results (QTL position or value) depend highly on one or two outliers (5-10% of the samples) then you should probably delete or winsorize the outliers. [Williams RW, Oct 15, 2004, Mar 15, 2008, Mar 26, 2008; Sept 2011] + +[Go back to index](#index) + +
+ +#### CEL and DAT Files (Affymetrix): + +Array data begin as raw image files that are generated using a confocal microscope and video system. Affymetrix refers to these image data files as DAT files. The DAT image needs to be registered to a template that assigns pixel values to expected array coordinates (cells). The result is an assignment of a set of image intensity values (pixel intensities) to each probe. For example, each cell/probe value generated using Affymetrix arrays is associated with approximately 36 pixels (a 6x6 set of pixels, usually with an effective 11 or 12-bit range of intensity). Affymetrix uses a method that simply ranks the values of these pixels and picks as the "representative value" the pixel that is has rank 24 from low to high. The range of variation in intensity amoung these ranked pixels provides a way to estimate the error of the estimate. The Affymetrix CEL files therefore consist of XY coordinates, the consensus value, and an error term. [Williams RW, April 30, 2005] + +#### Cluster Map or QTL Cluster Map: + +Cluster maps are sets of QTL maps for a group of traits. The QTL maps for the individual traits (up to 100) are run side by side to enable easy detection of common and unique QTLs. Traits are clustered along one axis of the map by phenotypic similarity (hierarchical clustering) using the Pearson product-moment correlation r as a measurement of similarity (we plot 1-r as the distance). Traits that are positively correlated will be located near to each other. The genome location is shown along the other, long axis of the cluster map, marker by marker, from Chromosome 1 to Chromosome X. Colors are used to encode the probability of linkage, as well as the additive effect polarity of alleles at each marker. These QTL maps are computed using the fast Marker Regression algorithm. P values for each trait are computed by permuting each trait 1000 times. Cluster maps could be considered trait gels because each lane is loaded with a trait that is run out along the genome. Cluster maps are a unique feature of the GeneNetwork developed by Elissa Chesler and implemented in WebQTL by J Wang and RW Williams, April 2004. [Williams RW, Dec 23, 2004, rev June 15, 2006 RWW]. + +#### Collections and Trait Collections: + +One of the most powerful features of GeneNetwork (GN) is the ability to study large sets of traits that have been measured using a common genetic reference population or panel (GRP). This is one of the key requirements of systems genetics--many traits studied in common. Under the main GN menu Search heading you will see a link to Trait Collections. You can assemble you own collection for any GRP by simply adding items using the Add to Collection button that you will find in many windows. Once you have a collection you will have access to a new set of tools for analysis of your collection, including QTL Cluster Map, Network Graph, Correlation Matrix, and Compare Correlates. [Williams RW, April 7, 2006] + +#### Complex Trait Analysis: + +Complex trait analysis is the study of multiple causes of variation of phenotypes within species. Essentially all traits that vary within a population are modulated by a set of genetic and environmental factors. Finding and characterizing the multiple genetic sources of variation is referred to as "genetic dissection" or "QTL mapping." In comparison, complex trait analysis has a slightly broader focus and includes the analysis of the effects of environmental perturbation, and gene-by-environment interactions on phenotypes; the "norm of reaction." Please also see the glossary term "Systems Genetics." [Williams RW, April 12, 2005] + +#### Composite Interval Mapping: + +Composite interval mapping is a method of mapping chromosomal regions that controls for some fraction of the genetic variability in a quantitative trait. Unlike simple interval mapping, composite interval mapping usually controls for variation produced at one or more background marker loci. These background markers are generally chosen because they are already known to be close to the location of a significant QTL. By factoring out a portion of the genetic variance produced by a major QTL, one can occasionally detect secondary QTLs. WebQTL allows users to control for a single background marker. To select this marker, first run the Marker Regression analysis (and if necessary, check the box labeled display all LRS, select the appropriate locus, and the click on either Composite Interval Mapping or Composite Regression. A more powerful and effective alternative to composite interval mapping is pair-scan analysis. This latter method takes into accounts (models) both the independent effects of two loci and possible two-locus epistatic interactions. [Williams RW, Dec 20, 2004] + +#### Correlations: Pearson and Spearman: + +GeneNetwork provides tools to compute both Pearson product-moment correlations (the standard type of correlation), Spearman rank order correlations. Wikipedia and introductory statistics text will have a discussion of these major types of correlation. The quick advice is to use the more robust Spearman rank order correlation if the number of pairs of observations in a data set is less than about 30 and to use the more powerful but much more sensitive Pearson product-moment correlation when the number of observations is greater than 30 AND after you have dealt with any outliers. GeneNetwork automatically flags outliers for you in the Trait Data and Analysis form. GeneNetwork also allows you to modify values by either deleting or winsorising them. That means that you can use Pearson correlations even with smaller sample sizes after making sure that data are well distributed. Be sure to view the scatterplots associated with correlation values (just click on the value to generate a plot). Look for bivariate outliers. + +#### Cross: + +The term Cross refers to a group of offspring made by mating (crossing) one strain with another strain. There are several types of crosses including intercrosses, backcrosses, advanced intercrosses, and recombinant inbred intercrosses. Genetic crosses are almost always started by mating two different but fully inbred strains to each other. For example, a B6D2F2 cross is made by breeding C57BL/6J females (B6 or B for short) with DBA/2J males (D2 or D) and then intercrossing their F1 progeny to make the second filial generation (F2). By convention the female is always listed first in cross nomenclature; B6D2F2 and D2B6F2 are therefore so-called reciprocal F2 intercrosses (B6D2F1 females to B6D2F1 males or D2B6F1 females to D2B6F1 males). A cross may also consist of a set of recombinant inbred (RI) strains such as the BXD strains, that are actually inbred progeny of a set of B6D2F2s. Crosses can be thought of as a method to randomize the assignment of blocks of chromosomes and genetic variants to different individuals or strains. This random assignment is a key feature in testing for causal relations. The strength with which one can assert that a causal relation exists between a chromosomal location and a phenotypic variant is measured by the LOD score or the LRS score (they are directly convertable, where LOD = LRS/4.61) [Williams RW, Dec 26, 2004; Dec 4, 2005]. + +[Go back to index](#index) + +
+ +#### Dominance Effects: + +The term dominance indicates that the phenotype of intercross progeny closely resemble one of the two parental lines, rather than having an intermediate phenotype. Geneticists commonly refer to an allele as having a dominance effect or dominance deviation on a phenotype. Dominance deviation at a particular marker are calculated as the difference between the average phenotype of all cases that have the Aa genotype at that marker and the expected value half way between the all casese that have the aa genotype and the AA genotype. For example, if the average phenotype value of 50 individuals with the aa genotype is 10 units whereas that of 50 individuals with the AA genotype is 20 units, then we would expect the average of 100 cases with the Aa genotype to be 15 units. We are assuming a linear and perfectly additive model of how the a and A alleles interact. If these 100 Aa cases actually have a mean of 11 units, then this additive model would be inadequate. A non-linear dominance terms is now needed. In this case the low a alleles is almost perfectly dominant (or semi-dominant) and the dominance deviation is -4 units. + +The dominance effects are computed at each location on the maps generated by the WebQTL module for F2 populations (e.g., B6D2F2 and B6BTBRF2). Orange and purple line colors are used to distinguish the polarity of the dominance effects. Purple is the positive dominance effect that matches the polarity of the green additive effect, whereas orange is the negative dominance effect that matches the polarity of the red additive effect. + +Note that dominance deviations cannot be computed from a set of recombinant inbred strains because there are only two classes of genotypes at any marker (aa and AA, more usuually written AA and BB). However, when data for F1 hybrids are available one can estimate the dominance of the trait. This global phenotypic dominance has almost nothing to do with the dominance deviation at a single marker in the genome. In other words, the dominance deviation detected at a single marker may be reversed or neutralized by the action of many other polymorphic genes. [Williams RW, Dec 21, 2004; Sept 3, 2005] + +[Go back to index](#index) + +
+ +#### Epistasis: + +Epistasis means that combined effects of two or more different loci or polymorphic genes are not what one would expect given the addition of their individual effects. There is, in other words, evidence for non-linear interactions among two or more loci. This is similar to the dominance effects mentioned above, but now generalized to two or more distinct loci, rather than to two or more alleles at a single locus. For example, if QTL 1 has an A allele that has an additive effects of +5 and QTL 2 has an A alleles that has an additive effect of +2, then the two locus genotype combination A/A would be expected to boost the mean by +7 units. But if the value of these A/A individuals was actually -7 we would be quite surprised and would refer to this as an epistatic interaction between QTL 1 and QTL 2. WebQTL will search for all possible epistatic interactions between pairs of loci in the genome. This function is called a Pair Scan becasue the software analyzes the LRS score for all possible pairs of loci. Instead of viewing an LRS plot along a single dimension, we now view a two-dimensional plot that shows a field of LRS scores computed for pairs of loci. Pair scan plots are extremely sensitive to outlier data. Be sure to review the primary data carefully using Basic Statistics. Also note that this more sophisiticated method also demands a significantly larger sample size. While 25 to 50 cases may be adequate for a conventional LRS plot (sometimes called a "main scan"), a Pair-Scan is hard to apply safely with fewer than 60 cases. [Williams RW, Dec 21, 2004; Dec 5, 2005] + +#### Effect Size of a QTL: + +QTLs can be ranked by the amount of variance that they explain--their so-called "effect size"--when they are included in a statistical model. The concept of a genetic model may seem odd to some users of GeneNetwork. A model is just an explicit hypothesis of how QTLs and other factors cause variation in a trait. QTL mapping involves comparisons of the explanatory power of different models. Effect sizes can be measured in different units including (1) the percentage of total or genetic variance that is explained by adding the QTL into the model, (2) the mean shift in Z score, or (3) the additive effect size expressed in the original measurement scale. Effects of single QTLs are often dependent on genetic background (i.e., other QTLs and their interactions) and on the numbers and types of cases used in a study. For example, the variance explained is influenced strongly by whether the sample are from a family cohort, a case-control cohort, a group of fully inbred strains such as recombinant inbred lines, an outcross or backcross population. + +Please note that the functional importance of a locus, QTL, or GWAS hit can not be predicted by the size of its effect on the trait in one environment, at one stage of development, and in one population. Estimates of the effect size of QTLs are usually both noisy and upwardly biased (overestimated), and both of these problems are particularly acute when sample sizes are small. + +Estimates of effect size for families of inbred lines, such as the BXD, HXB, CC, and hybrid diversity panels (e.g. the hybrid mouse diversity panel and the hybrid rat diversity panel) are typically (and correctly) much higher than those measured in otherwise similar analysis of intercrosses, heterogeneous stock (HS), or diversity outbred stock. Two factors contribute to the much higher level of explained variance of QTLs when using inbred strain panels. + + +1. *Replication Rate:* The variance that can be explained by a locus is increased by sampling multiple cases that have identical genomes and by using the strain mean for genetic analysis. Increasing replication rates from 1 to 6 can easily double the apparent heritability of a trait and therefore the effect size of a locus. The reason is simple—resampling decrease the standard error of mean, boosting the effective heritability (see Glossary entry on Heritability and focus on figure 1 from the Belknap 1998 paper reproduced below). Compare the genetically explained variance (labeled h2RI in this figure) of a single case (no replication) on the x-axis with the function at a replication rate of 4 on the y-axis. If the explained variance is 0.1 (10% of all variance explained) then the value is boosted to 0.3 (30% of strain mean variance explained) with n = 4. + +2. *Homozygosity:* The second factor has to do with the inherent genetic variance of populations. Recombinant inbred lines are homozygous at nearly all loci. This doubles the genetic variance in a family of recombinant inbred lines compared to a matched number of F2s. This also quadruples the variance compared to a matched number of backcross cases. As a result 40 BXDs sampled just one per genometype will average 2X the genetic variance and 2X the heritability of 40 BDF2 cases. Note that panels made up of isogenic F1 hybrids (so-called diallel crosses, DX) made by crossing recombinant inbred strains (BXD, CC, or HXB) are no longer homozygous at all loci, and while they do expose important new sources of variance associated with dominance, they do not benefit from the 2X gain in genetic variance relative to an F2 intercross. + +Homozygosity + +For the reasons listed above a QTL effect size of 0.4 detected a panel of BXD lines replicated four times each (160 cases total), corresponds approximately to an effect size of 0.18 in BXDs without replication (40 cases total), and to an effect size of 0.09 in an F2 of 40 cases total. [Williams RW, Dec 23, 2004; updated by RWW July 13, 2019] + +eQTL, cis eQTL, trans eQTL + +An expression QTL or eQTL. Differences in the expression of mRNA or proteins are often treated as standard phenotypes, much like body height or lung capacity. The variation in these microscopic traits (microtraits) can be mapped using conventional QTL methods. Damerval and colleagues were the first authors to use this kind of nomenclature and in their classic study of 1994 introduced the term PQLs for protein quantitative trait loci. Schadt and colleagues added the acronym eQTL in their early mRNA study of corn, mouse, and humans. We now are "blessed" with all kinds of prefixes to QTLs that highlight the type of trait that has been measured (m for metabolic, b for behavioral, p for physiological or protein). + +eQTLs of mRNAs and proteins have the unique property of (usually) having a single parent gene and genetic location. An eQTL that maps to the location of the parent gene that produces the mRNA or protein is referred to as a cis eQTL or local eQTL. In contrast, an eQTL that maps far away from its parent gene is referred to as a trans eQTL. You can use special search commands in GeneNetwork to find cis and trans eQTLs. [Williams RW, Nov 23, 2009, Dec 2009] + +[Go back to index](#index) + +
+ +## F + +#### Frequency of Peak LRS: + +The height of the yellow bars in some of the Map View windows provides a measure of the confidence with which a trait maps to a particular chromosomal region. WebQTL runs 2000 bootstrap samples of the original data. (A bootstrap sample is a "sample with replacement" of the same size as the original data set in which some samples will by chance be represented one of more times and others will not be represented at all.) For each of these 2000 bootstraps, WebQTL remaps each and keeps track of the location of the single locus with the highest LRS score. These accumulated locations are used to produce the yellow histogram of "best locations." A frequency of 10% means that 200 of 2000 bootstraps had a peak score at this location. It the mapping data are robust (for example, insensitive to the exclusion of an particular case), then the bootstrap bars should be confined to a short chromosomal interval. Bootstrap results will vary slightly between runs due to the random generation of the bootstrap samples. [Williams RW, Oct 15, 2004] + +#### False Discovery Rate (FDR): + +A false discovery is an apparently significant finding--usually determined using a particular P value alpha criterion--that given is known to be insignificant or false given other information. When performing a single statistical test we often accept a false discovery rate of 1 in 20 (p = .05). False discovery rates can climb to high levels in large genomic and genetic studies in which hundreds to millions of tests are run and summarized using standard "single test" p values. There are various statistical methods to estimate and control false discovery rate and to compute genome-wide p values that correct for large numbers of implicit or explicit statistical test. The Permutation test in GeneNetwork is one method that is used to prevent and excessive number of false QTL discoveries. Methods used to correct the FDR are approximations and may depend on a set of assumptions about data and sample structure. [Williams RW, April 5, 2008] + +[Go back to index](#index) + +
+ +## G + +#### Genes, GenBankID, UniGeneID, GeneID, LocusID: + +GeneNetwork provides summary information on most of the genes and their transcripts. Genes and their alternative splice variants are often are poorly annotated and may not have proper names or symbols. However, almost all entries have a valid GenBank accession identifier. This is a unique code associated with a single sequence deposited in GenBank (Entrez Nucleotide). A single gene may have hundreds of GenBank entries. GenBank entries that share a genomic location and possibly a single gene are generally combined into a single UniGene entry. For mouse, these always begin with "Mm" (Mus musculus) and are followed by a period and then a number. More than half of all mouse UniGene identifiers are associated with a reputable gene, and these genes will have gene identifiers (GeneID). GeneIDs are identical to LocusLink identifiers (LocusID). Even a 10 megabase locus such as human Myopia 4 (MYP4) that is not yet associated with a specific gene is assigned a GeneID--a minor misnomer and one reason to prefer the term LocusID. + +See the related FAQ on "How many genes and transcripts are in your databases and what fraction of the genome is being surveyed?" [Williams RW, Dec 23, 2004, updated Jan 2, 2005] + +#### Genetic Reference Population (GRP): + +A genetic reference population consists of a set of genetically well characterized lines that are often used over a long period of time to study a multitude of different phenotypes. Once a GRP has been genotyped, subsequent studies can focus on the analysis of interesting and important phenotypes and their joint and independent relations. Most of the mouse GRPs, such as the BXDs used in the GeneNetwork, have been typed using a common set of over 14,000 makers (SNPs and microsatellites). Many of these same GRPs have been phenotyped extensively for more than 25 years, resulting in rich sets of phenotypes. A GRP is an ideal long-term resource for systems genetics because of the relative ease with which vast amounts of diverse data can be accumulated, analyzed, and combined. + +The power of GRPs and their compelling scientific advantages derive from the ability to study multiple phenotypes and substantial numbers of genetically defined individuals under one or more environmental conditions. When accurate phenotypes from 20 or more lines in a GRP have been acquired it becomes practical to explore and test the genetic correlations between that trait and any previously measured trait in the same GRP. This fact underlies the use of the term reference in GRP. Since each genetic individual is represented by an entire isogenic line--usually an inbred strain or an isogenic F1 hybrid--it is possible to obtain accurate mean phenotypes associated with each line simply by typing several individuals. GRPs are also ideal for developmental and aging studies because the same genetic individual can be phenotyped at multiple stages. + +A GRP can also be used a conventional mapping panel. But unlike most other mapping panel, a GRP can be easily adapted to jointly map sets of functionally related traits (multitrait mapping); a more powerful method to extract causal relations from networks of genetic correlations. + +The largest GRPs now consist of more than 400 recombinant inbred lines of Arabidopsis and maize. The BayxSha Arabidopsis set in the GeneNetwork consists of 420 lines. Pioneer Hi-Bred International is rumored to have as many as 4000 maize RI lines. The largest mammalian GRPs are the LXS and BXD RI sets in the GeneNetwork. The Collaborative Cross is the largest mammalian GRP, and over 600 of these strains are now being bred by members of the Complex Trait Consortium. + +There are several subtypes of GRPs. In addition to recombinant inbred strains there are + + +- Recombinant congenic (RCC) strains such as the AcB set Consomic or chromosome substitution strains (CSS) of mice (Matin et al., 1999) and rats (Roman et al., 2002) + +- Recombinant intercross (RIX) F1 sets made by mating different RI strains to each other to generate large set of R! first generation (F1) progeny (RIX). This is a standard (diallel cross) of RI inbred strains. Genetic analysis of a set of RIX progeny has some advantages over a corresponding analysis of RI strains. The first of these is that while each set of F1 progeny is fully isogenic (AXB1 x AXB2 gives a set of isogenic F1s), these F1s are not inbred but are heterozygous at many loci across the genome. RIX therefore retain the advance of being genetically defined and replicable, but without the disadvantage of being fully inbred. RIX have a genetic architecture more like natural populations. The second correlated advantage is that it is possible to study patterns of dominance of allelic variants using an RIX cross. Almost all loci or genes that differs between the original stock strains (A and B) will be heterozygous among a sufficiently larges set of RIX. A set of RIX progeny can therefore be mapped using the same methods used to map an F2 intercross. Mapping of QTLs may have somewhat more power and precision than when RI strains are used alone. A third advantage is that RIX sets make it possible to expand often limited RI resources to very large sizes to confirm and extend models of genetic or GXE effects. For example a set of 30 AXB strains can be used to generate a full matrix of 30 x 29 unique RIX progeny. The main current disadvantage of RIX panels is the comparative lack of extant phenotype data. + +- Recombinant F1 line sets can also be made by backcrossing an entire RI sets to a single inbred line that carries an interesting mutation or transgene (RI backcross or RIB). GeneNetwork includes one RI backcross sets generated by Kent Hunter. In this RIB each of 18 AKXD RI strains were crossed to an FVB/N line that carries a tumor susceptibility allele (polyoma middle T). + + All of these sets of lines are GRPs since each line is genetically defined and because the set as a whole can in principle be easily regenerated and phenotyped. Finally, each of these resources can be used to track down genetic loci that are causes of variation in phenotype using variants of standard linkage analysis. + + A Diversity Panel such as that used by the Mouse Phenome Project is not a standard GRPs, although its also shares the ability to accumulate and study networks of phenotypes. The main difference is that a Diversity Panel cannot be used for conventional linkage analysis. A sufficiently large Diversity Panel could in principle be used for the equivalent of an assocation study. However, these are definitely NOT in silico studies, because hundreds of individuals need to be phenotyped for every trait. Surveys of many diverse isogenic lines (inbred or F1 hybrids) is statistically the equivalent of a human association study (the main difference is the ability to replicate measurements and study sets of traits) and therefore, like human association studies, does require very high sample size to map polygenic traits. Like human association studies there is also a high risk of false positive results due to population stratification and non-syntenic marker association. + + A good use of a Diversity Panel is as a fine-mapping resource with which to dissect chromosomal intervals already mapped using a conventional cross or GRP. GeneNetwork now includes Mouse Diversity Panel (MDP) data for several data sets. We now typically include all 16 sequenced strains of mice, and add PWK/PhJ, NZO/HiLtJ (two of the eight members of the Collaborative Cross), and several F1 hybrids. The MDP data is often appended at the bottom of the GRP data set with which is was acquired (e.g., BXD hippocampal and BXD eye data sets). [Williams RW, June 19, 2005; Dec 4, 2005] + + Genotype: The state of a gene or DNA sequence, usually used to describe a contrast between two or more states, such as that between the normal state (wildtype) and a mutant state (mutation) or between the alleles inherited from two parents. All species that are included in GeneNetwork are diploid (derived from two parents) and have two copies of most genes (genes located on the X and Y chromosomes are exceptions). As a result the genotype of a particular diploid individual is actually a pair of genotypes, one from each parents. For example, the offspring of a mating between strain A and strain B will have one copy of the A genotype and one copy of the B genotype and therefore have an A/B genotype. In contrast, offspring of a mating between a female strain A and a male strain A will inherit only A genotypes and have an A/A genotype. + +Genotypes can be measured or inferred in many different ways, even by visual inspection of animals (e.g. as Gregor Mendel did long before DNA was discovered). But now the typical method is to directly test DNA that has a well define chromosomal location that has been obtained from one or usually many cases using molecular tests that often rely on polymerase chain reaction steps and sequence analysis. Each case is genotyped at many chromosomal locations (loci, markers, or genes). The entire collection of genotypes (as many a 1 million for a single case) is also sometimes referred to as the cases genotype, but the word "genometype" might be more appropriate to highlight the fact that we are now dealing with a set of genotypes spanning the entire genome (all chromosomes) of the case. + +For gene mapping purposes, genotypes are often translated from letter codes (A/A, A/B, and B/B) to simple numerical codes that are more suitable for computation. A/A might be represented by the value -1, A/B by the value 0, and B/B by the value +1. This recoding makes it easy to determine if there is a statistically significant correlation between genotypes across of a set of cases (for example, an F2 population or a Genetic Reference Panel) and a variable phenotype measured in the same population. A sufficiently high correlation between genotypes and phenotypes is referred to as a quantitative trait locus (QTL). If the correlation is almost perfect (r > 0.9) then correlation is usually referred to as a Mendelian locus. Despite the fact that we use the term "correlation" in the preceding sentences, the genotype is actually the cause of the phenotype. More precisely, variation in the genotypes of individuals in the sample population cause the variation in the phenotype. The statistical confidence of this assertion of causality is often estimated using LOD and LRS scores and permutation methods. If the LOD score is above 10, then we can be extremely confident that we have located a genetic cause of variation in the phenotype. While the location is defined usually with a precision ranging from 10 million to 100 thousand basepairs (the locus), the individual sequence variant that is responsible may be quite difficult to extract. Think of this in terms of police work: we may know the neighborhood where the suspect lives, we may have clues as to identity and habits, but we still may have a large list of suspects. + +Text here [Williams RW, July 15, 2010] + +[Go back to index](#index) + +
+ +## H + +#### Heritability, h2: + +Heritability is a rough measure of the ability to use genetic information to predict the level of variation in phenotypes among progeny. Values range from 0 to 1 (or 0 to 100%). A value of 1 or 100% means that a trait is entirely predictable based on paternal/materinal and genetic data (in other words, a Mendelian trait), whereas a value of 0 means that a trait is not at all predictable from information on gene variants. Estimates of heritability are highly dependent on the environment, stage, and age. + +Important traits that affect fitness often have low heritabilities because stabilizing selection reduces the frequency of DNA variants that produce suboptimal phenotypes. Conversely, less critical traits for which substantial phenotypic variation is well tolerated, may have high heritability. The environment of laboratory rodents is unnatural, and this allows the accumulation of somewhat deleterious mutations (for example, mutations that lead to albinism). This leads to an upward trend in heritability of unselected traits in laboratory populations--a desirable feature from the point of view of the biomedical analysis of the genetic basis of trait variance. Heritability is a useful parameter to measure at an early stage of a genetic analysis, because it provides a rough gauge of the likelihood of successfully understanding the allelic sources of variation. Highly heritable traits are more amenable to mapping studies. There are numerous ways to estimate heritability, a few of which are described below. [Williams RW, Dec 23, 2004] + +#### h2 Estimated by Intraclass Correlation: + +Heritability can be estimated using the intraclass correlation coefficient. This is essentially a one-way repeated measures analysis of variance (ANOVA) of the reliability of trait data. Difference among strains are considered due to a random effect, whereas variation among samples within a single strain are considered due to measurement error. One can use the method implemented by SAS (PROC VARCOMP) that exploits a restricted maximum likelihood (REML) approach to estimate the intraclass correlation coefficient instead of an ordinary least squares method. The general equation for the intraclass correlation is: + +r = (Between-strain MS - Within-strain MS)/(Between-strain MS + (n-1)x Within-strain MS) + +where n is the average number of cases per strain. The intraclass correlation approaches 1 when there is minimal variation within strains, and strain means differ greatly. In contrast, if difference between strains are less than what would be predicted from the differences within strain, then the intraclass correlation will produce negative estimates of heritability. Negative heritability is usually a clue that the design of the experiment has injected excessive within-strain variance. It is easy for this to happen inadvertently by failing to correct for a batch effect. For example, if one collects the first batch of data for strains 1 through 20 during a full moon, and a second batch of data for these same strains during a rare blue moon, then the apparent variation within strain may greatly exceed the among strain variance. A technical batch effect has been confounded with the within-strain variation and has swamped any among-strain variance. What to do? Fix the batch effect, sex effect, age effect, etc., first! [Williams RW, Chesler EJ, Dec 23, 2004] + +#### h2 Estimated using Hegmann and Possidente's Method (Adjusted Heritability in the Basic Statisics): + +A simple estimate of heritability for inbred lines involves comparing the variance between strain means (Va) to the total variance (Vt) of the phenotype, where Va is the a rough estimate of the additive genetic variance and Vt is the equal to Va and the average environmental variance, Ve. For example, if we study 10 cases of each of 20 strains, we have a total variance of the phenotype across 200 samples, and a strain mean variance across 20 strain averages. We can use this simple equation to estimate the heritability: + +h2 = Va / Vt + +This estimate of heritability will be an overestimate, and the severity of this bias will be a function of the within-strain standard error of the mean. Even a random data set of 10 each of 20 strains that should have an h2 of 0, will often give h2 values of 0.10 to 0.20. (Try this in a spreadsheet program using random numbers.) + +However, this estimate of h2 cannot be compared directly to those calculated using standard intercrosses and backcrosses. The reason is that all cases above are fully inbred and no genotypes are heterozygous. As a result the estimate of Va will be inflated two-fold. Hegmann and Possidente (1981 suggested a simple solution; adjust the equation as follows: + +h2 = 0.5Va / (0.5Va+Ve) + +The factor 0.5 is applied to Va to adjust for the overestimation of additive genetic variance among inbred strains. This estimate of heritability also does not make allowances for the within-strain error term. The 0.5 adjustment factor is not recommended any more because h2 is severely underestimated. This adjustment is really only needed if the goal is to compare h2 between intercrosses and those generated using panels of inbred strains. + +#### h2RIx̅ + +Finally, heritability calculations using strain means, such as those listed above, do not provide estimates of the effective heritability achieved by resampling a given line, strain, or genometype many times. Belknap (1998) provides corrected estimates of the effective heritability. Figure 1 from his paper (reproduced below) illustrates how resampling helps a great deal. Simply resampling each strain 8 times can boost the effective heritability from 0.2 to 0.8. The graph also illustrates why it often does not make sense to resample much beyond 4 to 8, depending on heritability. Belknap used the term h2RIx̅ in this figure and paper, since he was focused on data generated using recombinant inbred (RI) strains, but the logic applies equally well to any panel of genomes for which replication of individual genometypes is practical. This h2RIx̅ can be calculated simply by: +h2RIx̅ = Va / (Va+(Ve/n)) where Va is the genetic variability (variability between strains), Ve is the environmental variability (variability within strains), and n is the number of within strain replicates. Of course, with many studies the number of within strain replicates will vary between strains, and this needs to be dealt with. A reasonable approach is to use the harmonic mean of n across all strains. + +Homozygosity + +An analysis of statistical power is useful to estimate numbers of replicates and strains needed to detect and resolve major sources of trait variance and covariance. A versatile method has been developed by Sen and colleagues (Sen et al., 2007) and implemented in the R program. qtlDesign. David Ashbrook implemented a version of this within Shiny that can help you estimate power for different heritability values QTL effect sizes, cohort sizes, and replication rates: + +### Power Calculator (D. Ashbrook) + +We can see that in all situations power is increased more by increasing the number of lines than by increasing the number of biological replicates. Dependent upon the heritability of the trait, there is little gain in power when going above 4-6 biological replicates. [DGA, Feb 1, 2019] [Chesler EJ, Dec 20, 2004; RWW updated March 7, 2018; Ashbrook DG, updated Feb 1, 2019] + +#### Hitchhiking Effect: + +Conventional knockout lines (KOs) of mice are often mixtures of the genomes of two strains of mice. One important consequence of this fact is that a conventional comparison of wildtype and KO litter mates does not only test of the effects of the KO gene itself but also tests the effects of thousands of "hitchhiking" sequence polymorphisms in genes that flank the KO gene. This experimental confound can be difficult to resolve (but see below). This problem was first highlighted by Robert Gerlai (1996). + +**Genetics of KO Lines**. The embryonic stem cells used to make KOs are usually derived from a 129 strain of mouse (e.g., 129/OlaHsd). Mutated stem cells are then added to a C57BL/6J blastocyst to generate B6x129 chimeric mice. Germline transmission of the KO allele is tested and carriers are then used to establish heterozygous +/- B6.129 KO stock. This stock is often crossed back to wildtype C57BL/6J strains for several generations. At each generation the transmission of the KO is checked by genotyping the gene or closely flanking markers in each litter of mice. Carriers are again selected for breeding. The end result of this process is a KO congenic line in which the genetic background is primarily C57BL/6J except for the region around the KO gene. + +It is often thought that 10 generations of backcrossing will result in a pure genetic background (99.8% C57BL/6J). Unfortunately, this is not true for the region around the KO, and even after many generations of backcrossing of KO stock to C57BL/6J, a large region around the KO is still derived from the 129 substrain (see the residual white "line" at N10 in the figure below. + +Congenic + +After 20 generations of backcrossing nearly +/-5 cM on either side of the KO will still usually be derived from 129 (see Figure 3.6) This amounts to an average of +/- 10 megabases of DNA around the KO. The wildtype littermates do NOT have this flanking DNA from 129 and they will be like a true C57BL/6J. The +/- 10 megabases to either side of the KO is known as the "hitchhiking" chromosomal interval. Any polymorphism between 129 and B6 in this interval has the potential to have significant downstream effects on gene expression, protein expression, and higher order traits such as anxiety, activity, and maternal behavior. Much of the conventional KO literature is highly suspect due to this hitchhiker effect (see Gerlai R, Trends in Neurosci 1996 19:177). + +As one example, consider the thyroid alpha receptor hormone gene Thra and its KO. Thra maps to Chr 11 at about 99 Mb. A conventional KO made as described above will have a hitchhiking 129 chromosomal interval extending from about 89 Mb to 109 Mb even after 20 generations of backcrossing to B6. Since the mouse genome is about 2.6 billion base pairs and contains about 26,000 genes, this 20 Mb region will typically contain about 200 genes. The particular region of Chr 11 around Thra has an unusually high density of genes (2-3X) and includes many highly expressed and polymorphic genes, including Nog, Car10, Cdc34, Col1a1, Dlx4, Myst2, Ngfr, Igf2bp1, Gip, the entire Hoxb complex, Sp6, Socs7, Lasp1, Cacnb1, Pparbp, Pnmt, Erbb2, Grb7, Nr1d1, Casc3, Igfbp4, and the entire Krt1 complex. Of these gene roughly half will be polymorphic between B6 and 129. It is like having a busload of noisy and possibly dangerous hitchhikers. Putative KO effects may be generated by a complex subset of these 100 polymorphic genes. + +What is the solution? + +1. Do not use litter mates as controls without great care. They are not really the correct genetic control. The correct genetic control is a congenic strain of the same general type without the KO or with a different KO in a nearby gene. These are often available as KOs in neighboring genes that are not of interest. For example, the gene Casc3 is located next to Thra. If a KO in Casc3 is available, then compare the two KOs and see if phenotypes of the two KOs differ ways predicted given the known molecular functions of the gene. + +2. Use a KO in which the KO has been backcrossed to a 129 strain--ideally the same strain from which ES cells were obtained. This eliminates the hitchhiker effect entirely and the KO, HET, and WT littermates really can be compared. + +3. Use a conditional KO. + +4. Compare the phenotype of the two parental strains--129 and C57BL/6J and see if they differ in ways that might be confounded with the effects of the KO. + +Homozygosity + +Legend:from Silver, L. (1995) Oxford University Press + +[Go back to index](#index) + +
+ +## I + +#### Interquartile Range: + +The interquartile range is the difference between the 75% and 25% percentiles of the distribution. We divide the sample into a high and low half and then compute the median for each of these halves. In other words we effectively split our sample into four ordered sets of values known as quartiles. The absolute value of the difference between the median of the lower half and the median of the upper half is also called the interquartile range. This estimate of range is insenstive to outliers. If you are curious you might double the IQR to get an interquartile-range-based estimate of the full range. Of course, keep in mind that range is dependent on the sample size. For theis reason the coeffficient of variation (the standard deviation divided by the mean) is a better overall indicator of dispersion of values around the mean that is less sensitive to sample size. [Williams RW, Oct 20, 2004; Jan 23, 2005] + +#### Interval Mapping: + +Interval mapping is a process in which the statistical significance of a hypothetical QTL is evaluated at regular points across a chromosome, even in the absence of explicit genotype data at those points. In the case of WebQTL, significance is calculated using an efficient and very rapid regression method, the Haley-Knott regression equations (Haley CS, Knott SA. 1992. A simple regression method for mapping quantitative trait loci in line crosses using flanking markers; Heredity 69:315–324), in which trait values are compared to the known genotype at a marker or to the probability of a specific genotype at a test location between two flanking markers. (The three genotypes are coded as -1, 0, and +1 at known markers, but often have fractional values in the intervals between markers.) The inferred probability of the genotypes in regions that have not been genotyped can be estimated from genotypes of the closest flanking markers. GeneNetwork/WebQTL compute linkage at intervals of 1 cM or less. As a consequence of this approach to computing linkage statistics, interval maps often have a characteristic shape in which the markers appear as sharply defined inflection points, and the intervals between nodes are smooth curves. [Chesler EJ, Dec 20, 2004; RWW April 2005; RWW Man 2014] + +#### Interval Mapping Options: + +- Permutation Test: Select this option to determine the approximate LRS value that matches a genome-wide p-value of .05. + +- Bootstrap Test: Select this option to evaluate the consistency with which peak LRS scores cluster around a putative QTL. Deselect this option if it obscures the SNP track or the additive effect track. + +- Additive Effect: The additive effect (shown by the red lines in these plots) provide an estimate of the change in the average phenotype that is brought about by substituting a single allele of one type with that of another type. + +- SNP Track: The SNP Seismograph Track provides information on the regional density of segregating variants in the cross that may generate trait variants. It is plotted along the X axis. If a locus spans a region with both high and low SNP density, then the causal variant has a higher prior probability to be located in the region with high density than in the region with low density. + +- Gene Track: This track overlays the positions of known genes on the physical Interval Map Viewer. If you hover the cursor over genes on this track, minimal information (symbol, position, and exon number) will appear. + +- Display from X Mb to Y Mb: Enter values in megabases to regenerate a smaller or large map view. + +- Graph width (in pixels): Adjust this value to obtain larger or smaller map views (x axis only). + +[Go back to index](#index) + +
+ +## J + +[Go back to index](#index) + +
+ +## K + +[Go back to index](#index) + +
+ +## L + +#### Literature Correlation: + +The literature correlation is a unique feature in GeneNetwork that quantifies the similarity of words used to describe genes and their functions. Sets of words associated with genes were extracted from MEDLINE/PubMed abstracts (Jan 2017 by Ramin Homayouni, Diem-Trang Pham, and Sujoy Roy). For example, about 2500 PubMed abstracts contain reference to the gene "Sonic hedgehog" (Shh) in mouse, human, or rat. The words in all of these abstracts were extracted and categorize by their information content. A word such as "the" is not interesting, but words such as "dopamine" or "development" are useful in quantifying similarity. Sets of informative words are then compared—one gene's word set is compared the word set for all other genes. Similarity values are computed for a matrix of about 20,000 genes using latent semantic indexing (see Xu et al., 2011). Similarity values are also known as literature correlations. These values are always positive and range from 0 to 1. Values between 0.5 and 1.0 indicate moderate-to-high levels of overlap of vocabularies. + +The literature correlation can be used to compare the "semantic" signal-to-noise of different measurements of gene, mRNA, and protein expression. Consider this common situation:There are three probe sets that measure Kit gene expression (1459588\_at, 1415900\_a\_at, and 1452514\_a\_at) in the Mouse BXD Lung mRNA data set (HZI Lung M430v2 (Apr08) RMA). Which one of these three gives the best measurement of Kit expression? It is impractical to perform quantitative rtPCR studies to answer this question, but there is a solid statistical answer that relies on Literature Correlation. Do the following: For each of the three probe sets, generate the top 1000 literature correlates. This will generate three apparently identical lists of genes that are known from the PubMed literature to be associated with the Kit oncogene. But the three lists are NOT actually identical when we look at the Sample Correlation column. To answer the question "which of the three probe sets is best", review the actual performance of the probe sets against this set of 1000 "friends of Kit". Do this by sorting all three lists by their Sample Correlation column (high to low). The clear winner is probe set 1415900_a_at. The 100th row in this probe set's list has a Sample Correlation of 0.620 (absolute value). In comparison, the 100th row for probe set 1452514_a_at has a Sample Correlation of 0.289. The probe set that targets the intron comes in last at 0.275. In conclusion, the probe set that targets the proximal half of the 3' UTR (1415900_a_at) has the highest "agreement" between Literature Correlation and Sample Correlation, and is our preferred measurement of Kit expression in the lung in this data set. (Updated by RWW and Ramin Homayouni, April 2017.) + +#### LOD: + +The logarithm of the odds (LOD) provides a measure of the association between variation in a phenotype and genetic differences (alleles) at a particular chromosomal locus (see Nyholt 2000 for a lovely review of LOD scores). + +A LOD score is defined as the logarithm of the ratio of two likelihoods: (1) in the numerator the likelihood for the alternative hypothesis, namely that there is linkage at the chromosomal marker, and (2) the likelihood of the null hypothesis that there is no linkage. Likelihoods are probabilities, but they are not Pr(hypothesis | data) but rather Pr(data | two alternative hypotheses). That's why they are called likelihoods rather than probabilities. (The "|" symbol above translates to "given the"). Since LOD and LRS scores are associated with two particular hypotheses or models, they are also associated with the degrees of freedom of those two alternative models. When the model only has one degree of freedom this conversion between LOD to p value will work: +
+    lodToPval <-
+    function(x)
+    {
+    pchisq(x*(2*log(10)),df=1,lower.tail=FALSE)/2
+    }
+    # (from https://www.biostars.org/p/88495/ )    
+
+ +In the two likelihoods, one has maximized over the various nuisance parameters (the mean phenotypes for each genotype group, or overall for the null hypothesis, and the residual variance). Or one can say, one has plugged in the maximum likelihood estimates for these nuisance parameters. + +With complete data at a marker, the log likelihood for the normal model reduces to the (-n/2) times the log of the residual sum of squares. + +LOD values can be converted to LRS scores (likelihood ratio statistics) by multiplying by 4.61. The LOD is also roughly equivalent to the -log(P), where P is the probability of linkage (P = 0.001 => 3). The LOD itself is not a precise measurement of the probability of linkage, but in general for F2 crosses and RI strains, values above 3.3 will usually be worth attention for simple interval maps. [Williams RW, June 15, 2005, updated with text from Karl Broman, Oct 28, 2010, updated Apr 21, 2020 with Nyholt reference]. + +#### LRS: + +In the setting of mapping traits, the likelihood ratio statistic is used as a measurement of the association or linkage between differences in traits and differences in particular genotype markers. LRS or LOD values are usually plotted on the y-axis, whereas chromosomal location of the marker are usually plotted on the x-axis. In the case of a whole genome scan--a sequential analysis of many markers and locations across the entire genome--LRS values above 10 to 15 will usually be worth attention for when mapping with standard experimental crosses (e.g., F2 intercrosses or recombinant inbred strains). The term "likelihood ratio" is used to describe the relative probability (likelihood) of two different explanations of the variation in a trait. The first explanation (or model or hypothesis H1) is that the differences in the trait ARE associated with that particular DNA sequence difference or marker. Very small probability values indicate that H1 is probably true. The second "null" hypothesis (Hnull or H0) is that differences in the trait are NOT associated with that particular DNA sequence. We can use the ratio of these two probabilities and models (H1 divided by H0) as our score. The math is a little bit more complicated and the LRS score is actually equal to -2 times the ratio of the natural logarithms of the two probabilities. For example, if the probability of H0 is 0.05 (only a one-in-twenty probability that the marker is associated with the trait by chance), whereas and the probability of H1 is 1 (the marker is certainly not linked to the trait), then the LRS value is 5.991. In Excel the equation giving the LRS result of 5.991 would look like this "=-2*(LN(0.05)-LN(1)). [Williams RW, Dec 13, 2004, updated Nov 18, 2009, updated Dec 19, 2012] + +[Go back to index](#index) + +
+ +## M + +Marker Regression: + +The relationship between differences in a trait and differences in alleles at a marker (or gene variants) can be computed using a regression analysis (genotype vs phenotype) or as a simple Pearson product moment correlation. Here is a simple example that you can try in Excel to understand marker-phenotype regression or marker-phenotype correlation: enter a row of phenotype and genotype data for 20 strains in an Excel spreadsheet labeled "Brain weight." The strains are C57BL/6J, DBA/2J, and 20 BXD strains of mice (1, 2, 5, 6, 8, 9, 12, 13, 14, 15, 16, 18, 21, 22, 23, 24, 25, 27, 28, and 29. The brains of these strains weigh an average (in milligrams) of 465, 339, 450, 390, 477, 361, 421, 419, 412, 403, 429, 429, 436, 427, 409, 431, 432, 380, 394, 381, 389, and 375. (These values are taken from BXD Trait 10032; data by John Belknap and colleagues, 1992. Notice that data are missing for several strains including the extinct lines BXD3, 4, and 7. Data for BXD11 and BXD19 (not extinct) are also missing. In the second row enter the genotypes at a single SNP marker on Chr 4 called "rs13478021" for the subset of strains for which we have phenotype data. The genotypes at rs1347801 are as follows for 20 BXDs listed above: D B D B D B D D D D D B D B D B D B D B. This string of alleles in the parents and 20 BXDs is called a strains distribution pattern (SDP). Let's convert these SDP letters into more useful numbers, so that we can "compute" with genotypes. Each B allele gets converted into a -1 and each D allele gets converted into a +1. In the spreadsheet, the data set of phenotypes and genotypes should look like this. + +
+    Strain BXD1 BXD2 BXD5 6 8 9 12 13 14 15 16 18 21 22 23 24 25 27 28 29
+    Brain_weight 450 390 477 361 421 419 412 403 429 429 436 427 409 431 432 380 394 381 389 375
+    Marker_rs1347801 D B D B D B D D D D D B D B D B D B D B
+    Marker_code 1 -1 1 -1 1 -1 1 1 1 1 1 -1 1 -1 1 -1 1 -1 1 -1
+
+ +To compute the marker regression (or correlation) we just compare values in Rows 2 and 4. A Pearson product moment correlation gives a value of r = 0.494. A regression analysis indicates that on average those strains with a D allele have a heavier brain with roughly a 14 mg increase for each 1 unit change in genotype; that is a total of about 28 mg if all B-type strains are compared to all D-type strains at this particular marker. This difference is associated with a p value of 0.0268 (two-tailed test) and an LRS of about 9.8 (LOD = 9.8/4.6 or about 2.1). Note that the number of strains is modest and the results are therefore not robust. If you were to add the two parent strains (C57BL/6J and DBA/2J) back into this analysis, which is perfectly fair, then the significance of this maker is lost (r = 0.206 and p = 0.3569). Bootstrap and permutation analyses can help you decide whether results are robust or not and whether a nominally significant p value for a single marker is actually significant when you test many hundreds of markers across the whole genome (a so-called genome-wide test with a genome-wide p value that is estimated by permutation testing). [RWW, Feb 20, 2007, Dec 14, 2012] + +[Go back to index](#index) + +
+ +## N + +#### Normal Probability Plot: + +A normal probability plot is a powerful tool to evaluate the extent to which a distribution of values conforms to (or deviates from) a normal Gaussian distribution. The Basic Statistics tools in GeneNetwork provides these plots for any trait. If a distribution of numbers is normal then the actual values and the predicted values based on a z score (units of deviation from the mean measured in standard deviation units) will form a nearly straight line. These plots can also be used to efficiently flag outlier samples in either tail of the distribution. + +In genetic studies, the probability plot can be used to detect the effects of major effect loci. A classical Mendelian locus will typically be associated with either a bimodal or trimodal distribution. In the plot below based on 99 samples, the points definitely do not fall on a single line. Three samples (green squares) have unusually high values; the majority of samples fall on a straight line between z = -0.8 to z = 2; and 16 values have much lower trait values than would be predicted based on a single normal distribution (a low mode group). The abrupt discontinuity in the distribution at -0.8 z is due to the effect of a single major Mendelian effect. + +Deviations from normality of the sort in the figure below should be considered good news from the point of view of likely success of tracking down the locations of QTLs. However, small numbers of outliers may require special statistical handling, such as their exclusion or winsorising (see more below on "Winsorizing"). [RWW June 2011] + +Homozygosity + +[Go back to index](#index) + +
+ +## O + +#### Outliers: (also see Wikipedia) + +Statistical methods often assume that the distribution of trait values is close to a Gaussian normal bell-shaped curve and that there are no outlier values that are extremely high or low compared to the average. Some traits can be clearly split into two or more groups (affected cases and unaffected cases) and this is not a problem as long as the number of cases in each group is close to the number that you expected by chance and that your sample size is reasonable high (40 or more for recombinant inbred strains). Mapping functions and most statistical procedure in GeneNetwork should work reasonable well (the pair scan function for epistatic interactions is one possible exception). + +However, correlations and QTL mapping methods can be highly sensitive to outlier values. Make sure you review your data for outliers before mapping. GeneNetwork flags all outliers for you in the Trait Data and Analysis window and gives you the option of zapping these extreme values. Options include (1) do nothing, (2) delete the outliers and see what happens to your maps, (3) Winsorize the values of the outliers. You might try all three options and determine if your main results are stable or not. With small samples or extreme outliers, you may find the correlation and mapping results to be volatile. In general, if results (correlations, QTL positions or QTL LRS score) depend highly on one or two outliers (5-10% of the samples) then you should probably delete or winsorize the outliers. + +In order to calculate outliers, we first determine the Q1(25%) and Q3(75%) values and then multiply by a constant (in our case 1.5; a higher constant is less sensitive to outliers). This value is then subtracted from the Q1 value and added to the Q3 value in order to determine the lower and upper bounds. Values that fall above the upper bound or below the lower bound are considered outliers. + +The method is summarized [here](http://www.wikihow.com/Calculate-Outliers). [Sloan ZA, Oct 2013] + +[Go back to index](#index) + +
+ +## P + +#### Pair-Scan, 2D Genome Scan, or Two-QTL Model: + +The pair scan function evaluates pairs of intervals (loci) across the genome to determine how much of the variability in the trait can be explained jointly by two putative QTLs. The pair scan function in GeneNetwork is used to detect effects of pairs of QTLs that have epistatic interactions, although this function also evaluates summed additive effects of two loci. Trait variance is evaluated using a general linear model that has this structure (called a "model"): + +Variance V(trait) = QTL1 + QTL2 + QTL1xQTL2 + error (where the = sign should be read "a function of" + +This model is also known as the Full Model (LRS Full in the output table), where QTL1 and QTL2 are the independent additive effects associated with two unlinked loci (the so-called main effects) and QTL1xQTL2 is the interaction term (LRS Interact in the output table). An LRS score is computed for this full model. This is computation identical to computing an ANOVA that allows for an interaction term between two predictors. The additive model that neglects the QTL1XQTL2 term is also computed. + +The output table in GeneNetwork list the the two intervals at the top of the table (Interval 1 to the left and Interval 2 to the far right). The LRS values for different components of the model are shown in the middle of the table (LRS Full, LRS Additive, LRS Interact, LRS 1, and LRS 2). Note that LRS 1 and LRS 2 will usually NOT sum to LRS Additive. + +CAUTIONS and LIMITATIONS: Pair-scan is only implemented for recombinant inbred strains. We do not recommend the use of this function with sample sizes of less than 60 recombinant inbred strains. Pair-scan procedures need careful diagnostics and an be very sensitive to outliers and to the balance among the four possible two-locus genotype classes among a set of RI strains. Pair-scan is not yet implemented for F2 progeny. + +GeneNetwork implements a rapid but non-exhaustive DIRECT algorithm (Lundberg et al., 2004) that efficiently searches for epistatic interactions. This method is so fast that it is possible to compute 500 permutations to evaluate non-parametric significance of the joint LRS value within a minute. This makes DIRECT ideal for an interactive web service. Karl Broman's R/qtl implements an exhaustive search using the "scantwo" function. [RWW, May 2011] + +#### Partial Correlation: + +Partial correlation is the correlation between two variables that remains after controlling for one or more other variables. Idea and techniques used to compute partial correlations are important in testing causal models (Cause and Correlation in Biology, Bill Shipley, 2000). For instance, r1,2||3,4 is the partial correlation between variables 1 and 2, while controlling for variables 3 and 4 (the || symbol is equivalent to "while controlling for"). We can compare partial correlations (e.g., r1,2||3,4) with original correlations (e.g., r1,2). If there is an insignificant difference, we infer that the controlled variables have minimal effect and may not influence the variables or even be part of the model. In contrast, if the partial correlations change significantly, the inference is that the causal link between the two variables is dependent to some degree on the controlled variables. These control variables are either anteceding causes or intervening variables. (text adapted from D Garson's original by RWW). + +For more on partial correlation please link to this great site by David Garson at NC State. + +For more on dependence separation ( d-separation) and constructing causal models see Richard Scheines' site. + +Why would you use of need partial correlations in GeneNetwork? It is often useful to compute correlations among traits while controlling for additional variables. Partial correlations may reveal more about the causality of relations. In a genetic context, partial correlations can be used to remove much of the variance associated with linkage and linkage disequilibrium. You can also control for age, age, and other common cofactors. + +Please see the related Glossary terms "Tissue Correlation". [RWW, Aug 21, 2009; Jan 30, 2010] + +#### PCA Trait or Eigentrait: + +If you place a number of traits in a Trait Collection you can carry out some of the key steps of a principal component analysis, including defining the variance directed along specific principal component eigenvectors. You can also plot the positions of cases against the first two eigenvectors; in essence a type of scatterplot. Finally, GeneNetwork allows you to exploit PCA methods to make new "synthetic" eigentraits from collections of correlated traits. These synthetic traits are the values of cases along specific eigenvectors and they may be less noisy than single traits. If this seems puzzling, then have a look at these useful PCA explanation by G. Dallas and by Powell and Lehe. How to do it: You can select and assemble many different traits into a single Trait Collection window using the check boxes and Add To Collection buttons. One of the most important function buttons in the Collection window is labeled Correlation Matrix. This function computes Pearson product moment correlations and Spearman rank order correlations for all possible pairs of traits in the Collection window. It also perfoms a principal component or factor analysis. For example, if you have 20 traits in the Collection window, the correlation matrix will consist of 20*19 or 190 correlations and the identity diagonal. Principal components analysis is a linear algebraic procedure that finds a small number of independent factors or principal components that efficiently explain variation in the original 20 traits. It is a effective method to reduce the dimensionality of a group of traits. If the 20 traits share a great deal of variation, then only two or three factors may explain variation among the traits. Instead of analyzing 20 traits as if they were independent, we can now analyze the main principal components labeled PC01, PC02, etc. PC01 and PC02 can be treated as new synthetic traits that represent the main sources of variation among original traits. You can treat a PC trait like any other trait except that it is not stored permanently in a database table. You can put a PC trait in your Collection window and see how well correlated each of the 20 original traits is with this new synthetic trait. You can also map a PC trait. [RWW, Aug 23, 2005] + +#### Permutation Test: + +A permutation test is a computationally intensive but conceptually simple method used to evaluate the statisical significance of findings. Permutation tests are often used to evaluate QTL significance. Some background: In order to detect parts of chromosomes that apparently harbor genes that contribute to differences in a trait's value, it is common to search for associations (linkage) across the entire genome. This is referred to as a "whole genome" scan, and it usually involves testing hundreds of independently segregating regions of the genome using hundreds, or even thousands of genetic markers (SNPs and microsatellites). A parametric test such as a conventional t test of F test can be used to estimate the probability of the null hypothesis at any single location in the genome (the null hypothesis is that there is no QTL at this particular location). But a parametric test of this type makes assumptions about the distribution of the trait (its normality), and also does not provide a way to correct for the large number of independent tests that are performed while scanning the whole genome. We need protection against many false discoveries as well as some assurance that we are not neglecting truly interesting locations. A permutation test is an elegant solution to both problems. The procedure involves randomly reassigning (permuting) traits values and genotypes of all cases used in the analysis. The permuted data sets have the same set of phenotypes and genotypes (in other words, distributions are the same), but obviously the permutation procedure almost invariably obliterates genuine gene-to-phenotype relation in large data sets. We typically generate several thousand permutations of the data. Each of these is analyzed using precisely the same method that was used to analyze the correctly ordered data set. We then compare statistical results of the original data set with the collection of values generated by the many permuted data sets. The hope is that the correctly ordered data are associated with larger LRS and LOD values than more than 95% of the permuted data sets. This is how we define the p = .05 whole genome significance threshold for a QTL. Please see the related Glossary terms "Significant threshold" and "Suggestive threshold". [RWW, July 15, 2005] + +#### Power to detect QTLs: + +An analysis of statistical power is useful to estimate numbers of replicates and strains needed to detect and resolve major sources of trait variance and covariance. A versatile method has been developed by Sen and colleagues (Sen et al., 2007) and implemented in the R program. qtlDesign. David Ashbrook implemented a version of this within Shiny that can help you estimate power for different QTL effect sizes, cohort sizes, and replication rates: + +#### Power Calculator (D. Ashbrook) + +We can see that in all situations power is increased more by increasing the number of lines than by increasing the number of biological replicates. Dependent upon the heritability of the trait, there is little gain in power when going above 4-6 biological replicates. [DGA, Mar 3, 2018] + +#### Probes and Probe Sets: + +In microarray experiments the probe is the immobilized sequence on the array that is complementary to the target message washed over the array surface. Affymetrix probes are 25-mer DNA sequences synthesized on a quartz substrate. There are a few million of these 25-mers in each 120-square micron cell of the array. The abundance of a single transcript is usualy estimated by as many as 16 perfect match probes and 16 mismatch probes. The collection of probes that targets a particular message is called a probe set. [RWW, Dec 21, 2004] + + +[Go back to index](#index) + +
+ +## Q + +#### QTL: + +A quantitative trait locus is a chromosome region that contains one or more sequence variants that modulates the distribution of a variable trait measured in a sample of genetically diverse individuals from an interbreeding population. Variation in a quantitative trait may be generated by a single QTL with the addition of some environmental noise. Variation may be oligogenic and be modulated by a few independently segregating QTLs. In many cases however, variation in a trait will be polygenic and influenced by large number of QTLs distributed on many chromosomes. Environment, technique, experimental design and a host of other factors also affect the apparent distribution of a trait. Most quantitative traits are therefore the product of complex interactions of genetic factors, developmental and epigenetics factors, environmental variables, and measurement error. [Williams RW, Dec 21, 2004] + +[Go back to index](#index) + +
+ +## R + +#### Recombinant Inbred Strain (RI or RIS) or Recombinant Inbred Line (RIL): + +An inbred strain whose chromosomes incorporate a fixed and permanent set of recombinations of chromosomes originally descended from two or more parental strains. Sets of RI strains (from 10 to 5000) are often used to map the chromosomal positions of polymorphic loci that control variance in phenotypes. + +For a terrific short summary of the uses of RI strains see 2007). + +Chromosomes of RI strains typically consist of alternating haplotypes of highly variable length that are inherited intact from the parental strains. In the case of a typical rodent RI strain made by crossing maternal strain C with paternal strain B (called a CXB RI strain), a chromosome will typically incorporate 3 to 5 alternating haplotype blocks with a structure such as BBBBBCCCCBBBCCCCCCCC, where each letter represents a genotype, series of similar genotype represent haplotypes, and where a transition between haplotypes represents a recombination. Both pairs of each chromosome will have the same alternating pattern, and all markers will be homozygous. Each of the different chromosomes (Chr 1, Chr 2, etc.) will have a different pattern of haplotypes and recombinations. The only exception is that the Y chromosome and the mitochondial genome, both of which are inherited intact from the paternal and maternal strain, respectively. For an RI strain to be useful for mapping purposes, the approximate position of recombinations along each chromsome need to be well defined either in terms of centimorgan or DNA basepair position. The precision with which these recombinations are mapped is a function of the number and position of the genotypes used to type the chromosomes--20 in the example above. Because markers and genotypes are often space quite far apart, often more than 500 Kb, the actual data entered into GeneNetwork will have some ambiguity at each recombination locus. The haplotype block BBBBBCCCCBBBCCCCCCCC will be entered as BBBBB?CCCC?BBB?CCCCCCCC where the ? mark indicates incomplete information over some (we hope) short interval. + +RI strains are almost always studied in sets or panels. All else being equal, the larger the set of RI strains, the greater the power and precision with which phenotypes can be mapped to chromosomal locations. The first set of eight RIs, the CXB RIs, were generated by Donald Bailey (By) from an intercross between a female BALB/cBy mouse (abbreviated C) and a male C57BL/6By mouse in the 1960s. The small panel of 8 CXB strains was originally used to determine if the major histocompatibility (MHC) locus on proximal Chr 17 was a key factor accounting for different immune responses such as tissue rejection. The methods used to determine the locations of recombinations relied on visible markers (coat color phenotypes such as the C and B loci) and the electrophoretic mobility of proteins. Somewhat larger RI sets were generated by Benjamin Taylor to map Mendelian and other major effect loci. In the 1990s the utility of RI sets for mapping was significantly improved thanks to higher density genotypes made possible by the use of microsatellite markers. Between 2005 and 2017, virtually all extant mouse and rat RI strains were regenotyped at many thousands of SNP markers, providing highly accurate maps of recombinations. + +While the potential utility of RI strains in mapping complex polygenic traits was obvious from the outset, the small number of strains only made it feasible to map quantitative traits with large effects. The first large RI sets were generated by plant geneticists (Burr et al. 2000) and this the plant genetics community holds a strong lead in the production of very large RI sets to study multigenic and polygenic traits and trait covariance and pleiotropy. + +By 2010 the number of mouse RI strains had increased to the point where defining causal gene and sequence variant was more practical. As of 2018 there are about 150 BXD strains (152 have been fully sequenced), ~100 Collaborative Cross strains (also all fully sequenced), and at least another 100 RI strains belonging to smaller sets that have been extremely well genotyped. + +Making RI strains: The usual procedure typically involves sib mating of the progeny of an F1 intercross for more than 20 generations. Even by the 5th filial (F) generation of successive matings, the RI lines are homozygous at 50% of loci and by F13, the value is above 90%. At F20 the lines are nearly fully inbred (~98%) and by convention are now referred to as inbred strains rather than inbred lines. + + +[Go back to index](#index) + +
+ +## S + +#### Scree Plots: + +GeneNetwork will often automatically generate a Scree Plot and the associated principal components (PCs) when you compute a Correlation Matrix for a group of traits that you have placed in your Trait Collection (a set of phenotypes and/or expression data for a specific population). Here is a nice definition of what a Scree plot is trying to tell you adopted and adapted from IOS (www.improvedoutcomes.com). + +A Scree Plot is a simple line segment plot that shows the fraction of total variance in the data as explained or represented by each PC. The PCs are ordered, and by definition are therefore assigned a number label, by decreasing order of contribution to total variance. The PC with the largest fraction contribution is labeled PC01. Such a plot when read left-to-right across the abscissa can often show a clear separation in fraction of total variance where the 'most important' components cease and the 'least important' components begin. The point of separation is often called the 'elbow'. (In the PCA literature, the plot is called a 'Scree' Plot because it often looks like a 'scree' slope, where rocks have fallen down and accumulated on the side of a mountain.) [Williams RW, Dec 20, 2008] + +#### Significant threshold: + +The significant threshold represents the approximate LRS value that corresponds to a genome-wide p-value of 0.05, or a 5% probability of falsely rejecting the null hypothesis that there is no linkage anywhere in the genome. This threshold is computed by evaluating the distribution of highest LRS scores generated by a set of 2000 random permutations of strain means. For example, a random permutation of the correctly ordered data may give a peak LRS score of 10 somewhere across the genome. The set of 1000 or more of these highest LRS scores is then compared to the actual LRS obtained for the correctly ordered (real) data at any location in the genome. If fewer than 50 (5%) of the 1000 permutations have peak LRS scores anywhere in the genome that exceed that obtained at a particular locus using the correctly ordered data, then one can usually claim that a QTL has been defined at a genome-wide p-value of .05. The threshold will vary slightly each time it is recomputed due to the random generation of the permutations. You can view the actual histogram of the permutation results by selecting the "Marker Regression" function in the Analysis Tools area of the Trait Data and Editing Form. WebQTL does make it possible to search through hundreds of traits for those that may have significant linkage somewhere in the genome. Keep in mind that this introduces a second tier of multiple testing problems for which the permutation test will not usually provide adequate protection. If you anticipate mapping many independent traits, then you will need to correct for the number of traits you have tested. [Williams RW, Nov 14, 2004] + +#### SNP Seismograph Track: + +SNP is an acronym for single nucleotide polymorphisms (SNPs). SNPs are simple one base pair variants that distinguish individuals and strains. The SNP Seismograph track is a unique feature of physical maps in the GeneNetwork. Each track is customized for a particular cross and shows only those SNPs that differ between the two parental strains. For example, on mouse BXD maps, only the SNPs that differ between C57BL/6J and DBA/2J will be displayed. Regions with high numbers of SNPs are characterised by wider excursions of the yellow traces that extends along the x axis. Since these regions have many SNPs they have a higher prior probability of containing functional sequence differences that might have downstream effects on phenotypes. Large genes with many SNPs close to the peak LRS and that also have a biological connection with the trait ypu are studying are high priority candidate genes. + +The SNP track in WebQTL exploits the complete Celera Discovery System SNP set but adds an additional 500,000 inferred SNPs in both BXD and AXB/BXA crosses. These SNPs were inferred based on common haplotype structure using an Monte Carlo Markov chain algorithm developed by Gary Churchill and Natalie Blades and implemented by Robert Crowell, and RWW in July 2004. Raw data used to generate the SNP seismograph track were generated by Alex Williams and Chris Vincent, July 2003. The BXD track exploits a database of 1.75 million B vs D SNPs, whereas the AXB/BXA track exploits a database of 1.80 million A vs B SNPs. The names, sequences, and precise locations of most of these SNPs are the property of Celera Discovery Systems, whom we thank for allowing us to provide this level of display in WebQTL. + +Approximately 2.8 million additional SNPs generated by Perlegen for the NIEHS have been added to the SNP track by Robert Crowell (July-Aug 2005). We have also added all Wellcome-CTC SNPs and all relevant mouse SNPs from dbSNP. [Williams RW, Dec 25, 2004; Sept 3, 2005] + +#### Standard Error of the Mean (SE or SEM): + +In most GeneNetwork data sets, the SEM is computed as: +Standard Deviation (SD) divided by the square root of n - 1 +where n is the number of independent biological samples used to estimate the population mean. What this means in practice is that when n = 2 (as in many microarray data sets), the SEM and the SD are identical. This method of computing the SEM is conservative, but corrects to some extent for well known bias of the SEM discussed by Gurland and Tripathi (1971, A simple approximation for unbiased estimation of the standard deviation. Amer Stat 25:30-32). [Williams RW, Dec 17, 2008] + +#### Strain Distribution Pattern: + +A marker such as a SNP or microsatellite is genotyped using DNA obtained from each member of the mapping population. In the case of a genetic reference population, such as the BXD strains or the BayXSha Arabadopsis lines, this results in a text string of genotypes (e.g., BDDDBDBBBBDDBDDDBBBB... for BXD1 through BXD100). Each marker is associated with its own particular text string of genotypes that is often called the strain distribution pattern of the marker. (A more appropriate term would be the marker genotype string.) This string is converted to a numerical version, a genotype vector: -1111-11-1-1-1-111-1111-1-1-1-1..., where D=1, B=-1, H=0. Mapping a trait boils down to performing correlations between each trait and all of the genotype vectors. The genotype vector with the highest correlation (absolute value) is a good candidate for a QTL. [Williams RW, June 18, 2005] + +#### Suggestive Threshold: + +The suggestive threshold represents the approximate LRS value that corresponds to a genome-wide p-value of 0.63, or a 63% probability of falsely rejecting the null hypothesis that there is no linkage anywhere in the genome. This is not a typographical error. The Suggestive LRS threshold is defined as that which yields, on average, one false positive per genome scan. That is, roughly one-third of scans at this threshold will yield no false positive, one-third will yield one false positive, and one-third will yield two or more false positives. This is a very permissive threshold, but it is useful because it calls attention to loci that may be worth follow-up. Regions of the genome in which the LRS exceeds the suggestive threshold are often worth tracking and screening. They are particularly useful in combined multicross metaanalysis of traits. If two crosses pick up the same suggestive locus, then that locus may be significant when the joint probability is computed. The suggestive threshold may vary slightly each time it is recomputed due to the random generation of permutations. You can view the actual histogram of the permutation results by selecting the "Marker Regression" function in the Analysis Tools area of the Trait Data and Editing Form. [Williams RW and Manly KF, Nov 15, 2004] + +#### Systems Genetics: + +Systems genetics or "network genetics" is an emerging new branch of genetics that aims to understand complex causal networks of interactions at multiple levels of biological organization. To put this in a simple context: Mendelian genetics can be defined as the search for linkage between a single trait and a single gene variant (1 to 1); complex trait analysis can be defined as the search for linkage between a single trait and a set of gene variants (QTLs, QTGs, and QTNs) and environmental cofactors (1 to many); and systems genetics can be defined as the search for linkages among networks of traits and networks of gene and environmental variants (many to many). + +A hallmark of systems genetics is the simultaneous consideration of groups (systems) of phenotypes from the primary level of molecular and cellular interactions that ultimately modulate global phenotypes such as blood pressure, behavior, or disease resistance. Changes in environment are also often important determinants of multiscalar phenotypes; reversing the standard notion of causality as flowing inexorably upward from the genome. Scientists who use a systems genetics approach often have a broad interest in modules of linked phenotypes. Causality in these complex dynamic systems is often contingent on environmental or temporal context, and often will involve feedback modulation. A systems genetics approach can be unusually powerful, but does require the use of large numbers of observations (large sample size), and more advanced statistical and computational models. + +Systems genetics is not really a new field and traces back to Sewall Wright's classical paper (Wright, 1921, "Correlation and Causation") that introduced path analysis to study systems of related phenotypes. Two factors have invigorated this field. The first factor is the advent of more sophisticated statistical methods including Structural Equation Modeling (SEM), System Dynamics Modeling, and Bayesian Network Modeling combined with powerful computer systems and efficient algorithms. The second factor is the relative ease with which it is now possible to acquire extensive and diverse phenotype data sets across genetic reference populations such as the BXD set of mice, the HXB set of rats, and the BayXSha lines of Arabidopsis (data are incorporated in the GeneNetwork). In the case of the BXD strains, a large research community has collectively generated hundreds of thousands of transcript phenotypes in different tissues and cells (level of expression), as well as hundreds of protein, cellular, pharmacological, and behavioral data types across a single genetic reference panel. Evaluating and modeling the associative and causal relations among these phenotypes is a major, and still relatively new area of research. Complex trait analysis and QTL mapping are both part of systems genetics in which causality is inferred using conventional genetic linkage (Li et al., 2005). One can often assert with confidence that a particular module of phenotypes (component of the variance and covariance) is modulated by sequence variants at a common locus. This provides a causal constraint that can be extremely helpful in more accurately modeling network architecture. Most models are currently static, but as the field matures, more sophisticated dynamic models will supplant steady-state models. + +The term "systems genetics" was coined by Grant Morahan, October 2004, during a visit to Memphis, as a more general and appropriate term to use instead of "genetical genomics." [Williams RW, April 11, 2005, revised Oct 22, 2005, April, 2008] + +[Go back to index](#index) + +
+ +## T + +#### Tissue Correlation: + +The tissue correlation is an estimate of the similarity of expression of two genes across different cells, tissues, or organs. In order to compute this type of correlation we first generate expression data for multiple different cell types, tissues, or whole organs from a single individual. There will be significant differences in gene expression across this sample and this variation can then be used to compute either Pearson product-moment correlations (r) or Spearman rank order correlations (rho) between any pair of genes, transcripts, or even exons. Since the samples are ideally all from one individual there should not be any genetic or environmental differences among samples. The difficulty in computing tissue correlations is that samples are not independent. For example, three samples of the small intestine (jejunum, ilieum, and duodenum) will have expression patterns that are quite similar to each other in comparison to three other samples, such as heart, brain, and bone. For this reason the nature of the sampling and how those samples are combined will greatly affect the correlation values. The tissue correlations in GeneNetwork were computed in a way that attempts to reduce the impact of this fact by combining closely related sample types. For example multiple data sets for different brain region were combined to generate a single average CNS tissue sample (generating a whole brain sample would have been an alternative method). + +However, there is really not optimal way to minimize the effects of this type of non-independence of samples. Some genes will have high expression in only a few tissues, for example the cholinergic receptor, nicotinic, alpha polypeptide 1 gene Chrna1 has high expression in muscle tissues (skeletal muscle = Mus, tongue = Ton, and esophagus = Eso) but lower expression in most other tissues. The very high correlation between Chrna1 and other genes with high expression only in muscle reflects their joint bimodality of expression. It does not mean that these genes or their proteins necessarily cooperate directly in molecular processes. [Williams RW, Dec 26, 2008] + + + +#### Transcript Location: + +The small orange triangle on the x-axis indicates the approximate position of the gene that corresponds to the transcript. These values were taken from the latest assembly of genome of the particular species. + +#### Transform: + +Most of the data sets in the GeneNetwork are ultimately derived from high resolution images of the surfaces of microarrays. Estimates the gene expression therefore involves extensive low-level image analysis. These processesing steps attempt to compensate for low spatial frequency "background" variation in image intensity that cannot be related to the actual hybridization signal, for example, a gradation of intensity across the whole array surface due to illumination differences, uneven hybridization, optical performance, scanning characteristics, etc. High spatial frequeny artifacts are also removed if they are likely to be artifacts: dust, scrathes on the array surface, and other "sharp" blemishes. The raw image data (for example, the Affymetrix DAT file) also needs to be registered to a template that assigns pixel values to expected array spots (cells). This image registration is an important process that users can usually take for granted. The end result is the reliable assignment of a set of image intensity values (pixels) to each probe. Each cell value generated using the Affymetrix U74Av2 array is associated with approximately 36 pixel intensity values (a 6x6 set of pixels, usually an effective 11 or 12-bit range of intensity). Affymetrix uses a method that simply ranks the values of these pixels and picks as the "representative value" the pixel that is closest to a particular rank order value, for example, the 24th highest of 36 pixels. The range of variation in intensity values amoung these ranked pixels provides a way to estimate the error of the estimate. The Affymetrix CEL files therefore consist of XY coordinates, the consensus value, and an error term. [Williams RW, April 30, 2005] + +#### Transgression: + +Most of us are familiar with the phrase "regression toward the mean." This refers to the tendency of progeny of a cross to have phenotype that are intermediate to those of the parents. Transgression refers to the converse: progeny that have more phenotypes that are higher and lower than those of either parent. Transgression is common, and provided that a trait is influenced by many independent sequence variants (a polygenic trait), transgression is the expectation. This is particularly true if the parents are different genetically, but by chance have similar phenotypes. Consider a trait that is controlled by six independent genes, A through F. The "0" allele at these size genes lowers body weight whereas the "1" allele increases body weight. If one parent has a 000111 6-locus genotype and the other parent has 111000 genotype, then they will have closely matched weight. But their progeny may inherit combinations as extreme as 000000 and 111111. + +Transgression means that you can rarely predict the distribution of phenotypes among a set of progeny unless you already have a significant amount of information about the genetic architecture of a trait (numbers of segregating variants that affect the trait, either interactions, and GXE effects). In practical terms this means that if the parents of a cross do NOT differ and you have good reasons to believe that the trait you are interested in is genetically complex, then you can be fairly confident that the progeny will display a much wider range of variation that the parents. [May 2011 by RWW]. + +[Go back to index](#index) + +
+ +## U + +[Go back to index](#index) + +
+ +## V + +[Go back to index](#index) + +
+ +## W + +#### Winsorize, Winsorise: + +QTL mapping results can be greatly affected by inclusion of outlier data. GeneNetwork will do its best to flag outliers for you in the Trait Data and Analysis pages (yellow highlighting). Before mapping, review the data, and if necessary, change values. Options for handling outliers include: (1) do nothing, (2) delete the outliers (trimming), (3) transform the data (e.g., logarithmic, arcsine, or logistic regression transforms), or (4) winsorize the distribution of values. Winsorizing is usually the easiest method to implement directly in GeneNetwork. + +How to winsorize: First review the distribution of values and define outliers. You should only do this one time, so think before you leap. Look at the Probability Plot of the trait by going to Trait Data and Analysis page and selecting Basic Statistics). For example, the figure below from GeneNetwork shows that at many as seven cases have relatively high values and as many as three have relatively low values (this trait is taken from Species = Mouse, Group = LXS, Type = Phenotype, Trait 10182). GeneNetwork code only declares the highest two values to be outliers, but you can use a more liberal definition and give all seven high values a haircut. It is advisable to winsorizes equal numbers of cases on each side of the distribution (high and low cases). In this case, the seven highest values were changed to match that of the 8th highest value (0.860). To retain the original rank order I added an incremental value of 0.01 to each (0.861, 0.862, etc). I did the same thing to the lowest seven values. Adding this increment is not necessary. + +The result in this case: a suggestive QTL on Chr 16 now reaches the significance threshold. + +The danger of winsorizing is doing it multiple times in different ways. You should transform or winsorize the data before mapping. And you should ideally only do any transformation/correction one time. If you fool around with different methods of transforming your data then you are asking for trouble by adding yet another level of multiple testing. If you feel compelled to experiment with different transforms, then you should/must report this in publications and explain why you did so. Demonstrating that mapping results are robust even using multiple transforms is one good excuse. [Williams RW, Jan 2, 2014] + + + + + +[Go back to index](#index) + +
+ +## X + +[Go back to index](#index) + +
+ +## Y +[Go back to index](#index) + + +
+ +## Z + +[Go back to index](#index) -- cgit v1.2.3 From 922be125592001f0e616351adcc05f1dbb16b38b Mon Sep 17 00:00:00 2001 From: BonfaceKilz Date: Wed, 4 Nov 2020 17:49:11 +0300 Subject: Wrap rendered content in div container --- wqflask/wqflask/templates/glossary.html | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/wqflask/wqflask/templates/glossary.html b/wqflask/wqflask/templates/glossary.html index 718baf13..ed6f6ff7 100644 --- a/wqflask/wqflask/templates/glossary.html +++ b/wqflask/wqflask/templates/glossary.html @@ -3,7 +3,7 @@ {% block title %}Glossary{% endblock %} {% block content %} - -{{ rendered_markdown|safe }} - +
+ {{ rendered_markdown|safe }} +
{% endblock %} -- cgit v1.2.3 From 92dbf80bb45e66a93be6fea0fc0a82644cdbb326 Mon Sep 17 00:00:00 2001 From: BonfaceKilz Date: Wed, 4 Nov 2020 17:53:49 +0300 Subject: Add an edit link button to glossary page --- wqflask/wqflask/templates/glossary.html | 3 +++ 1 file changed, 3 insertions(+) diff --git a/wqflask/wqflask/templates/glossary.html b/wqflask/wqflask/templates/glossary.html index ed6f6ff7..3b29f20e 100644 --- a/wqflask/wqflask/templates/glossary.html +++ b/wqflask/wqflask/templates/glossary.html @@ -3,7 +3,10 @@ {% block title %}Glossary{% endblock %} {% block content %} +
+ + [Edit on Github] {{ rendered_markdown|safe }}
{% endblock %} -- cgit v1.2.3 From fa852e4a293735215e10bff9198c812fb85912ce Mon Sep 17 00:00:00 2001 From: BonfaceKilz Date: Wed, 4 Nov 2020 18:04:48 +0300 Subject: Update test_markdown_routes * wqflask/tests/unit/wqflask/test_markdown_routes.py (test_render_markdown_when_fetching_locally): Fix failing test. --- wqflask/tests/unit/wqflask/test_markdown_routes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wqflask/tests/unit/wqflask/test_markdown_routes.py b/wqflask/tests/unit/wqflask/test_markdown_routes.py index 3de14276..3adf63e5 100644 --- a/wqflask/tests/unit/wqflask/test_markdown_routes.py +++ b/wqflask/tests/unit/wqflask/test_markdown_routes.py @@ -38,8 +38,8 @@ class TestMarkdownRoutesFunctions(unittest.TestCase): "/genenetwork/genenetwork2/" "wqflask/wqflask/static/" "glossary.md") - self.assertEqual("

Content

\n", - markdown_content) + self.assertRegexpMatches(markdown_content, + "Glossary of Terms and Features") @mock.patch('wqflask.markdown_routes.requests.get') def test_render_markdown_when_fetching_remotely(self, requests_mock): -- cgit v1.2.3 From 057b322c2d8270231170c35bb735edf075635357 Mon Sep 17 00:00:00 2001 From: BonfaceKilz Date: Wed, 4 Nov 2020 20:16:19 +0300 Subject: Add link markers to elements which are referenced from elsewhere --- wqflask/wqflask/static/markdown/glossary.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/wqflask/wqflask/static/markdown/glossary.md b/wqflask/wqflask/static/markdown/glossary.md index 3c14ab78..d2c80b40 100644 --- a/wqflask/wqflask/static/markdown/glossary.md +++ b/wqflask/wqflask/static/markdown/glossary.md @@ -13,6 +13,8 @@ Resource. gn1.genenetwork.org/glossary.html ## A +
+ #### Additive Allele Effect: The additive allele effect is an estimate of the change in the average phenotype that would be produced by substituting a single allele of one type with that of another type (e.g., a replaced by A) in a population. In a standard F2 intercross between two inbred parental lines there are two alleles at every polymorphic locus that are often referred to as the little "a" allele and big "A" allele. F2 progeny inherit the a/a, a/A, or A/A genotypes at every genetic locus in a ratio close to 1:2:1. The additive effect is half of the difference between the mean of all cases that are homozygous for one parental allele (aa) compared to the mean of all cases that are homozygous for the other parental allele (AA): @@ -29,6 +31,8 @@ The dominance effects of alleles are also computed on maps for F2 populations (e
+
+ #### Bootstrap: A bootstrap sample is a randomly drawn sample (or resample) that is taken from the original data set and that has the same number of samples as the original data set. In a single bootstrap sample, some cases will by chance be represented one or more times; other cases may not be represented at all (in other words, the sampling is done "with replacement" after each selection). To get a better intuitive feel for the method, imagine a bag of 26 Scrabble pieces that contain each letter of the English alphabet. In a bootstrap sample of these 26 pieces, you would shake the bag, insert your hand, and draw out one piece. You would then write down that letter on a piece of paper, and the place that Scrabble piece back in the bag in preparation for the next random selection. You would repeat this process (shake, draw, replace) 25 more times to generate a single bootstrap resample of the alphabet. Some letters will be represented several time in each sample and others will not be represented at al. If you repeat this procedure 1000 times you would have a set of bootstrap resamples of the type that GN uses to remap data sets. @@ -65,6 +69,8 @@ Complex trait analysis is the study of multiple causes of variation of phenotype Composite interval mapping is a method of mapping chromosomal regions that controls for some fraction of the genetic variability in a quantitative trait. Unlike simple interval mapping, composite interval mapping usually controls for variation produced at one or more background marker loci. These background markers are generally chosen because they are already known to be close to the location of a significant QTL. By factoring out a portion of the genetic variance produced by a major QTL, one can occasionally detect secondary QTLs. WebQTL allows users to control for a single background marker. To select this marker, first run the Marker Regression analysis (and if necessary, check the box labeled display all LRS, select the appropriate locus, and the click on either Composite Interval Mapping or Composite Regression. A more powerful and effective alternative to composite interval mapping is pair-scan analysis. This latter method takes into accounts (models) both the independent effects of two loci and possible two-locus epistatic interactions. [Williams RW, Dec 20, 2004] +
+ #### Correlations: Pearson and Spearman: GeneNetwork provides tools to compute both Pearson product-moment correlations (the standard type of correlation), Spearman rank order correlations. Wikipedia and introductory statistics text will have a discussion of these major types of correlation. The quick advice is to use the more robust Spearman rank order correlation if the number of pairs of observations in a data set is less than about 30 and to use the more powerful but much more sensitive Pearson product-moment correlation when the number of observations is greater than 30 AND after you have dealt with any outliers. GeneNetwork automatically flags outliers for you in the Trait Data and Analysis form. GeneNetwork also allows you to modify values by either deleting or winsorising them. That means that you can use Pearson correlations even with smaller sample sizes after making sure that data are well distributed. Be sure to view the scatterplots associated with correlation values (just click on the value to generate a plot). Look for bivariate outliers. @@ -298,12 +304,16 @@ Interval mapping is a process in which the statistical significance of a hypothe ## L +
+ #### Literature Correlation: The literature correlation is a unique feature in GeneNetwork that quantifies the similarity of words used to describe genes and their functions. Sets of words associated with genes were extracted from MEDLINE/PubMed abstracts (Jan 2017 by Ramin Homayouni, Diem-Trang Pham, and Sujoy Roy). For example, about 2500 PubMed abstracts contain reference to the gene "Sonic hedgehog" (Shh) in mouse, human, or rat. The words in all of these abstracts were extracted and categorize by their information content. A word such as "the" is not interesting, but words such as "dopamine" or "development" are useful in quantifying similarity. Sets of informative words are then compared—one gene's word set is compared the word set for all other genes. Similarity values are computed for a matrix of about 20,000 genes using latent semantic indexing (see Xu et al., 2011). Similarity values are also known as literature correlations. These values are always positive and range from 0 to 1. Values between 0.5 and 1.0 indicate moderate-to-high levels of overlap of vocabularies. The literature correlation can be used to compare the "semantic" signal-to-noise of different measurements of gene, mRNA, and protein expression. Consider this common situation:There are three probe sets that measure Kit gene expression (1459588\_at, 1415900\_a\_at, and 1452514\_a\_at) in the Mouse BXD Lung mRNA data set (HZI Lung M430v2 (Apr08) RMA). Which one of these three gives the best measurement of Kit expression? It is impractical to perform quantitative rtPCR studies to answer this question, but there is a solid statistical answer that relies on Literature Correlation. Do the following: For each of the three probe sets, generate the top 1000 literature correlates. This will generate three apparently identical lists of genes that are known from the PubMed literature to be associated with the Kit oncogene. But the three lists are NOT actually identical when we look at the Sample Correlation column. To answer the question "which of the three probe sets is best", review the actual performance of the probe sets against this set of 1000 "friends of Kit". Do this by sorting all three lists by their Sample Correlation column (high to low). The clear winner is probe set 1415900_a_at. The 100th row in this probe set's list has a Sample Correlation of 0.620 (absolute value). In comparison, the 100th row for probe set 1452514_a_at has a Sample Correlation of 0.289. The probe set that targets the intron comes in last at 0.275. In conclusion, the probe set that targets the proximal half of the 3' UTR (1415900_a_at) has the highest "agreement" between Literature Correlation and Sample Correlation, and is our preferred measurement of Kit expression in the lung in this data set. (Updated by RWW and Ramin Homayouni, April 2017.) +
+ #### LOD: The logarithm of the odds (LOD) provides a measure of the association between variation in a phenotype and genetic differences (alleles) at a particular chromosomal locus (see Nyholt 2000 for a lovely review of LOD scores). @@ -324,6 +334,8 @@ With complete data at a marker, the log likelihood for the normal model reduces LOD values can be converted to LRS scores (likelihood ratio statistics) by multiplying by 4.61. The LOD is also roughly equivalent to the -log(P), where P is the probability of linkage (P = 0.001 => 3). The LOD itself is not a precise measurement of the probability of linkage, but in general for F2 crosses and RI strains, values above 3.3 will usually be worth attention for simple interval maps. [Williams RW, June 15, 2005, updated with text from Karl Broman, Oct 28, 2010, updated Apr 21, 2020 with Nyholt reference]. +
+ #### LRS: In the setting of mapping traits, the likelihood ratio statistic is used as a measurement of the association or linkage between differences in traits and differences in particular genotype markers. LRS or LOD values are usually plotted on the y-axis, whereas chromosomal location of the marker are usually plotted on the x-axis. In the case of a whole genome scan--a sequential analysis of many markers and locations across the entire genome--LRS values above 10 to 15 will usually be worth attention for when mapping with standard experimental crosses (e.g., F2 intercrosses or recombinant inbred strains). The term "likelihood ratio" is used to describe the relative probability (likelihood) of two different explanations of the variation in a trait. The first explanation (or model or hypothesis H1) is that the differences in the trait ARE associated with that particular DNA sequence difference or marker. Very small probability values indicate that H1 is probably true. The second "null" hypothesis (Hnull or H0) is that differences in the trait are NOT associated with that particular DNA sequence. We can use the ratio of these two probabilities and models (H1 divided by H0) as our score. The math is a little bit more complicated and the LRS score is actually equal to -2 times the ratio of the natural logarithms of the two probabilities. For example, if the probability of H0 is 0.05 (only a one-in-twenty probability that the marker is associated with the trait by chance), whereas and the probability of H1 is 1 (the marker is certainly not linked to the trait), then the LRS value is 5.991. In Excel the equation giving the LRS result of 5.991 would look like this "=-2*(LN(0.05)-LN(1)). [Williams RW, Dec 13, 2004, updated Nov 18, 2009, updated Dec 19, 2012] @@ -415,6 +427,8 @@ Please see the related Glossary terms "Tissue Correlation". [RWW, Aug 21, 2009; If you place a number of traits in a Trait Collection you can carry out some of the key steps of a principal component analysis, including defining the variance directed along specific principal component eigenvectors. You can also plot the positions of cases against the first two eigenvectors; in essence a type of scatterplot. Finally, GeneNetwork allows you to exploit PCA methods to make new "synthetic" eigentraits from collections of correlated traits. These synthetic traits are the values of cases along specific eigenvectors and they may be less noisy than single traits. If this seems puzzling, then have a look at these useful PCA explanation by G. Dallas and by Powell and Lehe. How to do it: You can select and assemble many different traits into a single Trait Collection window using the check boxes and Add To Collection buttons. One of the most important function buttons in the Collection window is labeled Correlation Matrix. This function computes Pearson product moment correlations and Spearman rank order correlations for all possible pairs of traits in the Collection window. It also perfoms a principal component or factor analysis. For example, if you have 20 traits in the Collection window, the correlation matrix will consist of 20*19 or 190 correlations and the identity diagonal. Principal components analysis is a linear algebraic procedure that finds a small number of independent factors or principal components that efficiently explain variation in the original 20 traits. It is a effective method to reduce the dimensionality of a group of traits. If the 20 traits share a great deal of variation, then only two or three factors may explain variation among the traits. Instead of analyzing 20 traits as if they were independent, we can now analyze the main principal components labeled PC01, PC02, etc. PC01 and PC02 can be treated as new synthetic traits that represent the main sources of variation among original traits. You can treat a PC trait like any other trait except that it is not stored permanently in a database table. You can put a PC trait in your Collection window and see how well correlated each of the 20 original traits is with this new synthetic trait. You can also map a PC trait. [RWW, Aug 23, 2005] +
+ #### Permutation Test: A permutation test is a computationally intensive but conceptually simple method used to evaluate the statisical significance of findings. Permutation tests are often used to evaluate QTL significance. Some background: In order to detect parts of chromosomes that apparently harbor genes that contribute to differences in a trait's value, it is common to search for associations (linkage) across the entire genome. This is referred to as a "whole genome" scan, and it usually involves testing hundreds of independently segregating regions of the genome using hundreds, or even thousands of genetic markers (SNPs and microsatellites). A parametric test such as a conventional t test of F test can be used to estimate the probability of the null hypothesis at any single location in the genome (the null hypothesis is that there is no QTL at this particular location). But a parametric test of this type makes assumptions about the distribution of the trait (its normality), and also does not provide a way to correct for the large number of independent tests that are performed while scanning the whole genome. We need protection against many false discoveries as well as some assurance that we are not neglecting truly interesting locations. A permutation test is an elegant solution to both problems. The procedure involves randomly reassigning (permuting) traits values and genotypes of all cases used in the analysis. The permuted data sets have the same set of phenotypes and genotypes (in other words, distributions are the same), but obviously the permutation procedure almost invariably obliterates genuine gene-to-phenotype relation in large data sets. We typically generate several thousand permutations of the data. Each of these is analyzed using precisely the same method that was used to analyze the correctly ordered data set. We then compare statistical results of the original data set with the collection of values generated by the many permuted data sets. The hope is that the correctly ordered data are associated with larger LRS and LOD values than more than 95% of the permuted data sets. This is how we define the p = .05 whole genome significance threshold for a QTL. Please see the related Glossary terms "Significant threshold" and "Suggestive threshold". [RWW, July 15, 2005] @@ -481,6 +495,8 @@ A Scree Plot is a simple line segment plot that shows the fraction of total vari The significant threshold represents the approximate LRS value that corresponds to a genome-wide p-value of 0.05, or a 5% probability of falsely rejecting the null hypothesis that there is no linkage anywhere in the genome. This threshold is computed by evaluating the distribution of highest LRS scores generated by a set of 2000 random permutations of strain means. For example, a random permutation of the correctly ordered data may give a peak LRS score of 10 somewhere across the genome. The set of 1000 or more of these highest LRS scores is then compared to the actual LRS obtained for the correctly ordered (real) data at any location in the genome. If fewer than 50 (5%) of the 1000 permutations have peak LRS scores anywhere in the genome that exceed that obtained at a particular locus using the correctly ordered data, then one can usually claim that a QTL has been defined at a genome-wide p-value of .05. The threshold will vary slightly each time it is recomputed due to the random generation of the permutations. You can view the actual histogram of the permutation results by selecting the "Marker Regression" function in the Analysis Tools area of the Trait Data and Editing Form. WebQTL does make it possible to search through hundreds of traits for those that may have significant linkage somewhere in the genome. Keep in mind that this introduces a second tier of multiple testing problems for which the permutation test will not usually provide adequate protection. If you anticipate mapping many independent traits, then you will need to correct for the number of traits you have tested. [Williams RW, Nov 14, 2004] +
+ #### SNP Seismograph Track: SNP is an acronym for single nucleotide polymorphisms (SNPs). SNPs are simple one base pair variants that distinguish individuals and strains. The SNP Seismograph track is a unique feature of physical maps in the GeneNetwork. Each track is customized for a particular cross and shows only those SNPs that differ between the two parental strains. For example, on mouse BXD maps, only the SNPs that differ between C57BL/6J and DBA/2J will be displayed. Regions with high numbers of SNPs are characterised by wider excursions of the yellow traces that extends along the x axis. Since these regions have many SNPs they have a higher prior probability of containing functional sequence differences that might have downstream effects on phenotypes. Large genes with many SNPs close to the peak LRS and that also have a biological connection with the trait ypu are studying are high priority candidate genes. @@ -519,6 +535,8 @@ The term "systems genetics" was coined by Grant Morahan, October 2004, during a ## T +
+ #### Tissue Correlation: The tissue correlation is an estimate of the similarity of expression of two genes across different cells, tissues, or organs. In order to compute this type of correlation we first generate expression data for multiple different cell types, tissues, or whole organs from a single individual. There will be significant differences in gene expression across this sample and this variation can then be used to compute either Pearson product-moment correlations (r) or Spearman rank order correlations (rho) between any pair of genes, transcripts, or even exons. Since the samples are ideally all from one individual there should not be any genetic or environmental differences among samples. The difficulty in computing tissue correlations is that samples are not independent. For example, three samples of the small intestine (jejunum, ilieum, and duodenum) will have expression patterns that are quite similar to each other in comparison to three other samples, such as heart, brain, and bone. For this reason the nature of the sampling and how those samples are combined will greatly affect the correlation values. The tissue correlations in GeneNetwork were computed in a way that attempts to reduce the impact of this fact by combining closely related sample types. For example multiple data sets for different brain region were combined to generate a single average CNS tissue sample (generating a whole brain sample would have been an alternative method). -- cgit v1.2.3 From 5fabe2763200b5607d2f8ec3eaa85a7c23c9875b Mon Sep 17 00:00:00 2001 From: BonfaceKilz Date: Wed, 4 Nov 2020 20:33:45 +0300 Subject: Add missing image --- wqflask/wqflask/static/images/Congenic.png | Bin 0 -> 56578 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 wqflask/wqflask/static/images/Congenic.png diff --git a/wqflask/wqflask/static/images/Congenic.png b/wqflask/wqflask/static/images/Congenic.png new file mode 100644 index 00000000..8cd489a4 Binary files /dev/null and b/wqflask/wqflask/static/images/Congenic.png differ -- cgit v1.2.3 From 813204c39aebe17c43d8ea94c9e20a6a32914cba Mon Sep 17 00:00:00 2001 From: BonfaceKilz Date: Wed, 4 Nov 2020 20:34:13 +0300 Subject: Use new glossary link from glossary_blueprint in html templates --- wqflask/wqflask/templates/base.html | 2 +- wqflask/wqflask/templates/collections/view.html | 4 ++-- wqflask/wqflask/templates/gsearch_gene.html | 8 ++++---- wqflask/wqflask/templates/gsearch_pheno.html | 4 ++-- wqflask/wqflask/templates/index_page.html | 6 +++--- wqflask/wqflask/templates/index_page_orig.html | 2 +- wqflask/wqflask/templates/search_result_page.html | 8 ++++---- wqflask/wqflask/templates/show_trait_calculate_correlations.html | 2 +- 8 files changed, 18 insertions(+), 18 deletions(-) diff --git a/wqflask/wqflask/templates/base.html b/wqflask/wqflask/templates/base.html index 0f4e5ef5..6b584751 100644 --- a/wqflask/wqflask/templates/base.html +++ b/wqflask/wqflask/templates/base.html @@ -66,7 +66,7 @@ diff --git a/wqflask/wqflask/templates/index_page_orig.html b/wqflask/wqflask/templates/index_page_orig.html index 6b3bec9a..13ca52df 100755 --- a/wqflask/wqflask/templates/index_page_orig.html +++ b/wqflask/wqflask/templates/index_page_orig.html @@ -177,7 +177,7 @@ Gene Ontology.
  • RIF=diabetes LRS=(9 999 Chr2 100 105) transLRS=(9 999 10) - finds diabetes-associated transcripts with peak + finds diabetes-associated transcripts with peak trans eQTLs on Chr 2 between 100 and 105 Mb with LRS scores between 9 and 999.
  • diff --git a/wqflask/wqflask/templates/search_result_page.html b/wqflask/wqflask/templates/search_result_page.html index 2318bfb8..6d9ea8fe 100644 --- a/wqflask/wqflask/templates/search_result_page.html +++ b/wqflask/wqflask/templates/search_result_page.html @@ -312,7 +312,7 @@ 'orderSequence': [ "desc", "asc"] }, { - 'title': "High P ?", + 'title': "High P ?", 'type': "natural-minus-na", 'data': "lrs_score", 'width': "60px", @@ -325,7 +325,7 @@ 'data': "lrs_location" }, { - 'title': "Effect Size ?", + 'title': "Effect Size ?", 'type': "natural-minus-na", 'data': "additive", 'width': "85px", @@ -381,7 +381,7 @@ 'orderSequence': [ "desc", "asc"] }, { - 'title': "Max LRS ?", + 'title': "Max LRS ?", 'type': "natural-minus-na", 'data': "lrs_score", 'width': "80px", @@ -394,7 +394,7 @@ 'data': "lrs_location" }, { - 'title': "Effect Size ?", + 'title': "Effect Size ?", 'type': "natural-minus-na", 'width': "120px", 'data': "additive", diff --git a/wqflask/wqflask/templates/show_trait_calculate_correlations.html b/wqflask/wqflask/templates/show_trait_calculate_correlations.html index bc1d4091..4f25e90a 100644 --- a/wqflask/wqflask/templates/show_trait_calculate_correlations.html +++ b/wqflask/wqflask/templates/show_trait_calculate_correlations.html @@ -120,7 +120,7 @@ is computed between trait data and any other traits in the sample database selected above. Use - Spearman + Spearman Rank when the sample size is small (<20) or when there are influential outliers. -- cgit v1.2.3 From 6b4bcc526c27a17f96704b54fbfa6a506b3a22f9 Mon Sep 17 00:00:00 2001 From: Alexanderlacuna Date: Thu, 5 Nov 2020 22:30:33 +0300 Subject: add test for samplelist and show_trait --- wqflask/tests/wqflask/show_trait/testSampleList.py | 18 +++++++ .../tests/wqflask/show_trait/test_show_trait.py | 55 ++++++++++++++++++++++ 2 files changed, 73 insertions(+) create mode 100644 wqflask/tests/wqflask/show_trait/testSampleList.py diff --git a/wqflask/tests/wqflask/show_trait/testSampleList.py b/wqflask/tests/wqflask/show_trait/testSampleList.py new file mode 100644 index 00000000..4cc8c4da --- /dev/null +++ b/wqflask/tests/wqflask/show_trait/testSampleList.py @@ -0,0 +1,18 @@ +import unittest +import re +from unittest import mock +from wqflask.show_trait.SampleList import natural_sort + + +class TestSampleList(unittest.TestCase): + def test_natural_sort(self): + "Sort the list into natural alphanumeric order." + + characters_list = ["z", "f", "q", "s", "t", "a", "g"] + names_list = ["temp1", "publish", "Sample", "Dataset"] + + natural_sort(characters_list) + natural_sort(names_list) + + self.assertEqual(characters_list, ["a", "f", "g", "q", "s", "t", "z"]) + self.assertEqual(names_list, ["Dataset", "Sample", "publish", "temp1"]) diff --git a/wqflask/tests/wqflask/show_trait/test_show_trait.py b/wqflask/tests/wqflask/show_trait/test_show_trait.py index 24150738..b1a91bbe 100644 --- a/wqflask/tests/wqflask/show_trait/test_show_trait.py +++ b/wqflask/tests/wqflask/show_trait/test_show_trait.py @@ -11,6 +11,7 @@ from wqflask.show_trait.show_trait import get_trait_units from wqflask.show_trait.show_trait import get_nearest_marker from wqflask.show_trait.show_trait import get_genotype_scales from wqflask.show_trait.show_trait import requests +from wqflask.show_trait.show_trait import get_scales_from_genofile class TraitObject: @@ -240,3 +241,57 @@ class TestTraits(unittest.TestCase): expected_results = {f"{file_location}": [["physic", "Mb"]]} self.assertEqual(get_genotype_scales(file_location), expected_results) mock_get_scales.assert_called_once_with(file_location) + + @mock.patch("wqflask.show_trait.show_trait.locate_ignore_error") + def test_get_scales_from_genofile_found(self, mock_location_ignore): + # test no complete to be continued with + # a .geno file + mock_location_ignore.return_value = True + mock_file_with_one_line = mock.mock_open( + read_data="Some data from opened file") + + mock_file = """#@scale and random data:is here_returned\n + This is second with spaced with tabs\n """ + mock_file_result = mock.mock_open(read_data=mock_file) + + with mock.patch("builtins.open", mock_file_with_one_line): + result = get_scales_from_genofile("~/data/file") + self.assertEqual(result, [['morgan', 'cM']]) + + with mock.patch("builtins.open", mock_file_result): + results = get_scales_from_genofile("~data/file_geno") + self.assertEqual(results, [['physic', 'Mb']]) + + @mock.patch("wqflask.show_trait.show_trait.locate_ignore_error") + def test_get_scales_from_genofile_found(self, mock_ingore_location): + """"add test for get scales from genofile where file is found""" + mock_ingore_location.return_value = True + geno_file = """ + #sample line with no @scales:other\n + #sample line @scales and :separated by semicolon\n + This attempts to check whether\n + """ + + geno_file_string = "@line start with @ and has @scale:morgan" + + file_location = "~/data/file.geno" + + mock_open_geno_file = mock.mock_open(read_data=geno_file) + with mock.patch("builtins.open", mock_open_geno_file): + results = get_scales_from_genofile(file_location) + self.assertEqual(results, [["morgan", "cM"]]) + + mock_open_string = mock.mock_open(read_data=geno_file_string) + + with mock.patch("builtins.open", mock_open_string): + result2 = get_scales_from_genofile(file_location) + self.assertEqual([['morgan', 'cM']], result2) + + @mock.patch("wqflask.show_trait.show_trait.locate_ignore_error") + def test_get_scales_from_genofile_not_found(self, mock_location_ignore): + mock_location_ignore.return_value = False + + expected_results = [["physic", "Mb"]] + results = get_scales_from_genofile("~/data/file") + mock_location_ignore.assert_called_once_with("~/data/file", "genotype") + self.assertEqual(results, expected_results) -- cgit v1.2.3 From 162e3e1acc1fea2548f7caa71fa3fc425c0a4ccd Mon Sep 17 00:00:00 2001 From: Alexanderlacuna Date: Thu, 5 Nov 2020 22:47:30 +0300 Subject: remove duplicates --- wqflask/tests/wqflask/show_trait/test_show_trait.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/wqflask/tests/wqflask/show_trait/test_show_trait.py b/wqflask/tests/wqflask/show_trait/test_show_trait.py index b1a91bbe..600baefb 100644 --- a/wqflask/tests/wqflask/show_trait/test_show_trait.py +++ b/wqflask/tests/wqflask/show_trait/test_show_trait.py @@ -242,25 +242,6 @@ class TestTraits(unittest.TestCase): self.assertEqual(get_genotype_scales(file_location), expected_results) mock_get_scales.assert_called_once_with(file_location) - @mock.patch("wqflask.show_trait.show_trait.locate_ignore_error") - def test_get_scales_from_genofile_found(self, mock_location_ignore): - # test no complete to be continued with - # a .geno file - mock_location_ignore.return_value = True - mock_file_with_one_line = mock.mock_open( - read_data="Some data from opened file") - - mock_file = """#@scale and random data:is here_returned\n - This is second with spaced with tabs\n """ - mock_file_result = mock.mock_open(read_data=mock_file) - - with mock.patch("builtins.open", mock_file_with_one_line): - result = get_scales_from_genofile("~/data/file") - self.assertEqual(result, [['morgan', 'cM']]) - - with mock.patch("builtins.open", mock_file_result): - results = get_scales_from_genofile("~data/file_geno") - self.assertEqual(results, [['physic', 'Mb']]) @mock.patch("wqflask.show_trait.show_trait.locate_ignore_error") def test_get_scales_from_genofile_found(self, mock_ingore_location): -- cgit v1.2.3 From 7078bd8a2d9d3a2ab8f31e102deeece8bc0e5fcf Mon Sep 17 00:00:00 2001 From: Alexanderlacuna Date: Fri, 6 Nov 2020 00:45:50 +0300 Subject: fix typo --- wqflask/tests/wqflask/show_trait/test_show_trait.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wqflask/tests/wqflask/show_trait/test_show_trait.py b/wqflask/tests/wqflask/show_trait/test_show_trait.py index 600baefb..8c866874 100644 --- a/wqflask/tests/wqflask/show_trait/test_show_trait.py +++ b/wqflask/tests/wqflask/show_trait/test_show_trait.py @@ -244,9 +244,9 @@ class TestTraits(unittest.TestCase): @mock.patch("wqflask.show_trait.show_trait.locate_ignore_error") - def test_get_scales_from_genofile_found(self, mock_ingore_location): + def test_get_scales_from_genofile_found(self, mock_ignore_location): """"add test for get scales from genofile where file is found""" - mock_ingore_location.return_value = True + mock_ignore_location.return_value = True geno_file = """ #sample line with no @scales:other\n #sample line @scales and :separated by semicolon\n -- cgit v1.2.3 From 6b23bf4a0698339a1c7672b8d84dfef9b9066a79 Mon Sep 17 00:00:00 2001 From: Alexanderlacuna Date: Fri, 6 Nov 2020 10:59:12 +0300 Subject: modify tests to for changes in the natural sort function --- wqflask/tests/wqflask/show_trait/testSampleList.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/wqflask/tests/wqflask/show_trait/testSampleList.py b/wqflask/tests/wqflask/show_trait/testSampleList.py index 4cc8c4da..34c51e3e 100644 --- a/wqflask/tests/wqflask/show_trait/testSampleList.py +++ b/wqflask/tests/wqflask/show_trait/testSampleList.py @@ -10,9 +10,7 @@ class TestSampleList(unittest.TestCase): characters_list = ["z", "f", "q", "s", "t", "a", "g"] names_list = ["temp1", "publish", "Sample", "Dataset"] - - natural_sort(characters_list) - natural_sort(names_list) - - self.assertEqual(characters_list, ["a", "f", "g", "q", "s", "t", "z"]) - self.assertEqual(names_list, ["Dataset", "Sample", "publish", "temp1"]) + sorted_list_a=natural_sort(characters_list) + sorted_list_b=natural_sort(names_list) + self.assertEqual(sorted_list_a, ["a", "f", "g", "q", "s", "t", "z"]) + self.assertEqual(sorted_list_b,["Dataset", "Sample", "publish", "temp1"]) -- cgit v1.2.3 From fd402e9248f32bd79c4527d7d2cec518d1079586 Mon Sep 17 00:00:00 2001 From: Alexanderlacuna Date: Fri, 6 Nov 2020 11:02:05 +0300 Subject: modify natural_sort and return list to avoid having side effect --- wqflask/wqflask/show_trait/SampleList.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/wqflask/wqflask/show_trait/SampleList.py b/wqflask/wqflask/show_trait/SampleList.py index 37c1d6d5..6a056144 100644 --- a/wqflask/wqflask/show_trait/SampleList.py +++ b/wqflask/wqflask/show_trait/SampleList.py @@ -162,7 +162,7 @@ class SampleList(object): return first_attr_col -def natural_sort(list, key=lambda s: s): +def natural_sort(a_list, key=lambda s: s): """ Sort the list into natural alphanumeric order. """ @@ -170,4 +170,8 @@ def natural_sort(list, key=lambda s: s): def convert(text): return int(text) if text.isdigit() else text return lambda s: [convert(c) for c in re.split('([0-9]+)', key(s))] sort_key = get_alphanum_key_func(key) - list.sort(key=sort_key) \ No newline at end of file + + sorted_list=sorted(a_list,key=sort_key) + + return sorted_list + \ No newline at end of file -- cgit v1.2.3 From 55aa5bf7ec00fb5b5d28a1847e7f8682a62aa547 Mon Sep 17 00:00:00 2001 From: Alexanderlacuna Date: Fri, 6 Nov 2020 11:09:11 +0300 Subject: remove whitespace --- wqflask/wqflask/show_trait/SampleList.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/wqflask/wqflask/show_trait/SampleList.py b/wqflask/wqflask/show_trait/SampleList.py index 6a056144..191c29bd 100644 --- a/wqflask/wqflask/show_trait/SampleList.py +++ b/wqflask/wqflask/show_trait/SampleList.py @@ -8,6 +8,7 @@ from pprint import pformat as pf from utility import Plot from utility import Bunch + class SampleList(object): def __init__(self, dataset, @@ -67,7 +68,8 @@ class SampleList(object): self.sample_list.append(sample) self.se_exists = any(sample.variance for sample in self.sample_list) - self.num_cases_exists = any(sample.num_cases for sample in self.sample_list) + self.num_cases_exists = any( + sample.num_cases for sample in self.sample_list) first_attr_col = self.get_first_attr_col() for sample in self.sample_list: @@ -162,6 +164,7 @@ class SampleList(object): return first_attr_col + def natural_sort(a_list, key=lambda s: s): """ Sort the list into natural alphanumeric order. @@ -170,8 +173,5 @@ def natural_sort(a_list, key=lambda s: s): def convert(text): return int(text) if text.isdigit() else text return lambda s: [convert(c) for c in re.split('([0-9]+)', key(s))] sort_key = get_alphanum_key_func(key) - - sorted_list=sorted(a_list,key=sort_key) - + sorted_list = sorted(a_list, key=sort_key) return sorted_list - \ No newline at end of file -- cgit v1.2.3 From 74ba91ade4ed0a875abf7417c33249f42f7c367e Mon Sep 17 00:00:00 2001 From: BonfaceKilz Date: Tue, 10 Nov 2020 16:48:14 +0300 Subject: Add missing links and some markdown formatting(underline, bold) * wqflask/wqflask/static/markdown/glossary.md: Update file. --- wqflask/wqflask/static/markdown/glossary.md | 154 +++++++++++++++------------- 1 file changed, 80 insertions(+), 74 deletions(-) diff --git a/wqflask/wqflask/static/markdown/glossary.md b/wqflask/wqflask/static/markdown/glossary.md index d2c80b40..68276796 100644 --- a/wqflask/wqflask/static/markdown/glossary.md +++ b/wqflask/wqflask/static/markdown/glossary.md @@ -21,11 +21,11 @@ The additive allele effect is an estimate of the change in the average phenotype [(mean of AA cases)-(mean of aa cases)]/2 -GeneNetwork displays the additive values on the far right of many trait/QTL maps, usually as red or green lines along the maps. The units of measurement of additive effects (and dominance effects) are defined by the trait itself and are shown in Trait Data and Analysis windows. For mRNA estimates these units are usually normalized log2 expression values. For this reason an additive effect of 0.5 units indicates that the A/A and a/a genotypes at a particular locus or marker differ by 1 unit (twice the effect of swapping a single A allele for an a allele). On this log2 scale this is equivalent to a 2-fold difference (2 raised to the power of 1). +GeneNetwork displays the additive values on the far right of many trait/QTL maps, usually as red or green lines along the maps. The units of measurement of additive effects (and dominance effects) are defined by the trait itself and are shown in **Trait Data and Analysis** windows. For mRNA estimates these units are usually normalized log2 expression values. For this reason an additive effect of 0.5 units indicates that the A/A and a/a genotypes at a particular locus or marker differ by 1 unit (twice the effect of swapping a single A allele for an a allele). On this log2 scale this is equivalent to a 2-fold difference (2 raised to the power of 1). On the QTL map plots the polarity of allele effects is represented by the color of the line. For example, in mouse BXD family maps, if the DBA/2J allele produces higher values than the C57BL/6J allele then the additive effect line is colored in green. In contrast, if the C57BL/6J allele produces higher values then the line is colored in red. For computational purposes, C57BL/6J red values are considered negative. -The dominance effects of alleles are also computed on maps for F2 populations (e.g., B6D2F2 and B6BTBRF2). Orange and purple line colors are used to distinguish the polarity of effects. Purple is the positive dominance effect that matches the polarity of the green additive effect, whereas orange is the negative dominance effect that matches the polarity of the red additive effect. [Please also see entry on Dominance Effects: Williams RW, Oct 15, 2004; Sept 3, 2005; Dec 4, 2005; Oct 25, 2011] +The dominance effects of alleles are also computed on maps for F2 populations (e.g., B6D2F2 and B6BTBRF2). Orange and purple line colors are used to distinguish the polarity of effects. Purple is the positive dominance effect that matches the polarity of the green additive effect, whereas orange is the negative dominance effect that matches the polarity of the red additive effect. [Please also see entry on **Dominance Effects**: Williams RW, Oct 15, 2004; Sept 3, 2005; Dec 4, 2005; Oct 25, 2011] [Go back to index](#index) @@ -35,15 +35,15 @@ The dominance effects of alleles are also computed on maps for F2 populations (e #### Bootstrap: -A bootstrap sample is a randomly drawn sample (or resample) that is taken from the original data set and that has the same number of samples as the original data set. In a single bootstrap sample, some cases will by chance be represented one or more times; other cases may not be represented at all (in other words, the sampling is done "with replacement" after each selection). To get a better intuitive feel for the method, imagine a bag of 26 Scrabble pieces that contain each letter of the English alphabet. In a bootstrap sample of these 26 pieces, you would shake the bag, insert your hand, and draw out one piece. You would then write down that letter on a piece of paper, and the place that Scrabble piece back in the bag in preparation for the next random selection. You would repeat this process (shake, draw, replace) 25 more times to generate a single bootstrap resample of the alphabet. Some letters will be represented several time in each sample and others will not be represented at al. If you repeat this procedure 1000 times you would have a set of bootstrap resamples of the type that GN uses to remap data sets. +A [bootstrap sample](http://en.wikipedia.org/wiki/Bootstrapping_%28statistics%29) is a randomly drawn sample (or resample) that is taken from the original data set and that has the same number of samples as the original data set. In a single bootstrap sample, some cases will by chance be represented one or more times; other cases may not be represented at all (in other words, the sampling is done "with replacement" after each selection). To get a better intuitive feel for the method, imagine a bag of 26 Scrabble pieces that contain each letter of the English alphabet. In a bootstrap sample of these 26 pieces, you would shake the bag, insert your hand, and draw out one piece. You would then write down that letter on a piece of paper, and the place that Scrabble piece back in the bag in preparation for the next random selection. You would repeat this process (shake, draw, replace) 25 more times to generate a single bootstrap resample of the alphabet. Some letters will be represented several time in each sample and others will not be represented at al. If you repeat this procedure 1000 times you would have a set of bootstrap resamples of the type that GN uses to remap data sets. -Bootstrap resampling is a method that can be used to estimate statistical parameters and error terms. GeneNetwork uses a bootstrap procedure to evaluate approximate confidence limits of QTL peaks using a method proposed by Peter Visscher and colleagues (1996). We generate 2000 bootstraps, remap each, and keep track of the location of the single locus with the highest LRS score locations (equivalent to a "letter" in the Scrabble example). The 2000 "best" locations are used to produce the yellow histograms plotted on some of the QTL maps. If the position of a QTL is firm, then the particular composition of the sample, will not shift the position of the QTL peak by very much. In such a case, the histogram of "best QTLs" (yellow bars in the maps) that is displayed in WebQTL maps will tend to have a sharp peak (the scale is the percentage of bootstrap resamples that fall into each bar of the bootstrap histogram). In contrast, if the the yellow bootstrap histograms are spread out along a chromosome, then the precise location of a QTL may not be accurate, even in the original correct data set. Bootstrap results naturally vary between runs due to the random generation of the samples. See the related entry "Frequency of Peak LRS." +Bootstrap resampling is a method that can be used to estimate statistical parameters and error terms. GeneNetwork uses a bootstrap procedure to evaluate approximate confidence limits of QTL peaks using a method proposed by Peter Visscher and colleagues ([1996](http://www.genetics.org/content/143/2/1013.full.pdf)). We generate 2000 bootstraps, remap each, and keep track of the location of the single locus with the highest LRS score locations (equivalent to a "letter" in the Scrabble example). The 2000 "best" locations are used to produce the yellow histograms plotted on some of the QTL maps. If the position of a QTL is firm, then the particular composition of the sample, will not shift the position of the QTL peak by very much. In such a case, the histogram of "best QTLs" (yellow bars in the maps) that is displayed in WebQTL maps will tend to have a sharp peak (the scale is the percentage of bootstrap resamples that fall into each bar of the bootstrap histogram). In contrast, if the the yellow bootstrap histograms are spread out along a chromosome, then the precise location of a QTL may not be accurate, even in the original correct data set. Bootstrap results naturally vary between runs due to the random generation of the samples. See the related entry "Frequency of Peak LRS." -KNOWN PROBLEMS and INTERPRETATION of BOOTSTRAP RESULTS: The reliability of bootstrap analysis of QTL confidence intervals has been criticized by Manichaikul and colleagues (2006). Their work applies in particular to standard intercrosses and backcrosses in which markers are spaced every 2 cM. They recommend that confidence intervals be estimated either by a conventional 1.5 to 2.0 LOD drop-off interval or by a Bayes credible Interval method. +KNOWN PROBLEMS and INTERPRETATION of BOOTSTRAP RESULTS: The reliability of bootstrap analysis of QTL confidence intervals has been criticized by Manichaikul and colleagues ([2006](http://www.genetics.org/cgi/content/full/174/1/481)). Their work applies in particular to standard intercrosses and backcrosses in which markers are spaced every 2 cM. They recommend that confidence intervals be estimated either by a conventional 1.5 to 2.0 LOD drop-off interval or by a Bayes credible Interval method. There is a known flaw in the way in which GeneNetwork displays bootstrap results (Sept 2011). If a map has two or more adjacent markers with identical LOD score and identical strain distribution patterns, all of the bootstrap results are assigned incorrectly to just one of the "twin" markers. This results in a false perception of precision. -QTL mapping methods can be highly sensitive to cases with very high or very low phenotype values (outliers). The bootstrap method does not provide protection against the effects of outliers and their effects on QTL maps. Make sure you review your data for outliers before mapping. Options include (1) Do nothing, (2) Delete the outliers and see what happens to your maps, (3) Winsorize the values of the outliers. You might try all three options and determine if your main results are stable or not. With small samples or extreme outliers, you may find the mapping results to be quite volatile. In general, if the results (QTL position or value) depend highly on one or two outliers (5-10% of the samples) then you should probably delete or winsorize the outliers. [Williams RW, Oct 15, 2004, Mar 15, 2008, Mar 26, 2008; Sept 2011] +QTL mapping methods can be highly sensitive to cases with very high or very low phenotype values (outliers). The bootstrap method does not provide protection against the effects of outliers and their effects on QTL maps. Make sure you review your data for outliers before mapping. Options include (1) Do nothing, (2) Delete the outliers and see what happens to your maps, (3) [Winsorize](http://en.wikipedia.org/wiki/Winsorising) the values of the outliers. You might try all three options and determine if your main results are stable or not. With small samples or extreme outliers, you may find the mapping results to be quite volatile. In general, if the results (QTL position or value) depend highly on one or two outliers (5-10% of the samples) then you should probably delete or winsorize the outliers. [Williams RW, Oct 15, 2004, Mar 15, 2008, Mar 26, 2008; Sept 2011] [Go back to index](#index) @@ -59,7 +59,7 @@ Cluster maps are sets of QTL maps for a group of traits. The QTL maps for the in #### Collections and Trait Collections: -One of the most powerful features of GeneNetwork (GN) is the ability to study large sets of traits that have been measured using a common genetic reference population or panel (GRP). This is one of the key requirements of systems genetics--many traits studied in common. Under the main GN menu Search heading you will see a link to Trait Collections. You can assemble you own collection for any GRP by simply adding items using the Add to Collection button that you will find in many windows. Once you have a collection you will have access to a new set of tools for analysis of your collection, including QTL Cluster Map, Network Graph, Correlation Matrix, and Compare Correlates. [Williams RW, April 7, 2006] +One of the most powerful features of GeneNetwork (GN) is the ability to study large sets of traits that have been measured using a common genetic reference population or panel (GRP). This is one of the key requirements of systems genetics--many traits studied in common. Under the main GN menu **Search** heading you will see a link to **Trait Collections**. You can assemble you own collection for any GRP by simply adding items using the Add to Collection button that you will find in many windows. Once you have a collection you will have access to a new set of tools for analysis of your collection, including **QTL Cluster Map, Network Graph, Correlation Matrix**, and **Compare Correlates**. [Williams RW, April 7, 2006] #### Complex Trait Analysis: @@ -67,13 +67,13 @@ Complex trait analysis is the study of multiple causes of variation of phenotype #### Composite Interval Mapping: -Composite interval mapping is a method of mapping chromosomal regions that controls for some fraction of the genetic variability in a quantitative trait. Unlike simple interval mapping, composite interval mapping usually controls for variation produced at one or more background marker loci. These background markers are generally chosen because they are already known to be close to the location of a significant QTL. By factoring out a portion of the genetic variance produced by a major QTL, one can occasionally detect secondary QTLs. WebQTL allows users to control for a single background marker. To select this marker, first run the Marker Regression analysis (and if necessary, check the box labeled display all LRS, select the appropriate locus, and the click on either Composite Interval Mapping or Composite Regression. A more powerful and effective alternative to composite interval mapping is pair-scan analysis. This latter method takes into accounts (models) both the independent effects of two loci and possible two-locus epistatic interactions. [Williams RW, Dec 20, 2004] +Composite interval mapping is a method of mapping chromosomal regions that controls for some fraction of the genetic variability in a quantitative trait. Unlike simple interval mapping, composite interval mapping usually controls for variation produced at one or more background marker loci. These background markers are generally chosen because they are already known to be close to the location of a significant QTL. By factoring out a portion of the genetic variance produced by a major QTL, one can occasionally detect secondary QTLs. WebQTL allows users to control for a single background marker. To select this marker, first run the **Marker Regression** analysis (and if necessary, check the box labeled display all LRS, select the appropriate locus, and the click on either **Composite Interval Mapping** or **Composite Regression**. A more powerful and effective alternative to composite interval mapping is pair-scan analysis. This latter method takes into accounts (models) both the independent effects of two loci and possible two-locus epistatic interactions. [Williams RW, Dec 20, 2004]
    #### Correlations: Pearson and Spearman: -GeneNetwork provides tools to compute both Pearson product-moment correlations (the standard type of correlation), Spearman rank order correlations. Wikipedia and introductory statistics text will have a discussion of these major types of correlation. The quick advice is to use the more robust Spearman rank order correlation if the number of pairs of observations in a data set is less than about 30 and to use the more powerful but much more sensitive Pearson product-moment correlation when the number of observations is greater than 30 AND after you have dealt with any outliers. GeneNetwork automatically flags outliers for you in the Trait Data and Analysis form. GeneNetwork also allows you to modify values by either deleting or winsorising them. That means that you can use Pearson correlations even with smaller sample sizes after making sure that data are well distributed. Be sure to view the scatterplots associated with correlation values (just click on the value to generate a plot). Look for bivariate outliers. +GeneNetwork provides tools to compute both Pearson product-moment correlations (the standard type of correlation), Spearman rank order correlations. [Wikipedia](http://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient) and introductory statistics text will have a discussion of these major types of correlation. The quick advice is to use the more robust Spearman rank order correlation if the number of pairs of observations in a data set is less than about 30 and to use the more powerful but much more sensitive Pearson product-moment correlation when the number of observations is greater than 30 AND after you have dealt with any outliers. GeneNetwork automatically flags outliers for you in the Trait Data and Analysis form. GeneNetwork also allows you to modify values by either deleting or winsorising them. That means that you can use Pearson correlations even with smaller sample sizes after making sure that data are well distributed. Be sure to view the scatterplots associated with correlation values (just click on the value to generate a plot). Look for bivariate outliers. #### Cross: @@ -97,7 +97,7 @@ Note that dominance deviations cannot be computed from a set of recombinant inbr #### Epistasis: -Epistasis means that combined effects of two or more different loci or polymorphic genes are not what one would expect given the addition of their individual effects. There is, in other words, evidence for non-linear interactions among two or more loci. This is similar to the dominance effects mentioned above, but now generalized to two or more distinct loci, rather than to two or more alleles at a single locus. For example, if QTL 1 has an A allele that has an additive effects of +5 and QTL 2 has an A alleles that has an additive effect of +2, then the two locus genotype combination A/A would be expected to boost the mean by +7 units. But if the value of these A/A individuals was actually -7 we would be quite surprised and would refer to this as an epistatic interaction between QTL 1 and QTL 2. WebQTL will search for all possible epistatic interactions between pairs of loci in the genome. This function is called a Pair Scan becasue the software analyzes the LRS score for all possible pairs of loci. Instead of viewing an LRS plot along a single dimension, we now view a two-dimensional plot that shows a field of LRS scores computed for pairs of loci. Pair scan plots are extremely sensitive to outlier data. Be sure to review the primary data carefully using Basic Statistics. Also note that this more sophisiticated method also demands a significantly larger sample size. While 25 to 50 cases may be adequate for a conventional LRS plot (sometimes called a "main scan"), a Pair-Scan is hard to apply safely with fewer than 60 cases. [Williams RW, Dec 21, 2004; Dec 5, 2005] +Epistasis means that combined effects of two or more different loci or polymorphic genes are not what one would expect given the addition of their individual effects. There is, in other words, evidence for non-linear interactions among two or more loci. This is similar to the dominance effects mentioned above, but now generalized to two or more distinct loci, rather than to two or more alleles at a single locus. For example, if QTL 1 has an A allele that has an additive effects of +5 and QTL 2 has an A alleles that has an additive effect of +2, then the two locus genotype combination A/A would be expected to boost the mean by +7 units. But if the value of these A/A individuals was actually -7 we would be quite surprised and would refer to this as an epistatic interaction between QTL 1 and QTL 2. WebQTL will search for all possible epistatic interactions between pairs of loci in the genome. This function is called a **Pair Scan** becasue the software analyzes the LRS score for all possible pairs of loci. Instead of viewing an LRS plot along a single dimension, we now view a two-dimensional plot that shows a field of LRS scores computed for pairs of loci. Pair scan plots are extremely sensitive to outlier data. Be sure to review the primary data carefully using **Basic Statistics**. Also note that this more sophisiticated method also demands a significantly larger sample size. While 25 to 50 cases may be adequate for a conventional LRS plot (sometimes called a "main scan"), a **Pair-Scan** is hard to apply safely with fewer than 60 cases. [Williams RW, Dec 21, 2004; Dec 5, 2005] #### Effect Size of a QTL: @@ -108,19 +108,19 @@ Please note that the functional importance of a locus, QTL, or GWAS hit can not Estimates of effect size for families of inbred lines, such as the BXD, HXB, CC, and hybrid diversity panels (e.g. the hybrid mouse diversity panel and the hybrid rat diversity panel) are typically (and correctly) much higher than those measured in otherwise similar analysis of intercrosses, heterogeneous stock (HS), or diversity outbred stock. Two factors contribute to the much higher level of explained variance of QTLs when using inbred strain panels. -1. *Replication Rate:* The variance that can be explained by a locus is increased by sampling multiple cases that have identical genomes and by using the strain mean for genetic analysis. Increasing replication rates from 1 to 6 can easily double the apparent heritability of a trait and therefore the effect size of a locus. The reason is simple—resampling decrease the standard error of mean, boosting the effective heritability (see Glossary entry on Heritability and focus on figure 1 from the Belknap 1998 paper reproduced below). Compare the genetically explained variance (labeled h2RI in this figure) of a single case (no replication) on the x-axis with the function at a replication rate of 4 on the y-axis. If the explained variance is 0.1 (10% of all variance explained) then the value is boosted to 0.3 (30% of strain mean variance explained) with n = 4. +1. **Replication Rate:** The variance that can be explained by a locus is increased by sampling multiple cases that have identical genomes and by using the strain mean for genetic analysis. Increasing replication rates from 1 to 6 can easily double the apparent heritability of a trait and therefore the effect size of a locus. The reason is simple—resampling decrease the standard error of mean, boosting the effective heritability (see Glossary entry on *Heritability* and focus on figure 1 from the Belknap [1998](http://gn1.genenetwork.org/images/upload/Belknap_Heritability_1998.pdf) paper reproduced below).
    Compare the genetically explained variance (labeled h2RI in this figure) of a single case (no replication) on the x-axis with the function at a replication rate of 4 on the y-axis. If the explained variance is 0.1 (10% of all variance explained) then the value is boosted to 0.3 (30% of strain mean variance explained) with n = 4. -2. *Homozygosity:* The second factor has to do with the inherent genetic variance of populations. Recombinant inbred lines are homozygous at nearly all loci. This doubles the genetic variance in a family of recombinant inbred lines compared to a matched number of F2s. This also quadruples the variance compared to a matched number of backcross cases. As a result 40 BXDs sampled just one per genometype will average 2X the genetic variance and 2X the heritability of 40 BDF2 cases. Note that panels made up of isogenic F1 hybrids (so-called diallel crosses, DX) made by crossing recombinant inbred strains (BXD, CC, or HXB) are no longer homozygous at all loci, and while they do expose important new sources of variance associated with dominance, they do not benefit from the 2X gain in genetic variance relative to an F2 intercross. +2. **Homozygosity:** The second factor has to do with the inherent genetic variance of populations. Recombinant inbred lines are homozygous at nearly all loci. This doubles the genetic variance in a family of recombinant inbred lines compared to a matched number of F2s. This also quadruples the variance compared to a matched number of backcross cases. As a result 40 BXDs sampled just one per genometype will average 2X the genetic variance and 2X the heritability of 40 BDF2 cases. Note that panels made up of isogenic F1 hybrids (so-called diallel crosses, DX) made by crossing recombinant inbred strains (BXD, CC, or HXB) are no longer homozygous at all loci, and while they do expose important new sources of variance associated with dominance, they do not benefit from the 2X gain in genetic variance relative to an F2 intercross. Homozygosity For the reasons listed above a QTL effect size of 0.4 detected a panel of BXD lines replicated four times each (160 cases total), corresponds approximately to an effect size of 0.18 in BXDs without replication (40 cases total), and to an effect size of 0.09 in an F2 of 40 cases total. [Williams RW, Dec 23, 2004; updated by RWW July 13, 2019] -eQTL, cis eQTL, trans eQTL +#### eQTL, cis eQTL, trans eQTL -An expression QTL or eQTL. Differences in the expression of mRNA or proteins are often treated as standard phenotypes, much like body height or lung capacity. The variation in these microscopic traits (microtraits) can be mapped using conventional QTL methods. Damerval and colleagues were the first authors to use this kind of nomenclature and in their classic study of 1994 introduced the term PQLs for protein quantitative trait loci. Schadt and colleagues added the acronym eQTL in their early mRNA study of corn, mouse, and humans. We now are "blessed" with all kinds of prefixes to QTLs that highlight the type of trait that has been measured (m for metabolic, b for behavioral, p for physiological or protein). +An expression QTL or eQTL. Differences in the expression of mRNA or proteins are often treated as standard phenotypes, much like body height or lung capacity. The variation in these microscopic traits (microtraits) can be mapped using conventional QTL methods. [Damerval](http://www.genetics.org/cgi/reprint/137/1/289) and colleagues were the first authors to use this kind of nomenclature and in their classic study of 1994 introduced the term PQLs for protein quantitative trait loci. Schadt and colleagues added the acronym eQTL in their early mRNA study of corn, mouse, and humans. We now are "blessed" with all kinds of prefixes to QTLs that highlight the type of trait that has been measured (m for metabolic, b for behavioral, p for physiological or protein). -eQTLs of mRNAs and proteins have the unique property of (usually) having a single parent gene and genetic location. An eQTL that maps to the location of the parent gene that produces the mRNA or protein is referred to as a cis eQTL or local eQTL. In contrast, an eQTL that maps far away from its parent gene is referred to as a trans eQTL. You can use special search commands in GeneNetwork to find cis and trans eQTLs. [Williams RW, Nov 23, 2009, Dec 2009] +eQTLs of mRNAs and proteins have the unique property of (usually) having a single parent gene and genetic location. An eQTL that maps to the location of the parent gene that produces the mRNA or protein is referred to as a **cis eQTL** or local eQTL. In contrast, an eQTL that maps far away from its parent gene is referred to as a **trans eQTL**. You can use special search commands in GeneNetwork to find cis and trans eQTLs. [Williams RW, Nov 23, 2009, Dec 2009] [Go back to index](#index) @@ -134,7 +134,7 @@ The height of the yellow bars in some of the Map View windows provides a measure #### False Discovery Rate (FDR): -A false discovery is an apparently significant finding--usually determined using a particular P value alpha criterion--that given is known to be insignificant or false given other information. When performing a single statistical test we often accept a false discovery rate of 1 in 20 (p = .05). False discovery rates can climb to high levels in large genomic and genetic studies in which hundreds to millions of tests are run and summarized using standard "single test" p values. There are various statistical methods to estimate and control false discovery rate and to compute genome-wide p values that correct for large numbers of implicit or explicit statistical test. The Permutation test in GeneNetwork is one method that is used to prevent and excessive number of false QTL discoveries. Methods used to correct the FDR are approximations and may depend on a set of assumptions about data and sample structure. [Williams RW, April 5, 2008] +A [false discovery](http://en.wikipedia.org/wiki/False_discovery_rate) is an apparently significant finding--usually determined using a particular P value alpha criterion--that given is known to be insignificant or false given other information. When performing a single statistical test we often accept a false discovery rate of 1 in 20 (p = .05). False discovery rates can climb to high levels in large genomic and genetic studies in which hundreds to millions of tests are run and summarized using standard "single test" p values. There are various statistical methods to estimate and control false discovery rate and to compute genome-wide p values that correct for large numbers of implicit or explicit statistical test. The Permutation test in GeneNetwork is one method that is used to prevent and excessive number of false QTL discoveries. Methods used to correct the FDR are approximations and may depend on a set of assumptions about data and sample structure. [Williams RW, April 5, 2008] [Go back to index](#index) @@ -146,34 +146,36 @@ A false discovery is an apparently significant finding--usually determined using GeneNetwork provides summary information on most of the genes and their transcripts. Genes and their alternative splice variants are often are poorly annotated and may not have proper names or symbols. However, almost all entries have a valid GenBank accession identifier. This is a unique code associated with a single sequence deposited in GenBank (Entrez Nucleotide). A single gene may have hundreds of GenBank entries. GenBank entries that share a genomic location and possibly a single gene are generally combined into a single UniGene entry. For mouse, these always begin with "Mm" (Mus musculus) and are followed by a period and then a number. More than half of all mouse UniGene identifiers are associated with a reputable gene, and these genes will have gene identifiers (GeneID). GeneIDs are identical to LocusLink identifiers (LocusID). Even a 10 megabase locus such as human Myopia 4 (MYP4) that is not yet associated with a specific gene is assigned a GeneID--a minor misnomer and one reason to prefer the term LocusID. -See the related FAQ on "How many genes and transcripts are in your databases and what fraction of the genome is being surveyed?" [Williams RW, Dec 23, 2004, updated Jan 2, 2005] +See the related [FAQ](http://gn1.genenetwork.org/faq.html#Q-6) on "How many genes and transcripts are in your databases and what fraction of the genome is being surveyed?" [Williams RW, Dec 23, 2004, updated Jan 2, 2005] #### Genetic Reference Population (GRP): A genetic reference population consists of a set of genetically well characterized lines that are often used over a long period of time to study a multitude of different phenotypes. Once a GRP has been genotyped, subsequent studies can focus on the analysis of interesting and important phenotypes and their joint and independent relations. Most of the mouse GRPs, such as the BXDs used in the GeneNetwork, have been typed using a common set of over 14,000 makers (SNPs and microsatellites). Many of these same GRPs have been phenotyped extensively for more than 25 years, resulting in rich sets of phenotypes. A GRP is an ideal long-term resource for systems genetics because of the relative ease with which vast amounts of diverse data can be accumulated, analyzed, and combined. -The power of GRPs and their compelling scientific advantages derive from the ability to study multiple phenotypes and substantial numbers of genetically defined individuals under one or more environmental conditions. When accurate phenotypes from 20 or more lines in a GRP have been acquired it becomes practical to explore and test the genetic correlations between that trait and any previously measured trait in the same GRP. This fact underlies the use of the term reference in GRP. Since each genetic individual is represented by an entire isogenic line--usually an inbred strain or an isogenic F1 hybrid--it is possible to obtain accurate mean phenotypes associated with each line simply by typing several individuals. GRPs are also ideal for developmental and aging studies because the same genetic individual can be phenotyped at multiple stages. +The power of GRPs and their compelling scientific advantages derive from the ability to study multiple phenotypes and substantial numbers of genetically defined individuals under one or more environmental conditions. When accurate phenotypes from 20 or more lines in a GRP have been acquired it becomes practical to explore and test the genetic correlations between that trait and any previously measured trait in the same GRP. This fact underlies the use of the term **reference** in GRP. Since each genetic individual is represented by an entire isogenic line--usually an inbred strain or an isogenic F1 hybrid--it is possible to obtain accurate mean phenotypes associated with each line simply by typing several individuals. GRPs are also ideal for developmental and aging studies because the same genetic individual can be phenotyped at multiple stages. A GRP can also be used a conventional mapping panel. But unlike most other mapping panel, a GRP can be easily adapted to jointly map sets of functionally related traits (multitrait mapping); a more powerful method to extract causal relations from networks of genetic correlations. -The largest GRPs now consist of more than 400 recombinant inbred lines of Arabidopsis and maize. The BayxSha Arabidopsis set in the GeneNetwork consists of 420 lines. Pioneer Hi-Bred International is rumored to have as many as 4000 maize RI lines. The largest mammalian GRPs are the LXS and BXD RI sets in the GeneNetwork. The Collaborative Cross is the largest mammalian GRP, and over 600 of these strains are now being bred by members of the Complex Trait Consortium. +The largest GRPs now consist of more than 400 recombinant inbred lines of *Arabidopsis* and [maize](http://www.maizegdb.org/cgi-bin/stockadvquery.cgi?check=true&name=&typebox=true&type=701&linkage_group=0&genvar1=&genvar2=&genvar3=&karyovar=0&phenotype=0&attribution=&avail_from=0&parent=0). The BayxSha Arabidopsis set in the GeneNetwork consists of 420 lines. Pioneer Hi-Bred International is rumored to have as many as 4000 maize RI lines. The largest mammalian GRPs are the LXS and BXD RI sets in the GeneNetwork. The Collaborative Cross is the largest mammalian GRP, and over 600 of these strains are now being bred by members of the Complex Trait Consortium. There are several subtypes of GRPs. In addition to recombinant inbred strains there are -- Recombinant congenic (RCC) strains such as the AcB set Consomic or chromosome substitution strains (CSS) of mice (Matin et al., 1999) and rats (Roman et al., 2002) +- Recombinant congenic ([RCC](http://research.jax.org/grs/type/recombcong.htmll)) strains such as the [AcB](http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=pubmed&dopt=Abstract&list_uids=11374899&query_hl=4) set Consomic or chromosome substitution strains ([CSS](http://research.jax.org/grs/type/consomic.html)) of mice (Matin et al., [1999](http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=pubmed&dopt=Abstract&list_uids=10508525&query_hl=11)) and rats (Roman et al., [2002](http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=pubmed&dopt=Abstract&list_uids=12858554&query_hl=7)) -- Recombinant intercross (RIX) F1 sets made by mating different RI strains to each other to generate large set of R! first generation (F1) progeny (RIX). This is a standard (diallel cross) of RI inbred strains. Genetic analysis of a set of RIX progeny has some advantages over a corresponding analysis of RI strains. The first of these is that while each set of F1 progeny is fully isogenic (AXB1 x AXB2 gives a set of isogenic F1s), these F1s are not inbred but are heterozygous at many loci across the genome. RIX therefore retain the advance of being genetically defined and replicable, but without the disadvantage of being fully inbred. RIX have a genetic architecture more like natural populations. The second correlated advantage is that it is possible to study patterns of dominance of allelic variants using an RIX cross. Almost all loci or genes that differs between the original stock strains (A and B) will be heterozygous among a sufficiently larges set of RIX. A set of RIX progeny can therefore be mapped using the same methods used to map an F2 intercross. Mapping of QTLs may have somewhat more power and precision than when RI strains are used alone. A third advantage is that RIX sets make it possible to expand often limited RI resources to very large sizes to confirm and extend models of genetic or GXE effects. For example a set of 30 AXB strains can be used to generate a full matrix of 30 x 29 unique RIX progeny. The main current disadvantage of RIX panels is the comparative lack of extant phenotype data. +- Recombinant intercross ([RIX](http://www.ncbi.nlm.nih.gov/pubmed/?term=15879512)) F1 sets made by mating different RI strains to each other to generate large set of R! first generation (F1) progeny (RIX). This is a standard ([diallel cross](http://en.wikipedia.org/wiki/Diallel_cross)) of RI inbred strains. Genetic analysis of a set of RIX progeny has some advantages over a corresponding analysis of RI strains. The first of these is that while each set of F1 progeny is fully isogenic (AXB1 x AXB2 gives a set of isogenic F1s), these F1s are not inbred but are heterozygous at many loci across the genome. RIX therefore retain the advance of being genetically defined and replicable, but without the disadvantage of being fully inbred. RIX have a genetic architecture more like natural populations. The second correlated advantage is that it is possible to study patterns of dominance of allelic variants using an RIX cross. Almost all loci or genes that differs between the original stock strains (A and B) will be heterozygous among a sufficiently larges set of RIX. A set of RIX progeny can therefore be mapped using the same methods used to map an F2 intercross. Mapping of QTLs may have somewhat more power and precision than when RI strains are used alone. A third advantage is that RIX sets make it possible to expand often limited RI resources to very large sizes to confirm and extend models of genetic or GXE effects. For example a set of 30 AXB strains can be used to generate a full matrix of 30 x 29 unique RIX progeny. The main current disadvantage of RIX panels is the comparative lack of extant phenotype data. - Recombinant F1 line sets can also be made by backcrossing an entire RI sets to a single inbred line that carries an interesting mutation or transgene (RI backcross or RIB). GeneNetwork includes one RI backcross sets generated by Kent Hunter. In this RIB each of 18 AKXD RI strains were crossed to an FVB/N line that carries a tumor susceptibility allele (polyoma middle T). - All of these sets of lines are GRPs since each line is genetically defined and because the set as a whole can in principle be easily regenerated and phenotyped. Finally, each of these resources can be used to track down genetic loci that are causes of variation in phenotype using variants of standard linkage analysis. +All of these sets of lines are GRPs since each line is genetically defined and because the set as a whole can in principle be easily regenerated and phenotyped. Finally, each of these resources can be used to track down genetic loci that are causes of variation in phenotype using variants of standard linkage analysis. - A Diversity Panel such as that used by the Mouse Phenome Project is not a standard GRPs, although its also shares the ability to accumulate and study networks of phenotypes. The main difference is that a Diversity Panel cannot be used for conventional linkage analysis. A sufficiently large Diversity Panel could in principle be used for the equivalent of an assocation study. However, these are definitely NOT in silico studies, because hundreds of individuals need to be phenotyped for every trait. Surveys of many diverse isogenic lines (inbred or F1 hybrids) is statistically the equivalent of a human association study (the main difference is the ability to replicate measurements and study sets of traits) and therefore, like human association studies, does require very high sample size to map polygenic traits. Like human association studies there is also a high risk of false positive results due to population stratification and non-syntenic marker association. +A Diversity Panel such as that used by the Mouse Phenome Project is not a standard GRPs, although its also shares the ability to accumulate and study networks of phenotypes. The main difference is that a Diversity Panel cannot be used for conventional linkage analysis. A sufficiently large Diversity Panel could in principle be used for the equivalent of an assocation study. However, these are definitely NOT in silico studies, because hundreds of individuals need to be phenotyped for every trait. Surveys of many diverse isogenic lines (inbred or F1 hybrids) is statistically the equivalent of a human association study (the main difference is the ability to replicate measurements and study sets of traits) and therefore, like human association studies, does require very high sample size to map polygenic traits. Like human association studies there is also a high risk of false positive results due to population stratification and non-syntenic marker association. - A good use of a Diversity Panel is as a fine-mapping resource with which to dissect chromosomal intervals already mapped using a conventional cross or GRP. GeneNetwork now includes Mouse Diversity Panel (MDP) data for several data sets. We now typically include all 16 sequenced strains of mice, and add PWK/PhJ, NZO/HiLtJ (two of the eight members of the Collaborative Cross), and several F1 hybrids. The MDP data is often appended at the bottom of the GRP data set with which is was acquired (e.g., BXD hippocampal and BXD eye data sets). [Williams RW, June 19, 2005; Dec 4, 2005] +A good use of a Diversity Panel is as a fine-mapping resource with which to dissect chromosomal intervals already mapped using a conventional cross or GRP. GeneNetwork now includes Mouse Diversity Panel (MDP) data for several data sets. We now typically include all 16 sequenced strains of mice, and add PWK/PhJ, NZO/HiLtJ (two of the eight members of the Collaborative Cross), and several F1 hybrids. The MDP data is often appended at the bottom of the GRP data set with which is was acquired (e.g., BXD hippocampal and BXD eye data sets). [Williams RW, June 19, 2005; Dec 4, 2005] - Genotype: The state of a gene or DNA sequence, usually used to describe a contrast between two or more states, such as that between the normal state (wildtype) and a mutant state (mutation) or between the alleles inherited from two parents. All species that are included in GeneNetwork are diploid (derived from two parents) and have two copies of most genes (genes located on the X and Y chromosomes are exceptions). As a result the genotype of a particular diploid individual is actually a pair of genotypes, one from each parents. For example, the offspring of a mating between strain A and strain B will have one copy of the A genotype and one copy of the B genotype and therefore have an A/B genotype. In contrast, offspring of a mating between a female strain A and a male strain A will inherit only A genotypes and have an A/A genotype. +#### Genotype + +The state of a gene or DNA sequence, usually used to describe a contrast between two or more states, such as that between the normal state (wildtype) and a mutant state (mutation) or between the alleles inherited from two parents. All species that are included in GeneNetwork are diploid (derived from two parents) and have two copies of most genes (genes located on the X and Y chromosomes are exceptions). As a result the genotype of a particular diploid individual is actually a pair of genotypes, one from each parents. For example, the offspring of a mating between strain A and strain B will have one copy of the A genotype and one copy of the B genotype and therefore have an A/B genotype. In contrast, offspring of a mating between a female strain A and a male strain A will inherit only A genotypes and have an A/A genotype. Genotypes can be measured or inferred in many different ways, even by visual inspection of animals (e.g. as Gregor Mendel did long before DNA was discovered). But now the typical method is to directly test DNA that has a well define chromosomal location that has been obtained from one or usually many cases using molecular tests that often rely on polymerase chain reaction steps and sequence analysis. Each case is genotyped at many chromosomal locations (loci, markers, or genes). The entire collection of genotypes (as many a 1 million for a single case) is also sometimes referred to as the cases genotype, but the word "genometype" might be more appropriate to highlight the fact that we are now dealing with a set of genotypes spanning the entire genome (all chromosomes) of the case. @@ -187,13 +189,13 @@ Text here [Williams RW, July 15, 2010] ## H -#### Heritability, h2: +#### Heritability, h2: Heritability is a rough measure of the ability to use genetic information to predict the level of variation in phenotypes among progeny. Values range from 0 to 1 (or 0 to 100%). A value of 1 or 100% means that a trait is entirely predictable based on paternal/materinal and genetic data (in other words, a Mendelian trait), whereas a value of 0 means that a trait is not at all predictable from information on gene variants. Estimates of heritability are highly dependent on the environment, stage, and age. Important traits that affect fitness often have low heritabilities because stabilizing selection reduces the frequency of DNA variants that produce suboptimal phenotypes. Conversely, less critical traits for which substantial phenotypic variation is well tolerated, may have high heritability. The environment of laboratory rodents is unnatural, and this allows the accumulation of somewhat deleterious mutations (for example, mutations that lead to albinism). This leads to an upward trend in heritability of unselected traits in laboratory populations--a desirable feature from the point of view of the biomedical analysis of the genetic basis of trait variance. Heritability is a useful parameter to measure at an early stage of a genetic analysis, because it provides a rough gauge of the likelihood of successfully understanding the allelic sources of variation. Highly heritable traits are more amenable to mapping studies. There are numerous ways to estimate heritability, a few of which are described below. [Williams RW, Dec 23, 2004] -#### h2 Estimated by Intraclass Correlation: +#### h2 Estimated by Intraclass Correlation: Heritability can be estimated using the intraclass correlation coefficient. This is essentially a one-way repeated measures analysis of variance (ANOVA) of the reliability of trait data. Difference among strains are considered due to a random effect, whereas variation among samples within a single strain are considered due to measurement error. One can use the method implemented by SAS (PROC VARCOMP) that exploits a restricted maximum likelihood (REML) approach to estimate the intraclass correlation coefficient instead of an ordinary least squares method. The general equation for the intraclass correlation is: @@ -201,36 +203,36 @@ Heritability can be estimated using the intraclass correlation coefficient. This where n is the average number of cases per strain. The intraclass correlation approaches 1 when there is minimal variation within strains, and strain means differ greatly. In contrast, if difference between strains are less than what would be predicted from the differences within strain, then the intraclass correlation will produce negative estimates of heritability. Negative heritability is usually a clue that the design of the experiment has injected excessive within-strain variance. It is easy for this to happen inadvertently by failing to correct for a batch effect. For example, if one collects the first batch of data for strains 1 through 20 during a full moon, and a second batch of data for these same strains during a rare blue moon, then the apparent variation within strain may greatly exceed the among strain variance. A technical batch effect has been confounded with the within-strain variation and has swamped any among-strain variance. What to do? Fix the batch effect, sex effect, age effect, etc., first! [Williams RW, Chesler EJ, Dec 23, 2004] -#### h2 Estimated using Hegmann and Possidente's Method (Adjusted Heritability in the Basic Statisics): +#### h2 Estimated using Hegmann and Possidente's Method (Adjusted Heritability in the Basic Statisics): A simple estimate of heritability for inbred lines involves comparing the variance between strain means (Va) to the total variance (Vt) of the phenotype, where Va is the a rough estimate of the additive genetic variance and Vt is the equal to Va and the average environmental variance, Ve. For example, if we study 10 cases of each of 20 strains, we have a total variance of the phenotype across 200 samples, and a strain mean variance across 20 strain averages. We can use this simple equation to estimate the heritability: -h2 = Va / Vt +h2 = Va / Vt -This estimate of heritability will be an overestimate, and the severity of this bias will be a function of the within-strain standard error of the mean. Even a random data set of 10 each of 20 strains that should have an h2 of 0, will often give h2 values of 0.10 to 0.20. (Try this in a spreadsheet program using random numbers.) +This estimate of heritability will be an **overestimate**, and the severity of this bias will be a function of the within-strain standard error of the mean. Even a random data set of 10 each of 20 strains that should have an h2 of 0, will often give h2 values of 0.10 to 0.20. (Try this in a spreadsheet program using random numbers.) However, this estimate of h2 cannot be compared directly to those calculated using standard intercrosses and backcrosses. The reason is that all cases above are fully inbred and no genotypes are heterozygous. As a result the estimate of Va will be inflated two-fold. Hegmann and Possidente (1981 suggested a simple solution; adjust the equation as follows: -h2 = 0.5Va / (0.5Va+Ve) +h2 = 0.5Va / (0.5Va+Ve) -The factor 0.5 is applied to Va to adjust for the overestimation of additive genetic variance among inbred strains. This estimate of heritability also does not make allowances for the within-strain error term. The 0.5 adjustment factor is not recommended any more because h2 is severely underestimated. This adjustment is really only needed if the goal is to compare h2 between intercrosses and those generated using panels of inbred strains. +The factor 0.5 is applied to Va to adjust for the overestimation of additive genetic variance among inbred strains. This estimate of heritability also does not make allowances for the within-strain error term. The 0.5 adjustment factor is not recommended any more because h2 is severely **underestimated**. This adjustment is really only needed if the goal is to compare h2 between intercrosses and those generated using panels of inbred strains. -#### h2RIx̅ +#### h2RIx̅ -Finally, heritability calculations using strain means, such as those listed above, do not provide estimates of the effective heritability achieved by resampling a given line, strain, or genometype many times. Belknap (1998) provides corrected estimates of the effective heritability. Figure 1 from his paper (reproduced below) illustrates how resampling helps a great deal. Simply resampling each strain 8 times can boost the effective heritability from 0.2 to 0.8. The graph also illustrates why it often does not make sense to resample much beyond 4 to 8, depending on heritability. Belknap used the term h2RIx̅ in this figure and paper, since he was focused on data generated using recombinant inbred (RI) strains, but the logic applies equally well to any panel of genomes for which replication of individual genometypes is practical. This h2RIx̅ can be calculated simply by: -h2RIx̅ = Va / (Va+(Ve/n)) where Va is the genetic variability (variability between strains), Ve is the environmental variability (variability within strains), and n is the number of within strain replicates. Of course, with many studies the number of within strain replicates will vary between strains, and this needs to be dealt with. A reasonable approach is to use the harmonic mean of n across all strains. +Finally, heritability calculations using strain means, such as those listed above, do not provide estimates of the effective heritability achieved by resampling a given line, strain, or genometype many times. Belknap ([1998](http://gn1.genenetwork.org/images/upload/Belknap_Heritability_1998.pdf)) provides corrected estimates of the effective heritability. Figure 1 from his paper (reproduced below) illustrates how resampling helps a great deal. Simply resampling each strain 8 times can boost the effective heritability from 0.2 to 0.8. The graph also illustrates why it often does not make sense to resample much beyond 4 to 8, depending on heritability. Belknap used the term h2RIx̅ in this figure and paper, since he was focused on data generated using recombinant inbred (RI) strains, but the logic applies equally well to any panel of genomes for which replication of individual genometypes is practical. This h2RIx̅ can be calculated simply by: +h2RIx̅ = Va / (Va+(Ve/n)) where Va is the genetic variability (variability between strains), Ve is the environmental variability (variability within strains), and n is the number of within strain replicates. Of course, with many studies the number of within strain replicates will vary between strains, and this needs to be dealt with. A reasonable approach is to use the harmonic mean of n across all strains. Homozygosity An analysis of statistical power is useful to estimate numbers of replicates and strains needed to detect and resolve major sources of trait variance and covariance. A versatile method has been developed by Sen and colleagues (Sen et al., 2007) and implemented in the R program. qtlDesign. David Ashbrook implemented a version of this within Shiny that can help you estimate power for different heritability values QTL effect sizes, cohort sizes, and replication rates: -### Power Calculator (D. Ashbrook) +**[Power Calculator (D. Ashbrook)](https://dashbrook1.shinyapps.io/bxd_power_calculator_app/)** We can see that in all situations power is increased more by increasing the number of lines than by increasing the number of biological replicates. Dependent upon the heritability of the trait, there is little gain in power when going above 4-6 biological replicates. [DGA, Feb 1, 2019] [Chesler EJ, Dec 20, 2004; RWW updated March 7, 2018; Ashbrook DG, updated Feb 1, 2019] #### Hitchhiking Effect: -Conventional knockout lines (KOs) of mice are often mixtures of the genomes of two strains of mice. One important consequence of this fact is that a conventional comparison of wildtype and KO litter mates does not only test of the effects of the KO gene itself but also tests the effects of thousands of "hitchhiking" sequence polymorphisms in genes that flank the KO gene. This experimental confound can be difficult to resolve (but see below). This problem was first highlighted by Robert Gerlai (1996). +Conventional knockout lines (KOs) of mice are often mixtures of the genomes of two strains of mice. One important consequence of this fact is that a conventional comparison of wildtype and KO litter mates does not only test of the effects of the KO gene itself but also tests the effects of thousands of "hitchhiking" sequence polymorphisms in genes that flank the KO gene. This experimental confound can be difficult to resolve (but see below). This problem was first highlighted by Robert Gerlai ([1996](http://gn1.genenetwork.org/images/upload/Gerlai_TINS_1996.pdf)). **Genetics of KO Lines**. The embryonic stem cells used to make KOs are usually derived from a 129 strain of mouse (e.g., 129/OlaHsd). Mutated stem cells are then added to a C57BL/6J blastocyst to generate B6x129 chimeric mice. Germline transmission of the KO allele is tested and carriers are then used to establish heterozygous +/- B6.129 KO stock. This stock is often crossed back to wildtype C57BL/6J strains for several generations. At each generation the transmission of the KO is checked by genotyping the gene or closely flanking markers in each litter of mice. Carriers are again selected for breeding. The end result of this process is a KO congenic line in which the genetic background is primarily C57BL/6J except for the region around the KO gene. @@ -238,13 +240,13 @@ It is often thought that 10 generations of backcrossing will result in a pure ge Congenic -After 20 generations of backcrossing nearly +/-5 cM on either side of the KO will still usually be derived from 129 (see Figure 3.6) This amounts to an average of +/- 10 megabases of DNA around the KO. The wildtype littermates do NOT have this flanking DNA from 129 and they will be like a true C57BL/6J. The +/- 10 megabases to either side of the KO is known as the "hitchhiking" chromosomal interval. Any polymorphism between 129 and B6 in this interval has the potential to have significant downstream effects on gene expression, protein expression, and higher order traits such as anxiety, activity, and maternal behavior. Much of the conventional KO literature is highly suspect due to this hitchhiker effect (see Gerlai R, Trends in Neurosci 1996 19:177). +After 20 generations of backcrossing nearly +/-5 cM on either side of the KO will still usually be derived from 129 (see [Figure 3.6](http://www.informatics.jax.org/silverbook/frames/frame3-3.shtml)) This amounts to an average of +/- 10 megabases of DNA around the KO. The wildtype littermates do NOT have this flanking DNA from 129 and they will be like a true C57BL/6J. The +/- 10 megabases to either side of the KO is known as the "hitchhiking" chromosomal interval. Any polymorphism between 129 and B6 in this interval has the potential to have significant downstream effects on gene expression, protein expression, and higher order traits such as anxiety, activity, and maternal behavior. Much of the conventional KO literature is highly suspect due to this hitchhiker effect (see Gerlai R, [Trends in Neurosci 1996 19:177](http://gn1.genenetwork.org/images/upload/Gerlai_TINS_1996.pdf)). -As one example, consider the thyroid alpha receptor hormone gene Thra and its KO. Thra maps to Chr 11 at about 99 Mb. A conventional KO made as described above will have a hitchhiking 129 chromosomal interval extending from about 89 Mb to 109 Mb even after 20 generations of backcrossing to B6. Since the mouse genome is about 2.6 billion base pairs and contains about 26,000 genes, this 20 Mb region will typically contain about 200 genes. The particular region of Chr 11 around Thra has an unusually high density of genes (2-3X) and includes many highly expressed and polymorphic genes, including Nog, Car10, Cdc34, Col1a1, Dlx4, Myst2, Ngfr, Igf2bp1, Gip, the entire Hoxb complex, Sp6, Socs7, Lasp1, Cacnb1, Pparbp, Pnmt, Erbb2, Grb7, Nr1d1, Casc3, Igfbp4, and the entire Krt1 complex. Of these gene roughly half will be polymorphic between B6 and 129. It is like having a busload of noisy and possibly dangerous hitchhikers. Putative KO effects may be generated by a complex subset of these 100 polymorphic genes. +As one example, consider the thyroid alpha receptor hormone gene Thra and its KO. Thra maps to Chr 11 at about 99 Mb. A conventional KO made as described above will have a hitchhiking 129 chromosomal interval extending from about 89 Mb to 109 Mb even after 20 generations of backcrossing to B6. Since the mouse genome is about 2.6 billion base pairs and contains about 26,000 genes, this 20 Mb region will typically contain about 200 genes. The particular region of Chr 11 around Thra has an unusually high density of genes (2-3X) and includes many highly expressed and polymorphic genes, including *Nog*, *Car10*, *Cdc34*, *Col1a1*, *Dlx4*, *Myst2*, *Ngfr*, *Igf2bp1*, *Gip*, the entire *Hoxb* complex, *Sp6*, *Socs7*, *Lasp1*, *Cacnb1*, *Pparbp*, *Pnmt*, *Erbb2*, *Grb7*, *Nr1d1*, *Casc3*, *Igfbp4*, and the entire *Krt1* complex. Of these gene roughly half will be polymorphic between B6 and 129. It is like having a busload of noisy and possibly dangerous hitchhikers. Putative KO effects may be generated by a complex subset of these 100 polymorphic genes. What is the solution? -1. Do not use litter mates as controls without great care. They are not really the correct genetic control. The correct genetic control is a congenic strain of the same general type without the KO or with a different KO in a nearby gene. These are often available as KOs in neighboring genes that are not of interest. For example, the gene Casc3 is located next to Thra. If a KO in Casc3 is available, then compare the two KOs and see if phenotypes of the two KOs differ ways predicted given the known molecular functions of the gene. +1. Do not use litter mates as controls without great care. They are not really the correct genetic control. The correct genetic control is a congenic strain of the same general type without the KO or with a different KO in a nearby gene. These are often available as KOs in neighboring genes that are not of interest. For example, the gene *Casc3* is located next to Thra. If a KO in Casc3 is available, then compare the two KOs and see if phenotypes of the two KOs differ ways predicted given the known molecular functions of the gene. 2. Use a KO in which the KO has been backcrossed to a 129 strain--ideally the same strain from which ES cells were obtained. This eliminates the hitchhiker effect entirely and the KO, HET, and WT littermates really can be compared. @@ -254,7 +256,7 @@ What is the solution? Homozygosity -Legend:from Silver, L. (1995) Oxford University Press +Legend:from [Silver, L. (1995) Oxford University Press](http://www.informatics.jax.org/silver/index.shtml) [Go back to index](#index) @@ -268,23 +270,23 @@ The interquartile range is the difference between the 75% and 25% percentiles of #### Interval Mapping: -Interval mapping is a process in which the statistical significance of a hypothetical QTL is evaluated at regular points across a chromosome, even in the absence of explicit genotype data at those points. In the case of WebQTL, significance is calculated using an efficient and very rapid regression method, the Haley-Knott regression equations (Haley CS, Knott SA. 1992. A simple regression method for mapping quantitative trait loci in line crosses using flanking markers; Heredity 69:315–324), in which trait values are compared to the known genotype at a marker or to the probability of a specific genotype at a test location between two flanking markers. (The three genotypes are coded as -1, 0, and +1 at known markers, but often have fractional values in the intervals between markers.) The inferred probability of the genotypes in regions that have not been genotyped can be estimated from genotypes of the closest flanking markers. GeneNetwork/WebQTL compute linkage at intervals of 1 cM or less. As a consequence of this approach to computing linkage statistics, interval maps often have a characteristic shape in which the markers appear as sharply defined inflection points, and the intervals between nodes are smooth curves. [Chesler EJ, Dec 20, 2004; RWW April 2005; RWW Man 2014] +Interval mapping is a process in which the statistical significance of a hypothetical QTL is evaluated at regular points across a chromosome, even in the absence of explicit genotype data at those points. In the case of WebQTL, significance is calculated using an efficient and very rapid regression method, the Haley-Knott regression equations ([Haley CS, Knott SA. 1992. A simple regression method for mapping quantitative trait loci in line crosses using flanking markers; Heredity 69:315–324](http://www.ncbi.nlm.nih.gov/pubmed/16718932)), in which trait values are compared to the known genotype at a marker or to the probability of a specific genotype at a test location between two flanking markers. (The three genotypes are coded as -1, 0, and +1 at known markers, but often have fractional values in the intervals between markers.) The inferred probability of the genotypes in regions that have not been genotyped can be estimated from genotypes of the closest flanking markers. GeneNetwork/WebQTL compute linkage at intervals of 1 cM or less. As a consequence of this approach to computing linkage statistics, interval maps often have a characteristic shape in which the markers appear as sharply defined inflection points, and the intervals between nodes are smooth curves. [Chesler EJ, Dec 20, 2004; RWW April 2005; RWW Man 2014] #### Interval Mapping Options: -- Permutation Test: Select this option to determine the approximate LRS value that matches a genome-wide p-value of .05. +- _Permutation Test_: Select this option to determine the approximate LRS value that matches a genome-wide p-value of .05. -- Bootstrap Test: Select this option to evaluate the consistency with which peak LRS scores cluster around a putative QTL. Deselect this option if it obscures the SNP track or the additive effect track. +- _Bootstrap Test_: Select this option to evaluate the consistency with which peak LRS scores cluster around a putative QTL. Deselect this option if it obscures the SNP track or the additive effect track. -- Additive Effect: The additive effect (shown by the red lines in these plots) provide an estimate of the change in the average phenotype that is brought about by substituting a single allele of one type with that of another type. +- _Additive Effect_: The additive effect (shown by the red lines in these plots) provide an estimate of the change in the average phenotype that is brought about by substituting a single allele of one type with that of another type. -- SNP Track: The SNP Seismograph Track provides information on the regional density of segregating variants in the cross that may generate trait variants. It is plotted along the X axis. If a locus spans a region with both high and low SNP density, then the causal variant has a higher prior probability to be located in the region with high density than in the region with low density. +- _SNP Track_: The SNP Seismograph Track provides information on the regional density of segregating variants in the cross that may generate trait variants. It is plotted along the X axis. If a locus spans a region with both high and low SNP density, then the causal variant has a higher prior probability to be located in the region with high density than in the region with low density. -- Gene Track: This track overlays the positions of known genes on the physical Interval Map Viewer. If you hover the cursor over genes on this track, minimal information (symbol, position, and exon number) will appear. +- _Gene Track_: This track overlays the positions of known genes on the physical Interval Map Viewer. If you hover the cursor over genes on this track, minimal information (symbol, position, and exon number) will appear. -- Display from X Mb to Y Mb: Enter values in megabases to regenerate a smaller or large map view. +- _Display from X Mb to Y Mb_: Enter values in megabases to regenerate a smaller or large map view. -- Graph width (in pixels): Adjust this value to obtain larger or smaller map views (x axis only). +- _Graph width (in pixels)_: Adjust this value to obtain larger or smaller map views (x axis only). [Go back to index](#index) @@ -308,15 +310,15 @@ Interval mapping is a process in which the statistical significance of a hypothe #### Literature Correlation: -The literature correlation is a unique feature in GeneNetwork that quantifies the similarity of words used to describe genes and their functions. Sets of words associated with genes were extracted from MEDLINE/PubMed abstracts (Jan 2017 by Ramin Homayouni, Diem-Trang Pham, and Sujoy Roy). For example, about 2500 PubMed abstracts contain reference to the gene "Sonic hedgehog" (Shh) in mouse, human, or rat. The words in all of these abstracts were extracted and categorize by their information content. A word such as "the" is not interesting, but words such as "dopamine" or "development" are useful in quantifying similarity. Sets of informative words are then compared—one gene's word set is compared the word set for all other genes. Similarity values are computed for a matrix of about 20,000 genes using latent semantic indexing (see Xu et al., 2011). Similarity values are also known as literature correlations. These values are always positive and range from 0 to 1. Values between 0.5 and 1.0 indicate moderate-to-high levels of overlap of vocabularies. +The literature correlation is a unique feature in GeneNetwork that quantifies the similarity of words used to describe genes and their functions. Sets of words associated with genes were extracted from MEDLINE/PubMed abstracts (Jan 2017 by Ramin Homayouni, Diem-Trang Pham, and Sujoy Roy). For example, about 2500 PubMed abstracts contain reference to the gene "Sonic hedgehog" (Shh) in mouse, human, or rat. The words in all of these abstracts were extracted and categorize by their information content. A word such as "the" is not interesting, but words such as "dopamine" or "development" are useful in quantifying similarity. Sets of informative words are then compared—one gene's word set is compared the word set for all other genes. Similarity values are computed for a matrix of about 20,000 genes using latent semantic indexing [(see Xu et al., 2011)](http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0018851). Similarity values are also known as literature correlations. These values are always positive and range from 0 to 1. Values between 0.5 and 1.0 indicate moderate-to-high levels of overlap of vocabularies. -The literature correlation can be used to compare the "semantic" signal-to-noise of different measurements of gene, mRNA, and protein expression. Consider this common situation:There are three probe sets that measure Kit gene expression (1459588\_at, 1415900\_a\_at, and 1452514\_a\_at) in the Mouse BXD Lung mRNA data set (HZI Lung M430v2 (Apr08) RMA). Which one of these three gives the best measurement of Kit expression? It is impractical to perform quantitative rtPCR studies to answer this question, but there is a solid statistical answer that relies on Literature Correlation. Do the following: For each of the three probe sets, generate the top 1000 literature correlates. This will generate three apparently identical lists of genes that are known from the PubMed literature to be associated with the Kit oncogene. But the three lists are NOT actually identical when we look at the Sample Correlation column. To answer the question "which of the three probe sets is best", review the actual performance of the probe sets against this set of 1000 "friends of Kit". Do this by sorting all three lists by their Sample Correlation column (high to low). The clear winner is probe set 1415900_a_at. The 100th row in this probe set's list has a Sample Correlation of 0.620 (absolute value). In comparison, the 100th row for probe set 1452514_a_at has a Sample Correlation of 0.289. The probe set that targets the intron comes in last at 0.275. In conclusion, the probe set that targets the proximal half of the 3' UTR (1415900_a_at) has the highest "agreement" between Literature Correlation and Sample Correlation, and is our preferred measurement of Kit expression in the lung in this data set. (Updated by RWW and Ramin Homayouni, April 2017.) +The literature correlation can be used to compare the "semantic" signal-to-noise of different measurements of gene, mRNA, and protein expression. Consider this common situation:There are three probe sets that measure Kit gene expression (1459588\_at, 1415900\_a\_at, and 1452514\_a\_at) in the Mouse BXD Lung mRNA data set (HZI Lung M430v2 (Apr08) RMA). Which one of these three gives the best measurement of Kit expression? It is impractical to perform quantitative rtPCR studies to answer this question, but there is a solid statistical answer that relies on **Literature Correlation**. Do the following: For each of the three probe sets, generate the top 1000 literature correlates. This will generate three apparently identical lists of genes that are known from the PubMed literature to be associated with the Kit oncogene. But the three lists are NOT actually identical when we look at the **Sample Correlation** column. To answer the question "which of the three probe sets is best", review the actual performance of the probe sets against this set of 1000 "friends of Kit". Do this by sorting all three lists by their Sample Correlation column (high to low). The clear winner is probe set 1415900_a_at. The 100th row in this probe set's list has a Sample Correlation of 0.620 (absolute value). In comparison, the 100th row for probe set 1452514_a_at has a Sample Correlation of 0.289. The probe set that targets the intron comes in last at 0.275. In conclusion, the probe set that targets the proximal half of the 3' UTR (1415900_a_at) has the highest "agreement" between Literature Correlation and Sample Correlation, and is our preferred measurement of Kit expression in the lung in this data set. (Updated by RWW and Ramin Homayouni, April 2017.)
    #### LOD: -The logarithm of the odds (LOD) provides a measure of the association between variation in a phenotype and genetic differences (alleles) at a particular chromosomal locus (see Nyholt 2000 for a lovely review of LOD scores). +The logarithm of the odds (LOD) provides a measure of the association between variation in a phenotype and genetic differences (alleles) at a particular chromosomal locus (see Nyholt [2000](http://www.sciencedirect.com/science/article/pii/S0002929707626391) for a lovely review of LOD scores). A LOD score is defined as the logarithm of the ratio of two likelihoods: (1) in the numerator the likelihood for the alternative hypothesis, namely that there is linkage at the chromosomal marker, and (2) the likelihood of the null hypothesis that there is no linkage. Likelihoods are probabilities, but they are not Pr(hypothesis | data) but rather Pr(data | two alternative hypotheses). That's why they are called likelihoods rather than probabilities. (The "|" symbol above translates to "given the"). Since LOD and LRS scores are associated with two particular hypotheses or models, they are also associated with the degrees of freedom of those two alternative models. When the model only has one degree of freedom this conversion between LOD to p value will work:
    @@ -367,11 +369,11 @@ To compute the marker regression (or correlation) we just compare values in Rows
     
     #### Normal Probability Plot:
     
    -A normal probability plot is a powerful tool to evaluate the extent to which a distribution of values conforms to (or deviates from) a normal Gaussian distribution. The Basic Statistics tools in GeneNetwork provides these plots for any trait. If a distribution of numbers is normal then the actual values and the predicted values based on a z score (units of deviation from the mean measured in standard deviation units) will form a nearly straight line. These plots can also be used to efficiently flag outlier samples in either tail of the distribution.
    +A [normal probability plot](http://en.wikipedia.org/wiki/Normal_probability_plot) is a powerful tool to evaluate the extent to which a distribution of values conforms to (or deviates from) a normal Gaussian distribution. The Basic Statistics tools in GeneNetwork provides these plots for any trait. If a distribution of numbers is normal then the actual values and the predicted values based on a z score (units of deviation from the mean measured in standard deviation units) will form a nearly straight line. These plots can also be used to efficiently flag outlier samples in either tail of the distribution.
     
     In genetic studies, the probability plot can be used to detect the effects of major effect loci. A classical Mendelian locus will typically be associated with either a bimodal or trimodal distribution. In the plot below based on 99 samples, the points definitely do not fall on a single line. Three samples (green squares) have unusually high values; the majority of samples fall on a straight line between z = -0.8 to z = 2; and 16 values have much lower trait values than would be predicted based on a single normal distribution (a low mode group). The abrupt discontinuity in the distribution at -0.8 z is due to the effect of a single major Mendelian effect.
     
    -Deviations from normality of the sort in the figure below should be considered good news from the point of view of likely success of tracking down the locations of QTLs. However, small numbers of outliers may require special statistical handling, such as their exclusion or winsorising (see more below on "Winsorizing"). [RWW June 2011] 
    +Deviations from normality of the sort in the figure below should be considered good news from the point of view of likely success of tracking down the locations of QTLs. However, small numbers of outliers may require special statistical handling, such as their exclusion or [winsorising](http://en.wikipedia.org/wiki/Winsorising) (see more below on "Winsorizing"). [RWW June 2011] 
     
     Homozygosity
     
    @@ -381,11 +383,11 @@ Deviations from normality of the sort in the figure below should be considered g
     
     ## O
     
    -#### Outliers: (also see Wikipedia)
    +#### Outliers: (also see [Wikipedia](http://en.wikipedia.org/wiki/Outlier))
     
     Statistical methods often assume that the distribution of trait values is close to a Gaussian normal bell-shaped curve and that there are no outlier values that are extremely high or low compared to the average. Some traits can be clearly split into two or more groups (affected cases and unaffected cases) and this is not a problem as long as the number of cases in each group is close to the number that you expected by chance and that your sample size is reasonable high (40 or more for recombinant inbred strains). Mapping functions and most statistical procedure in GeneNetwork should work reasonable well (the pair scan function for epistatic interactions is one possible exception).
     
    -However, correlations and QTL mapping methods can be highly sensitive to outlier values. Make sure you review your data for outliers before mapping. GeneNetwork flags all outliers for you in the Trait Data and Analysis window and gives you the option of zapping these extreme values. Options include (1) do nothing, (2) delete the outliers and see what happens to your maps, (3) Winsorize the values of the outliers. You might try all three options and determine if your main results are stable or not. With small samples or extreme outliers, you may find the correlation and mapping results to be volatile. In general, if results (correlations, QTL positions or QTL LRS score) depend highly on one or two outliers (5-10% of the samples) then you should probably delete or winsorize the outliers.
    +However, correlations and QTL mapping methods can be highly sensitive to outlier values. Make sure you review your data for outliers before mapping. GeneNetwork flags all outliers for you in the Trait Data and Analysis window and gives you the option of zapping these extreme values. Options include (1) do nothing, (2) delete the outliers and see what happens to your maps, (3) [Winsorize](http://en.wikipedia.org/wiki/Winsorising) the values of the outliers. You might try all three options and determine if your main results are stable or not. With small samples or extreme outliers, you may find the correlation and mapping results to be volatile. In general, if results (correlations, QTL positions or QTL LRS score) depend highly on one or two outliers (5-10% of the samples) then you should probably delete or winsorize the outliers.
     
     In order to calculate outliers, we first determine the Q1(25%) and Q3(75%) values and then multiply by a constant (in our case 1.5; a higher constant is less sensitive to outliers). This value is then subtracted from the Q1 value and added to the Q3 value in order to determine the lower and upper bounds. Values that fall above the upper bound or below the lower bound are considered outliers.
     
    @@ -409,15 +411,15 @@ The output table in GeneNetwork list the the two intervals at the top of the tab
     
     CAUTIONS and LIMITATIONS: Pair-scan is only implemented for recombinant inbred strains. We do not recommend the use of this function with sample sizes of less than 60 recombinant inbred strains. Pair-scan procedures need careful diagnostics and an be very sensitive to outliers and to the balance among the four possible two-locus genotype classes among a set of RI strains. Pair-scan is not yet implemented for F2 progeny.
     
    -GeneNetwork implements a rapid but non-exhaustive DIRECT algorithm (Lundberg et al., 2004) that efficiently searches for epistatic interactions. This method is so fast that it is possible to compute 500 permutations to evaluate non-parametric significance of the joint LRS value within a minute. This makes DIRECT ideal for an interactive web service. Karl Broman's R/qtl implements an exhaustive search using the "scantwo" function. [RWW, May 2011]
    +GeneNetwork implements a rapid but non-exhaustive DIRECT algorithm (Lundberg et al., [2004](http://bioinformatics.oxfordjournals.org/content/20/12/1887.full.pdf)) that efficiently searches for epistatic interactions. This method is so fast that it is possible to compute 500 permutations to evaluate non-parametric significance of the joint LRS value within a minute. This makes DIRECT ideal for an interactive web service. Karl Broman's [R/qtl](http://www.rqtl.org/tutorials/rqtltour.pdf) implements an exhaustive search using the "scantwo" function. [RWW, May 2011]
     
     #### Partial Correlation:
     
    -Partial correlation is the correlation between two variables that remains after controlling for one or more other variables. Idea and techniques used to compute partial correlations are important in testing causal models (Cause and Correlation in Biology, Bill Shipley, 2000). For instance, r1,2||3,4 is the partial correlation between variables 1 and 2, while controlling for variables 3 and 4 (the || symbol is equivalent to "while controlling for"). We can compare partial correlations (e.g., r1,2||3,4) with original correlations (e.g., r1,2). If there is an insignificant difference, we infer that the controlled variables have minimal effect and may not influence the variables or even be part of the model. In contrast, if the partial correlations change significantly, the inference is that the causal link between the two variables is dependent to some degree on the controlled variables. These control variables are either anteceding causes or intervening variables. (text adapted from D Garson's original by RWW).
    +Partial correlation is the correlation between two variables that remains after controlling for one or more other variables. Idea and techniques used to compute partial correlations are important in testing causal models ([Cause and Correlation in Biology](http://www.amazon.com/Cause-Correlation-Biology-Structural-Equations/dp/0521529212), Bill Shipley, 2000). For instance, r1,2||3,4 is the partial correlation between variables 1 and 2, while controlling for variables 3 and 4 (the || symbol is equivalent to "while controlling for"). We can compare partial correlations (e.g., r1,2||3,4) with original correlations (e.g., r1,2). If there is an insignificant difference, we infer that the controlled variables have minimal effect and may not influence the variables or even be part of the model. In contrast, if the partial correlations change significantly, the inference is that the causal link between the two variables is dependent to some degree on the controlled variables. These control variables are either anteceding causes or intervening variables. (text adapted from D Garson's original by RWW).
     
    -For more on partial correlation please link to this great site by David Garson at NC State.
    +For more on [partial correlation](http://faculty.chass.ncsu.edu/garson/PA765/partialr.htm) please link to this great site by David Garson at NC State.
     
    -For more on dependence separation ( d-separation) and constructing causal models see Richard Scheines' site.
    +For more on dependence separation ([d-separation](http://www.andrew.cmu.edu/user/scheines/tutor/d-sep.html)) and constructing causal models see Richard Scheines' site.
     
     Why would you use of need partial correlations in GeneNetwork? It is often useful to compute correlations among traits while controlling for additional variables. Partial correlations may reveal more about the causality of relations. In a genetic context, partial correlations can be used to remove much of the variance associated with linkage and linkage disequilibrium. You can also control for age, age, and other common cofactors.
     
    @@ -425,19 +427,19 @@ Please see the related Glossary terms "Tissue Correlation". [RWW, Aug 21, 2009;
     
     #### PCA Trait or Eigentrait:
     
    -If you place a number of traits in a Trait Collection you can carry out some of the key steps of a principal component analysis, including defining the variance directed along specific principal component eigenvectors. You can also plot the positions of cases against the first two eigenvectors; in essence a type of scatterplot. Finally, GeneNetwork allows you to exploit PCA methods to make new "synthetic" eigentraits from collections of correlated traits. These synthetic traits are the values of cases along specific eigenvectors and they may be less noisy than single traits. If this seems puzzling, then have a look at these useful PCA explanation by G. Dallas and by Powell and Lehe. How to do it: You can select and assemble many different traits into a single Trait Collection window using the check boxes and Add To Collection buttons. One of the most important function buttons in the Collection window is labeled Correlation Matrix. This function computes Pearson product moment correlations and Spearman rank order correlations for all possible pairs of traits in the Collection window. It also perfoms a principal component or factor analysis. For example, if you have 20 traits in the Collection window, the correlation matrix will consist of 20*19 or 190 correlations and the identity diagonal. Principal components analysis is a linear algebraic procedure that finds a small number of independent factors or principal components that efficiently explain variation in the original 20 traits. It is a effective method to reduce the dimensionality of a group of traits. If the 20 traits share a great deal of variation, then only two or three factors may explain variation among the traits. Instead of analyzing 20 traits as if they were independent, we can now analyze the main principal components labeled PC01, PC02, etc. PC01 and PC02 can be treated as new synthetic traits that represent the main sources of variation among original traits. You can treat a PC trait like any other trait except that it is not stored permanently in a database table. You can put a PC trait in your Collection window and see how well correlated each of the 20 original traits is with this new synthetic trait. You can also map a PC trait. [RWW, Aug 23, 2005]
    +If you place a number of traits in a Trait Collection you can carry out some of the key steps of a principal component analysis, including defining the variance directed along specific principal component eigenvectors. You can also plot the positions of cases against the first two eigenvectors; in essence a type of scatterplot. Finally, GeneNetwork allows you to exploit PCA methods to make new "synthetic" eigentraits from collections of correlated traits. These synthetic traits are the values of cases along specific eigenvectors and they may be less noisy than single traits. If this seems puzzling, then have a look at these useful PCA explanation by [G. Dallas](http://georgemdallas.wordpress.com/2013/10/30/principal-component-analysis-4-dummies-eigenvectors-eigenvalues-and-dimension-reduction/) and by [Powell and Lehe](http://setosa.io/ev/principal-component-analysis/). **How to do it:** You can select and assemble many different traits into a single **Trait Collection** window using the check boxes and **Add To Collection** buttons. One of the most important function buttons in the **Collection** window is labeled **Correlation Matrix**. This function computes Pearson product moment correlations and Spearman rank order correlations for all possible pairs of traits in the Collection window. It also perfoms a principal component or factor analysis. For example, if you have 20 traits in the Collection window, the correlation matrix will consist of 20*19 or 190 correlations and the identity diagonal. Principal components analysis is a linear algebraic procedure that finds a small number of independent factors or principal components that efficiently explain variation in the original 20 traits. It is a effective method to reduce the dimensionality of a group of traits. If the 20 traits share a great deal of variation, then only two or three factors may explain variation among the traits. Instead of analyzing 20 traits as if they were independent, we can now analyze the main principal components labeled PC01, PC02, etc. PC01 and PC02 can be treated as new synthetic traits that represent the main sources of variation among original traits. You can treat a PC trait like any other trait except that it is not stored permanently in a database table. You can put a PC trait in your Collection window and see how well correlated each of the 20 original traits is with this new synthetic trait. You can also map a PC trait. [RWW, Aug 23, 2005]
     
     
    #### Permutation Test: -A permutation test is a computationally intensive but conceptually simple method used to evaluate the statisical significance of findings. Permutation tests are often used to evaluate QTL significance. Some background: In order to detect parts of chromosomes that apparently harbor genes that contribute to differences in a trait's value, it is common to search for associations (linkage) across the entire genome. This is referred to as a "whole genome" scan, and it usually involves testing hundreds of independently segregating regions of the genome using hundreds, or even thousands of genetic markers (SNPs and microsatellites). A parametric test such as a conventional t test of F test can be used to estimate the probability of the null hypothesis at any single location in the genome (the null hypothesis is that there is no QTL at this particular location). But a parametric test of this type makes assumptions about the distribution of the trait (its normality), and also does not provide a way to correct for the large number of independent tests that are performed while scanning the whole genome. We need protection against many false discoveries as well as some assurance that we are not neglecting truly interesting locations. A permutation test is an elegant solution to both problems. The procedure involves randomly reassigning (permuting) traits values and genotypes of all cases used in the analysis. The permuted data sets have the same set of phenotypes and genotypes (in other words, distributions are the same), but obviously the permutation procedure almost invariably obliterates genuine gene-to-phenotype relation in large data sets. We typically generate several thousand permutations of the data. Each of these is analyzed using precisely the same method that was used to analyze the correctly ordered data set. We then compare statistical results of the original data set with the collection of values generated by the many permuted data sets. The hope is that the correctly ordered data are associated with larger LRS and LOD values than more than 95% of the permuted data sets. This is how we define the p = .05 whole genome significance threshold for a QTL. Please see the related Glossary terms "Significant threshold" and "Suggestive threshold". [RWW, July 15, 2005] +A permutation test is a computationally intensive but conceptually simple method used to evaluate the statisical significance of findings. Permutation tests are often used to evaluate QTL significance. _Some background_: In order to detect parts of chromosomes that apparently harbor genes that contribute to differences in a trait's value, it is common to search for associations (linkage) across the entire genome. This is referred to as a "whole genome" scan, and it usually involves testing hundreds of independently segregating regions of the genome using hundreds, or even thousands of genetic markers (SNPs and microsatellites). A parametric test such as a conventional t test of F test can be used to estimate the probability of the null hypothesis at any single location in the genome (the null hypothesis is that there is no QTL at this particular location). But a parametric test of this type makes assumptions about the distribution of the trait (its normality), and also does not provide a way to correct for the large number of independent tests that are performed while scanning the whole genome. We need protection against many false discoveries as well as some assurance that we are not neglecting truly interesting locations. A permutation test is an elegant solution to both problems. The procedure involves randomly reassigning (permuting) traits values and genotypes of all cases used in the analysis. The permuted data sets have the same set of phenotypes and genotypes (in other words, distributions are the same), but obviously the permutation procedure almost invariably obliterates genuine gene-to-phenotype relation in large data sets. We typically generate several thousand permutations of the data. Each of these is analyzed using precisely the same method that was used to analyze the correctly ordered data set. We then compare statistical results of the original data set with the collection of values generated by the many permuted data sets. The hope is that the correctly ordered data are associated with larger LRS and LOD values than more than 95% of the permuted data sets. This is how we define the p = .05 whole genome significance threshold for a QTL. Please see the related Glossary terms "Significant threshold" and "Suggestive threshold". [RWW, July 15, 2005] #### Power to detect QTLs: An analysis of statistical power is useful to estimate numbers of replicates and strains needed to detect and resolve major sources of trait variance and covariance. A versatile method has been developed by Sen and colleagues (Sen et al., 2007) and implemented in the R program. qtlDesign. David Ashbrook implemented a version of this within Shiny that can help you estimate power for different QTL effect sizes, cohort sizes, and replication rates: -#### Power Calculator (D. Ashbrook) +#### [Power Calculator (D. Ashbrook)](https://dashbrook1.shinyapps.io/bxd_power_calculator_app/) We can see that in all situations power is increased more by increasing the number of lines than by increasing the number of biological replicates. Dependent upon the heritability of the trait, there is little gain in power when going above 4-6 biological replicates. [DGA, Mar 3, 2018] @@ -466,34 +468,38 @@ A quantitative trait locus is a chromosome region that contains one or more sequ An inbred strain whose chromosomes incorporate a fixed and permanent set of recombinations of chromosomes originally descended from two or more parental strains. Sets of RI strains (from 10 to 5000) are often used to map the chromosomal positions of polymorphic loci that control variance in phenotypes. -For a terrific short summary of the uses of RI strains see 2007). +For a terrific short summary of the uses of RI strains see [2007](http://www.informatics.jax.org/silverbook/chapters/9-2.shtml)). Chromosomes of RI strains typically consist of alternating haplotypes of highly variable length that are inherited intact from the parental strains. In the case of a typical rodent RI strain made by crossing maternal strain C with paternal strain B (called a CXB RI strain), a chromosome will typically incorporate 3 to 5 alternating haplotype blocks with a structure such as BBBBBCCCCBBBCCCCCCCC, where each letter represents a genotype, series of similar genotype represent haplotypes, and where a transition between haplotypes represents a recombination. Both pairs of each chromosome will have the same alternating pattern, and all markers will be homozygous. Each of the different chromosomes (Chr 1, Chr 2, etc.) will have a different pattern of haplotypes and recombinations. The only exception is that the Y chromosome and the mitochondial genome, both of which are inherited intact from the paternal and maternal strain, respectively. For an RI strain to be useful for mapping purposes, the approximate position of recombinations along each chromsome need to be well defined either in terms of centimorgan or DNA basepair position. The precision with which these recombinations are mapped is a function of the number and position of the genotypes used to type the chromosomes--20 in the example above. Because markers and genotypes are often space quite far apart, often more than 500 Kb, the actual data entered into GeneNetwork will have some ambiguity at each recombination locus. The haplotype block BBBBBCCCCBBBCCCCCCCC will be entered as BBBBB?CCCC?BBB?CCCCCCCC where the ? mark indicates incomplete information over some (we hope) short interval. RI strains are almost always studied in sets or panels. All else being equal, the larger the set of RI strains, the greater the power and precision with which phenotypes can be mapped to chromosomal locations. The first set of eight RIs, the CXB RIs, were generated by Donald Bailey (By) from an intercross between a female BALB/cBy mouse (abbreviated C) and a male C57BL/6By mouse in the 1960s. The small panel of 8 CXB strains was originally used to determine if the major histocompatibility (MHC) locus on proximal Chr 17 was a key factor accounting for different immune responses such as tissue rejection. The methods used to determine the locations of recombinations relied on visible markers (coat color phenotypes such as the C and B loci) and the electrophoretic mobility of proteins. Somewhat larger RI sets were generated by Benjamin Taylor to map Mendelian and other major effect loci. In the 1990s the utility of RI sets for mapping was significantly improved thanks to higher density genotypes made possible by the use of microsatellite markers. Between 2005 and 2017, virtually all extant mouse and rat RI strains were regenotyped at many thousands of SNP markers, providing highly accurate maps of recombinations. -While the potential utility of RI strains in mapping complex polygenic traits was obvious from the outset, the small number of strains only made it feasible to map quantitative traits with large effects. The first large RI sets were generated by plant geneticists (Burr et al. 2000) and this the plant genetics community holds a strong lead in the production of very large RI sets to study multigenic and polygenic traits and trait covariance and pleiotropy. +While the potential utility of RI strains in mapping complex polygenic traits was obvious from the outset, the small number of strains only made it feasible to map quantitative traits with large effects. The first large RI sets were generated by plant geneticists (Burr et al. [2000](http://demeter.bio.bnl.gov/RIchap_rev.pdf)) and this the plant genetics community holds a strong lead in the production of very large RI sets to study multigenic and polygenic traits and trait covariance and pleiotropy. By 2010 the number of mouse RI strains had increased to the point where defining causal gene and sequence variant was more practical. As of 2018 there are about 150 BXD strains (152 have been fully sequenced), ~100 Collaborative Cross strains (also all fully sequenced), and at least another 100 RI strains belonging to smaller sets that have been extremely well genotyped. -Making RI strains: The usual procedure typically involves sib mating of the progeny of an F1 intercross for more than 20 generations. Even by the 5th filial (F) generation of successive matings, the RI lines are homozygous at 50% of loci and by F13, the value is above 90%. At F20 the lines are nearly fully inbred (~98%) and by convention are now referred to as inbred strains rather than inbred lines. +**Making RI strains**: The usual procedure typically involves sib mating of the progeny of an F1 intercross for more than 20 generations. Even by the 5th filial (F) generation of successive matings, the RI lines are homozygous at 50% of loci and by F13, the value is above 90%. At F20 the lines are nearly fully inbred (~98%) and by convention are now referred to as inbred strains rather than inbred lines. [Go back to index](#index) +Legend:from [Silver, L. (1995) Oxford University Press](http://www.informatics.jax.org/silverbook/frames/frame3-3.shtml) + +[Williams RW, June 20, 2005; significant extension, Sept 21, 2007, added Crow ref, Oct 2009] +
    ## S #### Scree Plots: -GeneNetwork will often automatically generate a Scree Plot and the associated principal components (PCs) when you compute a Correlation Matrix for a group of traits that you have placed in your Trait Collection (a set of phenotypes and/or expression data for a specific population). Here is a nice definition of what a Scree plot is trying to tell you adopted and adapted from IOS (www.improvedoutcomes.com). +GeneNetwork will often automatically generate a [Scree Plot](http://www.improvedoutcomes.com/docs/WebSiteDocs/PCA/Creating_a_Scree_Plot.htm) and the associated principal components (PCs) when you compute a Correlation Matrix for a group of traits that you have placed in your Trait Collection (a set of phenotypes and/or expression data for a specific population). Here is a nice definition of what a Scree plot is trying to tell you adopted and adapted from IOS (www.improvedoutcomes.com). A Scree Plot is a simple line segment plot that shows the fraction of total variance in the data as explained or represented by each PC. The PCs are ordered, and by definition are therefore assigned a number label, by decreasing order of contribution to total variance. The PC with the largest fraction contribution is labeled PC01. Such a plot when read left-to-right across the abscissa can often show a clear separation in fraction of total variance where the 'most important' components cease and the 'least important' components begin. The point of separation is often called the 'elbow'. (In the PCA literature, the plot is called a 'Scree' Plot because it often looks like a 'scree' slope, where rocks have fallen down and accumulated on the side of a mountain.) [Williams RW, Dec 20, 2008] #### Significant threshold: -The significant threshold represents the approximate LRS value that corresponds to a genome-wide p-value of 0.05, or a 5% probability of falsely rejecting the null hypothesis that there is no linkage anywhere in the genome. This threshold is computed by evaluating the distribution of highest LRS scores generated by a set of 2000 random permutations of strain means. For example, a random permutation of the correctly ordered data may give a peak LRS score of 10 somewhere across the genome. The set of 1000 or more of these highest LRS scores is then compared to the actual LRS obtained for the correctly ordered (real) data at any location in the genome. If fewer than 50 (5%) of the 1000 permutations have peak LRS scores anywhere in the genome that exceed that obtained at a particular locus using the correctly ordered data, then one can usually claim that a QTL has been defined at a genome-wide p-value of .05. The threshold will vary slightly each time it is recomputed due to the random generation of the permutations. You can view the actual histogram of the permutation results by selecting the "Marker Regression" function in the Analysis Tools area of the Trait Data and Editing Form. WebQTL does make it possible to search through hundreds of traits for those that may have significant linkage somewhere in the genome. Keep in mind that this introduces a second tier of multiple testing problems for which the permutation test will not usually provide adequate protection. If you anticipate mapping many independent traits, then you will need to correct for the number of traits you have tested. [Williams RW, Nov 14, 2004] +The significant threshold represents the approximate LRS value that corresponds to a genome-wide p-value of 0.05, or a 5% probability of falsely rejecting the null hypothesis that there is no linkage anywhere in the genome. This threshold is computed by evaluating the distribution of highest LRS scores generated by a set of 2000 random permutations of strain means. For example, a random permutation of the correctly ordered data may give a peak LRS score of 10 somewhere across the genome. The set of 1000 or more of these highest LRS scores is then compared to the actual LRS obtained for the correctly ordered (real) data at any location in the genome. If fewer than 50 (5%) of the 1000 permutations have peak LRS scores anywhere in the genome that exceed that obtained at a particular locus using the correctly ordered data, then one can usually claim that a QTL has been defined at a genome-wide p-value of .05. The threshold will vary slightly each time it is recomputed due to the random generation of the permutations. You can view the actual histogram of the permutation results by selecting the "Marker Regression" function in the **Analysis Tools** area of the **Trait Data and Editing Form**. WebQTL does make it possible to search through hundreds of traits for those that may have significant linkage somewhere in the genome. Keep in mind that this introduces a second tier of multiple testing problems for which the permutation test will not usually provide adequate protection. If you anticipate mapping many independent traits, then you will need to correct for the number of traits you have tested. [Williams RW, Nov 14, 2004]
    @@ -513,11 +519,11 @@ where n is the number of independent biological samples used to estimate the pop #### Strain Distribution Pattern: -A marker such as a SNP or microsatellite is genotyped using DNA obtained from each member of the mapping population. In the case of a genetic reference population, such as the BXD strains or the BayXSha Arabadopsis lines, this results in a text string of genotypes (e.g., BDDDBDBBBBDDBDDDBBBB... for BXD1 through BXD100). Each marker is associated with its own particular text string of genotypes that is often called the strain distribution pattern of the marker. (A more appropriate term would be the marker genotype string.) This string is converted to a numerical version, a genotype vector: -1111-11-1-1-1-111-1111-1-1-1-1..., where D=1, B=-1, H=0. Mapping a trait boils down to performing correlations between each trait and all of the genotype vectors. The genotype vector with the highest correlation (absolute value) is a good candidate for a QTL. [Williams RW, June 18, 2005] +A marker such as a SNP or microsatellite is genotyped using DNA obtained from each member of the mapping population. In the case of a genetic reference population, such as the BXD strains or the BayXSha Arabadopsis lines, this results in a text string of genotypes (e.g., BDDDBDBBBBDDBDDDBBBB... for BXD1 through BXD100). Each marker is associated with its own particular text string of genotypes that is often called the **strain distribution pattern** of the marker. (A more appropriate term would be the **marker genotype string**.) This string is converted to a numerical version, a genotype vector: -1111-11-1-1-1-111-1111-1-1-1-1..., where D=1, B=-1, H=0. Mapping a trait boils down to performing correlations between each trait and all of the genotype vectors. The genotype vector with the highest correlation (absolute value) is a good candidate for a QTL. [Williams RW, June 18, 2005] #### Suggestive Threshold: -The suggestive threshold represents the approximate LRS value that corresponds to a genome-wide p-value of 0.63, or a 63% probability of falsely rejecting the null hypothesis that there is no linkage anywhere in the genome. This is not a typographical error. The Suggestive LRS threshold is defined as that which yields, on average, one false positive per genome scan. That is, roughly one-third of scans at this threshold will yield no false positive, one-third will yield one false positive, and one-third will yield two or more false positives. This is a very permissive threshold, but it is useful because it calls attention to loci that may be worth follow-up. Regions of the genome in which the LRS exceeds the suggestive threshold are often worth tracking and screening. They are particularly useful in combined multicross metaanalysis of traits. If two crosses pick up the same suggestive locus, then that locus may be significant when the joint probability is computed. The suggestive threshold may vary slightly each time it is recomputed due to the random generation of permutations. You can view the actual histogram of the permutation results by selecting the "Marker Regression" function in the Analysis Tools area of the Trait Data and Editing Form. [Williams RW and Manly KF, Nov 15, 2004] +The suggestive threshold represents the approximate LRS value that corresponds to a genome-wide p-value of 0.63, or a 63% probability of falsely rejecting the null hypothesis that there is no linkage anywhere in the genome. This is not a typographical error. The Suggestive LRS threshold is defined as that which yields, on average, one false positive per genome scan. That is, roughly one-third of scans at this threshold will yield no false positive, one-third will yield one false positive, and one-third will yield two or more false positives. This is a very permissive threshold, but it is useful because it calls attention to loci that may be worth follow-up. Regions of the genome in which the LRS exceeds the suggestive threshold are often worth tracking and screening. They are particularly useful in combined multicross metaanalysis of traits. If two crosses pick up the same suggestive locus, then that locus may be significant when the joint probability is computed. The suggestive threshold may vary slightly each time it is recomputed due to the random generation of permutations. You can view the actual histogram of the permutation results by selecting the "Marker Regression" function in the **Analysis Tools** area of the **Trait Data and Editing Form**. [Williams RW and Manly KF, Nov 15, 2004] #### Systems Genetics: @@ -525,7 +531,7 @@ Systems genetics or "network genetics" is an emerging new branch of genetics tha A hallmark of systems genetics is the simultaneous consideration of groups (systems) of phenotypes from the primary level of molecular and cellular interactions that ultimately modulate global phenotypes such as blood pressure, behavior, or disease resistance. Changes in environment are also often important determinants of multiscalar phenotypes; reversing the standard notion of causality as flowing inexorably upward from the genome. Scientists who use a systems genetics approach often have a broad interest in modules of linked phenotypes. Causality in these complex dynamic systems is often contingent on environmental or temporal context, and often will involve feedback modulation. A systems genetics approach can be unusually powerful, but does require the use of large numbers of observations (large sample size), and more advanced statistical and computational models. -Systems genetics is not really a new field and traces back to Sewall Wright's classical paper (Wright, 1921, "Correlation and Causation") that introduced path analysis to study systems of related phenotypes. Two factors have invigorated this field. The first factor is the advent of more sophisticated statistical methods including Structural Equation Modeling (SEM), System Dynamics Modeling, and Bayesian Network Modeling combined with powerful computer systems and efficient algorithms. The second factor is the relative ease with which it is now possible to acquire extensive and diverse phenotype data sets across genetic reference populations such as the BXD set of mice, the HXB set of rats, and the BayXSha lines of Arabidopsis (data are incorporated in the GeneNetwork). In the case of the BXD strains, a large research community has collectively generated hundreds of thousands of transcript phenotypes in different tissues and cells (level of expression), as well as hundreds of protein, cellular, pharmacological, and behavioral data types across a single genetic reference panel. Evaluating and modeling the associative and causal relations among these phenotypes is a major, and still relatively new area of research. Complex trait analysis and QTL mapping are both part of systems genetics in which causality is inferred using conventional genetic linkage (Li et al., 2005). One can often assert with confidence that a particular module of phenotypes (component of the variance and covariance) is modulated by sequence variants at a common locus. This provides a causal constraint that can be extremely helpful in more accurately modeling network architecture. Most models are currently static, but as the field matures, more sophisticated dynamic models will supplant steady-state models. +Systems genetics is not really a new field and traces back to [Sewall Wright's](http://www.amphilsoc.org/library/mole/w/wrights.htm) classical paper (Wright, 1921, "Correlation and Causation") that introduced path analysis to study systems of related phenotypes. Two factors have invigorated this field. The first factor is the advent of more sophisticated statistical methods including Structural [Equation Modeling](http://userwww.sfsu.edu/~efc/classes/biol710/path/SEMwebpage.htm) (SEM), [System Dynamics Modeling](http://www.public.asu.edu/~kirkwood/sysdyn/SDIntro/SDIntro.htm), and [Bayesian Network Modeling](http://bnj.sourceforge.net/) combined with powerful computer systems and efficient algorithms. The second factor is the relative ease with which it is now possible to acquire extensive and diverse phenotype data sets across genetic reference populations such as the BXD set of mice, the HXB set of rats, and the BayXSha lines of Arabidopsis (data are incorporated in the GeneNetwork). In the case of the BXD strains, a large research community has collectively generated hundreds of thousands of transcript phenotypes in different tissues and cells (level of expression), as well as hundreds of protein, cellular, pharmacological, and behavioral data types across a single genetic reference panel. Evaluating and modeling the associative and causal relations among these phenotypes is a major, and still relatively new area of research. Complex trait analysis and QTL mapping are both part of systems genetics in which causality is inferred using conventional genetic linkage (Li et al., [2005](http://hmg.oupjournals.org/cgi/content/abstract/ddi124v1)). One can often assert with confidence that a particular module of phenotypes (component of the variance and covariance) is modulated by sequence variants at a common locus. This provides a causal constraint that can be extremely helpful in more accurately modeling network architecture. Most models are currently static, but as the field matures, more sophisticated dynamic models will supplant steady-state models. The term "systems genetics" was coined by Grant Morahan, October 2004, during a visit to Memphis, as a more general and appropriate term to use instead of "genetical genomics." [Williams RW, April 11, 2005, revised Oct 22, 2005, April, 2008] @@ -579,13 +585,13 @@ Transgression means that you can rarely predict the distribution of phenotypes a #### Winsorize, Winsorise: -QTL mapping results can be greatly affected by inclusion of outlier data. GeneNetwork will do its best to flag outliers for you in the Trait Data and Analysis pages (yellow highlighting). Before mapping, review the data, and if necessary, change values. Options for handling outliers include: (1) do nothing, (2) delete the outliers (trimming), (3) transform the data (e.g., logarithmic, arcsine, or logistic regression transforms), or (4) winsorize the distribution of values. Winsorizing is usually the easiest method to implement directly in GeneNetwork. +QTL mapping results can be greatly affected by inclusion of outlier data. GeneNetwork will do its best to flag outliers for you in the **Trait Data and Analysis** pages (yellow highlighting). Before mapping, review the data, and if necessary, change values. Options for handling outliers include: (1) do nothing, (2) delete the outliers (trimming), (3) transform the data (e.g., logarithmic, arcsine, or logistic regression transforms), or (4) [winsorize](http://en.wikipedia.org/wiki/Winsorising) the distribution of values. Winsorizing is usually the easiest method to implement directly in GeneNetwork. -How to winsorize: First review the distribution of values and define outliers. You should only do this one time, so think before you leap. Look at the Probability Plot of the trait by going to Trait Data and Analysis page and selecting Basic Statistics). For example, the figure below from GeneNetwork shows that at many as seven cases have relatively high values and as many as three have relatively low values (this trait is taken from Species = Mouse, Group = LXS, Type = Phenotype, Trait 10182). GeneNetwork code only declares the highest two values to be outliers, but you can use a more liberal definition and give all seven high values a haircut. It is advisable to winsorizes equal numbers of cases on each side of the distribution (high and low cases). In this case, the seven highest values were changed to match that of the 8th highest value (0.860). To retain the original rank order I added an incremental value of 0.01 to each (0.861, 0.862, etc). I did the same thing to the lowest seven values. Adding this increment is not necessary. +**How to winsorize**: First review the distribution of values and define outliers. You should only do this one time, so think before you leap. Look at the **Probability Plot** of the trait by going to **Trait Data and Analysis** page and selecting **Basic Statistics**). For example, the figure below from GeneNetwork shows that at many as seven cases have relatively high values and as many as three have relatively low values (this trait is taken from Species = Mouse, Group = LXS, Type = Phenotype, Trait 10182). GeneNetwork code only declares the highest two values to be outliers, but you can use a more liberal definition and give all seven high values a haircut. It is advisable to winsorizes equal numbers of cases on each side of the distribution (high and low cases). In this case, the seven highest values were changed to match that of the 8th highest value (0.860). To retain the original rank order I added an incremental value of 0.01 to each (0.861, 0.862, etc). I did the same thing to the lowest seven values. Adding this increment is not necessary. The result in this case: a suggestive QTL on Chr 16 now reaches the significance threshold. -The danger of winsorizing is doing it multiple times in different ways. You should transform or winsorize the data before mapping. And you should ideally only do any transformation/correction one time. If you fool around with different methods of transforming your data then you are asking for trouble by adding yet another level of multiple testing. If you feel compelled to experiment with different transforms, then you should/must report this in publications and explain why you did so. Demonstrating that mapping results are robust even using multiple transforms is one good excuse. [Williams RW, Jan 2, 2014] +The **danger of winsorizing** is doing it multiple times in different ways. You should transform or winsorize the data before mapping. And you should ideally only do any transformation/correction one time. If you fool around with different methods of transforming your data then you are asking for trouble by adding yet another level of multiple testing. If you feel compelled to experiment with different transforms, then you should/must report this in publications and explain why you did so. Demonstrating that mapping results are robust even using multiple transforms is one good excuse. [Williams RW, Jan 2, 2014] -- cgit v1.2.3 From ddcd54b505384b3d8f70c82cc97b9781672c5fb6 Mon Sep 17 00:00:00 2001 From: zsloan Date: Tue, 10 Nov 2020 13:02:15 -0600 Subject: Changed "BXD-Harvested" to "BXD-Longevity" since that group was renamed and using the wrong name was causing correlations to not work. --- wqflask/wqflask/show_trait/show_trait.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wqflask/wqflask/show_trait/show_trait.py b/wqflask/wqflask/show_trait/show_trait.py index 25ba1a1d..0c6ae198 100644 --- a/wqflask/wqflask/show_trait/show_trait.py +++ b/wqflask/wqflask/show_trait/show_trait.py @@ -372,7 +372,7 @@ class ShowTrait(object): # We're checking a string here! assert isinstance(this_group, str), "We need a string type thing here" - if this_group[:3] == 'BXD' and this_group != "BXD-Harvested": + if this_group[:3] == 'BXD' and this_group != "BXD-Longevity": this_group = 'BXD' if this_group: -- cgit v1.2.3 From 61280ad47d8c0486b776011391deadcf7df39819 Mon Sep 17 00:00:00 2001 From: zsloan Date: Tue, 10 Nov 2020 13:36:13 -0600 Subject: Fixed issue where strain values in bytes needed to be decoded when loading a Temp trait --- wqflask/wqflask/show_trait/SampleList.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wqflask/wqflask/show_trait/SampleList.py b/wqflask/wqflask/show_trait/SampleList.py index 37c1d6d5..99ca7f88 100644 --- a/wqflask/wqflask/show_trait/SampleList.py +++ b/wqflask/wqflask/show_trait/SampleList.py @@ -35,7 +35,7 @@ class SampleList(object): # ZS: self.this_trait will be a list if it is a Temp trait if isinstance(self.this_trait, list): if (counter <= len(self.this_trait) and - str(self.this_trait[counter-1]).upper() != 'X'): + self.this_trait[counter-1].decode("utf-8").lower() != 'x'): sample = webqtlCaseData.webqtlCaseData( name=sample_name, value=float(self.this_trait[counter-1])) -- cgit v1.2.3 From 25f7d30b9f052ec5d812bfe3bf9713df850cc267 Mon Sep 17 00:00:00 2001 From: zsloan Date: Tue, 10 Nov 2020 14:24:26 -0600 Subject: Changed logic for creating temp trait SampleList to account for both traits encoded as bytes and traits encoded as strings, since temp traits created before and after the Python 3 switchover will have different encoding --- wqflask/wqflask/show_trait/SampleList.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/wqflask/wqflask/show_trait/SampleList.py b/wqflask/wqflask/show_trait/SampleList.py index 99ca7f88..a535c493 100644 --- a/wqflask/wqflask/show_trait/SampleList.py +++ b/wqflask/wqflask/show_trait/SampleList.py @@ -34,13 +34,18 @@ class SampleList(object): # ZS: self.this_trait will be a list if it is a Temp trait if isinstance(self.this_trait, list): - if (counter <= len(self.this_trait) and - self.this_trait[counter-1].decode("utf-8").lower() != 'x'): - sample = webqtlCaseData.webqtlCaseData( - name=sample_name, - value=float(self.this_trait[counter-1])) - else: - sample = webqtlCaseData.webqtlCaseData(name=sample_name) + sample = webqtlCaseData.webqtlCaseData(name=sample_name) + if counter <= len(self.this_trait): + if isinstance(self.this_trait[counter-1], (bytes, bytearray)): + if (self.this_trait[counter-1].decode("utf-8").lower() != 'x'): + sample = webqtlCaseData.webqtlCaseData( + name=sample_name, + value=float(self.this_trait[counter-1])) + else: + if (self.this_trait[counter-1].lower() != 'x'): + sample = webqtlCaseData.webqtlCaseData( + name=sample_name, + value=float(self.this_trait[counter-1])) else: # ZS - If there's no value for the sample/strain, # create the sample object (so samples with no value -- cgit v1.2.3 From 0422cea46205ef6477949b1f0dc188f94024f8cf Mon Sep 17 00:00:00 2001 From: BonfaceKilz Date: Wed, 11 Nov 2020 14:21:15 +0300 Subject: Add basic markdown styling * wqflask/wqflask/static/new/css/markdown.css: New file. * wqflask/wqflask/templates/glossary.html: add markdown id to markdown container. --- wqflask/wqflask/static/new/css/markdown.css | 13 +++++++++++++ wqflask/wqflask/templates/glossary.html | 6 +++++- 2 files changed, 18 insertions(+), 1 deletion(-) create mode 100644 wqflask/wqflask/static/new/css/markdown.css diff --git a/wqflask/wqflask/static/new/css/markdown.css b/wqflask/wqflask/static/new/css/markdown.css new file mode 100644 index 00000000..91167908 --- /dev/null +++ b/wqflask/wqflask/static/new/css/markdown.css @@ -0,0 +1,13 @@ +#markdown { + padding: 20px; +} + +#markdown h2, #markdown h3, #markdown h4, #markdown h5 { + font-weight: bold; +} + +#markdown img { + display: block; + margin-right: auto; + margin-left: auto; +} diff --git a/wqflask/wqflask/templates/glossary.html b/wqflask/wqflask/templates/glossary.html index 3b29f20e..146c7e86 100644 --- a/wqflask/wqflask/templates/glossary.html +++ b/wqflask/wqflask/templates/glossary.html @@ -2,9 +2,13 @@ {% block title %}Glossary{% endblock %} +{% block css %} + +{% endblock %} + {% block content %} -
    +
    [Edit on Github] {{ rendered_markdown|safe }} -- cgit v1.2.3 From 394a67ca2f28b18aa3ea8398ca48661985be8f5a Mon Sep 17 00:00:00 2001 From: BonfaceKilz Date: Wed, 11 Nov 2020 14:22:07 +0300 Subject: Update image width --- wqflask/wqflask/static/markdown/glossary.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wqflask/wqflask/static/markdown/glossary.md b/wqflask/wqflask/static/markdown/glossary.md index 68276796..db94ae18 100644 --- a/wqflask/wqflask/static/markdown/glossary.md +++ b/wqflask/wqflask/static/markdown/glossary.md @@ -238,7 +238,7 @@ Conventional knockout lines (KOs) of mice are often mixtures of the genomes of t It is often thought that 10 generations of backcrossing will result in a pure genetic background (99.8% C57BL/6J). Unfortunately, this is not true for the region around the KO, and even after many generations of backcrossing of KO stock to C57BL/6J, a large region around the KO is still derived from the 129 substrain (see the residual white "line" at N10 in the figure below. -Congenic +Congenic After 20 generations of backcrossing nearly +/-5 cM on either side of the KO will still usually be derived from 129 (see [Figure 3.6](http://www.informatics.jax.org/silverbook/frames/frame3-3.shtml)) This amounts to an average of +/- 10 megabases of DNA around the KO. The wildtype littermates do NOT have this flanking DNA from 129 and they will be like a true C57BL/6J. The +/- 10 megabases to either side of the KO is known as the "hitchhiking" chromosomal interval. Any polymorphism between 129 and B6 in this interval has the potential to have significant downstream effects on gene expression, protein expression, and higher order traits such as anxiety, activity, and maternal behavior. Much of the conventional KO literature is highly suspect due to this hitchhiker effect (see Gerlai R, [Trends in Neurosci 1996 19:177](http://gn1.genenetwork.org/images/upload/Gerlai_TINS_1996.pdf)). -- cgit v1.2.3 From 06102cab4b6a3bc898c65c6df485f328e3314980 Mon Sep 17 00:00:00 2001 From: zsloan Date: Wed, 11 Nov 2020 13:03:19 -0600 Subject: Added Bonface to the footer --- wqflask/wqflask/templates/base.html | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/wqflask/wqflask/templates/base.html b/wqflask/wqflask/templates/base.html index 0f4e5ef5..1e4301bc 100644 --- a/wqflask/wqflask/templates/base.html +++ b/wqflask/wqflask/templates/base.html @@ -159,9 +159,10 @@ Saunak Sen, Zachary Sloan, Arthur Centeno, - and Christian Fischer. + Christian Fischer + and Bonface Munyoki.

    -

    Design and code by Pjotr Prins, Zach Sloan, Arthur Centeno, Christan Fischer, Danny Arends, Sam Ockman, Lei Yan, Xiaodong Zhou, Christian Fernandez, +

    Design and code by Pjotr Prins, Zach Sloan, Arthur Centeno, Christan Fischer, Bonface Munyoki, Danny Arends, Sam Ockman, Lei Yan, Xiaodong Zhou, Christian Fernandez, Ning Liu, Rudi Alberts, Elissa Chesler, Sujoy Roy, Evan G. Williams, Alexander G. Williams, Kenneth Manly, Jintao Wang, Robert W. Williams, and colleagues.

    -- cgit v1.2.3 From 6feb7dd0f8020c7ed54b70be6a15c099dc62b490 Mon Sep 17 00:00:00 2001 From: zsloan Date: Thu, 12 Nov 2020 13:33:52 -0600 Subject: Show Mb positions to 6 decimal places in mapping results table --- wqflask/wqflask/marker_regression/run_mapping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wqflask/wqflask/marker_regression/run_mapping.py b/wqflask/wqflask/marker_regression/run_mapping.py index fa61272f..9625349c 100644 --- a/wqflask/wqflask/marker_regression/run_mapping.py +++ b/wqflask/wqflask/marker_regression/run_mapping.py @@ -414,7 +414,7 @@ class RunMapping(object): highest_chr = marker['chr'] if ('lod_score' in marker.keys()) or ('lrs_value' in marker.keys()): if 'Mb' in marker.keys(): - marker['display_pos'] = "Chr" + str(marker['chr']) + ": " + "{:.3f}".format(marker['Mb']) + marker['display_pos'] = "Chr" + str(marker['chr']) + ": " + "{:.6f}".format(marker['Mb']) elif 'cM' in marker.keys(): marker['display_pos'] = "Chr" + str(marker['chr']) + ": " + "{:.3f}".format(marker['cM']) else: -- cgit v1.2.3 From d6227b9e0a331b2cb3db705a503dcfa5bbaf1555 Mon Sep 17 00:00:00 2001 From: zsloan Date: Thu, 12 Nov 2020 13:42:18 -0600 Subject: Substituted -log(p) with -logP as mentioned in Rob's e-mail --- wqflask/wqflask/marker_regression/display_mapping_results.py | 6 +++--- wqflask/wqflask/marker_regression/run_mapping.py | 8 ++++---- wqflask/wqflask/templates/mapping_results.html | 8 ++++---- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/wqflask/wqflask/marker_regression/display_mapping_results.py b/wqflask/wqflask/marker_regression/display_mapping_results.py index 6d6572ff..3f6de2b2 100644 --- a/wqflask/wqflask/marker_regression/display_mapping_results.py +++ b/wqflask/wqflask/marker_regression/display_mapping_results.py @@ -2076,7 +2076,7 @@ class DisplayMappingResults(object): if self.lrsMax <= 0: #sliding scale if "lrs_value" in self.qtlresults[0]: LRS_LOD_Max = max([result['lrs_value'] for result in self.qtlresults]) - if self.LRS_LOD == "LOD" or self.LRS_LOD == "-log(p)": + if self.LRS_LOD == "LOD" or self.LRS_LOD == "-logP": LRS_LOD_Max = LRS_LOD_Max / self.LODFACTOR if self.permChecked and self.nperm > 0 and not self.multipleInterval: self.significant = min(self.significant / self.LODFACTOR, webqtlConfig.MAXLRS) @@ -2172,7 +2172,7 @@ class DisplayMappingResults(object): TEXT_X_DISPLACEMENT = -12 else: TEXT_X_DISPLACEMENT = -30 - if self.LRS_LOD == "-log(p)": + if self.LRS_LOD == "-logP": TEXT_Y_DISPLACEMENT = -242 else: TEXT_Y_DISPLACEMENT = -210 @@ -2397,7 +2397,7 @@ class DisplayMappingResults(object): if 'lrs_value' in qtlresult: - if self.LRS_LOD == "LOD" or self.LRS_LOD == "-log(p)": + if self.LRS_LOD == "LOD" or self.LRS_LOD == "-logP": if qtlresult['lrs_value'] > 460 or qtlresult['lrs_value']=='inf': #Yc = yZero - webqtlConfig.MAXLRS*LRSHeightThresh/(LRSAxisList[-1]*self.LODFACTOR) Yc = yZero - webqtlConfig.MAXLRS*LRSHeightThresh/(LRS_LOD_Max*self.LODFACTOR) diff --git a/wqflask/wqflask/marker_regression/run_mapping.py b/wqflask/wqflask/marker_regression/run_mapping.py index 9625349c..31d6a67c 100644 --- a/wqflask/wqflask/marker_regression/run_mapping.py +++ b/wqflask/wqflask/marker_regression/run_mapping.py @@ -228,7 +228,7 @@ class RunMapping(object): self.output_files = start_vars['output_files'] if 'first_run' in start_vars: #ZS: check if first run so existing result files can be used if it isn't (for example zooming on a chromosome, etc) self.first_run = False - self.score_type = "-log(p)" + self.score_type = "-logP" self.manhattan_plot = True with Bench("Running GEMMA"): if self.use_loco == "True": @@ -327,7 +327,7 @@ class RunMapping(object): self.control_marker, self.manhattan_plot) elif self.mapping_method == "plink": - self.score_type = "-log(p)" + self.score_type = "-logP" self.manhattan_plot = True results = plink_mapping.run_plink(self.this_trait, self.dataset, self.species, self.vals, self.maf) #results = self.run_plink() @@ -539,8 +539,8 @@ def export_mapping_results(dataset, trait, markers, results_path, mapping_scale, output_file.write("Location: " + str(trait.chr) + " @ " + str(trait.mb) + " Mb\n") output_file.write("\n") output_file.write("Name,Chr,") - if score_type.lower() == "-log(p)": - score_type = "-log(p)" + if score_type.lower() == "-logP": + score_type = "-logP" if 'Mb' in markers[0]: output_file.write("Mb," + score_type) if 'cM' in markers[0]: diff --git a/wqflask/wqflask/templates/mapping_results.html b/wqflask/wqflask/templates/mapping_results.html index e68a792a..28d93542 100644 --- a/wqflask/wqflask/templates/mapping_results.html +++ b/wqflask/wqflask/templates/mapping_results.html @@ -99,7 +99,7 @@ LRS ? @@ -235,8 +235,8 @@ Row Marker - {% if LRS_LOD == "-log(p)" %} -
    –log(p)
    + {% if LRS_LOD == "-logP" %} +
    –logP
    {% else %}
    {{ LRS_LOD }}
    {% endif %} @@ -259,7 +259,7 @@ {{ loop.index }} {% if geno_db_exists == "True" %}
    {{ marker.name }}{% else %}{{ marker.name }}{% endif %} - {% if LRS_LOD == "LOD" or LRS_LOD == "-log(p)" %} + {% if LRS_LOD == "LOD" or LRS_LOD == "-logP" %} {% if 'lod_score' in marker %} {{ '%0.2f' | format(marker.lod_score|float) }} {% else %} -- cgit v1.2.3 From f1c15048407fa82557ecc89c1367b58ba65c859b Mon Sep 17 00:00:00 2001 From: zsloan Date: Thu, 12 Nov 2020 17:28:40 -0600 Subject: Import fontawesome css/js and used its info character for glossary links on search results page --- wqflask/wqflask/templates/search_result_page.html | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/wqflask/wqflask/templates/search_result_page.html b/wqflask/wqflask/templates/search_result_page.html index 36f144c2..8e2b06a4 100644 --- a/wqflask/wqflask/templates/search_result_page.html +++ b/wqflask/wqflask/templates/search_result_page.html @@ -4,6 +4,7 @@ + {% endblock %} @@ -184,6 +185,7 @@ + @@ -331,10 +333,10 @@ 'orderSequence': [ "desc", "asc"] }, { - 'title': "High P ?", + 'title': "High P ", 'type': "natural-minus-na", 'data': "lrs_score", - 'width': "60px", + 'width': "65px", 'orderSequence': [ "desc", "asc"] }, { @@ -344,10 +346,10 @@ 'data': "lrs_location" }, { - 'title': "Effect Size ?", + 'title': "Effect Size ", 'type': "natural-minus-na", 'data': "additive", - 'width': "85px", + 'width': "90px", 'orderSequence': [ "desc", "asc"] }{% elif dataset.type == 'Publish' %}, { @@ -400,7 +402,7 @@ 'orderSequence': [ "desc", "asc"] }, { - 'title': "High P ?", + 'title': "High P ", 'type': "natural-minus-na", 'data': "lrs_score", 'width': "80px", @@ -413,7 +415,7 @@ 'data': "lrs_location" }, { - 'title': "Effect Size ?", + 'title': "Effect Size ", 'type': "natural-minus-na", 'width': "120px", 'data': "additive", @@ -470,5 +472,4 @@ }); -{% endblock %} - +{% endblock %} \ No newline at end of file -- cgit v1.2.3 From 61d5c7710d044c507d69e600e04a40067cf7320f Mon Sep 17 00:00:00 2001 From: Alexanderlacuna Date: Mon, 16 Nov 2020 20:23:21 +0300 Subject: update refactored natural_sort function in show_trait/SampleList.py --- wqflask/wqflask/show_trait/SampleList.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wqflask/wqflask/show_trait/SampleList.py b/wqflask/wqflask/show_trait/SampleList.py index 191c29bd..349f6b65 100644 --- a/wqflask/wqflask/show_trait/SampleList.py +++ b/wqflask/wqflask/show_trait/SampleList.py @@ -112,7 +112,7 @@ class SampleList(object): self.attributes[key].name = name self.attributes[key].distinct_values = [ item.Value for item in values] - natural_sort(self.attributes[key].distinct_values) + self.attributes[key].distinct_values=natural_sort(self.attributes[key].distinct_values) all_numbers = True for value in self.attributes[key].distinct_values: try: -- cgit v1.2.3 From 4ab6c9e9744e3a681a885599cddfeebde513ed2a Mon Sep 17 00:00:00 2001 From: zsloan Date: Wed, 18 Nov 2020 12:31:50 -0600 Subject: Fixed "back_to_collections" function in get_traits_from_collection.js (function that lets you return from viewing a collection to viewing the list of collections when selecting cofactors for scatterplot) --- wqflask/wqflask/static/new/javascript/get_traits_from_collection.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wqflask/wqflask/static/new/javascript/get_traits_from_collection.js b/wqflask/wqflask/static/new/javascript/get_traits_from_collection.js index 4ec62157..8f6f389f 100644 --- a/wqflask/wqflask/static/new/javascript/get_traits_from_collection.js +++ b/wqflask/wqflask/static/new/javascript/get_traits_from_collection.js @@ -397,8 +397,8 @@ process_traits = function(trait_data, textStatus, jqXHR) { }; back_to_collections = function() { - collection_list_html = $('#collection_list_html').html() - $("#collections_holder").html(collection_list_html); + console.log("collection_list:", collection_list); + $("#collections_holder").html(collection_list); $(document).on("click", ".collection_line", collection_click); return $('#collections_holder').colorbox.resize(); }; -- cgit v1.2.3 From 2256312c8d2fdf11b78c894bdf030f640e6a2158 Mon Sep 17 00:00:00 2001 From: zsloan Date: Wed, 18 Nov 2020 12:33:48 -0600 Subject: Removed console.log and commented out statements from get_traits_from_collection.js --- .../new/javascript/get_traits_from_collection.js | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/wqflask/wqflask/static/new/javascript/get_traits_from_collection.js b/wqflask/wqflask/static/new/javascript/get_traits_from_collection.js index 8f6f389f..a55ab356 100644 --- a/wqflask/wqflask/static/new/javascript/get_traits_from_collection.js +++ b/wqflask/wqflask/static/new/javascript/get_traits_from_collection.js @@ -2,10 +2,6 @@ var add_trait_data, assemble_into_json, back_to_collections, collection_click, collection_list, color_by_trait, create_trait_data_csv, get_this_trait_vals, get_trait_data, process_traits, selected_traits, submit_click, this_trait_data, trait_click, __indexOf = [].indexOf || function(item) { for (var i = 0, l = this.length; i < l; i++) { if (i in this && this[i] === item) return i; } return -1; }; -console.log("before get_traits_from_collection"); - -//collection_list = null; - this_trait_data = null; selected_traits = {}; @@ -69,7 +65,7 @@ if ( ! $.fn.DataTable.isDataTable( '#collection_table' ) ) { collection_click = function() { var this_collection_url; - //console.log("Clicking on:", $(this)); + this_collection_url = $(this).find('.collection_name').prop("href"); this_collection_url += "&json"; collection_list = $("#collections_holder").html(); @@ -87,9 +83,7 @@ submit_click = function() { $('#collections_holder').find('input[type=checkbox]:checked').each(function() { var this_dataset, this_trait, this_trait_url; this_trait = $(this).parents('tr').find('.trait').text(); - console.log("this_trait is:", this_trait); this_dataset = $(this).parents('tr').find('.dataset').text(); - console.log("this_dataset is:", this_dataset); this_trait_url = "/trait/get_sample_data?trait=" + this_trait + "&dataset=" + this_dataset; return $.ajax({ dataType: "json", @@ -147,7 +141,7 @@ create_trait_data_csv = function(selected_traits) { } all_vals.push(this_trait_vals); } - console.log("all_vals:", all_vals); + trait_vals_csv = trait_names.join(","); trait_vals_csv += "\n"; for (index = _k = 0, _len2 = samples.length; _k < _len2; index = ++_k) { @@ -168,7 +162,7 @@ create_trait_data_csv = function(selected_traits) { trait_click = function() { var dataset, this_trait_url, trait; - console.log("Clicking on:", $(this)); + trait = $(this).parent().find('.trait').text(); dataset = $(this).parent().find('.dataset').text(); this_trait_url = "/trait/get_sample_data?trait=" + trait + "&dataset=" + dataset; @@ -182,7 +176,6 @@ trait_click = function() { trait_row_click = function() { var dataset, this_trait_url, trait; - console.log("Clicking on:", $(this)); trait = $(this).find('.trait').text(); dataset = $(this).find('.dataset').data("dataset"); this_trait_url = "/trait/get_sample_data?trait=" + trait + "&dataset=" + dataset; @@ -256,7 +249,6 @@ populate_cofactor_info = function(trait_info) { get_trait_data = function(trait_data, textStatus, jqXHR) { var sample, samples, this_trait_vals, trait_sample_data, vals, _i, _len; trait_sample_data = trait_data[1]; - console.log("IN GET TRAIT DATA") if ( $('input[name=allsamples]').length ) { samples = $('input[name=allsamples]').val().split(" "); } else { @@ -362,13 +354,11 @@ assemble_into_json = function(this_trait_vals) { }; color_by_trait = function(trait_sample_data, textStatus, jqXHR) { - console.log('in color_by_trait:', trait_sample_data); return root.bar_chart.color_by_trait(trait_sample_data); }; process_traits = function(trait_data, textStatus, jqXHR) { var the_html, trait, _i, _len; - console.log('in process_traits with trait_data:', trait_data); the_html = ""; the_html += " "; @@ -397,13 +387,11 @@ process_traits = function(trait_data, textStatus, jqXHR) { }; back_to_collections = function() { - console.log("collection_list:", collection_list); $("#collections_holder").html(collection_list); $(document).on("click", ".collection_line", collection_click); return $('#collections_holder').colorbox.resize(); }; -console.log("inside get_traits_from_collection"); $(".collection_line").on("click", collection_click); $("#submit").on("click", submit_click); if ($('#scatterplot2').length){ -- cgit v1.2.3 From 7a2a59f3d225f94bda5a9bc51a958d3203ea690c Mon Sep 17 00:00:00 2001 From: zsloan Date: Wed, 18 Nov 2020 12:48:43 -0600 Subject: Fixed issue that was causing scatterplot cofactors to not work if the cofactors were genotypes (because it tried to get the "description_display" which doesn't exist for genotype traits) --- wqflask/base/trait.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/wqflask/base/trait.py b/wqflask/base/trait.py index cfc02f8b..7763dbe8 100644 --- a/wqflask/base/trait.py +++ b/wqflask/base/trait.py @@ -265,11 +265,12 @@ def get_sample_data(): trait_dict['species'] = trait_ob.dataset.group.species trait_dict['url'] = url_for( 'show_trait_page', trait_id=trait, dataset=dataset) - trait_dict['description'] = trait_ob.description_display if trait_ob.dataset.type == "ProbeSet": trait_dict['symbol'] = trait_ob.symbol trait_dict['location'] = trait_ob.location_repr + trait_dict['description'] = trait_ob.description_display elif trait_ob.dataset.type == "Publish": + trait_dict['description'] = trait_ob.description_display if trait_ob.pubmed_id: trait_dict['pubmed_link'] = trait_ob.pubmed_link trait_dict['pubmed_text'] = trait_ob.pubmed_text -- cgit v1.2.3 From 56445de585552ecb60c62d608f510f01fabc454b Mon Sep 17 00:00:00 2001 From: zsloan Date: Wed, 18 Nov 2020 13:44:38 -0600 Subject: Added some jquery closing the "Add To Collection" colorbox window after its form is submitted --- wqflask/wqflask/templates/collections/add.html | 3 +++ 1 file changed, 3 insertions(+) diff --git a/wqflask/wqflask/templates/collections/add.html b/wqflask/wqflask/templates/collections/add.html index 62b6abb5..b4e5385b 100644 --- a/wqflask/wqflask/templates/collections/add.html +++ b/wqflask/wqflask/templates/collections/add.html @@ -50,4 +50,7 @@ -- cgit v1.2.3 From 518bdbd8f956596e1cee189fe026a71863156fac Mon Sep 17 00:00:00 2001 From: zsloan Date: Wed, 18 Nov 2020 13:48:46 -0600 Subject: Changed default Low/High colors for the correlation scatterplot cofactor coloring to Blue/Red respectively (was originally Light Grey/Black) --- wqflask/wqflask/templates/corr_scatterplot.html | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wqflask/wqflask/templates/corr_scatterplot.html b/wqflask/wqflask/templates/corr_scatterplot.html index 1fd5cd15..1133fcd2 100644 --- a/wqflask/wqflask/templates/corr_scatterplot.html +++ b/wqflask/wqflask/templates/corr_scatterplot.html @@ -81,9 +81,9 @@ -- cgit v1.2.3 From 6ed037083f0b2bac95021e5fb00c0c8877422a47 Mon Sep 17 00:00:00 2001 From: zsloan Date: Wed, 18 Nov 2020 14:06:27 -0600 Subject: Fixed issue where the cofactor trait descriptions didn't work correctly for genotype/snp cofactors (the code previously only accounted for probeset/phenotype cofactors, so it was treating genotypes/snps like phenotype traits) --- wqflask/base/trait.py | 2 ++ .../new/javascript/get_traits_from_collection.js | 24 ++++++++++++++++------ 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/wqflask/base/trait.py b/wqflask/base/trait.py index 7763dbe8..0f8f937c 100644 --- a/wqflask/base/trait.py +++ b/wqflask/base/trait.py @@ -274,6 +274,8 @@ def get_sample_data(): if trait_ob.pubmed_id: trait_dict['pubmed_link'] = trait_ob.pubmed_link trait_dict['pubmed_text'] = trait_ob.pubmed_text + else: + trait_dict['location'] = trait_ob.location_repr return json.dumps([trait_dict, {key: value.value for key, value in list( diff --git a/wqflask/wqflask/static/new/javascript/get_traits_from_collection.js b/wqflask/wqflask/static/new/javascript/get_traits_from_collection.js index a55ab356..626357d4 100644 --- a/wqflask/wqflask/static/new/javascript/get_traits_from_collection.js +++ b/wqflask/wqflask/static/new/javascript/get_traits_from_collection.js @@ -201,13 +201,17 @@ populate_cofactor_info = function(trait_info) { if (trait_info['type'] == "ProbeSet"){ $('#cofactor1_trait_link').text(trait_info['species'] + " " + trait_info['group'] + " " + trait_info['tissue'] + " " + trait_info['db'] + ": " + trait_info['name']) $('#cofactor1_description').text("[" + trait_info['symbol'] + " on " + trait_info['location'] + " Mb]\n" + trait_info['description']) - } else { + } else if (trait_info['type'] == "Publish") { $('#cofactor1_trait_link').text(trait_info['species'] + " " + trait_info['group'] + " " + trait_info['db'] + ": " + trait_info['name']) if ('pubmed_link' in trait_info) { $('#cofactor1_description').html('PubMed: ' + trait_info['pubmed_text'] + '
    ' + trait_info['description']) } else { - $('#cofactor1_description').html('PubMed: ' + trait_info['pubmed_text'] + '
    ' + trait_info['description']) + $('#cofactor1_trait_link').text(trait_info['species'] + " " + trait_info['group'] + " " + trait_info['db'] + ": " + trait_info['name']) + $('#cofactor1_description').text("[" + trait_info['name'] + " on " + trait_info['location'] + " Mb]\n" + trait_info['description']) } + } else { + $('#cofactor1_trait_link').text(trait_info['species'] + " " + trait_info['group'] + " " + trait_info['db'] + ": " + trait_info['name']) + $('#cofactor1_description').text("[" + trait_info['name'] + " on " + trait_info['location'] + " Mb]\n") } $('#select_cofactor1').text("Change Cofactor 1"); $('#cofactor1_info_container').css("display", "inline"); @@ -217,13 +221,17 @@ populate_cofactor_info = function(trait_info) { if (trait_info['type'] == "ProbeSet"){ $('#cofactor2_trait_link').text(trait_info['species'] + " " + trait_info['group'] + " " + trait_info['tissue'] + " " + trait_info['db'] + ": " + trait_info['name']) $('#cofactor2_description').text("[" + trait_info['symbol'] + " on " + trait_info['location'] + " Mb]\n" + trait_info['description']) - } else { + } else if (trait_info['type'] == "Publish") { $('#cofactor2_trait_link').text(trait_info['species'] + " " + trait_info['group'] + " " + trait_info['db'] + ": " + trait_info['name']) if ('pubmed_link' in trait_info) { $('#cofactor2_description').html('PubMed: ' + trait_info['pubmed_text'] + '
    ' + trait_info['description']) } else { - $('#cofactor2_description').html('PubMed: ' + trait_info['pubmed_text'] + '
    ' + trait_info['description']) + $('#cofactor2_trait_link').text(trait_info['species'] + " " + trait_info['group'] + " " + trait_info['db'] + ": " + trait_info['name']) + $('#cofactor2_description').text("[" + trait_info['name'] + " on " + trait_info['location'] + " Mb]\n" + trait_info['description']) } + } else { + $('#cofactor2_trait_link').text(trait_info['species'] + " " + trait_info['group'] + " " + trait_info['db'] + ": " + trait_info['name']) + $('#cofactor2_description').text("[" + trait_info['name'] + " on " + trait_info['location'] + " Mb]\n") } $('#select_cofactor2').text("Change Cofactor 2"); $('#cofactor2_info_container').css("display", "inline"); @@ -233,13 +241,17 @@ populate_cofactor_info = function(trait_info) { if (trait_info['type'] == "ProbeSet"){ $('#cofactor3_trait_link').text(trait_info['species'] + " " + trait_info['group'] + " " + trait_info['tissue'] + " " + trait_info['db'] + ": " + trait_info['name']) $('#cofactor3_description').text("[" + trait_info['symbol'] + " on " + trait_info['location'] + " Mb]\n" + trait_info['description']) - } else { + } else if (trait_info['type'] == "Publish") { $('#cofactor3_trait_link').text(trait_info['species'] + " " + trait_info['group'] + " " + trait_info['db'] + ": " + trait_info['name']) if ('pubmed_link' in trait_info) { $('#cofactor3_description').html('PubMed: ' + trait_info['pubmed_text'] + '
    ' + trait_info['description']) } else { - $('#cofactor3_description').html('PubMed: ' + trait_info['pubmed_text'] + '
    ' + trait_info['description']) + $('#cofactor3_trait_link').text(trait_info['species'] + " " + trait_info['group'] + " " + trait_info['db'] + ": " + trait_info['name']) + $('#cofactor3_description').text("[" + trait_info['name'] + " on " + trait_info['location'] + " Mb]\n" + trait_info['description']) } + } else { + $('#cofactor3_trait_link').text(trait_info['species'] + " " + trait_info['group'] + " " + trait_info['db'] + ": " + trait_info['name']) + $('#cofactor3_description').text("[" + trait_info['name'] + " on " + trait_info['location'] + " Mb]\n") } $('#select_cofactor3').text("Change Cofactor 3"); $('#cofactor3_info_container').css("display", "inline"); -- cgit v1.2.3 From 6cc806e65bee5652bbe761c10079017a5b44a160 Mon Sep 17 00:00:00 2001 From: zsloan Date: Thu, 19 Nov 2020 14:38:04 -0600 Subject: Removed lines that check that all traits are part of the same group, since it might be the case in the future that different groups still share sample names (and it's not really necessary since we check how many samples are shraed between each individual pair of traits) --- .../wqflask/correlation_matrix/show_corr_matrix.py | 210 ++++++++++----------- 1 file changed, 101 insertions(+), 109 deletions(-) diff --git a/wqflask/wqflask/correlation_matrix/show_corr_matrix.py b/wqflask/wqflask/correlation_matrix/show_corr_matrix.py index a394f548..0269ce68 100644 --- a/wqflask/wqflask/correlation_matrix/show_corr_matrix.py +++ b/wqflask/wqflask/correlation_matrix/show_corr_matrix.py @@ -55,11 +55,7 @@ class CorrelationMatrix(object): self.do_PCA = True this_group = self.trait_list[0][1].group.name #ZS: Getting initial group name before verifying all traits are in the same group in the following loop for trait_db in self.trait_list: - if trait_db[1].group.name != this_group: - self.insufficient_shared_samples = True - break - else: - this_group = trait_db[1].group.name + this_group = trait_db[1].group.name this_trait = trait_db[0] self.traits.append(this_trait) this_sample_data = this_trait.data @@ -68,119 +64,115 @@ class CorrelationMatrix(object): if sample not in self.all_sample_list: self.all_sample_list.append(sample) - if self.insufficient_shared_samples: - pass - else: - self.sample_data = [] - for trait_db in self.trait_list: - this_trait = trait_db[0] - this_sample_data = this_trait.data + self.sample_data = [] + for trait_db in self.trait_list: + this_trait = trait_db[0] + this_sample_data = this_trait.data - this_trait_vals = [] - for sample in self.all_sample_list: - if sample in this_sample_data: - this_trait_vals.append(this_sample_data[sample].value) - else: - this_trait_vals.append('') - self.sample_data.append(this_trait_vals) - - if len(this_trait_vals) < len(self.trait_list): #Shouldn't do PCA if there are more traits than observations/samples - self.do_PCA = False - - self.lowest_overlap = 8 #ZS: Variable set to the lowest overlapping samples in order to notify user, or 8, whichever is lower (since 8 is when we want to display warning) - - self.corr_results = [] - self.pca_corr_results = [] - self.shared_samples_list = self.all_sample_list - for trait_db in self.trait_list: - this_trait = trait_db[0] - this_db = trait_db[1] - - this_db_samples = this_db.group.all_samples_ordered() - this_sample_data = this_trait.data - - corr_result_row = [] - pca_corr_result_row = [] - is_spearman = False #ZS: To determine if it's above or below the diagonal - for target in self.trait_list: - target_trait = target[0] - target_db = target[1] - target_samples = target_db.group.all_samples_ordered() - target_sample_data = target_trait.data - - this_trait_vals = [] - target_vals = [] - for index, sample in enumerate(target_samples): - if (sample in this_sample_data) and (sample in target_sample_data): - sample_value = this_sample_data[sample].value - target_sample_value = target_sample_data[sample].value - this_trait_vals.append(sample_value) - target_vals.append(target_sample_value) - else: - if sample in self.shared_samples_list: - self.shared_samples_list.remove(sample) - - this_trait_vals, target_vals, num_overlap = corr_result_helpers.normalize_values(this_trait_vals, target_vals) - - if num_overlap < self.lowest_overlap: - self.lowest_overlap = num_overlap - if num_overlap < 2: - corr_result_row.append([target_trait, 0, num_overlap]) - pca_corr_result_row.append(0) - else: - pearson_r, pearson_p = scipy.stats.pearsonr(this_trait_vals, target_vals) - if is_spearman == False: - sample_r, sample_p = pearson_r, pearson_p - if sample_r == 1: - is_spearman = True - else: - sample_r, sample_p = scipy.stats.spearmanr(this_trait_vals, target_vals) - - corr_result_row.append([target_trait, sample_r, num_overlap]) - pca_corr_result_row.append(pearson_r) - - self.corr_results.append(corr_result_row) - self.pca_corr_results.append(pca_corr_result_row) - - self.trait_data_array = [] - for trait_db in self.trait_list: - this_trait = trait_db[0] - this_db = trait_db[1] - this_db_samples = this_db.group.all_samples_ordered() - this_sample_data = this_trait.data + this_trait_vals = [] + for sample in self.all_sample_list: + if sample in this_sample_data: + this_trait_vals.append(this_sample_data[sample].value) + else: + this_trait_vals.append('') + self.sample_data.append(this_trait_vals) + + if len(this_trait_vals) < len(self.trait_list): #Shouldn't do PCA if there are more traits than observations/samples + self.do_PCA = False + + self.lowest_overlap = 8 #ZS: Variable set to the lowest overlapping samples in order to notify user, or 8, whichever is lower (since 8 is when we want to display warning) + + self.corr_results = [] + self.pca_corr_results = [] + self.shared_samples_list = self.all_sample_list + for trait_db in self.trait_list: + this_trait = trait_db[0] + this_db = trait_db[1] + + this_db_samples = this_db.group.all_samples_ordered() + this_sample_data = this_trait.data + + corr_result_row = [] + pca_corr_result_row = [] + is_spearman = False #ZS: To determine if it's above or below the diagonal + for target in self.trait_list: + target_trait = target[0] + target_db = target[1] + target_samples = target_db.group.all_samples_ordered() + target_sample_data = target_trait.data this_trait_vals = [] - for index, sample in enumerate(this_db_samples): - if (sample in this_sample_data) and (sample in self.shared_samples_list): + target_vals = [] + for index, sample in enumerate(target_samples): + if (sample in this_sample_data) and (sample in target_sample_data): sample_value = this_sample_data[sample].value + target_sample_value = target_sample_data[sample].value this_trait_vals.append(sample_value) - self.trait_data_array.append(this_trait_vals) + target_vals.append(target_sample_value) + else: + if sample in self.shared_samples_list: + self.shared_samples_list.remove(sample) - corr_result_eigen = np.linalg.eig(np.array(self.pca_corr_results)) - corr_eigen_value, corr_eigen_vectors = sortEigenVectors(corr_result_eigen) + this_trait_vals, target_vals, num_overlap = corr_result_helpers.normalize_values(this_trait_vals, target_vals) - groups = [] - for sample in self.all_sample_list: - groups.append(1) - - try: - if self.do_PCA == True: - self.pca_works = "True" - self.pca_trait_ids = [] - pca = self.calculate_pca(list(range(len(self.traits))), corr_eigen_value, corr_eigen_vectors) - self.loadings_array = self.process_loadings() + if num_overlap < self.lowest_overlap: + self.lowest_overlap = num_overlap + if num_overlap < 2: + corr_result_row.append([target_trait, 0, num_overlap]) + pca_corr_result_row.append(0) else: - self.pca_works = "False" - except: - self.pca_works = "False" + pearson_r, pearson_p = scipy.stats.pearsonr(this_trait_vals, target_vals) + if is_spearman == False: + sample_r, sample_p = pearson_r, pearson_p + if sample_r == 1: + is_spearman = True + else: + sample_r, sample_p = scipy.stats.spearmanr(this_trait_vals, target_vals) + + corr_result_row.append([target_trait, sample_r, num_overlap]) + pca_corr_result_row.append(pearson_r) - self.js_data = dict(traits = [trait.name for trait in self.traits], - groups = groups, - cols = list(range(len(self.traits))), - rows = list(range(len(self.traits))), - samples = self.all_sample_list, - sample_data = self.sample_data,) - # corr_results = [result[1] for result in result_row for result_row in self.corr_results]) + self.corr_results.append(corr_result_row) + self.pca_corr_results.append(pca_corr_result_row) + + self.trait_data_array = [] + for trait_db in self.trait_list: + this_trait = trait_db[0] + this_db = trait_db[1] + this_db_samples = this_db.group.all_samples_ordered() + this_sample_data = this_trait.data + + this_trait_vals = [] + for index, sample in enumerate(this_db_samples): + if (sample in this_sample_data) and (sample in self.shared_samples_list): + sample_value = this_sample_data[sample].value + this_trait_vals.append(sample_value) + self.trait_data_array.append(this_trait_vals) + + corr_result_eigen = np.linalg.eig(np.array(self.pca_corr_results)) + corr_eigen_value, corr_eigen_vectors = sortEigenVectors(corr_result_eigen) + + groups = [] + for sample in self.all_sample_list: + groups.append(1) + + try: + if self.do_PCA == True: + self.pca_works = "True" + self.pca_trait_ids = [] + pca = self.calculate_pca(list(range(len(self.traits))), corr_eigen_value, corr_eigen_vectors) + self.loadings_array = self.process_loadings() + else: + self.pca_works = "False" + except: + self.pca_works = "False" + + self.js_data = dict(traits = [trait.name for trait in self.traits], + groups = groups, + cols = list(range(len(self.traits))), + rows = list(range(len(self.traits))), + samples = self.all_sample_list, + sample_data = self.sample_data,) def calculate_pca(self, cols, corr_eigen_value, corr_eigen_vectors): base = importr('base') -- cgit v1.2.3 From 97d3e0907640fd32f9ff49a79f8a453852727f72 Mon Sep 17 00:00:00 2001 From: BonfaceKilz Date: Fri, 20 Nov 2020 18:45:02 +0300 Subject: Remove glossary markdown template * wqflask/wqflask/static/markdown/: Delete folder. Files moved to https://github.com/genenetwork/gn-docs --- wqflask/wqflask/static/markdown/glossary.md | 618 ---------------------------- 1 file changed, 618 deletions(-) delete mode 100644 wqflask/wqflask/static/markdown/glossary.md diff --git a/wqflask/wqflask/static/markdown/glossary.md b/wqflask/wqflask/static/markdown/glossary.md deleted file mode 100644 index db94ae18..00000000 --- a/wqflask/wqflask/static/markdown/glossary.md +++ /dev/null @@ -1,618 +0,0 @@ -# Glossary of Terms and Features - -
    - -[A](#a) | [B](#b) | [C](#c)| [D](#d) | [E](#e) | [F](#f) | [G](#g) | [H](#h) | [I](#i) | [J](#j) | [K](#k) | [L](#l) | [M](#m) | [N](#n) | [O](#o) | [P](#p) | [Q](#q) | [R](#r) | [S](#s) | [T](#t) | [U](#u) | [V](#v) | [W](#w) | [X](#x) | [Y](#y) | [Z](#z) - -You are welcome to cite or reproduce these glossary -definitions. Please cite or link: Author AA. "Insert Glossary Term -Here." From The WebQTL Glossary--A GeneNetwork -Resource. gn1.genenetwork.org/glossary.html - -
    - -## A - -
    - -#### Additive Allele Effect: - -The additive allele effect is an estimate of the change in the average phenotype that would be produced by substituting a single allele of one type with that of another type (e.g., a replaced by A) in a population. In a standard F2 intercross between two inbred parental lines there are two alleles at every polymorphic locus that are often referred to as the little "a" allele and big "A" allele. F2 progeny inherit the a/a, a/A, or A/A genotypes at every genetic locus in a ratio close to 1:2:1. The additive effect is half of the difference between the mean of all cases that are homozygous for one parental allele (aa) compared to the mean of all cases that are homozygous for the other parental allele (AA): - -[(mean of AA cases)-(mean of aa cases)]/2 - -GeneNetwork displays the additive values on the far right of many trait/QTL maps, usually as red or green lines along the maps. The units of measurement of additive effects (and dominance effects) are defined by the trait itself and are shown in **Trait Data and Analysis** windows. For mRNA estimates these units are usually normalized log2 expression values. For this reason an additive effect of 0.5 units indicates that the A/A and a/a genotypes at a particular locus or marker differ by 1 unit (twice the effect of swapping a single A allele for an a allele). On this log2 scale this is equivalent to a 2-fold difference (2 raised to the power of 1). - -On the QTL map plots the polarity of allele effects is represented by the color of the line. For example, in mouse BXD family maps, if the DBA/2J allele produces higher values than the C57BL/6J allele then the additive effect line is colored in green. In contrast, if the C57BL/6J allele produces higher values then the line is colored in red. For computational purposes, C57BL/6J red values are considered negative. - -The dominance effects of alleles are also computed on maps for F2 populations (e.g., B6D2F2 and B6BTBRF2). Orange and purple line colors are used to distinguish the polarity of effects. Purple is the positive dominance effect that matches the polarity of the green additive effect, whereas orange is the negative dominance effect that matches the polarity of the red additive effect. [Please also see entry on **Dominance Effects**: Williams RW, Oct 15, 2004; Sept 3, 2005; Dec 4, 2005; Oct 25, 2011] - -[Go back to index](#index) - -
    - -
    - -#### Bootstrap: - -A [bootstrap sample](http://en.wikipedia.org/wiki/Bootstrapping_%28statistics%29) is a randomly drawn sample (or resample) that is taken from the original data set and that has the same number of samples as the original data set. In a single bootstrap sample, some cases will by chance be represented one or more times; other cases may not be represented at all (in other words, the sampling is done "with replacement" after each selection). To get a better intuitive feel for the method, imagine a bag of 26 Scrabble pieces that contain each letter of the English alphabet. In a bootstrap sample of these 26 pieces, you would shake the bag, insert your hand, and draw out one piece. You would then write down that letter on a piece of paper, and the place that Scrabble piece back in the bag in preparation for the next random selection. You would repeat this process (shake, draw, replace) 25 more times to generate a single bootstrap resample of the alphabet. Some letters will be represented several time in each sample and others will not be represented at al. If you repeat this procedure 1000 times you would have a set of bootstrap resamples of the type that GN uses to remap data sets. - -Bootstrap resampling is a method that can be used to estimate statistical parameters and error terms. GeneNetwork uses a bootstrap procedure to evaluate approximate confidence limits of QTL peaks using a method proposed by Peter Visscher and colleagues ([1996](http://www.genetics.org/content/143/2/1013.full.pdf)). We generate 2000 bootstraps, remap each, and keep track of the location of the single locus with the highest LRS score locations (equivalent to a "letter" in the Scrabble example). The 2000 "best" locations are used to produce the yellow histograms plotted on some of the QTL maps. If the position of a QTL is firm, then the particular composition of the sample, will not shift the position of the QTL peak by very much. In such a case, the histogram of "best QTLs" (yellow bars in the maps) that is displayed in WebQTL maps will tend to have a sharp peak (the scale is the percentage of bootstrap resamples that fall into each bar of the bootstrap histogram). In contrast, if the the yellow bootstrap histograms are spread out along a chromosome, then the precise location of a QTL may not be accurate, even in the original correct data set. Bootstrap results naturally vary between runs due to the random generation of the samples. See the related entry "Frequency of Peak LRS." - -KNOWN PROBLEMS and INTERPRETATION of BOOTSTRAP RESULTS: The reliability of bootstrap analysis of QTL confidence intervals has been criticized by Manichaikul and colleagues ([2006](http://www.genetics.org/cgi/content/full/174/1/481)). Their work applies in particular to standard intercrosses and backcrosses in which markers are spaced every 2 cM. They recommend that confidence intervals be estimated either by a conventional 1.5 to 2.0 LOD drop-off interval or by a Bayes credible Interval method. - -There is a known flaw in the way in which GeneNetwork displays bootstrap results (Sept 2011). If a map has two or more adjacent markers with identical LOD score and identical strain distribution patterns, all of the bootstrap results are assigned incorrectly to just one of the "twin" markers. This results in a false perception of precision. - -QTL mapping methods can be highly sensitive to cases with very high or very low phenotype values (outliers). The bootstrap method does not provide protection against the effects of outliers and their effects on QTL maps. Make sure you review your data for outliers before mapping. Options include (1) Do nothing, (2) Delete the outliers and see what happens to your maps, (3) [Winsorize](http://en.wikipedia.org/wiki/Winsorising) the values of the outliers. You might try all three options and determine if your main results are stable or not. With small samples or extreme outliers, you may find the mapping results to be quite volatile. In general, if the results (QTL position or value) depend highly on one or two outliers (5-10% of the samples) then you should probably delete or winsorize the outliers. [Williams RW, Oct 15, 2004, Mar 15, 2008, Mar 26, 2008; Sept 2011] - -[Go back to index](#index) - -
    - -#### CEL and DAT Files (Affymetrix): - -Array data begin as raw image files that are generated using a confocal microscope and video system. Affymetrix refers to these image data files as DAT files. The DAT image needs to be registered to a template that assigns pixel values to expected array coordinates (cells). The result is an assignment of a set of image intensity values (pixel intensities) to each probe. For example, each cell/probe value generated using Affymetrix arrays is associated with approximately 36 pixels (a 6x6 set of pixels, usually with an effective 11 or 12-bit range of intensity). Affymetrix uses a method that simply ranks the values of these pixels and picks as the "representative value" the pixel that is has rank 24 from low to high. The range of variation in intensity amoung these ranked pixels provides a way to estimate the error of the estimate. The Affymetrix CEL files therefore consist of XY coordinates, the consensus value, and an error term. [Williams RW, April 30, 2005] - -#### Cluster Map or QTL Cluster Map: - -Cluster maps are sets of QTL maps for a group of traits. The QTL maps for the individual traits (up to 100) are run side by side to enable easy detection of common and unique QTLs. Traits are clustered along one axis of the map by phenotypic similarity (hierarchical clustering) using the Pearson product-moment correlation r as a measurement of similarity (we plot 1-r as the distance). Traits that are positively correlated will be located near to each other. The genome location is shown along the other, long axis of the cluster map, marker by marker, from Chromosome 1 to Chromosome X. Colors are used to encode the probability of linkage, as well as the additive effect polarity of alleles at each marker. These QTL maps are computed using the fast Marker Regression algorithm. P values for each trait are computed by permuting each trait 1000 times. Cluster maps could be considered trait gels because each lane is loaded with a trait that is run out along the genome. Cluster maps are a unique feature of the GeneNetwork developed by Elissa Chesler and implemented in WebQTL by J Wang and RW Williams, April 2004. [Williams RW, Dec 23, 2004, rev June 15, 2006 RWW]. - -#### Collections and Trait Collections: - -One of the most powerful features of GeneNetwork (GN) is the ability to study large sets of traits that have been measured using a common genetic reference population or panel (GRP). This is one of the key requirements of systems genetics--many traits studied in common. Under the main GN menu **Search** heading you will see a link to **Trait Collections**. You can assemble you own collection for any GRP by simply adding items using the Add to Collection button that you will find in many windows. Once you have a collection you will have access to a new set of tools for analysis of your collection, including **QTL Cluster Map, Network Graph, Correlation Matrix**, and **Compare Correlates**. [Williams RW, April 7, 2006] - -#### Complex Trait Analysis: - -Complex trait analysis is the study of multiple causes of variation of phenotypes within species. Essentially all traits that vary within a population are modulated by a set of genetic and environmental factors. Finding and characterizing the multiple genetic sources of variation is referred to as "genetic dissection" or "QTL mapping." In comparison, complex trait analysis has a slightly broader focus and includes the analysis of the effects of environmental perturbation, and gene-by-environment interactions on phenotypes; the "norm of reaction." Please also see the glossary term "Systems Genetics." [Williams RW, April 12, 2005] - -#### Composite Interval Mapping: - -Composite interval mapping is a method of mapping chromosomal regions that controls for some fraction of the genetic variability in a quantitative trait. Unlike simple interval mapping, composite interval mapping usually controls for variation produced at one or more background marker loci. These background markers are generally chosen because they are already known to be close to the location of a significant QTL. By factoring out a portion of the genetic variance produced by a major QTL, one can occasionally detect secondary QTLs. WebQTL allows users to control for a single background marker. To select this marker, first run the **Marker Regression** analysis (and if necessary, check the box labeled display all LRS, select the appropriate locus, and the click on either **Composite Interval Mapping** or **Composite Regression**. A more powerful and effective alternative to composite interval mapping is pair-scan analysis. This latter method takes into accounts (models) both the independent effects of two loci and possible two-locus epistatic interactions. [Williams RW, Dec 20, 2004] - -
    - -#### Correlations: Pearson and Spearman: - -GeneNetwork provides tools to compute both Pearson product-moment correlations (the standard type of correlation), Spearman rank order correlations. [Wikipedia](http://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient) and introductory statistics text will have a discussion of these major types of correlation. The quick advice is to use the more robust Spearman rank order correlation if the number of pairs of observations in a data set is less than about 30 and to use the more powerful but much more sensitive Pearson product-moment correlation when the number of observations is greater than 30 AND after you have dealt with any outliers. GeneNetwork automatically flags outliers for you in the Trait Data and Analysis form. GeneNetwork also allows you to modify values by either deleting or winsorising them. That means that you can use Pearson correlations even with smaller sample sizes after making sure that data are well distributed. Be sure to view the scatterplots associated with correlation values (just click on the value to generate a plot). Look for bivariate outliers. - -#### Cross: - -The term Cross refers to a group of offspring made by mating (crossing) one strain with another strain. There are several types of crosses including intercrosses, backcrosses, advanced intercrosses, and recombinant inbred intercrosses. Genetic crosses are almost always started by mating two different but fully inbred strains to each other. For example, a B6D2F2 cross is made by breeding C57BL/6J females (B6 or B for short) with DBA/2J males (D2 or D) and then intercrossing their F1 progeny to make the second filial generation (F2). By convention the female is always listed first in cross nomenclature; B6D2F2 and D2B6F2 are therefore so-called reciprocal F2 intercrosses (B6D2F1 females to B6D2F1 males or D2B6F1 females to D2B6F1 males). A cross may also consist of a set of recombinant inbred (RI) strains such as the BXD strains, that are actually inbred progeny of a set of B6D2F2s. Crosses can be thought of as a method to randomize the assignment of blocks of chromosomes and genetic variants to different individuals or strains. This random assignment is a key feature in testing for causal relations. The strength with which one can assert that a causal relation exists between a chromosomal location and a phenotypic variant is measured by the LOD score or the LRS score (they are directly convertable, where LOD = LRS/4.61) [Williams RW, Dec 26, 2004; Dec 4, 2005]. - -[Go back to index](#index) - -
    - -#### Dominance Effects: - -The term dominance indicates that the phenotype of intercross progeny closely resemble one of the two parental lines, rather than having an intermediate phenotype. Geneticists commonly refer to an allele as having a dominance effect or dominance deviation on a phenotype. Dominance deviation at a particular marker are calculated as the difference between the average phenotype of all cases that have the Aa genotype at that marker and the expected value half way between the all casese that have the aa genotype and the AA genotype. For example, if the average phenotype value of 50 individuals with the aa genotype is 10 units whereas that of 50 individuals with the AA genotype is 20 units, then we would expect the average of 100 cases with the Aa genotype to be 15 units. We are assuming a linear and perfectly additive model of how the a and A alleles interact. If these 100 Aa cases actually have a mean of 11 units, then this additive model would be inadequate. A non-linear dominance terms is now needed. In this case the low a alleles is almost perfectly dominant (or semi-dominant) and the dominance deviation is -4 units. - -The dominance effects are computed at each location on the maps generated by the WebQTL module for F2 populations (e.g., B6D2F2 and B6BTBRF2). Orange and purple line colors are used to distinguish the polarity of the dominance effects. Purple is the positive dominance effect that matches the polarity of the green additive effect, whereas orange is the negative dominance effect that matches the polarity of the red additive effect. - -Note that dominance deviations cannot be computed from a set of recombinant inbred strains because there are only two classes of genotypes at any marker (aa and AA, more usuually written AA and BB). However, when data for F1 hybrids are available one can estimate the dominance of the trait. This global phenotypic dominance has almost nothing to do with the dominance deviation at a single marker in the genome. In other words, the dominance deviation detected at a single marker may be reversed or neutralized by the action of many other polymorphic genes. [Williams RW, Dec 21, 2004; Sept 3, 2005] - -[Go back to index](#index) - -
    - -#### Epistasis: - -Epistasis means that combined effects of two or more different loci or polymorphic genes are not what one would expect given the addition of their individual effects. There is, in other words, evidence for non-linear interactions among two or more loci. This is similar to the dominance effects mentioned above, but now generalized to two or more distinct loci, rather than to two or more alleles at a single locus. For example, if QTL 1 has an A allele that has an additive effects of +5 and QTL 2 has an A alleles that has an additive effect of +2, then the two locus genotype combination A/A would be expected to boost the mean by +7 units. But if the value of these A/A individuals was actually -7 we would be quite surprised and would refer to this as an epistatic interaction between QTL 1 and QTL 2. WebQTL will search for all possible epistatic interactions between pairs of loci in the genome. This function is called a **Pair Scan** becasue the software analyzes the LRS score for all possible pairs of loci. Instead of viewing an LRS plot along a single dimension, we now view a two-dimensional plot that shows a field of LRS scores computed for pairs of loci. Pair scan plots are extremely sensitive to outlier data. Be sure to review the primary data carefully using **Basic Statistics**. Also note that this more sophisiticated method also demands a significantly larger sample size. While 25 to 50 cases may be adequate for a conventional LRS plot (sometimes called a "main scan"), a **Pair-Scan** is hard to apply safely with fewer than 60 cases. [Williams RW, Dec 21, 2004; Dec 5, 2005] - -#### Effect Size of a QTL: - -QTLs can be ranked by the amount of variance that they explain--their so-called "effect size"--when they are included in a statistical model. The concept of a genetic model may seem odd to some users of GeneNetwork. A model is just an explicit hypothesis of how QTLs and other factors cause variation in a trait. QTL mapping involves comparisons of the explanatory power of different models. Effect sizes can be measured in different units including (1) the percentage of total or genetic variance that is explained by adding the QTL into the model, (2) the mean shift in Z score, or (3) the additive effect size expressed in the original measurement scale. Effects of single QTLs are often dependent on genetic background (i.e., other QTLs and their interactions) and on the numbers and types of cases used in a study. For example, the variance explained is influenced strongly by whether the sample are from a family cohort, a case-control cohort, a group of fully inbred strains such as recombinant inbred lines, an outcross or backcross population. - -Please note that the functional importance of a locus, QTL, or GWAS hit can not be predicted by the size of its effect on the trait in one environment, at one stage of development, and in one population. Estimates of the effect size of QTLs are usually both noisy and upwardly biased (overestimated), and both of these problems are particularly acute when sample sizes are small. - -Estimates of effect size for families of inbred lines, such as the BXD, HXB, CC, and hybrid diversity panels (e.g. the hybrid mouse diversity panel and the hybrid rat diversity panel) are typically (and correctly) much higher than those measured in otherwise similar analysis of intercrosses, heterogeneous stock (HS), or diversity outbred stock. Two factors contribute to the much higher level of explained variance of QTLs when using inbred strain panels. - - -1. **Replication Rate:** The variance that can be explained by a locus is increased by sampling multiple cases that have identical genomes and by using the strain mean for genetic analysis. Increasing replication rates from 1 to 6 can easily double the apparent heritability of a trait and therefore the effect size of a locus. The reason is simple—resampling decrease the standard error of mean, boosting the effective heritability (see Glossary entry on *Heritability* and focus on figure 1 from the Belknap [1998](http://gn1.genenetwork.org/images/upload/Belknap_Heritability_1998.pdf) paper reproduced below).
    Compare the genetically explained variance (labeled h2RI in this figure) of a single case (no replication) on the x-axis with the function at a replication rate of 4 on the y-axis. If the explained variance is 0.1 (10% of all variance explained) then the value is boosted to 0.3 (30% of strain mean variance explained) with n = 4. - -2. **Homozygosity:** The second factor has to do with the inherent genetic variance of populations. Recombinant inbred lines are homozygous at nearly all loci. This doubles the genetic variance in a family of recombinant inbred lines compared to a matched number of F2s. This also quadruples the variance compared to a matched number of backcross cases. As a result 40 BXDs sampled just one per genometype will average 2X the genetic variance and 2X the heritability of 40 BDF2 cases. Note that panels made up of isogenic F1 hybrids (so-called diallel crosses, DX) made by crossing recombinant inbred strains (BXD, CC, or HXB) are no longer homozygous at all loci, and while they do expose important new sources of variance associated with dominance, they do not benefit from the 2X gain in genetic variance relative to an F2 intercross. - -Homozygosity - -For the reasons listed above a QTL effect size of 0.4 detected a panel of BXD lines replicated four times each (160 cases total), corresponds approximately to an effect size of 0.18 in BXDs without replication (40 cases total), and to an effect size of 0.09 in an F2 of 40 cases total. [Williams RW, Dec 23, 2004; updated by RWW July 13, 2019] - -#### eQTL, cis eQTL, trans eQTL - -An expression QTL or eQTL. Differences in the expression of mRNA or proteins are often treated as standard phenotypes, much like body height or lung capacity. The variation in these microscopic traits (microtraits) can be mapped using conventional QTL methods. [Damerval](http://www.genetics.org/cgi/reprint/137/1/289) and colleagues were the first authors to use this kind of nomenclature and in their classic study of 1994 introduced the term PQLs for protein quantitative trait loci. Schadt and colleagues added the acronym eQTL in their early mRNA study of corn, mouse, and humans. We now are "blessed" with all kinds of prefixes to QTLs that highlight the type of trait that has been measured (m for metabolic, b for behavioral, p for physiological or protein). - -eQTLs of mRNAs and proteins have the unique property of (usually) having a single parent gene and genetic location. An eQTL that maps to the location of the parent gene that produces the mRNA or protein is referred to as a **cis eQTL** or local eQTL. In contrast, an eQTL that maps far away from its parent gene is referred to as a **trans eQTL**. You can use special search commands in GeneNetwork to find cis and trans eQTLs. [Williams RW, Nov 23, 2009, Dec 2009] - -[Go back to index](#index) - -
    - -## F - -#### Frequency of Peak LRS: - -The height of the yellow bars in some of the Map View windows provides a measure of the confidence with which a trait maps to a particular chromosomal region. WebQTL runs 2000 bootstrap samples of the original data. (A bootstrap sample is a "sample with replacement" of the same size as the original data set in which some samples will by chance be represented one of more times and others will not be represented at all.) For each of these 2000 bootstraps, WebQTL remaps each and keeps track of the location of the single locus with the highest LRS score. These accumulated locations are used to produce the yellow histogram of "best locations." A frequency of 10% means that 200 of 2000 bootstraps had a peak score at this location. It the mapping data are robust (for example, insensitive to the exclusion of an particular case), then the bootstrap bars should be confined to a short chromosomal interval. Bootstrap results will vary slightly between runs due to the random generation of the bootstrap samples. [Williams RW, Oct 15, 2004] - -#### False Discovery Rate (FDR): - -A [false discovery](http://en.wikipedia.org/wiki/False_discovery_rate) is an apparently significant finding--usually determined using a particular P value alpha criterion--that given is known to be insignificant or false given other information. When performing a single statistical test we often accept a false discovery rate of 1 in 20 (p = .05). False discovery rates can climb to high levels in large genomic and genetic studies in which hundreds to millions of tests are run and summarized using standard "single test" p values. There are various statistical methods to estimate and control false discovery rate and to compute genome-wide p values that correct for large numbers of implicit or explicit statistical test. The Permutation test in GeneNetwork is one method that is used to prevent and excessive number of false QTL discoveries. Methods used to correct the FDR are approximations and may depend on a set of assumptions about data and sample structure. [Williams RW, April 5, 2008] - -[Go back to index](#index) - -
    - -## G - -#### Genes, GenBankID, UniGeneID, GeneID, LocusID: - -GeneNetwork provides summary information on most of the genes and their transcripts. Genes and their alternative splice variants are often are poorly annotated and may not have proper names or symbols. However, almost all entries have a valid GenBank accession identifier. This is a unique code associated with a single sequence deposited in GenBank (Entrez Nucleotide). A single gene may have hundreds of GenBank entries. GenBank entries that share a genomic location and possibly a single gene are generally combined into a single UniGene entry. For mouse, these always begin with "Mm" (Mus musculus) and are followed by a period and then a number. More than half of all mouse UniGene identifiers are associated with a reputable gene, and these genes will have gene identifiers (GeneID). GeneIDs are identical to LocusLink identifiers (LocusID). Even a 10 megabase locus such as human Myopia 4 (MYP4) that is not yet associated with a specific gene is assigned a GeneID--a minor misnomer and one reason to prefer the term LocusID. - -See the related [FAQ](http://gn1.genenetwork.org/faq.html#Q-6) on "How many genes and transcripts are in your databases and what fraction of the genome is being surveyed?" [Williams RW, Dec 23, 2004, updated Jan 2, 2005] - -#### Genetic Reference Population (GRP): - -A genetic reference population consists of a set of genetically well characterized lines that are often used over a long period of time to study a multitude of different phenotypes. Once a GRP has been genotyped, subsequent studies can focus on the analysis of interesting and important phenotypes and their joint and independent relations. Most of the mouse GRPs, such as the BXDs used in the GeneNetwork, have been typed using a common set of over 14,000 makers (SNPs and microsatellites). Many of these same GRPs have been phenotyped extensively for more than 25 years, resulting in rich sets of phenotypes. A GRP is an ideal long-term resource for systems genetics because of the relative ease with which vast amounts of diverse data can be accumulated, analyzed, and combined. - -The power of GRPs and their compelling scientific advantages derive from the ability to study multiple phenotypes and substantial numbers of genetically defined individuals under one or more environmental conditions. When accurate phenotypes from 20 or more lines in a GRP have been acquired it becomes practical to explore and test the genetic correlations between that trait and any previously measured trait in the same GRP. This fact underlies the use of the term **reference** in GRP. Since each genetic individual is represented by an entire isogenic line--usually an inbred strain or an isogenic F1 hybrid--it is possible to obtain accurate mean phenotypes associated with each line simply by typing several individuals. GRPs are also ideal for developmental and aging studies because the same genetic individual can be phenotyped at multiple stages. - -A GRP can also be used a conventional mapping panel. But unlike most other mapping panel, a GRP can be easily adapted to jointly map sets of functionally related traits (multitrait mapping); a more powerful method to extract causal relations from networks of genetic correlations. - -The largest GRPs now consist of more than 400 recombinant inbred lines of *Arabidopsis* and [maize](http://www.maizegdb.org/cgi-bin/stockadvquery.cgi?check=true&name=&typebox=true&type=701&linkage_group=0&genvar1=&genvar2=&genvar3=&karyovar=0&phenotype=0&attribution=&avail_from=0&parent=0). The BayxSha Arabidopsis set in the GeneNetwork consists of 420 lines. Pioneer Hi-Bred International is rumored to have as many as 4000 maize RI lines. The largest mammalian GRPs are the LXS and BXD RI sets in the GeneNetwork. The Collaborative Cross is the largest mammalian GRP, and over 600 of these strains are now being bred by members of the Complex Trait Consortium. - -There are several subtypes of GRPs. In addition to recombinant inbred strains there are - - -- Recombinant congenic ([RCC](http://research.jax.org/grs/type/recombcong.htmll)) strains such as the [AcB](http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=pubmed&dopt=Abstract&list_uids=11374899&query_hl=4) set Consomic or chromosome substitution strains ([CSS](http://research.jax.org/grs/type/consomic.html)) of mice (Matin et al., [1999](http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=pubmed&dopt=Abstract&list_uids=10508525&query_hl=11)) and rats (Roman et al., [2002](http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=pubmed&dopt=Abstract&list_uids=12858554&query_hl=7)) - -- Recombinant intercross ([RIX](http://www.ncbi.nlm.nih.gov/pubmed/?term=15879512)) F1 sets made by mating different RI strains to each other to generate large set of R! first generation (F1) progeny (RIX). This is a standard ([diallel cross](http://en.wikipedia.org/wiki/Diallel_cross)) of RI inbred strains. Genetic analysis of a set of RIX progeny has some advantages over a corresponding analysis of RI strains. The first of these is that while each set of F1 progeny is fully isogenic (AXB1 x AXB2 gives a set of isogenic F1s), these F1s are not inbred but are heterozygous at many loci across the genome. RIX therefore retain the advance of being genetically defined and replicable, but without the disadvantage of being fully inbred. RIX have a genetic architecture more like natural populations. The second correlated advantage is that it is possible to study patterns of dominance of allelic variants using an RIX cross. Almost all loci or genes that differs between the original stock strains (A and B) will be heterozygous among a sufficiently larges set of RIX. A set of RIX progeny can therefore be mapped using the same methods used to map an F2 intercross. Mapping of QTLs may have somewhat more power and precision than when RI strains are used alone. A third advantage is that RIX sets make it possible to expand often limited RI resources to very large sizes to confirm and extend models of genetic or GXE effects. For example a set of 30 AXB strains can be used to generate a full matrix of 30 x 29 unique RIX progeny. The main current disadvantage of RIX panels is the comparative lack of extant phenotype data. - -- Recombinant F1 line sets can also be made by backcrossing an entire RI sets to a single inbred line that carries an interesting mutation or transgene (RI backcross or RIB). GeneNetwork includes one RI backcross sets generated by Kent Hunter. In this RIB each of 18 AKXD RI strains were crossed to an FVB/N line that carries a tumor susceptibility allele (polyoma middle T). - -All of these sets of lines are GRPs since each line is genetically defined and because the set as a whole can in principle be easily regenerated and phenotyped. Finally, each of these resources can be used to track down genetic loci that are causes of variation in phenotype using variants of standard linkage analysis. - -A Diversity Panel such as that used by the Mouse Phenome Project is not a standard GRPs, although its also shares the ability to accumulate and study networks of phenotypes. The main difference is that a Diversity Panel cannot be used for conventional linkage analysis. A sufficiently large Diversity Panel could in principle be used for the equivalent of an assocation study. However, these are definitely NOT in silico studies, because hundreds of individuals need to be phenotyped for every trait. Surveys of many diverse isogenic lines (inbred or F1 hybrids) is statistically the equivalent of a human association study (the main difference is the ability to replicate measurements and study sets of traits) and therefore, like human association studies, does require very high sample size to map polygenic traits. Like human association studies there is also a high risk of false positive results due to population stratification and non-syntenic marker association. - -A good use of a Diversity Panel is as a fine-mapping resource with which to dissect chromosomal intervals already mapped using a conventional cross or GRP. GeneNetwork now includes Mouse Diversity Panel (MDP) data for several data sets. We now typically include all 16 sequenced strains of mice, and add PWK/PhJ, NZO/HiLtJ (two of the eight members of the Collaborative Cross), and several F1 hybrids. The MDP data is often appended at the bottom of the GRP data set with which is was acquired (e.g., BXD hippocampal and BXD eye data sets). [Williams RW, June 19, 2005; Dec 4, 2005] - -#### Genotype - -The state of a gene or DNA sequence, usually used to describe a contrast between two or more states, such as that between the normal state (wildtype) and a mutant state (mutation) or between the alleles inherited from two parents. All species that are included in GeneNetwork are diploid (derived from two parents) and have two copies of most genes (genes located on the X and Y chromosomes are exceptions). As a result the genotype of a particular diploid individual is actually a pair of genotypes, one from each parents. For example, the offspring of a mating between strain A and strain B will have one copy of the A genotype and one copy of the B genotype and therefore have an A/B genotype. In contrast, offspring of a mating between a female strain A and a male strain A will inherit only A genotypes and have an A/A genotype. - -Genotypes can be measured or inferred in many different ways, even by visual inspection of animals (e.g. as Gregor Mendel did long before DNA was discovered). But now the typical method is to directly test DNA that has a well define chromosomal location that has been obtained from one or usually many cases using molecular tests that often rely on polymerase chain reaction steps and sequence analysis. Each case is genotyped at many chromosomal locations (loci, markers, or genes). The entire collection of genotypes (as many a 1 million for a single case) is also sometimes referred to as the cases genotype, but the word "genometype" might be more appropriate to highlight the fact that we are now dealing with a set of genotypes spanning the entire genome (all chromosomes) of the case. - -For gene mapping purposes, genotypes are often translated from letter codes (A/A, A/B, and B/B) to simple numerical codes that are more suitable for computation. A/A might be represented by the value -1, A/B by the value 0, and B/B by the value +1. This recoding makes it easy to determine if there is a statistically significant correlation between genotypes across of a set of cases (for example, an F2 population or a Genetic Reference Panel) and a variable phenotype measured in the same population. A sufficiently high correlation between genotypes and phenotypes is referred to as a quantitative trait locus (QTL). If the correlation is almost perfect (r > 0.9) then correlation is usually referred to as a Mendelian locus. Despite the fact that we use the term "correlation" in the preceding sentences, the genotype is actually the cause of the phenotype. More precisely, variation in the genotypes of individuals in the sample population cause the variation in the phenotype. The statistical confidence of this assertion of causality is often estimated using LOD and LRS scores and permutation methods. If the LOD score is above 10, then we can be extremely confident that we have located a genetic cause of variation in the phenotype. While the location is defined usually with a precision ranging from 10 million to 100 thousand basepairs (the locus), the individual sequence variant that is responsible may be quite difficult to extract. Think of this in terms of police work: we may know the neighborhood where the suspect lives, we may have clues as to identity and habits, but we still may have a large list of suspects. - -Text here [Williams RW, July 15, 2010] - -[Go back to index](#index) - -
    - -## H - -#### Heritability, h2: - -Heritability is a rough measure of the ability to use genetic information to predict the level of variation in phenotypes among progeny. Values range from 0 to 1 (or 0 to 100%). A value of 1 or 100% means that a trait is entirely predictable based on paternal/materinal and genetic data (in other words, a Mendelian trait), whereas a value of 0 means that a trait is not at all predictable from information on gene variants. Estimates of heritability are highly dependent on the environment, stage, and age. - -Important traits that affect fitness often have low heritabilities because stabilizing selection reduces the frequency of DNA variants that produce suboptimal phenotypes. Conversely, less critical traits for which substantial phenotypic variation is well tolerated, may have high heritability. The environment of laboratory rodents is unnatural, and this allows the accumulation of somewhat deleterious mutations (for example, mutations that lead to albinism). This leads to an upward trend in heritability of unselected traits in laboratory populations--a desirable feature from the point of view of the biomedical analysis of the genetic basis of trait variance. Heritability is a useful parameter to measure at an early stage of a genetic analysis, because it provides a rough gauge of the likelihood of successfully understanding the allelic sources of variation. Highly heritable traits are more amenable to mapping studies. There are numerous ways to estimate heritability, a few of which are described below. [Williams RW, Dec 23, 2004] - -#### h2 Estimated by Intraclass Correlation: - -Heritability can be estimated using the intraclass correlation coefficient. This is essentially a one-way repeated measures analysis of variance (ANOVA) of the reliability of trait data. Difference among strains are considered due to a random effect, whereas variation among samples within a single strain are considered due to measurement error. One can use the method implemented by SAS (PROC VARCOMP) that exploits a restricted maximum likelihood (REML) approach to estimate the intraclass correlation coefficient instead of an ordinary least squares method. The general equation for the intraclass correlation is: - -r = (Between-strain MS - Within-strain MS)/(Between-strain MS + (n-1)x Within-strain MS) - -where n is the average number of cases per strain. The intraclass correlation approaches 1 when there is minimal variation within strains, and strain means differ greatly. In contrast, if difference between strains are less than what would be predicted from the differences within strain, then the intraclass correlation will produce negative estimates of heritability. Negative heritability is usually a clue that the design of the experiment has injected excessive within-strain variance. It is easy for this to happen inadvertently by failing to correct for a batch effect. For example, if one collects the first batch of data for strains 1 through 20 during a full moon, and a second batch of data for these same strains during a rare blue moon, then the apparent variation within strain may greatly exceed the among strain variance. A technical batch effect has been confounded with the within-strain variation and has swamped any among-strain variance. What to do? Fix the batch effect, sex effect, age effect, etc., first! [Williams RW, Chesler EJ, Dec 23, 2004] - -#### h2 Estimated using Hegmann and Possidente's Method (Adjusted Heritability in the Basic Statisics): - -A simple estimate of heritability for inbred lines involves comparing the variance between strain means (Va) to the total variance (Vt) of the phenotype, where Va is the a rough estimate of the additive genetic variance and Vt is the equal to Va and the average environmental variance, Ve. For example, if we study 10 cases of each of 20 strains, we have a total variance of the phenotype across 200 samples, and a strain mean variance across 20 strain averages. We can use this simple equation to estimate the heritability: - -h2 = Va / Vt - -This estimate of heritability will be an **overestimate**, and the severity of this bias will be a function of the within-strain standard error of the mean. Even a random data set of 10 each of 20 strains that should have an h2 of 0, will often give h2 values of 0.10 to 0.20. (Try this in a spreadsheet program using random numbers.) - -However, this estimate of h2 cannot be compared directly to those calculated using standard intercrosses and backcrosses. The reason is that all cases above are fully inbred and no genotypes are heterozygous. As a result the estimate of Va will be inflated two-fold. Hegmann and Possidente (1981 suggested a simple solution; adjust the equation as follows: - -h2 = 0.5Va / (0.5Va+Ve) - -The factor 0.5 is applied to Va to adjust for the overestimation of additive genetic variance among inbred strains. This estimate of heritability also does not make allowances for the within-strain error term. The 0.5 adjustment factor is not recommended any more because h2 is severely **underestimated**. This adjustment is really only needed if the goal is to compare h2 between intercrosses and those generated using panels of inbred strains. - -#### h2RIx̅ - -Finally, heritability calculations using strain means, such as those listed above, do not provide estimates of the effective heritability achieved by resampling a given line, strain, or genometype many times. Belknap ([1998](http://gn1.genenetwork.org/images/upload/Belknap_Heritability_1998.pdf)) provides corrected estimates of the effective heritability. Figure 1 from his paper (reproduced below) illustrates how resampling helps a great deal. Simply resampling each strain 8 times can boost the effective heritability from 0.2 to 0.8. The graph also illustrates why it often does not make sense to resample much beyond 4 to 8, depending on heritability. Belknap used the term h2RIx̅ in this figure and paper, since he was focused on data generated using recombinant inbred (RI) strains, but the logic applies equally well to any panel of genomes for which replication of individual genometypes is practical. This h2RIx̅ can be calculated simply by: -h2RIx̅ = Va / (Va+(Ve/n)) where Va is the genetic variability (variability between strains), Ve is the environmental variability (variability within strains), and n is the number of within strain replicates. Of course, with many studies the number of within strain replicates will vary between strains, and this needs to be dealt with. A reasonable approach is to use the harmonic mean of n across all strains. - -Homozygosity - -An analysis of statistical power is useful to estimate numbers of replicates and strains needed to detect and resolve major sources of trait variance and covariance. A versatile method has been developed by Sen and colleagues (Sen et al., 2007) and implemented in the R program. qtlDesign. David Ashbrook implemented a version of this within Shiny that can help you estimate power for different heritability values QTL effect sizes, cohort sizes, and replication rates: - -**[Power Calculator (D. Ashbrook)](https://dashbrook1.shinyapps.io/bxd_power_calculator_app/)** - -We can see that in all situations power is increased more by increasing the number of lines than by increasing the number of biological replicates. Dependent upon the heritability of the trait, there is little gain in power when going above 4-6 biological replicates. [DGA, Feb 1, 2019] [Chesler EJ, Dec 20, 2004; RWW updated March 7, 2018; Ashbrook DG, updated Feb 1, 2019] - -#### Hitchhiking Effect: - -Conventional knockout lines (KOs) of mice are often mixtures of the genomes of two strains of mice. One important consequence of this fact is that a conventional comparison of wildtype and KO litter mates does not only test of the effects of the KO gene itself but also tests the effects of thousands of "hitchhiking" sequence polymorphisms in genes that flank the KO gene. This experimental confound can be difficult to resolve (but see below). This problem was first highlighted by Robert Gerlai ([1996](http://gn1.genenetwork.org/images/upload/Gerlai_TINS_1996.pdf)). - -**Genetics of KO Lines**. The embryonic stem cells used to make KOs are usually derived from a 129 strain of mouse (e.g., 129/OlaHsd). Mutated stem cells are then added to a C57BL/6J blastocyst to generate B6x129 chimeric mice. Germline transmission of the KO allele is tested and carriers are then used to establish heterozygous +/- B6.129 KO stock. This stock is often crossed back to wildtype C57BL/6J strains for several generations. At each generation the transmission of the KO is checked by genotyping the gene or closely flanking markers in each litter of mice. Carriers are again selected for breeding. The end result of this process is a KO congenic line in which the genetic background is primarily C57BL/6J except for the region around the KO gene. - -It is often thought that 10 generations of backcrossing will result in a pure genetic background (99.8% C57BL/6J). Unfortunately, this is not true for the region around the KO, and even after many generations of backcrossing of KO stock to C57BL/6J, a large region around the KO is still derived from the 129 substrain (see the residual white "line" at N10 in the figure below. - -Congenic - -After 20 generations of backcrossing nearly +/-5 cM on either side of the KO will still usually be derived from 129 (see [Figure 3.6](http://www.informatics.jax.org/silverbook/frames/frame3-3.shtml)) This amounts to an average of +/- 10 megabases of DNA around the KO. The wildtype littermates do NOT have this flanking DNA from 129 and they will be like a true C57BL/6J. The +/- 10 megabases to either side of the KO is known as the "hitchhiking" chromosomal interval. Any polymorphism between 129 and B6 in this interval has the potential to have significant downstream effects on gene expression, protein expression, and higher order traits such as anxiety, activity, and maternal behavior. Much of the conventional KO literature is highly suspect due to this hitchhiker effect (see Gerlai R, [Trends in Neurosci 1996 19:177](http://gn1.genenetwork.org/images/upload/Gerlai_TINS_1996.pdf)). - -As one example, consider the thyroid alpha receptor hormone gene Thra and its KO. Thra maps to Chr 11 at about 99 Mb. A conventional KO made as described above will have a hitchhiking 129 chromosomal interval extending from about 89 Mb to 109 Mb even after 20 generations of backcrossing to B6. Since the mouse genome is about 2.6 billion base pairs and contains about 26,000 genes, this 20 Mb region will typically contain about 200 genes. The particular region of Chr 11 around Thra has an unusually high density of genes (2-3X) and includes many highly expressed and polymorphic genes, including *Nog*, *Car10*, *Cdc34*, *Col1a1*, *Dlx4*, *Myst2*, *Ngfr*, *Igf2bp1*, *Gip*, the entire *Hoxb* complex, *Sp6*, *Socs7*, *Lasp1*, *Cacnb1*, *Pparbp*, *Pnmt*, *Erbb2*, *Grb7*, *Nr1d1*, *Casc3*, *Igfbp4*, and the entire *Krt1* complex. Of these gene roughly half will be polymorphic between B6 and 129. It is like having a busload of noisy and possibly dangerous hitchhikers. Putative KO effects may be generated by a complex subset of these 100 polymorphic genes. - -What is the solution? - -1. Do not use litter mates as controls without great care. They are not really the correct genetic control. The correct genetic control is a congenic strain of the same general type without the KO or with a different KO in a nearby gene. These are often available as KOs in neighboring genes that are not of interest. For example, the gene *Casc3* is located next to Thra. If a KO in Casc3 is available, then compare the two KOs and see if phenotypes of the two KOs differ ways predicted given the known molecular functions of the gene. - -2. Use a KO in which the KO has been backcrossed to a 129 strain--ideally the same strain from which ES cells were obtained. This eliminates the hitchhiker effect entirely and the KO, HET, and WT littermates really can be compared. - -3. Use a conditional KO. - -4. Compare the phenotype of the two parental strains--129 and C57BL/6J and see if they differ in ways that might be confounded with the effects of the KO. - -Homozygosity - -Legend:from [Silver, L. (1995) Oxford University Press](http://www.informatics.jax.org/silver/index.shtml) - -[Go back to index](#index) - -
    - -## I - -#### Interquartile Range: - -The interquartile range is the difference between the 75% and 25% percentiles of the distribution. We divide the sample into a high and low half and then compute the median for each of these halves. In other words we effectively split our sample into four ordered sets of values known as quartiles. The absolute value of the difference between the median of the lower half and the median of the upper half is also called the interquartile range. This estimate of range is insenstive to outliers. If you are curious you might double the IQR to get an interquartile-range-based estimate of the full range. Of course, keep in mind that range is dependent on the sample size. For theis reason the coeffficient of variation (the standard deviation divided by the mean) is a better overall indicator of dispersion of values around the mean that is less sensitive to sample size. [Williams RW, Oct 20, 2004; Jan 23, 2005] - -#### Interval Mapping: - -Interval mapping is a process in which the statistical significance of a hypothetical QTL is evaluated at regular points across a chromosome, even in the absence of explicit genotype data at those points. In the case of WebQTL, significance is calculated using an efficient and very rapid regression method, the Haley-Knott regression equations ([Haley CS, Knott SA. 1992. A simple regression method for mapping quantitative trait loci in line crosses using flanking markers; Heredity 69:315–324](http://www.ncbi.nlm.nih.gov/pubmed/16718932)), in which trait values are compared to the known genotype at a marker or to the probability of a specific genotype at a test location between two flanking markers. (The three genotypes are coded as -1, 0, and +1 at known markers, but often have fractional values in the intervals between markers.) The inferred probability of the genotypes in regions that have not been genotyped can be estimated from genotypes of the closest flanking markers. GeneNetwork/WebQTL compute linkage at intervals of 1 cM or less. As a consequence of this approach to computing linkage statistics, interval maps often have a characteristic shape in which the markers appear as sharply defined inflection points, and the intervals between nodes are smooth curves. [Chesler EJ, Dec 20, 2004; RWW April 2005; RWW Man 2014] - -#### Interval Mapping Options: - -- _Permutation Test_: Select this option to determine the approximate LRS value that matches a genome-wide p-value of .05. - -- _Bootstrap Test_: Select this option to evaluate the consistency with which peak LRS scores cluster around a putative QTL. Deselect this option if it obscures the SNP track or the additive effect track. - -- _Additive Effect_: The additive effect (shown by the red lines in these plots) provide an estimate of the change in the average phenotype that is brought about by substituting a single allele of one type with that of another type. - -- _SNP Track_: The SNP Seismograph Track provides information on the regional density of segregating variants in the cross that may generate trait variants. It is plotted along the X axis. If a locus spans a region with both high and low SNP density, then the causal variant has a higher prior probability to be located in the region with high density than in the region with low density. - -- _Gene Track_: This track overlays the positions of known genes on the physical Interval Map Viewer. If you hover the cursor over genes on this track, minimal information (symbol, position, and exon number) will appear. - -- _Display from X Mb to Y Mb_: Enter values in megabases to regenerate a smaller or large map view. - -- _Graph width (in pixels)_: Adjust this value to obtain larger or smaller map views (x axis only). - -[Go back to index](#index) - -
    - -## J - -[Go back to index](#index) - -
    - -## K - -[Go back to index](#index) - -
    - -## L - -
    - -#### Literature Correlation: - -The literature correlation is a unique feature in GeneNetwork that quantifies the similarity of words used to describe genes and their functions. Sets of words associated with genes were extracted from MEDLINE/PubMed abstracts (Jan 2017 by Ramin Homayouni, Diem-Trang Pham, and Sujoy Roy). For example, about 2500 PubMed abstracts contain reference to the gene "Sonic hedgehog" (Shh) in mouse, human, or rat. The words in all of these abstracts were extracted and categorize by their information content. A word such as "the" is not interesting, but words such as "dopamine" or "development" are useful in quantifying similarity. Sets of informative words are then compared—one gene's word set is compared the word set for all other genes. Similarity values are computed for a matrix of about 20,000 genes using latent semantic indexing [(see Xu et al., 2011)](http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0018851). Similarity values are also known as literature correlations. These values are always positive and range from 0 to 1. Values between 0.5 and 1.0 indicate moderate-to-high levels of overlap of vocabularies. - -The literature correlation can be used to compare the "semantic" signal-to-noise of different measurements of gene, mRNA, and protein expression. Consider this common situation:There are three probe sets that measure Kit gene expression (1459588\_at, 1415900\_a\_at, and 1452514\_a\_at) in the Mouse BXD Lung mRNA data set (HZI Lung M430v2 (Apr08) RMA). Which one of these three gives the best measurement of Kit expression? It is impractical to perform quantitative rtPCR studies to answer this question, but there is a solid statistical answer that relies on **Literature Correlation**. Do the following: For each of the three probe sets, generate the top 1000 literature correlates. This will generate three apparently identical lists of genes that are known from the PubMed literature to be associated with the Kit oncogene. But the three lists are NOT actually identical when we look at the **Sample Correlation** column. To answer the question "which of the three probe sets is best", review the actual performance of the probe sets against this set of 1000 "friends of Kit". Do this by sorting all three lists by their Sample Correlation column (high to low). The clear winner is probe set 1415900_a_at. The 100th row in this probe set's list has a Sample Correlation of 0.620 (absolute value). In comparison, the 100th row for probe set 1452514_a_at has a Sample Correlation of 0.289. The probe set that targets the intron comes in last at 0.275. In conclusion, the probe set that targets the proximal half of the 3' UTR (1415900_a_at) has the highest "agreement" between Literature Correlation and Sample Correlation, and is our preferred measurement of Kit expression in the lung in this data set. (Updated by RWW and Ramin Homayouni, April 2017.) - -
    - -#### LOD: - -The logarithm of the odds (LOD) provides a measure of the association between variation in a phenotype and genetic differences (alleles) at a particular chromosomal locus (see Nyholt [2000](http://www.sciencedirect.com/science/article/pii/S0002929707626391) for a lovely review of LOD scores). - -A LOD score is defined as the logarithm of the ratio of two likelihoods: (1) in the numerator the likelihood for the alternative hypothesis, namely that there is linkage at the chromosomal marker, and (2) the likelihood of the null hypothesis that there is no linkage. Likelihoods are probabilities, but they are not Pr(hypothesis | data) but rather Pr(data | two alternative hypotheses). That's why they are called likelihoods rather than probabilities. (The "|" symbol above translates to "given the"). Since LOD and LRS scores are associated with two particular hypotheses or models, they are also associated with the degrees of freedom of those two alternative models. When the model only has one degree of freedom this conversion between LOD to p value will work: -
    -    lodToPval <-
    -    function(x)
    -    {
    -    pchisq(x*(2*log(10)),df=1,lower.tail=FALSE)/2
    -    }
    -    # (from https://www.biostars.org/p/88495/ )    
    -
    - -In the two likelihoods, one has maximized over the various nuisance parameters (the mean phenotypes for each genotype group, or overall for the null hypothesis, and the residual variance). Or one can say, one has plugged in the maximum likelihood estimates for these nuisance parameters. - -With complete data at a marker, the log likelihood for the normal model reduces to the (-n/2) times the log of the residual sum of squares. - -LOD values can be converted to LRS scores (likelihood ratio statistics) by multiplying by 4.61. The LOD is also roughly equivalent to the -log(P), where P is the probability of linkage (P = 0.001 => 3). The LOD itself is not a precise measurement of the probability of linkage, but in general for F2 crosses and RI strains, values above 3.3 will usually be worth attention for simple interval maps. [Williams RW, June 15, 2005, updated with text from Karl Broman, Oct 28, 2010, updated Apr 21, 2020 with Nyholt reference]. - -
    - -#### LRS: - -In the setting of mapping traits, the likelihood ratio statistic is used as a measurement of the association or linkage between differences in traits and differences in particular genotype markers. LRS or LOD values are usually plotted on the y-axis, whereas chromosomal location of the marker are usually plotted on the x-axis. In the case of a whole genome scan--a sequential analysis of many markers and locations across the entire genome--LRS values above 10 to 15 will usually be worth attention for when mapping with standard experimental crosses (e.g., F2 intercrosses or recombinant inbred strains). The term "likelihood ratio" is used to describe the relative probability (likelihood) of two different explanations of the variation in a trait. The first explanation (or model or hypothesis H1) is that the differences in the trait ARE associated with that particular DNA sequence difference or marker. Very small probability values indicate that H1 is probably true. The second "null" hypothesis (Hnull or H0) is that differences in the trait are NOT associated with that particular DNA sequence. We can use the ratio of these two probabilities and models (H1 divided by H0) as our score. The math is a little bit more complicated and the LRS score is actually equal to -2 times the ratio of the natural logarithms of the two probabilities. For example, if the probability of H0 is 0.05 (only a one-in-twenty probability that the marker is associated with the trait by chance), whereas and the probability of H1 is 1 (the marker is certainly not linked to the trait), then the LRS value is 5.991. In Excel the equation giving the LRS result of 5.991 would look like this "=-2*(LN(0.05)-LN(1)). [Williams RW, Dec 13, 2004, updated Nov 18, 2009, updated Dec 19, 2012] - -[Go back to index](#index) - -
    - -## M - -Marker Regression: - -The relationship between differences in a trait and differences in alleles at a marker (or gene variants) can be computed using a regression analysis (genotype vs phenotype) or as a simple Pearson product moment correlation. Here is a simple example that you can try in Excel to understand marker-phenotype regression or marker-phenotype correlation: enter a row of phenotype and genotype data for 20 strains in an Excel spreadsheet labeled "Brain weight." The strains are C57BL/6J, DBA/2J, and 20 BXD strains of mice (1, 2, 5, 6, 8, 9, 12, 13, 14, 15, 16, 18, 21, 22, 23, 24, 25, 27, 28, and 29. The brains of these strains weigh an average (in milligrams) of 465, 339, 450, 390, 477, 361, 421, 419, 412, 403, 429, 429, 436, 427, 409, 431, 432, 380, 394, 381, 389, and 375. (These values are taken from BXD Trait 10032; data by John Belknap and colleagues, 1992. Notice that data are missing for several strains including the extinct lines BXD3, 4, and 7. Data for BXD11 and BXD19 (not extinct) are also missing. In the second row enter the genotypes at a single SNP marker on Chr 4 called "rs13478021" for the subset of strains for which we have phenotype data. The genotypes at rs1347801 are as follows for 20 BXDs listed above: D B D B D B D D D D D B D B D B D B D B. This string of alleles in the parents and 20 BXDs is called a strains distribution pattern (SDP). Let's convert these SDP letters into more useful numbers, so that we can "compute" with genotypes. Each B allele gets converted into a -1 and each D allele gets converted into a +1. In the spreadsheet, the data set of phenotypes and genotypes should look like this. - -
    -    Strain BXD1 BXD2 BXD5 6 8 9 12 13 14 15 16 18 21 22 23 24 25 27 28 29
    -    Brain_weight 450 390 477 361 421 419 412 403 429 429 436 427 409 431 432 380 394 381 389 375
    -    Marker_rs1347801 D B D B D B D D D D D B D B D B D B D B
    -    Marker_code 1 -1 1 -1 1 -1 1 1 1 1 1 -1 1 -1 1 -1 1 -1 1 -1
    -
    - -To compute the marker regression (or correlation) we just compare values in Rows 2 and 4. A Pearson product moment correlation gives a value of r = 0.494. A regression analysis indicates that on average those strains with a D allele have a heavier brain with roughly a 14 mg increase for each 1 unit change in genotype; that is a total of about 28 mg if all B-type strains are compared to all D-type strains at this particular marker. This difference is associated with a p value of 0.0268 (two-tailed test) and an LRS of about 9.8 (LOD = 9.8/4.6 or about 2.1). Note that the number of strains is modest and the results are therefore not robust. If you were to add the two parent strains (C57BL/6J and DBA/2J) back into this analysis, which is perfectly fair, then the significance of this maker is lost (r = 0.206 and p = 0.3569). Bootstrap and permutation analyses can help you decide whether results are robust or not and whether a nominally significant p value for a single marker is actually significant when you test many hundreds of markers across the whole genome (a so-called genome-wide test with a genome-wide p value that is estimated by permutation testing). [RWW, Feb 20, 2007, Dec 14, 2012] - -[Go back to index](#index) - -
    - -## N - -#### Normal Probability Plot: - -A [normal probability plot](http://en.wikipedia.org/wiki/Normal_probability_plot) is a powerful tool to evaluate the extent to which a distribution of values conforms to (or deviates from) a normal Gaussian distribution. The Basic Statistics tools in GeneNetwork provides these plots for any trait. If a distribution of numbers is normal then the actual values and the predicted values based on a z score (units of deviation from the mean measured in standard deviation units) will form a nearly straight line. These plots can also be used to efficiently flag outlier samples in either tail of the distribution. - -In genetic studies, the probability plot can be used to detect the effects of major effect loci. A classical Mendelian locus will typically be associated with either a bimodal or trimodal distribution. In the plot below based on 99 samples, the points definitely do not fall on a single line. Three samples (green squares) have unusually high values; the majority of samples fall on a straight line between z = -0.8 to z = 2; and 16 values have much lower trait values than would be predicted based on a single normal distribution (a low mode group). The abrupt discontinuity in the distribution at -0.8 z is due to the effect of a single major Mendelian effect. - -Deviations from normality of the sort in the figure below should be considered good news from the point of view of likely success of tracking down the locations of QTLs. However, small numbers of outliers may require special statistical handling, such as their exclusion or [winsorising](http://en.wikipedia.org/wiki/Winsorising) (see more below on "Winsorizing"). [RWW June 2011] - -Homozygosity - -[Go back to index](#index) - -
    - -## O - -#### Outliers: (also see [Wikipedia](http://en.wikipedia.org/wiki/Outlier)) - -Statistical methods often assume that the distribution of trait values is close to a Gaussian normal bell-shaped curve and that there are no outlier values that are extremely high or low compared to the average. Some traits can be clearly split into two or more groups (affected cases and unaffected cases) and this is not a problem as long as the number of cases in each group is close to the number that you expected by chance and that your sample size is reasonable high (40 or more for recombinant inbred strains). Mapping functions and most statistical procedure in GeneNetwork should work reasonable well (the pair scan function for epistatic interactions is one possible exception). - -However, correlations and QTL mapping methods can be highly sensitive to outlier values. Make sure you review your data for outliers before mapping. GeneNetwork flags all outliers for you in the Trait Data and Analysis window and gives you the option of zapping these extreme values. Options include (1) do nothing, (2) delete the outliers and see what happens to your maps, (3) [Winsorize](http://en.wikipedia.org/wiki/Winsorising) the values of the outliers. You might try all three options and determine if your main results are stable or not. With small samples or extreme outliers, you may find the correlation and mapping results to be volatile. In general, if results (correlations, QTL positions or QTL LRS score) depend highly on one or two outliers (5-10% of the samples) then you should probably delete or winsorize the outliers. - -In order to calculate outliers, we first determine the Q1(25%) and Q3(75%) values and then multiply by a constant (in our case 1.5; a higher constant is less sensitive to outliers). This value is then subtracted from the Q1 value and added to the Q3 value in order to determine the lower and upper bounds. Values that fall above the upper bound or below the lower bound are considered outliers. - -The method is summarized [here](http://www.wikihow.com/Calculate-Outliers). [Sloan ZA, Oct 2013] - -[Go back to index](#index) - -
    - -## P - -#### Pair-Scan, 2D Genome Scan, or Two-QTL Model: - -The pair scan function evaluates pairs of intervals (loci) across the genome to determine how much of the variability in the trait can be explained jointly by two putative QTLs. The pair scan function in GeneNetwork is used to detect effects of pairs of QTLs that have epistatic interactions, although this function also evaluates summed additive effects of two loci. Trait variance is evaluated using a general linear model that has this structure (called a "model"): - -Variance V(trait) = QTL1 + QTL2 + QTL1xQTL2 + error (where the = sign should be read "a function of" - -This model is also known as the Full Model (LRS Full in the output table), where QTL1 and QTL2 are the independent additive effects associated with two unlinked loci (the so-called main effects) and QTL1xQTL2 is the interaction term (LRS Interact in the output table). An LRS score is computed for this full model. This is computation identical to computing an ANOVA that allows for an interaction term between two predictors. The additive model that neglects the QTL1XQTL2 term is also computed. - -The output table in GeneNetwork list the the two intervals at the top of the table (Interval 1 to the left and Interval 2 to the far right). The LRS values for different components of the model are shown in the middle of the table (LRS Full, LRS Additive, LRS Interact, LRS 1, and LRS 2). Note that LRS 1 and LRS 2 will usually NOT sum to LRS Additive. - -CAUTIONS and LIMITATIONS: Pair-scan is only implemented for recombinant inbred strains. We do not recommend the use of this function with sample sizes of less than 60 recombinant inbred strains. Pair-scan procedures need careful diagnostics and an be very sensitive to outliers and to the balance among the four possible two-locus genotype classes among a set of RI strains. Pair-scan is not yet implemented for F2 progeny. - -GeneNetwork implements a rapid but non-exhaustive DIRECT algorithm (Lundberg et al., [2004](http://bioinformatics.oxfordjournals.org/content/20/12/1887.full.pdf)) that efficiently searches for epistatic interactions. This method is so fast that it is possible to compute 500 permutations to evaluate non-parametric significance of the joint LRS value within a minute. This makes DIRECT ideal for an interactive web service. Karl Broman's [R/qtl](http://www.rqtl.org/tutorials/rqtltour.pdf) implements an exhaustive search using the "scantwo" function. [RWW, May 2011] - -#### Partial Correlation: - -Partial correlation is the correlation between two variables that remains after controlling for one or more other variables. Idea and techniques used to compute partial correlations are important in testing causal models ([Cause and Correlation in Biology](http://www.amazon.com/Cause-Correlation-Biology-Structural-Equations/dp/0521529212), Bill Shipley, 2000). For instance, r1,2||3,4 is the partial correlation between variables 1 and 2, while controlling for variables 3 and 4 (the || symbol is equivalent to "while controlling for"). We can compare partial correlations (e.g., r1,2||3,4) with original correlations (e.g., r1,2). If there is an insignificant difference, we infer that the controlled variables have minimal effect and may not influence the variables or even be part of the model. In contrast, if the partial correlations change significantly, the inference is that the causal link between the two variables is dependent to some degree on the controlled variables. These control variables are either anteceding causes or intervening variables. (text adapted from D Garson's original by RWW). - -For more on [partial correlation](http://faculty.chass.ncsu.edu/garson/PA765/partialr.htm) please link to this great site by David Garson at NC State. - -For more on dependence separation ([d-separation](http://www.andrew.cmu.edu/user/scheines/tutor/d-sep.html)) and constructing causal models see Richard Scheines' site. - -Why would you use of need partial correlations in GeneNetwork? It is often useful to compute correlations among traits while controlling for additional variables. Partial correlations may reveal more about the causality of relations. In a genetic context, partial correlations can be used to remove much of the variance associated with linkage and linkage disequilibrium. You can also control for age, age, and other common cofactors. - -Please see the related Glossary terms "Tissue Correlation". [RWW, Aug 21, 2009; Jan 30, 2010] - -#### PCA Trait or Eigentrait: - -If you place a number of traits in a Trait Collection you can carry out some of the key steps of a principal component analysis, including defining the variance directed along specific principal component eigenvectors. You can also plot the positions of cases against the first two eigenvectors; in essence a type of scatterplot. Finally, GeneNetwork allows you to exploit PCA methods to make new "synthetic" eigentraits from collections of correlated traits. These synthetic traits are the values of cases along specific eigenvectors and they may be less noisy than single traits. If this seems puzzling, then have a look at these useful PCA explanation by [G. Dallas](http://georgemdallas.wordpress.com/2013/10/30/principal-component-analysis-4-dummies-eigenvectors-eigenvalues-and-dimension-reduction/) and by [Powell and Lehe](http://setosa.io/ev/principal-component-analysis/). **How to do it:** You can select and assemble many different traits into a single **Trait Collection** window using the check boxes and **Add To Collection** buttons. One of the most important function buttons in the **Collection** window is labeled **Correlation Matrix**. This function computes Pearson product moment correlations and Spearman rank order correlations for all possible pairs of traits in the Collection window. It also perfoms a principal component or factor analysis. For example, if you have 20 traits in the Collection window, the correlation matrix will consist of 20*19 or 190 correlations and the identity diagonal. Principal components analysis is a linear algebraic procedure that finds a small number of independent factors or principal components that efficiently explain variation in the original 20 traits. It is a effective method to reduce the dimensionality of a group of traits. If the 20 traits share a great deal of variation, then only two or three factors may explain variation among the traits. Instead of analyzing 20 traits as if they were independent, we can now analyze the main principal components labeled PC01, PC02, etc. PC01 and PC02 can be treated as new synthetic traits that represent the main sources of variation among original traits. You can treat a PC trait like any other trait except that it is not stored permanently in a database table. You can put a PC trait in your Collection window and see how well correlated each of the 20 original traits is with this new synthetic trait. You can also map a PC trait. [RWW, Aug 23, 2005] - -
    - -#### Permutation Test: - -A permutation test is a computationally intensive but conceptually simple method used to evaluate the statisical significance of findings. Permutation tests are often used to evaluate QTL significance. _Some background_: In order to detect parts of chromosomes that apparently harbor genes that contribute to differences in a trait's value, it is common to search for associations (linkage) across the entire genome. This is referred to as a "whole genome" scan, and it usually involves testing hundreds of independently segregating regions of the genome using hundreds, or even thousands of genetic markers (SNPs and microsatellites). A parametric test such as a conventional t test of F test can be used to estimate the probability of the null hypothesis at any single location in the genome (the null hypothesis is that there is no QTL at this particular location). But a parametric test of this type makes assumptions about the distribution of the trait (its normality), and also does not provide a way to correct for the large number of independent tests that are performed while scanning the whole genome. We need protection against many false discoveries as well as some assurance that we are not neglecting truly interesting locations. A permutation test is an elegant solution to both problems. The procedure involves randomly reassigning (permuting) traits values and genotypes of all cases used in the analysis. The permuted data sets have the same set of phenotypes and genotypes (in other words, distributions are the same), but obviously the permutation procedure almost invariably obliterates genuine gene-to-phenotype relation in large data sets. We typically generate several thousand permutations of the data. Each of these is analyzed using precisely the same method that was used to analyze the correctly ordered data set. We then compare statistical results of the original data set with the collection of values generated by the many permuted data sets. The hope is that the correctly ordered data are associated with larger LRS and LOD values than more than 95% of the permuted data sets. This is how we define the p = .05 whole genome significance threshold for a QTL. Please see the related Glossary terms "Significant threshold" and "Suggestive threshold". [RWW, July 15, 2005] - -#### Power to detect QTLs: - -An analysis of statistical power is useful to estimate numbers of replicates and strains needed to detect and resolve major sources of trait variance and covariance. A versatile method has been developed by Sen and colleagues (Sen et al., 2007) and implemented in the R program. qtlDesign. David Ashbrook implemented a version of this within Shiny that can help you estimate power for different QTL effect sizes, cohort sizes, and replication rates: - -#### [Power Calculator (D. Ashbrook)](https://dashbrook1.shinyapps.io/bxd_power_calculator_app/) - -We can see that in all situations power is increased more by increasing the number of lines than by increasing the number of biological replicates. Dependent upon the heritability of the trait, there is little gain in power when going above 4-6 biological replicates. [DGA, Mar 3, 2018] - -#### Probes and Probe Sets: - -In microarray experiments the probe is the immobilized sequence on the array that is complementary to the target message washed over the array surface. Affymetrix probes are 25-mer DNA sequences synthesized on a quartz substrate. There are a few million of these 25-mers in each 120-square micron cell of the array. The abundance of a single transcript is usualy estimated by as many as 16 perfect match probes and 16 mismatch probes. The collection of probes that targets a particular message is called a probe set. [RWW, Dec 21, 2004] - - -[Go back to index](#index) - -
    - -## Q - -#### QTL: - -A quantitative trait locus is a chromosome region that contains one or more sequence variants that modulates the distribution of a variable trait measured in a sample of genetically diverse individuals from an interbreeding population. Variation in a quantitative trait may be generated by a single QTL with the addition of some environmental noise. Variation may be oligogenic and be modulated by a few independently segregating QTLs. In many cases however, variation in a trait will be polygenic and influenced by large number of QTLs distributed on many chromosomes. Environment, technique, experimental design and a host of other factors also affect the apparent distribution of a trait. Most quantitative traits are therefore the product of complex interactions of genetic factors, developmental and epigenetics factors, environmental variables, and measurement error. [Williams RW, Dec 21, 2004] - -[Go back to index](#index) - -
    - -## R - -#### Recombinant Inbred Strain (RI or RIS) or Recombinant Inbred Line (RIL): - -An inbred strain whose chromosomes incorporate a fixed and permanent set of recombinations of chromosomes originally descended from two or more parental strains. Sets of RI strains (from 10 to 5000) are often used to map the chromosomal positions of polymorphic loci that control variance in phenotypes. - -For a terrific short summary of the uses of RI strains see [2007](http://www.informatics.jax.org/silverbook/chapters/9-2.shtml)). - -Chromosomes of RI strains typically consist of alternating haplotypes of highly variable length that are inherited intact from the parental strains. In the case of a typical rodent RI strain made by crossing maternal strain C with paternal strain B (called a CXB RI strain), a chromosome will typically incorporate 3 to 5 alternating haplotype blocks with a structure such as BBBBBCCCCBBBCCCCCCCC, where each letter represents a genotype, series of similar genotype represent haplotypes, and where a transition between haplotypes represents a recombination. Both pairs of each chromosome will have the same alternating pattern, and all markers will be homozygous. Each of the different chromosomes (Chr 1, Chr 2, etc.) will have a different pattern of haplotypes and recombinations. The only exception is that the Y chromosome and the mitochondial genome, both of which are inherited intact from the paternal and maternal strain, respectively. For an RI strain to be useful for mapping purposes, the approximate position of recombinations along each chromsome need to be well defined either in terms of centimorgan or DNA basepair position. The precision with which these recombinations are mapped is a function of the number and position of the genotypes used to type the chromosomes--20 in the example above. Because markers and genotypes are often space quite far apart, often more than 500 Kb, the actual data entered into GeneNetwork will have some ambiguity at each recombination locus. The haplotype block BBBBBCCCCBBBCCCCCCCC will be entered as BBBBB?CCCC?BBB?CCCCCCCC where the ? mark indicates incomplete information over some (we hope) short interval. - -RI strains are almost always studied in sets or panels. All else being equal, the larger the set of RI strains, the greater the power and precision with which phenotypes can be mapped to chromosomal locations. The first set of eight RIs, the CXB RIs, were generated by Donald Bailey (By) from an intercross between a female BALB/cBy mouse (abbreviated C) and a male C57BL/6By mouse in the 1960s. The small panel of 8 CXB strains was originally used to determine if the major histocompatibility (MHC) locus on proximal Chr 17 was a key factor accounting for different immune responses such as tissue rejection. The methods used to determine the locations of recombinations relied on visible markers (coat color phenotypes such as the C and B loci) and the electrophoretic mobility of proteins. Somewhat larger RI sets were generated by Benjamin Taylor to map Mendelian and other major effect loci. In the 1990s the utility of RI sets for mapping was significantly improved thanks to higher density genotypes made possible by the use of microsatellite markers. Between 2005 and 2017, virtually all extant mouse and rat RI strains were regenotyped at many thousands of SNP markers, providing highly accurate maps of recombinations. - -While the potential utility of RI strains in mapping complex polygenic traits was obvious from the outset, the small number of strains only made it feasible to map quantitative traits with large effects. The first large RI sets were generated by plant geneticists (Burr et al. [2000](http://demeter.bio.bnl.gov/RIchap_rev.pdf)) and this the plant genetics community holds a strong lead in the production of very large RI sets to study multigenic and polygenic traits and trait covariance and pleiotropy. - -By 2010 the number of mouse RI strains had increased to the point where defining causal gene and sequence variant was more practical. As of 2018 there are about 150 BXD strains (152 have been fully sequenced), ~100 Collaborative Cross strains (also all fully sequenced), and at least another 100 RI strains belonging to smaller sets that have been extremely well genotyped. - -**Making RI strains**: The usual procedure typically involves sib mating of the progeny of an F1 intercross for more than 20 generations. Even by the 5th filial (F) generation of successive matings, the RI lines are homozygous at 50% of loci and by F13, the value is above 90%. At F20 the lines are nearly fully inbred (~98%) and by convention are now referred to as inbred strains rather than inbred lines. - - -[Go back to index](#index) - -Legend:from [Silver, L. (1995) Oxford University Press](http://www.informatics.jax.org/silverbook/frames/frame3-3.shtml) - -[Williams RW, June 20, 2005; significant extension, Sept 21, 2007, added Crow ref, Oct 2009] - -
    - -## S - -#### Scree Plots: - -GeneNetwork will often automatically generate a [Scree Plot](http://www.improvedoutcomes.com/docs/WebSiteDocs/PCA/Creating_a_Scree_Plot.htm) and the associated principal components (PCs) when you compute a Correlation Matrix for a group of traits that you have placed in your Trait Collection (a set of phenotypes and/or expression data for a specific population). Here is a nice definition of what a Scree plot is trying to tell you adopted and adapted from IOS (www.improvedoutcomes.com). - -A Scree Plot is a simple line segment plot that shows the fraction of total variance in the data as explained or represented by each PC. The PCs are ordered, and by definition are therefore assigned a number label, by decreasing order of contribution to total variance. The PC with the largest fraction contribution is labeled PC01. Such a plot when read left-to-right across the abscissa can often show a clear separation in fraction of total variance where the 'most important' components cease and the 'least important' components begin. The point of separation is often called the 'elbow'. (In the PCA literature, the plot is called a 'Scree' Plot because it often looks like a 'scree' slope, where rocks have fallen down and accumulated on the side of a mountain.) [Williams RW, Dec 20, 2008] - -#### Significant threshold: - -The significant threshold represents the approximate LRS value that corresponds to a genome-wide p-value of 0.05, or a 5% probability of falsely rejecting the null hypothesis that there is no linkage anywhere in the genome. This threshold is computed by evaluating the distribution of highest LRS scores generated by a set of 2000 random permutations of strain means. For example, a random permutation of the correctly ordered data may give a peak LRS score of 10 somewhere across the genome. The set of 1000 or more of these highest LRS scores is then compared to the actual LRS obtained for the correctly ordered (real) data at any location in the genome. If fewer than 50 (5%) of the 1000 permutations have peak LRS scores anywhere in the genome that exceed that obtained at a particular locus using the correctly ordered data, then one can usually claim that a QTL has been defined at a genome-wide p-value of .05. The threshold will vary slightly each time it is recomputed due to the random generation of the permutations. You can view the actual histogram of the permutation results by selecting the "Marker Regression" function in the **Analysis Tools** area of the **Trait Data and Editing Form**. WebQTL does make it possible to search through hundreds of traits for those that may have significant linkage somewhere in the genome. Keep in mind that this introduces a second tier of multiple testing problems for which the permutation test will not usually provide adequate protection. If you anticipate mapping many independent traits, then you will need to correct for the number of traits you have tested. [Williams RW, Nov 14, 2004] - -
    - -#### SNP Seismograph Track: - -SNP is an acronym for single nucleotide polymorphisms (SNPs). SNPs are simple one base pair variants that distinguish individuals and strains. The SNP Seismograph track is a unique feature of physical maps in the GeneNetwork. Each track is customized for a particular cross and shows only those SNPs that differ between the two parental strains. For example, on mouse BXD maps, only the SNPs that differ between C57BL/6J and DBA/2J will be displayed. Regions with high numbers of SNPs are characterised by wider excursions of the yellow traces that extends along the x axis. Since these regions have many SNPs they have a higher prior probability of containing functional sequence differences that might have downstream effects on phenotypes. Large genes with many SNPs close to the peak LRS and that also have a biological connection with the trait ypu are studying are high priority candidate genes. - -The SNP track in WebQTL exploits the complete Celera Discovery System SNP set but adds an additional 500,000 inferred SNPs in both BXD and AXB/BXA crosses. These SNPs were inferred based on common haplotype structure using an Monte Carlo Markov chain algorithm developed by Gary Churchill and Natalie Blades and implemented by Robert Crowell, and RWW in July 2004. Raw data used to generate the SNP seismograph track were generated by Alex Williams and Chris Vincent, July 2003. The BXD track exploits a database of 1.75 million B vs D SNPs, whereas the AXB/BXA track exploits a database of 1.80 million A vs B SNPs. The names, sequences, and precise locations of most of these SNPs are the property of Celera Discovery Systems, whom we thank for allowing us to provide this level of display in WebQTL. - -Approximately 2.8 million additional SNPs generated by Perlegen for the NIEHS have been added to the SNP track by Robert Crowell (July-Aug 2005). We have also added all Wellcome-CTC SNPs and all relevant mouse SNPs from dbSNP. [Williams RW, Dec 25, 2004; Sept 3, 2005] - -#### Standard Error of the Mean (SE or SEM): - -In most GeneNetwork data sets, the SEM is computed as: -Standard Deviation (SD) divided by the square root of n - 1 -where n is the number of independent biological samples used to estimate the population mean. What this means in practice is that when n = 2 (as in many microarray data sets), the SEM and the SD are identical. This method of computing the SEM is conservative, but corrects to some extent for well known bias of the SEM discussed by Gurland and Tripathi (1971, A simple approximation for unbiased estimation of the standard deviation. Amer Stat 25:30-32). [Williams RW, Dec 17, 2008] - -#### Strain Distribution Pattern: - -A marker such as a SNP or microsatellite is genotyped using DNA obtained from each member of the mapping population. In the case of a genetic reference population, such as the BXD strains or the BayXSha Arabadopsis lines, this results in a text string of genotypes (e.g., BDDDBDBBBBDDBDDDBBBB... for BXD1 through BXD100). Each marker is associated with its own particular text string of genotypes that is often called the **strain distribution pattern** of the marker. (A more appropriate term would be the **marker genotype string**.) This string is converted to a numerical version, a genotype vector: -1111-11-1-1-1-111-1111-1-1-1-1..., where D=1, B=-1, H=0. Mapping a trait boils down to performing correlations between each trait and all of the genotype vectors. The genotype vector with the highest correlation (absolute value) is a good candidate for a QTL. [Williams RW, June 18, 2005] - -#### Suggestive Threshold: - -The suggestive threshold represents the approximate LRS value that corresponds to a genome-wide p-value of 0.63, or a 63% probability of falsely rejecting the null hypothesis that there is no linkage anywhere in the genome. This is not a typographical error. The Suggestive LRS threshold is defined as that which yields, on average, one false positive per genome scan. That is, roughly one-third of scans at this threshold will yield no false positive, one-third will yield one false positive, and one-third will yield two or more false positives. This is a very permissive threshold, but it is useful because it calls attention to loci that may be worth follow-up. Regions of the genome in which the LRS exceeds the suggestive threshold are often worth tracking and screening. They are particularly useful in combined multicross metaanalysis of traits. If two crosses pick up the same suggestive locus, then that locus may be significant when the joint probability is computed. The suggestive threshold may vary slightly each time it is recomputed due to the random generation of permutations. You can view the actual histogram of the permutation results by selecting the "Marker Regression" function in the **Analysis Tools** area of the **Trait Data and Editing Form**. [Williams RW and Manly KF, Nov 15, 2004] - -#### Systems Genetics: - -Systems genetics or "network genetics" is an emerging new branch of genetics that aims to understand complex causal networks of interactions at multiple levels of biological organization. To put this in a simple context: Mendelian genetics can be defined as the search for linkage between a single trait and a single gene variant (1 to 1); complex trait analysis can be defined as the search for linkage between a single trait and a set of gene variants (QTLs, QTGs, and QTNs) and environmental cofactors (1 to many); and systems genetics can be defined as the search for linkages among networks of traits and networks of gene and environmental variants (many to many). - -A hallmark of systems genetics is the simultaneous consideration of groups (systems) of phenotypes from the primary level of molecular and cellular interactions that ultimately modulate global phenotypes such as blood pressure, behavior, or disease resistance. Changes in environment are also often important determinants of multiscalar phenotypes; reversing the standard notion of causality as flowing inexorably upward from the genome. Scientists who use a systems genetics approach often have a broad interest in modules of linked phenotypes. Causality in these complex dynamic systems is often contingent on environmental or temporal context, and often will involve feedback modulation. A systems genetics approach can be unusually powerful, but does require the use of large numbers of observations (large sample size), and more advanced statistical and computational models. - -Systems genetics is not really a new field and traces back to [Sewall Wright's](http://www.amphilsoc.org/library/mole/w/wrights.htm) classical paper (Wright, 1921, "Correlation and Causation") that introduced path analysis to study systems of related phenotypes. Two factors have invigorated this field. The first factor is the advent of more sophisticated statistical methods including Structural [Equation Modeling](http://userwww.sfsu.edu/~efc/classes/biol710/path/SEMwebpage.htm) (SEM), [System Dynamics Modeling](http://www.public.asu.edu/~kirkwood/sysdyn/SDIntro/SDIntro.htm), and [Bayesian Network Modeling](http://bnj.sourceforge.net/) combined with powerful computer systems and efficient algorithms. The second factor is the relative ease with which it is now possible to acquire extensive and diverse phenotype data sets across genetic reference populations such as the BXD set of mice, the HXB set of rats, and the BayXSha lines of Arabidopsis (data are incorporated in the GeneNetwork). In the case of the BXD strains, a large research community has collectively generated hundreds of thousands of transcript phenotypes in different tissues and cells (level of expression), as well as hundreds of protein, cellular, pharmacological, and behavioral data types across a single genetic reference panel. Evaluating and modeling the associative and causal relations among these phenotypes is a major, and still relatively new area of research. Complex trait analysis and QTL mapping are both part of systems genetics in which causality is inferred using conventional genetic linkage (Li et al., [2005](http://hmg.oupjournals.org/cgi/content/abstract/ddi124v1)). One can often assert with confidence that a particular module of phenotypes (component of the variance and covariance) is modulated by sequence variants at a common locus. This provides a causal constraint that can be extremely helpful in more accurately modeling network architecture. Most models are currently static, but as the field matures, more sophisticated dynamic models will supplant steady-state models. - -The term "systems genetics" was coined by Grant Morahan, October 2004, during a visit to Memphis, as a more general and appropriate term to use instead of "genetical genomics." [Williams RW, April 11, 2005, revised Oct 22, 2005, April, 2008] - -[Go back to index](#index) - -
    - -## T - -
    - -#### Tissue Correlation: - -The tissue correlation is an estimate of the similarity of expression of two genes across different cells, tissues, or organs. In order to compute this type of correlation we first generate expression data for multiple different cell types, tissues, or whole organs from a single individual. There will be significant differences in gene expression across this sample and this variation can then be used to compute either Pearson product-moment correlations (r) or Spearman rank order correlations (rho) between any pair of genes, transcripts, or even exons. Since the samples are ideally all from one individual there should not be any genetic or environmental differences among samples. The difficulty in computing tissue correlations is that samples are not independent. For example, three samples of the small intestine (jejunum, ilieum, and duodenum) will have expression patterns that are quite similar to each other in comparison to three other samples, such as heart, brain, and bone. For this reason the nature of the sampling and how those samples are combined will greatly affect the correlation values. The tissue correlations in GeneNetwork were computed in a way that attempts to reduce the impact of this fact by combining closely related sample types. For example multiple data sets for different brain region were combined to generate a single average CNS tissue sample (generating a whole brain sample would have been an alternative method). - -However, there is really not optimal way to minimize the effects of this type of non-independence of samples. Some genes will have high expression in only a few tissues, for example the cholinergic receptor, nicotinic, alpha polypeptide 1 gene Chrna1 has high expression in muscle tissues (skeletal muscle = Mus, tongue = Ton, and esophagus = Eso) but lower expression in most other tissues. The very high correlation between Chrna1 and other genes with high expression only in muscle reflects their joint bimodality of expression. It does not mean that these genes or their proteins necessarily cooperate directly in molecular processes. [Williams RW, Dec 26, 2008] - - - -#### Transcript Location: - -The small orange triangle on the x-axis indicates the approximate position of the gene that corresponds to the transcript. These values were taken from the latest assembly of genome of the particular species. - -#### Transform: - -Most of the data sets in the GeneNetwork are ultimately derived from high resolution images of the surfaces of microarrays. Estimates the gene expression therefore involves extensive low-level image analysis. These processesing steps attempt to compensate for low spatial frequency "background" variation in image intensity that cannot be related to the actual hybridization signal, for example, a gradation of intensity across the whole array surface due to illumination differences, uneven hybridization, optical performance, scanning characteristics, etc. High spatial frequeny artifacts are also removed if they are likely to be artifacts: dust, scrathes on the array surface, and other "sharp" blemishes. The raw image data (for example, the Affymetrix DAT file) also needs to be registered to a template that assigns pixel values to expected array spots (cells). This image registration is an important process that users can usually take for granted. The end result is the reliable assignment of a set of image intensity values (pixels) to each probe. Each cell value generated using the Affymetrix U74Av2 array is associated with approximately 36 pixel intensity values (a 6x6 set of pixels, usually an effective 11 or 12-bit range of intensity). Affymetrix uses a method that simply ranks the values of these pixels and picks as the "representative value" the pixel that is closest to a particular rank order value, for example, the 24th highest of 36 pixels. The range of variation in intensity values amoung these ranked pixels provides a way to estimate the error of the estimate. The Affymetrix CEL files therefore consist of XY coordinates, the consensus value, and an error term. [Williams RW, April 30, 2005] - -#### Transgression: - -Most of us are familiar with the phrase "regression toward the mean." This refers to the tendency of progeny of a cross to have phenotype that are intermediate to those of the parents. Transgression refers to the converse: progeny that have more phenotypes that are higher and lower than those of either parent. Transgression is common, and provided that a trait is influenced by many independent sequence variants (a polygenic trait), transgression is the expectation. This is particularly true if the parents are different genetically, but by chance have similar phenotypes. Consider a trait that is controlled by six independent genes, A through F. The "0" allele at these size genes lowers body weight whereas the "1" allele increases body weight. If one parent has a 000111 6-locus genotype and the other parent has 111000 genotype, then they will have closely matched weight. But their progeny may inherit combinations as extreme as 000000 and 111111. - -Transgression means that you can rarely predict the distribution of phenotypes among a set of progeny unless you already have a significant amount of information about the genetic architecture of a trait (numbers of segregating variants that affect the trait, either interactions, and GXE effects). In practical terms this means that if the parents of a cross do NOT differ and you have good reasons to believe that the trait you are interested in is genetically complex, then you can be fairly confident that the progeny will display a much wider range of variation that the parents. [May 2011 by RWW]. - -[Go back to index](#index) - -
    - -## U - -[Go back to index](#index) - -
    - -## V - -[Go back to index](#index) - -
    - -## W - -#### Winsorize, Winsorise: - -QTL mapping results can be greatly affected by inclusion of outlier data. GeneNetwork will do its best to flag outliers for you in the **Trait Data and Analysis** pages (yellow highlighting). Before mapping, review the data, and if necessary, change values. Options for handling outliers include: (1) do nothing, (2) delete the outliers (trimming), (3) transform the data (e.g., logarithmic, arcsine, or logistic regression transforms), or (4) [winsorize](http://en.wikipedia.org/wiki/Winsorising) the distribution of values. Winsorizing is usually the easiest method to implement directly in GeneNetwork. - -**How to winsorize**: First review the distribution of values and define outliers. You should only do this one time, so think before you leap. Look at the **Probability Plot** of the trait by going to **Trait Data and Analysis** page and selecting **Basic Statistics**). For example, the figure below from GeneNetwork shows that at many as seven cases have relatively high values and as many as three have relatively low values (this trait is taken from Species = Mouse, Group = LXS, Type = Phenotype, Trait 10182). GeneNetwork code only declares the highest two values to be outliers, but you can use a more liberal definition and give all seven high values a haircut. It is advisable to winsorizes equal numbers of cases on each side of the distribution (high and low cases). In this case, the seven highest values were changed to match that of the 8th highest value (0.860). To retain the original rank order I added an incremental value of 0.01 to each (0.861, 0.862, etc). I did the same thing to the lowest seven values. Adding this increment is not necessary. - -The result in this case: a suggestive QTL on Chr 16 now reaches the significance threshold. - -The **danger of winsorizing** is doing it multiple times in different ways. You should transform or winsorize the data before mapping. And you should ideally only do any transformation/correction one time. If you fool around with different methods of transforming your data then you are asking for trouble by adding yet another level of multiple testing. If you feel compelled to experiment with different transforms, then you should/must report this in publications and explain why you did so. Demonstrating that mapping results are robust even using multiple transforms is one good excuse. [Williams RW, Jan 2, 2014] - - - - - -[Go back to index](#index) - -
    - -## X - -[Go back to index](#index) - -
    - -## Y -[Go back to index](#index) - - -
    - -## Z - -[Go back to index](#index) -- cgit v1.2.3 From 51f05b5fa896a2ed636ecb08bf69d929896db517 Mon Sep 17 00:00:00 2001 From: BonfaceKilz Date: Fri, 20 Nov 2020 18:47:21 +0300 Subject: Point url for fetching md content to gn-docs repo * wqflask/wqflask/markdown_routes.py (render_markdown): Use gn-docs URL to fetch markdown content. --- wqflask/wqflask/markdown_routes.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/wqflask/wqflask/markdown_routes.py b/wqflask/wqflask/markdown_routes.py index 33092947..48a1ea81 100644 --- a/wqflask/wqflask/markdown_routes.py +++ b/wqflask/wqflask/markdown_routes.py @@ -17,11 +17,9 @@ def render_markdown(file_name): look for it inside the file system """ - markdown_url = (f"https://raw.githubusercontent.com" - f"/genenetwork/genenetwork2/" - f"wqflask/wqflask/static/" - f"{file_name}") - md_content = requests.get(markdown_url) + github_url = ("https://raw.githubusercontent.com/" + "genenetwork/gn-docs/master/") + md_content = requests.get(f"{github_url}{file_name}") if md_content.status_code == 200: return mistune.html(md_content.content.decode("utf-8")) -- cgit v1.2.3 From 0a6a5d0ac847489bb5db158bc54e73aa682e18a0 Mon Sep 17 00:00:00 2001 From: BonfaceKilz Date: Fri, 20 Nov 2020 18:51:46 +0300 Subject: Replace mistunes with markdown library * wqflask/wqflask/markdown_routes.py (render_markdown): Use markdown library instead of mistunes. --- wqflask/wqflask/markdown_routes.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/wqflask/wqflask/markdown_routes.py b/wqflask/wqflask/markdown_routes.py index 48a1ea81..7a9fac41 100644 --- a/wqflask/wqflask/markdown_routes.py +++ b/wqflask/wqflask/markdown_routes.py @@ -2,9 +2,8 @@ Render pages from github, or if they are unavailable, look for it else where """ -import os import requests -import mistune +import markdown from flask import Blueprint from flask import render_template @@ -21,16 +20,20 @@ look for it inside the file system "genenetwork/gn-docs/master/") md_content = requests.get(f"{github_url}{file_name}") if md_content.status_code == 200: - return mistune.html(md_content.content.decode("utf-8")) + return markdown.Markdown().convert(md_content.content.decode("utf-8")) - with open(os.path.join(os.path.abspath(os.path.dirname(__file__)), - f"static/markdown/{file_name}")) as md_file: - markdown = md_file.read() - return mistune.html(markdown) + # TODO: Add fallback on our git server by checking the mirror. + + # Content not available + return (f"\nContent for {file_name} not available. " + "Please check " + "(here to see where content exists)" + "[https://github.com/genenetwork/gn-docs]. " + "Please reach out to the gn2 team to have a look at this") @glossary_blueprint.route('/') def glossary(): return render_template( "glossary.html", - rendered_markdown=render_markdown("glossary.md")), 200 + rendered_markdown=render_markdown("general/glossary/glossary.md")), 200 -- cgit v1.2.3 From e323c52889cfbab7f152e4969f6e2b528768587a Mon Sep 17 00:00:00 2001 From: BonfaceKilz Date: Fri, 20 Nov 2020 20:00:29 +0300 Subject: Update tests to use new gn2-docs url --- wqflask/tests/unit/wqflask/test_markdown_routes.py | 24 ++++++++++------------ 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/wqflask/tests/unit/wqflask/test_markdown_routes.py b/wqflask/tests/unit/wqflask/test_markdown_routes.py index 3adf63e5..2d403d04 100644 --- a/wqflask/tests/unit/wqflask/test_markdown_routes.py +++ b/wqflask/tests/unit/wqflask/test_markdown_routes.py @@ -23,8 +23,7 @@ class MockRequests200: This is some content ## Sub-heading -This is another sub-heading - """ +This is another sub-heading""" class TestMarkdownRoutesFunctions(unittest.TestCase): """Test cases for functions in markdown_routes""" @@ -32,27 +31,26 @@ class TestMarkdownRoutesFunctions(unittest.TestCase): @mock.patch('wqflask.markdown_routes.requests.get') def test_render_markdown_when_fetching_locally(self, requests_mock): requests_mock.return_value = MockRequests404() - markdown_content = render_markdown("glossary.md") + markdown_content = render_markdown("general/glossary/glossary.md") requests_mock.assert_called_with( "https://raw.githubusercontent.com" - "/genenetwork/genenetwork2/" - "wqflask/wqflask/static/" - "glossary.md") + "/genenetwork/gn-docs/" + "master/general/" + "glossary/glossary.md") self.assertRegexpMatches(markdown_content, - "Glossary of Terms and Features") + "Content for general/glossary/glossary.md not available.") @mock.patch('wqflask.markdown_routes.requests.get') def test_render_markdown_when_fetching_remotely(self, requests_mock): requests_mock.return_value = MockRequests200() - markdown_content = render_markdown("glossary.md") + markdown_content = render_markdown("general/glossary/glossary.md") requests_mock.assert_called_with( "https://raw.githubusercontent.com" - "/genenetwork/genenetwork2/" - "wqflask/wqflask/static/" - "glossary.md") + "/genenetwork/gn-docs/" + "master/general/" + "glossary/glossary.md") self.assertEqual("""

    Glossary

    This is some content

    Sub-heading

    -

    This is another sub-heading

    -""", +

    This is another sub-heading

    """, markdown_content) -- cgit v1.2.3 From 8fc58655c9b98e816f31e4a4f99b6879fe304bca Mon Sep 17 00:00:00 2001 From: BonfaceKilz Date: Fri, 20 Nov 2020 20:03:23 +0300 Subject: Replace mockobject with dataclasses * wqflask/tests/unit/wqflask/test_markdown_routes.py (MockRequests404): Use dataclasses. (MockRequests200): Ditto. --- wqflask/tests/unit/wqflask/test_markdown_routes.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/wqflask/tests/unit/wqflask/test_markdown_routes.py b/wqflask/tests/unit/wqflask/test_markdown_routes.py index 2d403d04..90e0f17c 100644 --- a/wqflask/tests/unit/wqflask/test_markdown_routes.py +++ b/wqflask/tests/unit/wqflask/test_markdown_routes.py @@ -3,28 +3,26 @@ import unittest from unittest import mock +from dataclasses import dataclass from wqflask.markdown_routes import render_markdown +@dataclass class MockRequests404: - @property - def status_code(self): - return 404 + status_code: int = 404 -class MockRequests200: - @property - def status_code(self): - return 200 - @property - def content(self): - return b""" +@dataclass +class MockRequests200: + status_code: int = 200 + content: str = b""" # Glossary This is some content ## Sub-heading This is another sub-heading""" + class TestMarkdownRoutesFunctions(unittest.TestCase): """Test cases for functions in markdown_routes""" -- cgit v1.2.3 From 4a16262f1f8097122ed0bedf2e92211627ad223a Mon Sep 17 00:00:00 2001 From: BonfaceKilz Date: Sat, 21 Nov 2020 09:58:14 +0300 Subject: Use docker image with markdown packaged * .github/workflows/main.yml (jobs): [container]: Use image that is on python3-genenetwork "ad741c1" which has python-markdown packaged. --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index c78f6d85..2342796a 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -11,7 +11,7 @@ on: jobs: unittest: runs-on: ubuntu-latest - container: bonfacekilz/python3-genenetwork2:0bf4ee6 + container: bonfacekilz/python3-genenetwork2:ad741c1 steps: # First start with mariadb set then checkout. The checkout gives -- cgit v1.2.3 From 37904013a43f99a3fffa73d16245d1b79a3d61ba Mon Sep 17 00:00:00 2001 From: zsloan Date: Mon, 23 Nov 2020 10:35:22 -0600 Subject: Changed correlation matrix cells for traits with zero shared samples to instead say N/A and be a shade of grey --- wqflask/wqflask/templates/correlation_matrix.html | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/wqflask/wqflask/templates/correlation_matrix.html b/wqflask/wqflask/templates/correlation_matrix.html index d556f31a..4e150618 100644 --- a/wqflask/wqflask/templates/correlation_matrix.html +++ b/wqflask/wqflask/templates/correlation_matrix.html @@ -51,8 +51,12 @@ {% if result[0].name == trait.name and result[0].dataset == trait.dataset %} n
    {{ result[2] }}
    {% else %} + {% if result[1] == 0 %} + N/A + {% else %} {{ '%0.2f' % result[1] }}
    {{ result[2] }}
    {% endif %} + {% endif %} {% endfor %} {% endfor %} -- cgit v1.2.3 From 3957634c1fd24eb61efe0adc74fe7c6c7841ad86 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Tue, 24 Nov 2020 10:27:29 +0300 Subject: solve issues to do with decoding in python3 --- wqflask/wqflask/docs.py | 4 +++- wqflask/wqflask/views.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/wqflask/wqflask/docs.py b/wqflask/wqflask/docs.py index d653c269..23fc3cad 100644 --- a/wqflask/wqflask/docs.py +++ b/wqflask/wqflask/docs.py @@ -19,8 +19,10 @@ class Docs(object): self.title = self.entry.capitalize() self.content = "" else: + self.title = result[0] - self.content = result[1] + self.content = result[1].decode("utf-8") + self.editable = "false" # ZS: Removing option to edit to see if text still gets vandalized diff --git a/wqflask/wqflask/views.py b/wqflask/wqflask/views.py index b7c4d142..bf2e9026 100644 --- a/wqflask/wqflask/views.py +++ b/wqflask/wqflask/views.py @@ -304,8 +304,8 @@ def news(): @app.route("/references") def references(): doc = Docs("references", request.args) + # return render_template("reference.html") return render_template("docs.html", **doc.__dict__) - #return render_template("reference.html") @app.route("/intro") def intro(): -- cgit v1.2.3 From a53302a077b8eea3591460a03a60c9621c3f4586 Mon Sep 17 00:00:00 2001 From: zsloan Date: Wed, 25 Nov 2020 13:11:11 -0600 Subject: Substituted CDN link to the font-awesome CSS with the local guix profile link --- wqflask/wqflask/templates/search_result_page.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wqflask/wqflask/templates/search_result_page.html b/wqflask/wqflask/templates/search_result_page.html index 8e2b06a4..35d8ed27 100644 --- a/wqflask/wqflask/templates/search_result_page.html +++ b/wqflask/wqflask/templates/search_result_page.html @@ -2,9 +2,9 @@ {% block title %}Search Results{% endblock %} {% block css %} + - {% endblock %} -- cgit v1.2.3 From c8a339616fb3dd3f63427378dc4285de9ad1ccef Mon Sep 17 00:00:00 2001 From: zsloan Date: Wed, 25 Nov 2020 13:11:48 -0600 Subject: Removed an unused console.timeEnd in the JS --- wqflask/wqflask/templates/search_result_page.html | 2 -- 1 file changed, 2 deletions(-) diff --git a/wqflask/wqflask/templates/search_result_page.html b/wqflask/wqflask/templates/search_result_page.html index 35d8ed27..9a28a78e 100644 --- a/wqflask/wqflask/templates/search_result_page.html +++ b/wqflask/wqflask/templates/search_result_page.html @@ -440,8 +440,6 @@ 'processing': 'Loading...' } } ); - - console.timeEnd("Creating table"); $('.toggle-vis').on( 'click', function (e) { e.preventDefault(); -- cgit v1.2.3 From ab6681e545bd4c84e1f74107b90ea52f7d2ca24c Mon Sep 17 00:00:00 2001 From: zsloan Date: Wed, 25 Nov 2020 13:15:25 -0600 Subject: Added form options for changing manhattan plot color scheme + added necessary imports and hidden inputs --- wqflask/wqflask/templates/mapping_results.html | 47 +++++++++++++++++++++++--- 1 file changed, 42 insertions(+), 5 deletions(-) diff --git a/wqflask/wqflask/templates/mapping_results.html b/wqflask/wqflask/templates/mapping_results.html index 28d93542..9542c29d 100644 --- a/wqflask/wqflask/templates/mapping_results.html +++ b/wqflask/wqflask/templates/mapping_results.html @@ -40,6 +40,9 @@ + {% if manhattan_plot == True %} + + {% endif %} @@ -55,7 +58,7 @@ -
    +

    Map Viewer: Whole Genome


    Population: {{ dataset.group.species|capitalize }} {{ dataset.group.name }}
    @@ -77,7 +80,7 @@ - - + @@ -114,11 +117,31 @@ -
    Chr:  +
    View: View:  to
    Width:  + pixels (minimum=900)
    + {% if manhattan_plot == True and selectedChr == -1 %} + + + + + + +
    + Manhattan Plot Color Scheme:  + + + + + +
    + {% endif %}
    {% if (mapping_method == "reaper" or mapping_method == "rqtl_geno") and nperm > 0 %} @@ -328,6 +351,9 @@ + {% if manhattan_plot == True and selectedChr == -1 %} + + {% endif %} @@ -423,7 +449,7 @@ var mapping_input_list = ['temp_uuid', 'trait_id', 'dataset', 'tool_used', 'form_url', 'method', 'transform', 'trimmed_markers', 'selected_chr', 'chromosomes', 'mapping_scale', 'score_type', 'suggestive', 'significant', 'num_perm', 'permCheck', 'perm_output', 'perm_strata', 'categorical_vars', 'num_bootstrap', 'bootCheck', 'bootstrap_results', - 'LRSCheck', 'covariates', 'maf', 'use_loco', 'manhattan_plot', 'control_marker', 'control_marker_db', 'do_control', 'genofile', + 'LRSCheck', 'covariates', 'maf', 'use_loco', 'manhattan_plot', 'color_scheme', 'manhattan_single_color', 'control_marker', 'control_marker_db', 'do_control', 'genofile', 'pair_scan', 'startMb', 'endMb', 'graphWidth', 'lrsMax', 'additiveCheck', 'showSNP', 'showGenes', 'viewLegend', 'haplotypeAnalystCheck', 'mapmethod_rqtl_geno', 'mapmodel_rqtl_geno', 'temp_trait', 'group', 'species', 'reaper_version', 'primary_samples', 'n_samples'] @@ -449,10 +475,21 @@ remap = function() { $('input[name=selected_chr]').val($('select[name=chromosomes]').val()); + $('input[name=color_scheme]').val($('select#color_scheme').val()); $('#marker_regression_form').attr('action', '/loading'); return $('#marker_regression_form').submit(); }; + {% if manhattan_plot == True and selectedChr == -1 %} + $('#color_scheme').change(function(){ + if ($(this).val() == "single"){ + $('#point_color_picker').show(); + } else { + $('#point_color_picker').hide(); + } + }); + {% endif %} + {% if mapping_method != "gemma" and mapping_method != "plink" %} $('#download_perm').click(function(){ perm_info_dict = { -- cgit v1.2.3 From a1563aee47a54d5ff4253e1da7e851afe28805c0 Mon Sep 17 00:00:00 2001 From: zsloan Date: Wed, 25 Nov 2020 13:16:26 -0600 Subject: Added code for passing around form parameters related to manhattan plot coloration to run_mapping.py and views.py --- wqflask/wqflask/marker_regression/run_mapping.py | 7 ++++++- wqflask/wqflask/views.py | 2 ++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/wqflask/wqflask/marker_regression/run_mapping.py b/wqflask/wqflask/marker_regression/run_mapping.py index 31d6a67c..c474e0e0 100644 --- a/wqflask/wqflask/marker_regression/run_mapping.py +++ b/wqflask/wqflask/marker_regression/run_mapping.py @@ -138,7 +138,12 @@ class RunMapping(object): mapping_results_filename = self.dataset.group.name + "_" + ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(6)) self.mapping_results_path = "{}{}.csv".format(webqtlConfig.GENERATED_IMAGE_DIR, mapping_results_filename) - if start_vars['manhattan_plot'] == "true": + if start_vars['manhattan_plot']: + self.color_scheme = "alternating" + if "color_scheme" in start_vars: + self.color_scheme = start_vars['color_scheme'] + if self.color_scheme == "single": + self.manhattan_single_color = start_vars['manhattan_single_color'] self.manhattan_plot = True else: self.manhattan_plot = False diff --git a/wqflask/wqflask/views.py b/wqflask/wqflask/views.py index b7c4d142..7118bab7 100644 --- a/wqflask/wqflask/views.py +++ b/wqflask/wqflask/views.py @@ -718,6 +718,8 @@ def mapping_results_page(): 'maf', 'use_loco', 'manhattan_plot', + 'color_scheme', + 'manhattan_single_color', 'control_marker', 'control_marker_db', 'do_control', -- cgit v1.2.3 From d1740605ffd36e49b9560e81c0e085c58ef8b688 Mon Sep 17 00:00:00 2001 From: zsloan Date: Wed, 25 Nov 2020 13:17:03 -0600 Subject: Added actual code for coloring manhattan plot differently to display_mapping_results.py --- .../marker_regression/display_mapping_results.py | 46 ++++++++++++++++++++-- 1 file changed, 43 insertions(+), 3 deletions(-) diff --git a/wqflask/wqflask/marker_regression/display_mapping_results.py b/wqflask/wqflask/marker_regression/display_mapping_results.py index 3f6de2b2..87910401 100644 --- a/wqflask/wqflask/marker_regression/display_mapping_results.py +++ b/wqflask/wqflask/marker_regression/display_mapping_results.py @@ -74,6 +74,34 @@ DARKVIOLET = ImageColor.getrgb("darkviolet") MEDIUMPURPLE = ImageColor.getrgb("mediumpurple") # ---- END: Define common colours ---- # +# ZS: List of distinct colors for manhattan plot if user selects "varied" +DISTINCT_COLOR_LIST = [ + ImageColor.getrgb("#FF0000"), + ImageColor.getrgb("#00FF00"), + ImageColor.getrgb("#0000FF"), + ImageColor.getrgb("#FFFF00"), + ImageColor.getrgb("#FF00FF"), + ImageColor.getrgb("#00FFFF"), + ImageColor.getrgb("#000000"), + ImageColor.getrgb("#800000"), + ImageColor.getrgb("#008000"), + ImageColor.getrgb("#000080"), + ImageColor.getrgb("#808000"), + ImageColor.getrgb("#800080"), + ImageColor.getrgb("#008080"), + ImageColor.getrgb("#808080"), + ImageColor.getrgb("#C00000"), + ImageColor.getrgb("#00C000"), + ImageColor.getrgb("#0000C0"), + ImageColor.getrgb("#C0C000"), + ImageColor.getrgb("#C000C0"), + ImageColor.getrgb("#00C0C0"), + ImageColor.getrgb("#C0C0C0"), + ImageColor.getrgb("#400000"), + ImageColor.getrgb("#004000"), + ImageColor.getrgb("#000040"), +] + # ---- FONT FILES ---- # VERDANA_FILE = "./wqflask/static/fonts/verdana.ttf" VERDANA_BOLD_FILE = "./wqflask/static/fonts/verdanab.ttf" @@ -293,6 +321,12 @@ class DisplayMappingResults(object): self.plotScale = "physic" self.manhattan_plot = start_vars['manhattan_plot'] + if self.manhattan_plot: + self.color_scheme = "alternating" + if 'color_scheme' in start_vars: + self.color_scheme = start_vars['color_scheme'] + if self.color_scheme == "single": + self.manhattan_single_color = ImageColor.getrgb("#" + start_vars['manhattan_single_color']) if 'permCheck' in list(start_vars.keys()): self.permChecked = start_vars['permCheck'] @@ -2424,10 +2458,16 @@ class DisplayMappingResults(object): Yc = yZero - qtlresult['lod_score']*LRSHeightThresh/LRS_LOD_Max if self.manhattan_plot == True: - if self.selectedChr == -1 and (previous_chr_as_int % 2 == 1): - point_color = RED + if self.color_scheme == "single": + point_color = self.manhattan_single_color + elif self.color_scheme == "varied": + point_color = DISTINCT_COLOR_LIST[previous_chr_as_int] else: - point_color = BLUE + if self.selectedChr == -1 and (previous_chr_as_int % 2 == 1): + point_color = RED + else: + point_color = BLUE + im_drawer.text( text="5", xy=( -- cgit v1.2.3 From 78fd4d4a89a02fc5233a1f5bbdfd3946ec08f3b7 Mon Sep 17 00:00:00 2001 From: zsloan Date: Thu, 26 Nov 2020 16:06:59 -0600 Subject: Changed the way the chromosome colors for the "color by chr" manhattan plot option are stored to using a list comprehension on a list of the just the hex strings --- .../marker_regression/display_mapping_results.py | 32 ++++------------------ 1 file changed, 6 insertions(+), 26 deletions(-) diff --git a/wqflask/wqflask/marker_regression/display_mapping_results.py b/wqflask/wqflask/marker_regression/display_mapping_results.py index 87910401..08c2d750 100644 --- a/wqflask/wqflask/marker_regression/display_mapping_results.py +++ b/wqflask/wqflask/marker_regression/display_mapping_results.py @@ -75,32 +75,12 @@ MEDIUMPURPLE = ImageColor.getrgb("mediumpurple") # ---- END: Define common colours ---- # # ZS: List of distinct colors for manhattan plot if user selects "varied" -DISTINCT_COLOR_LIST = [ - ImageColor.getrgb("#FF0000"), - ImageColor.getrgb("#00FF00"), - ImageColor.getrgb("#0000FF"), - ImageColor.getrgb("#FFFF00"), - ImageColor.getrgb("#FF00FF"), - ImageColor.getrgb("#00FFFF"), - ImageColor.getrgb("#000000"), - ImageColor.getrgb("#800000"), - ImageColor.getrgb("#008000"), - ImageColor.getrgb("#000080"), - ImageColor.getrgb("#808000"), - ImageColor.getrgb("#800080"), - ImageColor.getrgb("#008080"), - ImageColor.getrgb("#808080"), - ImageColor.getrgb("#C00000"), - ImageColor.getrgb("#00C000"), - ImageColor.getrgb("#0000C0"), - ImageColor.getrgb("#C0C000"), - ImageColor.getrgb("#C000C0"), - ImageColor.getrgb("#00C0C0"), - ImageColor.getrgb("#C0C0C0"), - ImageColor.getrgb("#400000"), - ImageColor.getrgb("#004000"), - ImageColor.getrgb("#000040"), -] +COLOR_CODES = ["#FF0000", "#00FF00", "#0000FF", "#FFFF00", "#FF00FF", "#00FFFF", + "#000000", "#800000", "#008000", "#000080", "#808000", "#800080", + "#008080", "#808080", "#C00000", "#00C000", "#0000C0", "#C0C000", + "#C000C0", "#00C0C0", "#C0C0C0", "#400000", "#004000", "#000040"] + +DISTINCT_COLOR_LIST = [ImageColor.getrgb(color) for color in COLOR_CODES] # ---- FONT FILES ---- # VERDANA_FILE = "./wqflask/static/fonts/verdana.ttf" -- cgit v1.2.3