From a2f6406909951a80dc4ead809a09e8de2c15200d Mon Sep 17 00:00:00 2001
From: Muriithi Frederick Muriuki
Date: Tue, 17 Aug 2021 08:49:14 +0300
Subject: Provide top-level `riset` key-value pair

Issue:
https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi

* Provide the expected, top-level `riset` key-value pair and eliminate the
  redundant key-value pair.
---
 gn3/db/traits.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'gn3/db/traits.py')

diff --git a/gn3/db/traits.py b/gn3/db/traits.py
index 6ea24be..1031e44 100644
--- a/gn3/db/traits.py
+++ b/gn3/db/traits.py
@@ -418,9 +418,9 @@ def retrieve_trait_info(
         conn)
     if trait_info["haveinfo"]:
         return {
-            **trait_post_processing_functions_table[trait_dataset_type](trait_info),
-            "db": {**trait["db"], **trait_dataset},
-            "riset": trait_dataset["riset"]
+            **trait_post_processing_functions_table[trait_dataset_type](
+                {**trait_info, "riset": trait_dataset["riset"]}),
+            "db": {**trait["db"], **trait_dataset}
         }
     return trait_info
 
-- 
cgit v1.2.3


From e441509a59c20a051fd5ab94710513f1968a5e02 Mon Sep 17 00:00:00 2001
From: Muriithi Frederick Muriuki
Date: Tue, 31 Aug 2021 10:50:56 +0300
Subject: Update `heatmap_data` function: remove extraneous data

Issue:
https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi

* gn3/computations/heatmap.py: update function
* gn3/db/traits.py: new function

  Remove extraneous data and arguments from the function.
  - Load the genotype file
  - Generate traits file
  - Provide both raw traits data, and exported traits data in return
---
 gn3/computations/heatmap.py | 42 ++++++++++++++++++++++--------------------
 gn3/db/traits.py            |  5 +++++
 2 files changed, 27 insertions(+), 20 deletions(-)

(limited to 'gn3/db/traits.py')

diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py
index e0ff05b..92014cf 100644
--- a/gn3/computations/heatmap.py
+++ b/gn3/computations/heatmap.py
@@ -6,8 +6,12 @@ generate various kinds of heatmaps.
 from functools import reduce
 from typing import Any, Dict, Sequence
 from gn3.computations.slink import slink
-from gn3.db.traits import retrieve_trait_data, retrieve_trait_info
 from gn3.computations.correlations2 import compute_correlation
+from gn3.db.genotypes import build_genotype_file, load_genotype_samples
+from gn3.db.traits import (
+    retrieve_trait_data,
+    retrieve_trait_info,
+    generate_traits_filename)
 
 def export_trait_data(
         trait_data: dict, strainlist: Sequence[str], dtype: str = "val",
@@ -125,7 +129,7 @@ def cluster_traits(traits_data_list: Sequence[Dict]):
 
     return tuple(__cluster(tdata_i) for tdata_i in enumerate(traits_data_list))
 
-def heatmap_data(formd, search_result, conn: Any):
+def heatmap_data(traits_names, conn: Any):
     """
     heatmap function
 
@@ -142,39 +146,37 @@ def heatmap_data(formd, search_result, conn: Any):
     TODO: Elaborate on the parameters here...
     """
     threshold = 0 # webqtlConfig.PUBLICTHRESH
-    cluster_checked = formd.formdata.getvalue("clusterCheck", "")
-    strainlist = [
-        strain for strain in formd.strainlist if strain not in formd.parlist]
-    genotype = formd.genotype
-
     def __retrieve_traitlist_and_datalist(threshold, fullname):
         trait = retrieve_trait_info(threshold, fullname, conn)
         return (trait, retrieve_trait_data(trait, conn))
 
     traits_details = [
         __retrieve_traitlist_and_datalist(threshold, fullname)
-        for fullname in search_result]
+        for fullname in traits_names]
     traits_list = tuple(x[0] for x in traits_details)
     traits_data_list = [x[1] for x in traits_details]
     exported_traits_data_list = tuple(
         export_trait_data(td, strainlist) for td in traits_data_list)
+    genotype_filename = build_genotype_file(traits_list[0]["riset"])
+    strainlist = load_genotype_samples(genotype_filename)
+    slink_data = slink(cluster_traits(exported_traits_data_list))
+    ordering_data = compute_heatmap_order(slink_data)
+    strains_and_values = retrieve_strains_and_values(
+        orders, strainlist, exported_traits_data_list)
+    strains_values = strains_and_values[0][1]
+    trait_values = [t[2] for t in strains_and_values]
+    traits_filename = generate_traits_filename()
+    generate_traits_file(strains_values, trait_values, traits_filename)
 
     return {
-        "target_description_checked": formd.formdata.getvalue(
-            "targetDescriptionCheck", ""),
-        "cluster_checked": cluster_checked,
-        "slink_data": (
-            slink(cluster_traits(exported_traits_data_list))
-            if cluster_checked else False),
-        "sessionfile": formd.formdata.getvalue("session"),
-        "genotype": genotype,
-        "nLoci": sum(map(len, genotype)),
+        "slink_data": slink_data,
+        "ordering_data": ordering_data,
         "strainlist": strainlist,
-        "ppolar": formd.ppolar,
-        "mpolar":formd.mpolar,
+        "genotype_filename": genotype_filename,
         "traits_list": traits_list,
         "traits_data_list": traits_data_list,
-        "exported_traits_data_list": exported_traits_data_list
+        "exported_traits_data_list": exported_traits_data_list,
+        "traits_filename": traits_filename
     }
 
 def compute_heatmap_order(
diff --git a/gn3/db/traits.py b/gn3/db/traits.py
index 1031e44..ccb101a 100644
--- a/gn3/db/traits.py
+++ b/gn3/db/traits.py
@@ -1,4 +1,5 @@
 """This class contains functions relating to trait data manipulation"""
+from gn3.settings import TMPDIR
 from typing import Any, Dict, Union, Sequence
 from gn3.function_helpers import compose
 from gn3.db.datasets import retrieve_trait_dataset
@@ -666,3 +667,7 @@ def retrieve_trait_data(trait: dict, conn: Any, strainlist: Sequence[str] = tupl
                     {k:v for k, v in x.items() if x != "strain_name"}),
                 data))}
     return {}
+
+def generate_traits_filename(base_path: str = TMPDIR):
+    return "{}/traits_test_file_{}.txt".format(
+        os.path.abspath(base_path), random_string(10))
-- 
cgit v1.2.3


From b5e1d1176f1bf4f7c0b68b27beb15e99418f1650 Mon Sep 17 00:00:00 2001
From: Muriithi Frederick Muriuki
Date: Tue, 31 Aug 2021 11:16:29 +0300
Subject: Fix linting errors, minor bugs and reorganise code

* Fix some linting errors and some minor bugs caught by the linter.
  Move the `random_string` function to separate module for use in multiple
  places in the code.
---
 gn3/computations/heatmap.py               |  7 ++++---
 gn3/computations/qtlreaper.py             | 27 ++++++++++++++-------------
 gn3/db/traits.py                          |  5 ++++-
 gn3/heatmaps/heatmaps.py                  | 25 +++++++++++++++++++------
 gn3/random.py                             | 11 +++++++++++
 tests/unit/computations/test_qtlreaper.py |  5 +++--
 6 files changed, 55 insertions(+), 25 deletions(-)
 create mode 100644 gn3/random.py

(limited to 'gn3/db/traits.py')

diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py
index 92014cf..1143450 100644
--- a/gn3/computations/heatmap.py
+++ b/gn3/computations/heatmap.py
@@ -6,6 +6,7 @@ generate various kinds of heatmaps.
 from functools import reduce
 from typing import Any, Dict, Sequence
 from gn3.computations.slink import slink
+from gn3.computations.qtlreaper import generate_traits_file
 from gn3.computations.correlations2 import compute_correlation
 from gn3.db.genotypes import build_genotype_file, load_genotype_samples
 from gn3.db.traits import (
@@ -155,14 +156,14 @@ def heatmap_data(traits_names, conn: Any):
         for fullname in traits_names]
     traits_list = tuple(x[0] for x in traits_details)
     traits_data_list = [x[1] for x in traits_details]
-    exported_traits_data_list = tuple(
-        export_trait_data(td, strainlist) for td in traits_data_list)
     genotype_filename = build_genotype_file(traits_list[0]["riset"])
     strainlist = load_genotype_samples(genotype_filename)
+    exported_traits_data_list = tuple(
+        export_trait_data(td, strainlist) for td in traits_data_list)
     slink_data = slink(cluster_traits(exported_traits_data_list))
     ordering_data = compute_heatmap_order(slink_data)
     strains_and_values = retrieve_strains_and_values(
-        orders, strainlist, exported_traits_data_list)
+        ordering_data, strainlist, exported_traits_data_list)
     strains_values = strains_and_values[0][1]
     trait_values = [t[2] for t in strains_and_values]
     traits_filename = generate_traits_filename()
diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py
index 3b8e4db..30c7051 100644
--- a/gn3/computations/qtlreaper.py
+++ b/gn3/computations/qtlreaper.py
@@ -3,17 +3,10 @@ This module contains functions to interact with the `qtlreaper` utility for
 computation of QTLs.
 """
 import os
-import random
-import string
 import subprocess
+from gn3.random import random_string
 from gn3.settings import TMPDIR, REAPER_COMMAND
 
-def random_string(length):
-    """Generate a random string of length `length`."""
-    return "".join(
-        random.choices(
-            string.ascii_letters + string.digits, k=length))
-
 def generate_traits_file(strains, trait_values, traits_filename):
     """
     Generate a traits file for use with `qtlreaper`.
@@ -25,11 +18,13 @@ def generate_traits_file(strains, trait_values, traits_filename):
         computation of QTLs.
     """
     header = "Trait\t{}\n".format("\t".join(strains))
-    data = [header] + [
-        "T{}\t{}\n".format(i+1, "\t".join([str(i) for i in t]))
-        for i, t in enumerate(trait_values[:-1])] + [
-        "T{}\t{}".format(len(trait_values), "\t".join([str(i) for i in t]))
-        for t in trait_values[-1:]]
+    data = (
+        [header] +
+        ["T{}\t{}\n".format(i+1, "\t".join([str(i) for i in t]))
+         for i, t in enumerate(trait_values[:-1])] +
+        ["T{}\t{}".format(
+            len(trait_values), "\t".join([str(i) for i in t]))
+         for t in trait_values[-1:]])
     with open(traits_filename, "w") as outfile:
         outfile.writelines(data)
 
@@ -93,6 +88,9 @@ def run_reaper(
 
 
 def parse_reaper_main_results(results_file):
+    """
+    Parse the results file of running QTLReaper into a list of dicts.
+    """
     with open(results_file, "r") as infile:
         lines = infile.readlines()
 
@@ -104,6 +102,9 @@ def parse_reaper_main_results(results_file):
     return [dict(zip(header, __parse_line(line))) for line in lines[1:]]
 
 def parse_reaper_permutation_results(results_file):
+    """
+    Parse the results QTLReaper permutations into a list of values.
+    """
     with open(results_file, "r") as infile:
         lines = infile.readlines()
 
diff --git a/gn3/db/traits.py b/gn3/db/traits.py
index ccb101a..bfe887e 100644
--- a/gn3/db/traits.py
+++ b/gn3/db/traits.py
@@ -1,6 +1,8 @@
 """This class contains functions relating to trait data manipulation"""
-from gn3.settings import TMPDIR
+import os
 from typing import Any, Dict, Union, Sequence
+from gn3.settings import TMPDIR
+from gn3.random import random_string
 from gn3.function_helpers import compose
 from gn3.db.datasets import retrieve_trait_dataset
 
@@ -669,5 +671,6 @@ def retrieve_trait_data(trait: dict, conn: Any, strainlist: Sequence[str] = tupl
     return {}
 
 def generate_traits_filename(base_path: str = TMPDIR):
+    """Generate a unique filename for use with generated traits files."""
     return "{}/traits_test_file_{}.txt".format(
         os.path.abspath(base_path), random_string(10))
diff --git a/gn3/heatmaps/heatmaps.py b/gn3/heatmaps/heatmaps.py
index 3bf7917..88f546d 100644
--- a/gn3/heatmaps/heatmaps.py
+++ b/gn3/heatmaps/heatmaps.py
@@ -14,6 +14,19 @@ def generate_random_data(data_stop: float = 2, width: int = 10, height: int = 30
     return [[random.uniform(0,data_stop) for i in range(0, width)]
             for j in range(0, height)]
 
+def generate_random_data2(data_stop: float = 2, width: int = 10, height: int = 30):
+    """
+    This is mostly a utility function to be used to generate random data, useful
+    for development of the heatmap generation code, without access to the actual
+    database data.
+    """
+    return [
+        [{
+            "value": item,
+            "category": random.choice(["C57BL/6J +", "DBA/2J +"])}
+         for item in axis]
+        for axis in generate_random_data(data_stop, width, height)]
+
 def heatmap_x_axis_names():
     return [
         "UCLA_BXDBXH_CARTILAGE_V2::ILM103710672",
@@ -30,13 +43,14 @@ def heatmap_x_axis_names():
 
 # Grey + Blue + Red
 def generate_heatmap():
-    rows = 20
-    data = generate_random_data(height=rows)
-    y = (["%s"%x for x in range(1, rows+1)][:-1] + ["X"]) #replace last item with x for now
+    cols = 20
+    y_axis = (["%s"%x for x in range(1, cols+1)][:-1] + ["X"]) #replace last item with x for now
+    x_axis = heatmap_x_axis_names()
+    data = generate_random_data(height=cols, width=len(x_axis))
     fig = px.imshow(
         data,
-        x=heatmap_x_axis_names(),
-        y=y,
+        x=x_axis,
+        y=y_axis,
         width=500)
     fig.update_traces(xtype="array")
     fig.update_traces(ytype="array")
@@ -49,6 +63,5 @@ def generate_heatmap():
         coloraxis_colorscale=[
             [0.0, '#3B3B3B'], [0.4999999999999999, '#ABABAB'],
             [0.5, '#F5DE11'], [1.0, '#FF0D00']])
-
     fig.write_html("%s/%s"%(heatmap_dir, "test_image.html"))
     return fig
diff --git a/gn3/random.py b/gn3/random.py
new file mode 100644
index 0000000..f0ba574
--- /dev/null
+++ b/gn3/random.py
@@ -0,0 +1,11 @@
+"""
+Functions to generate complex random data.
+"""
+import random
+import string
+
+def random_string(length):
+    """Generate a random string of length `length`."""
+    return "".join(
+        random.choices(
+            string.ascii_letters + string.digits, k=length))
diff --git a/tests/unit/computations/test_qtlreaper.py b/tests/unit/computations/test_qtlreaper.py
index ec23664..6c3b64d 100644
--- a/tests/unit/computations/test_qtlreaper.py
+++ b/tests/unit/computations/test_qtlreaper.py
@@ -1,5 +1,4 @@
 """Module contains tests for gn3.computations.qtlreaper"""
-import os
 from unittest import TestCase
 from gn3.computations.qtlreaper import (
     parse_reaper_main_results, parse_reaper_permutation_results)
@@ -8,6 +7,7 @@ class TestQTLReaper(TestCase):
     """Class for testing qtlreaper interface functions."""
 
     def test_parse_reaper_main_results(self):
+        """Test that the main results file is parsed correctly."""
         self.assertEqual(
             parse_reaper_main_results(
                 "tests/unit/computations/data/qtlreaper/main_output_sample.txt"),
@@ -65,9 +65,10 @@ class TestQTLReaper(TestCase):
             ])
 
     def test_parse_reaper_permutation_results(self):
+        """Test that the permutations results file is parsed correctly."""
         self.assertEqual(
             parse_reaper_permutation_results(
-            "tests/unit/computations/data/qtlreaper/permu_output_sample.txt"),
+                "tests/unit/computations/data/qtlreaper/permu_output_sample.txt"),
             [4.44174, 5.03825, 5.08167, 5.18119, 5.18578, 5.24563, 5.24619,
              5.24619, 5.27961, 5.28228, 5.43903, 5.50188, 5.51694, 5.56830,
              5.63874, 5.71346, 5.71936, 5.74275, 5.76764, 5.79815, 5.81671,
-- 
cgit v1.2.3


From cd7f301688fd9780df1f842f8bd2b7602775ba1f Mon Sep 17 00:00:00 2001
From: Frederick Muriuki Muriithi
Date: Wed, 22 Sep 2021 07:53:53 +0300
Subject: Fix pylint errors

* Add missing function and module docstrings
* Remove unused imports
* Fix import order
* Rework some code sections to fix issues
* Disable some pylint errors.
---
 gn3/api/heatmaps.py           |  8 ++++++++
 gn3/app.py                    |  5 +++--
 gn3/computations/qtlreaper.py |  8 ++++++++
 gn3/db/genotypes.py           |  1 +
 gn3/db/traits.py              |  2 +-
 gn3/heatmaps.py               | 28 ++++++++++++++++------------
 6 files changed, 37 insertions(+), 15 deletions(-)

(limited to 'gn3/db/traits.py')

diff --git a/gn3/api/heatmaps.py b/gn3/api/heatmaps.py
index 1022a35..fe47aee 100644
--- a/gn3/api/heatmaps.py
+++ b/gn3/api/heatmaps.py
@@ -1,3 +1,7 @@
+"""
+Module to hold the entrypoint functions that generate heatmaps
+"""
+
 import io
 from flask import jsonify
 from flask import request
@@ -9,6 +13,10 @@ heatmaps = Blueprint("heatmaps", __name__)
 
 @heatmaps.route("/clustered", methods=("POST",))
 def clustered_heatmaps():
+    """
+    Parses the incoming data and responds with the JSON-serialized plotly figure
+    representing the clustered heatmap.
+    """
     heatmap_request = request.get_json()
     traits_names = heatmap_request.get("traits_names", tuple())
     if len(traits_names) < 2:
diff --git a/gn3/app.py b/gn3/app.py
index 6b4c57e..8badb65 100644
--- a/gn3/app.py
+++ b/gn3/app.py
@@ -3,7 +3,10 @@ import os
 
 from typing import Dict
 from typing import Union
+
 from flask import Flask
+from flask_cors import CORS
+
 from gn3.api.gemma import gemma
 from gn3.api.rqtl import rqtl
 from gn3.api.general import general
@@ -11,8 +14,6 @@ from gn3.api.heatmaps import heatmaps
 from gn3.api.correlation import correlation
 from gn3.api.data_entry import data_entry
 
-from flask_cors import CORS
-
 def create_app(config: Union[Dict, str, None] = None) -> Flask:
     """Create a new flask object"""
     app = Flask(__name__)
diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py
index 377db9b..5d17fed 100644
--- a/gn3/computations/qtlreaper.py
+++ b/gn3/computations/qtlreaper.py
@@ -87,11 +87,17 @@ def run_reaper(
     return (output_filename, permu_output_filename)
 
 def chromosome_sorter_key_fn(val):
+    """
+    Useful for sorting the chromosomes
+    """
     if isinstance(val, int):
         return val
     return ord(val)
 
 def organise_reaper_main_results(parsed_results):
+    """
+    Provide the results of running reaper in a format that is easier to use.
+    """
     def __organise_by_chromosome(chr_name, items):
         chr_items = [item for item in items if item["Chr"] == chr_name]
         return {
@@ -129,12 +135,14 @@ def parse_reaper_main_results(results_file):
         lines = infile.readlines()
 
     def __parse_column_float_value(value):
+        # pylint: disable=W0702
         try:
             return float(value)
         except:
             return value
 
     def __parse_column_int_value(value):
+        # pylint: disable=W0702
         try:
             return int(value)
         except:
diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py
index 9d052d9..919c539 100644
--- a/gn3/db/genotypes.py
+++ b/gn3/db/genotypes.py
@@ -115,6 +115,7 @@ def parse_genotype_marker(line: str, geno_obj: dict, parlist: list):
     Reworks
     https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/utility/gen_geno_ob.py#L143-L190
     """
+    # pylint: disable=W0702
     marker_row = [item.strip() for item in line.split("\t")]
     geno_table = {
         geno_obj["mat"]: -1, geno_obj["pat"]: 1, geno_obj["het"]: 0,
diff --git a/gn3/db/traits.py b/gn3/db/traits.py
index bfe887e..747ed27 100644
--- a/gn3/db/traits.py
+++ b/gn3/db/traits.py
@@ -46,7 +46,7 @@ def update_sample_data(conn: Any,
                        count: Union[int, str]):
     """Given the right parameters, update sample-data from the relevant
     table."""
-    # pylint: disable=[R0913, R0914]
+    # pylint: disable=[R0913, R0914, C0103]
     STRAIN_ID_SQL: str = "UPDATE Strain SET Name = %s WHERE Id = %s"
     PUBLISH_DATA_SQL: str = ("UPDATE PublishData SET value = %s "
                              "WHERE StrainId = %s AND Id = %s")
diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py
index cd93b3f..9d82fb2 100644
--- a/gn3/heatmaps.py
+++ b/gn3/heatmaps.py
@@ -3,29 +3,28 @@ This module will contain functions to be used in computation of the data used to
 generate various kinds of heatmaps.
 """
 
+from functools import reduce
 from typing import Any, Dict, Sequence
+
 import numpy as np
-from functools import reduce
-from gn3.settings import TMPDIR
 import plotly.graph_objects as go
 import plotly.figure_factory as ff
+from plotly.subplots import make_subplots
+
+from gn3.settings import TMPDIR
 from gn3.random import random_string
 from gn3.computations.slink import slink
-from plotly.subplots import make_subplots
 from gn3.computations.correlations2 import compute_correlation
 from gn3.db.genotypes import (
-    build_genotype_file, load_genotype_samples, parse_genotype_file)
+    build_genotype_file, load_genotype_samples)
 from gn3.db.traits import (
-    retrieve_trait_data,
-    retrieve_trait_info,
-    generate_traits_filename)
+    retrieve_trait_data, retrieve_trait_info)
 from gn3.computations.qtlreaper import (
     run_reaper,
     generate_traits_file,
     chromosome_sorter_key_fn,
     parse_reaper_main_results,
-    organise_reaper_main_results,
-    parse_reaper_permutation_results)
+    organise_reaper_main_results)
 
 def export_trait_data(
         trait_data: dict, strainlist: Sequence[str], dtype: str = "val",
@@ -159,13 +158,13 @@ def build_heatmap(traits_names, conn: Any):
     PARAMETERS:
     TODO: Elaborate on the parameters here...
     """
+    # pylint: disable=[R0914]
     threshold = 0 # webqtlConfig.PUBLICTHRESH
     traits = [
         retrieve_trait_info(threshold, fullname, conn)
         for fullname in traits_names]
     traits_data_list = [retrieve_trait_data(t, conn) for t in traits]
     genotype_filename = build_genotype_file(traits[0]["riset"])
-    # genotype = parse_genotype_file(genotype_filename)
     strains = load_genotype_samples(genotype_filename)
     exported_traits_data_list = [
         export_trait_data(td, strains) for td in traits_data_list]
@@ -336,6 +335,7 @@ def generate_clustered_heatmap(
     Generate a dendrogram, and heatmaps for each chromosome, and put them all
     into one plot.
     """
+    # pylint: disable=[R0913, R0914]
     num_cols = 1 + len(x_axis)
     fig = make_subplots(
         rows=1,
@@ -359,14 +359,18 @@ def generate_clustered_heatmap(
             "height": 800,
             "xaxis": {
                 "mirror": False,
-                "showgrid": True
+                "showgrid": True,
+                "title": x_label
+            },
+            "yaxis": {
+                "title": y_label
             }
         })
 
     x_axes_layouts = {
         "xaxis{}".format(i+1 if i > 0 else ""): {
             "mirror": False,
-            "showticklabels": True if i == 0 else False,
+            "showticklabels": i == 0,
             "ticks": "outside" if i == 0 else ""
         }
         for i in range(num_cols)}
-- 
cgit v1.2.3


From 71cc35e5178904b512b9007e33be17a36f6656f2 Mon Sep 17 00:00:00 2001
From: Frederick Muriuki Muriithi
Date: Wed, 22 Sep 2021 08:36:11 +0300
Subject: Fix typing issues

* Ignore some errors
* Update typing definitions for some portions of code
* Add missing imports
---
 gn3/app.py                    |  2 +-
 gn3/computations/qtlreaper.py |  6 ++++--
 gn3/db/genotypes.py           | 10 ++++++----
 gn3/db/traits.py              |  8 ++++----
 gn3/heatmaps.py               |  8 +++-----
 5 files changed, 18 insertions(+), 16 deletions(-)

(limited to 'gn3/db/traits.py')

diff --git a/gn3/app.py b/gn3/app.py
index 8badb65..5e852e1 100644
--- a/gn3/app.py
+++ b/gn3/app.py
@@ -5,7 +5,7 @@ from typing import Dict
 from typing import Union
 
 from flask import Flask
-from flask_cors import CORS
+from flask_cors import CORS # type: ignore
 
 from gn3.api.gemma import gemma
 from gn3.api.rqtl import rqtl
diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py
index 5d17fed..5ddea76 100644
--- a/gn3/computations/qtlreaper.py
+++ b/gn3/computations/qtlreaper.py
@@ -4,6 +4,8 @@ computation of QTLs.
 """
 import os
 import subprocess
+from typing import Union
+
 from gn3.random import random_string
 from gn3.settings import TMPDIR, REAPER_COMMAND
 
@@ -70,9 +72,9 @@ def run_reaper(
         output_dir, random_string(10))
     output_list = ["--main_output", output_filename]
     if separate_nperm_output:
-        permu_output_filename = "{}/qtlreaper/permu_output_{}.txt".format(
+        permu_output_filename: Union[None, str] = "{}/qtlreaper/permu_output_{}.txt".format(
             output_dir, random_string(10))
-        output_list = output_list + ["--permu_output", permu_output_filename]
+        output_list = output_list + ["--permu_output", permu_output_filename] # type: ignore[list-item]
     else:
         permu_output_filename = None
 
diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py
index 919c539..9ea9f20 100644
--- a/gn3/db/genotypes.py
+++ b/gn3/db/genotypes.py
@@ -2,6 +2,8 @@
 
 import os
 import gzip
+from typing import Union, TextIO
+
 from gn3.settings import GENOTYPE_FILES
 
 def build_genotype_file(
@@ -44,17 +46,17 @@ def __load_genotype_samples_from_geno(genotype_filename: str):
     """
     gzipped_filename = "{}.gz".format(genotype_filename)
     if os.path.isfile(gzipped_filename):
-        genofile = gzip.open(gzipped_filename)
+        genofile: Union[TextIO, gzip.GzipFile] = gzip.open(gzipped_filename)
     else:
         genofile = open(genotype_filename)
 
     for row in genofile:
         line = row.strip()
-        if (not line) or (line.startswith(("#", "@"))):
+        if (not line) or (line.startswith(("#", "@"))): # type: ignore[arg-type]
             continue
         break
 
-    headers = line.split("\t")
+    headers = line.split("\t" ) # type: ignore[arg-type]
     if headers[3] == "Mb":
         return headers[4:]
     return headers[3:]
@@ -107,7 +109,7 @@ def parse_genotype_header(line: str, parlist: tuple = tuple()):
         ("prgy", prgy),
         ("nprgy", len(prgy)))
 
-def parse_genotype_marker(line: str, geno_obj: dict, parlist: list):
+def parse_genotype_marker(line: str, geno_obj: dict, parlist: tuple):
     """
     Parse a data line in a genotype file
 
diff --git a/gn3/db/traits.py b/gn3/db/traits.py
index 747ed27..4fc47c3 100644
--- a/gn3/db/traits.py
+++ b/gn3/db/traits.py
@@ -63,22 +63,22 @@ def update_sample_data(conn: Any,
     with conn.cursor() as cursor:
         # Update the Strains table
         cursor.execute(STRAIN_ID_SQL, (strain_name, strain_id))
-        updated_strains: int = cursor.rowcount
+        updated_strains = cursor.rowcount
         # Update the PublishData table
         cursor.execute(PUBLISH_DATA_SQL,
                        (None if value == "x" else value,
                         strain_id, publish_data_id))
-        updated_published_data: int = cursor.rowcount
+        updated_published_data = cursor.rowcount
         # Update the PublishSE table
         cursor.execute(PUBLISH_SE_SQL,
                        (None if error == "x" else error,
                         strain_id, publish_data_id))
-        updated_se_data: int = cursor.rowcount
+        updated_se_data = cursor.rowcount
         # Update the NStrain table
         cursor.execute(N_STRAIN_SQL,
                        (None if count == "x" else count,
                         strain_id, publish_data_id))
-        updated_n_strains: int = cursor.rowcount
+        updated_n_strains = cursor.rowcount
     return (updated_strains, updated_published_data,
             updated_se_data, updated_n_strains)
 
diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py
index 9d82fb2..45d0c22 100644
--- a/gn3/heatmaps.py
+++ b/gn3/heatmaps.py
@@ -7,9 +7,9 @@ from functools import reduce
 from typing import Any, Dict, Sequence
 
 import numpy as np
-import plotly.graph_objects as go
-import plotly.figure_factory as ff
-from plotly.subplots import make_subplots
+import plotly.graph_objects as go # type: ignore
+import plotly.figure_factory as ff # type: ignore
+from plotly.subplots import make_subplots # type: ignore
 
 from gn3.settings import TMPDIR
 from gn3.random import random_string
@@ -171,8 +171,6 @@ def build_heatmap(traits_names, conn: Any):
     clustered = cluster_traits(exported_traits_data_list)
     slinked = slink(clustered)
     traits_order = compute_traits_order(slinked)
-    ordered_traits_names = [
-        traits[idx]["trait_fullname"] for idx in traits_order]
     strains_and_values = retrieve_strains_and_values(
         traits_order, strains, exported_traits_data_list)
     traits_filename = "{}/traits_test_file_{}.txt".format(
-- 
cgit v1.2.3


From 19783a18c2bc7941fc5980e593f19fb1d18c3623 Mon Sep 17 00:00:00 2001
From: Frederick Muriuki Muriithi
Date: Mon, 27 Sep 2021 04:48:53 +0300
Subject: Update terminology: `strain` to `sample`

Issue:
https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi

* Update the terminology used: use `sample` in place of `strain` according to
  Zachary's direction at
  https://github.com/genenetwork/genenetwork3/pull/37#issuecomment-926043306
---
 gn3/computations/parsers.py             | 10 ++---
 gn3/computations/qtlreaper.py           |  8 ++--
 gn3/db/genotypes.py                     |  8 ++--
 gn3/db/traits.py                        | 44 ++++++++++-----------
 gn3/heatmaps.py                         | 62 ++++++++++++++---------------
 tests/unit/computations/test_parsers.py |  4 +-
 tests/unit/test_heatmaps.py             | 70 ++++++++++++++++-----------------
 7 files changed, 103 insertions(+), 103 deletions(-)

(limited to 'gn3/db/traits.py')

diff --git a/gn3/computations/parsers.py b/gn3/computations/parsers.py
index 94387ff..1af35d6 100644
--- a/gn3/computations/parsers.py
+++ b/gn3/computations/parsers.py
@@ -14,7 +14,7 @@ def parse_genofile(file_path: str) -> Tuple[List[str],
         'h': 0,
         'u': None,
     }
-    genotypes, strains = [], []
+    genotypes, samples = [], []
     with open(file_path, "r") as _genofile:
         for line in _genofile:
             line = line.strip()
@@ -22,8 +22,8 @@ def parse_genofile(file_path: str) -> Tuple[List[str],
                 continue
             cells = line.split()
             if line.startswith("Chr"):
-                strains = cells[4:]
-                strains = [strain.lower() for strain in strains]
+                samples = cells[4:]
+                samples = [sample.lower() for sample in samples]
                 continue
             values = [__map.get(value.lower(), None) for value in cells[4:]]
             genotype = {
@@ -32,7 +32,7 @@ def parse_genofile(file_path: str) -> Tuple[List[str],
                 "cm": cells[2],
                 "mb": cells[3],
                 "values":  values,
-                "dicvalues": dict(zip(strains, values)),
+                "dicvalues": dict(zip(samples, values)),
             }
             genotypes.append(genotype)
-        return strains, genotypes
+        return samples, genotypes
diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py
index 8b2893e..166d2dd 100644
--- a/gn3/computations/qtlreaper.py
+++ b/gn3/computations/qtlreaper.py
@@ -9,17 +9,17 @@ from typing import Union
 from gn3.random import random_string
 from gn3.settings import TMPDIR, REAPER_COMMAND
 
-def generate_traits_file(strains, trait_values, traits_filename):
+def generate_traits_file(samples, trait_values, traits_filename):
     """
     Generate a traits file for use with `qtlreaper`.
 
     PARAMETERS:
-    strains: A list of strains to use as the headers for the various columns.
-    trait_values: A list of lists of values for each trait and strain.
+    samples: A list of samples to use as the headers for the various columns.
+    trait_values: A list of lists of values for each trait and sample.
     traits_filename: The tab-separated value to put the values in for
         computation of QTLs.
     """
-    header = "Trait\t{}\n".format("\t".join(strains))
+    header = "Trait\t{}\n".format("\t".join(samples))
     data = (
         [header] +
         ["{}\t{}\n".format(i+1, "\t".join([str(i) for i in t]))
diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py
index 9987320..8f18cac 100644
--- a/gn3/db/genotypes.py
+++ b/gn3/db/genotypes.py
@@ -14,16 +14,16 @@ def build_genotype_file(
 
 def load_genotype_samples(genotype_filename: str, file_type: str = "geno"):
     """
-    Load sample of strains from genotype files.
+    Load sample of samples from genotype files.
 
     DESCRIPTION:
-    Traits can contain a varied number of strains, some of which do not exist in
+    Traits can contain a varied number of samples, some of which do not exist in
     certain genotypes. In order to compute QTLs, GEMMAs, etc, we need to ensure
-    to pick only those strains that exist in the genotype under consideration
+    to pick only those samples that exist in the genotype under consideration
     for the traits used in the computation.
 
     This function loads a list of samples from the genotype files for use in
-    filtering out unusable strains.
+    filtering out unusable samples.
 
 
     PARAMETERS:
diff --git a/gn3/db/traits.py b/gn3/db/traits.py
index 4fc47c3..c9d05d7 100644
--- a/gn3/db/traits.py
+++ b/gn3/db/traits.py
@@ -445,7 +445,7 @@ def retrieve_temp_trait_data(trait_info: dict, conn: Any):
             query,
             {"trait_name": trait_info["trait_name"]})
         return [dict(zip(
-            ["strain_name", "value", "se_error", "nstrain", "id"], row))
+            ["sample_name", "value", "se_error", "nstrain", "id"], row))
                 for row in cursor.fetchall()]
     return []
 
@@ -484,7 +484,7 @@ def retrieve_geno_trait_data(trait_info: Dict, conn: Any):
              "species_id": retrieve_species_id(
                  trait_info["db"]["riset"], conn)})
         return [dict(zip(
-            ["strain_name", "value", "se_error", "id"], row))
+            ["sample_name", "value", "se_error", "id"], row))
                 for row in cursor.fetchall()]
     return []
 
@@ -515,7 +515,7 @@ def retrieve_publish_trait_data(trait_info: Dict, conn: Any):
             {"trait_name": trait_info["trait_name"],
              "dataset_id": trait_info["db"]["dataset_id"]})
         return [dict(zip(
-            ["strain_name", "value", "se_error", "nstrain", "id"], row))
+            ["sample_name", "value", "se_error", "nstrain", "id"], row))
                 for row in cursor.fetchall()]
     return []
 
@@ -548,7 +548,7 @@ def retrieve_cellid_trait_data(trait_info: Dict, conn: Any):
              "trait_name": trait_info["trait_name"],
              "dataset_id": trait_info["db"]["dataset_id"]})
         return [dict(zip(
-            ["strain_name", "value", "se_error", "id"], row))
+            ["sample_name", "value", "se_error", "id"], row))
                 for row in cursor.fetchall()]
     return []
 
@@ -577,29 +577,29 @@ def retrieve_probeset_trait_data(trait_info: Dict, conn: Any):
             {"trait_name": trait_info["trait_name"],
              "dataset_name": trait_info["db"]["dataset_name"]})
         return [dict(zip(
-            ["strain_name", "value", "se_error", "id"], row))
+            ["sample_name", "value", "se_error", "id"], row))
                 for row in cursor.fetchall()]
     return []
 
-def with_strainlist_data_setup(strainlist: Sequence[str]):
+def with_samplelist_data_setup(samplelist: Sequence[str]):
     """
-    Build function that computes the trait data from provided list of strains.
+    Build function that computes the trait data from provided list of samples.
 
     PARAMETERS
-    strainlist: (list)
-      A list of strain names
+    samplelist: (list)
+      A list of sample names
 
     RETURNS:
       Returns a function that given some data from the database, computes the
-      strain's value, variance and ndata values, only if the strain is present
-      in the provided `strainlist` variable.
+      sample's value, variance and ndata values, only if the sample is present
+      in the provided `samplelist` variable.
     """
     def setup_fn(tdata):
-        if tdata["strain_name"] in strainlist:
+        if tdata["sample_name"] in samplelist:
             val = tdata["value"]
             if val is not None:
                 return {
-                    "strain_name": tdata["strain_name"],
+                    "sample_name": tdata["sample_name"],
                     "value": val,
                     "variance": tdata["se_error"],
                     "ndata": tdata.get("nstrain", None)
@@ -607,19 +607,19 @@ def with_strainlist_data_setup(strainlist: Sequence[str]):
         return None
     return setup_fn
 
-def without_strainlist_data_setup():
+def without_samplelist_data_setup():
     """
     Build function that computes the trait data.
 
     RETURNS:
       Returns a function that given some data from the database, computes the
-      strain's value, variance and ndata values.
+      sample's value, variance and ndata values.
     """
     def setup_fn(tdata):
         val = tdata["value"]
         if val is not None:
             return {
-                "strain_name": tdata["strain_name"],
+                "sample_name": tdata["sample_name"],
                 "value": val,
                 "variance": tdata["se_error"],
                 "ndata": tdata.get("nstrain", None)
@@ -627,7 +627,7 @@ def without_strainlist_data_setup():
         return None
     return setup_fn
 
-def retrieve_trait_data(trait: dict, conn: Any, strainlist: Sequence[str] = tuple()):
+def retrieve_trait_data(trait: dict, conn: Any, samplelist: Sequence[str] = tuple()):
     """
     Retrieve trait data
 
@@ -650,23 +650,23 @@ def retrieve_trait_data(trait: dict, conn: Any, strainlist: Sequence[str] = tupl
     if results:
         # do something with mysqlid
         mysqlid = results[0]["id"]
-        if strainlist:
+        if samplelist:
             data = [
                 item for item in
-                map(with_strainlist_data_setup(strainlist), results)
+                map(with_samplelist_data_setup(samplelist), results)
                 if item is not None]
         else:
             data = [
                 item for item in
-                map(without_strainlist_data_setup(), results)
+                map(without_samplelist_data_setup(), results)
                 if item is not None]
 
         return {
             "mysqlid": mysqlid,
             "data": dict(map(
                 lambda x: (
-                    x["strain_name"],
-                    {k:v for k, v in x.items() if x != "strain_name"}),
+                    x["sample_name"],
+                    {k:v for k, v in x.items() if x != "sample_name"}),
                 data))}
     return {}
 
diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py
index 45d0c22..b6fc6d3 100644
--- a/gn3/heatmaps.py
+++ b/gn3/heatmaps.py
@@ -27,10 +27,10 @@ from gn3.computations.qtlreaper import (
     organise_reaper_main_results)
 
 def export_trait_data(
-        trait_data: dict, strainlist: Sequence[str], dtype: str = "val",
+        trait_data: dict, samplelist: Sequence[str], dtype: str = "val",
         var_exists: bool = False, n_exists: bool = False):
     """
-    Export data according to `strainlist`. Mostly used in calculating
+    Export data according to `samplelist`. Mostly used in calculating
     correlations.
 
     DESCRIPTION:
@@ -40,8 +40,8 @@ def export_trait_data(
     PARAMETERS
     trait: (dict)
       The dictionary of key-value pairs representing a trait
-    strainlist: (list)
-      A list of strain names
+    samplelist: (list)
+      A list of sample names
     dtype: (str)
       ... verify what this is ...
     var_exists: (bool)
@@ -49,18 +49,18 @@ def export_trait_data(
     n_exists: (bool)
       A flag indicating existence of ndata
     """
-    def __export_all_types(tdata, strain):
+    def __export_all_types(tdata, sample):
         sample_data = []
-        if tdata[strain]["value"]:
-            sample_data.append(tdata[strain]["value"])
+        if tdata[sample]["value"]:
+            sample_data.append(tdata[sample]["value"])
             if var_exists:
-                if tdata[strain]["variance"]:
-                    sample_data.append(tdata[strain]["variance"])
+                if tdata[sample]["variance"]:
+                    sample_data.append(tdata[sample]["variance"])
                 else:
                     sample_data.append(None)
             if n_exists:
-                if tdata[strain]["ndata"]:
-                    sample_data.append(tdata[strain]["ndata"])
+                if tdata[sample]["ndata"]:
+                    sample_data.append(tdata[sample]["ndata"])
                 else:
                     sample_data.append(None)
         else:
@@ -73,17 +73,17 @@ def export_trait_data(
 
         return tuple(sample_data)
 
-    def __exporter(accumulator, strain):
+    def __exporter(accumulator, sample):
         # pylint: disable=[R0911]
-        if strain in trait_data["data"]:
+        if sample in trait_data["data"]:
             if dtype == "val":
-                return accumulator + (trait_data["data"][strain]["value"], )
+                return accumulator + (trait_data["data"][sample]["value"], )
             if dtype == "var":
-                return accumulator + (trait_data["data"][strain]["variance"], )
+                return accumulator + (trait_data["data"][sample]["variance"], )
             if dtype == "N":
-                return accumulator + (trait_data["data"][strain]["ndata"], )
+                return accumulator + (trait_data["data"][sample]["ndata"], )
             if dtype == "all":
-                return accumulator + __export_all_types(trait_data["data"], strain)
+                return accumulator + __export_all_types(trait_data["data"], sample)
             raise KeyError("Type `%s` is incorrect" % dtype)
         if var_exists and n_exists:
             return accumulator + (None, None, None)
@@ -91,7 +91,7 @@ def export_trait_data(
             return accumulator + (None, None)
         return accumulator + (None,)
 
-    return reduce(__exporter, strainlist, tuple())
+    return reduce(__exporter, samplelist, tuple())
 
 def trait_display_name(trait: Dict):
     """
@@ -165,19 +165,19 @@ def build_heatmap(traits_names, conn: Any):
         for fullname in traits_names]
     traits_data_list = [retrieve_trait_data(t, conn) for t in traits]
     genotype_filename = build_genotype_file(traits[0]["riset"])
-    strains = load_genotype_samples(genotype_filename)
+    samples = load_genotype_samples(genotype_filename)
     exported_traits_data_list = [
-        export_trait_data(td, strains) for td in traits_data_list]
+        export_trait_data(td, samples) for td in traits_data_list]
     clustered = cluster_traits(exported_traits_data_list)
     slinked = slink(clustered)
     traits_order = compute_traits_order(slinked)
-    strains_and_values = retrieve_strains_and_values(
-        traits_order, strains, exported_traits_data_list)
+    samples_and_values = retrieve_samples_and_values(
+        traits_order, samples, exported_traits_data_list)
     traits_filename = "{}/traits_test_file_{}.txt".format(
         TMPDIR, random_string(10))
     generate_traits_file(
-        strains_and_values[0][1],
-        [t[2] for t in strains_and_values],
+        samples_and_values[0][1],
+        [t[2] for t in samples_and_values],
         traits_filename)
 
     main_output, _permutations_output = run_reaper(
@@ -229,9 +229,9 @@ def compute_traits_order(slink_data, neworder: tuple = tuple()):
 
     return __order_maker(neworder, slink_data)
 
-def retrieve_strains_and_values(orders, strainlist, traits_data_list):
+def retrieve_samples_and_values(orders, samplelist, traits_data_list):
     """
-    Get the strains and their corresponding values from `strainlist` and
+    Get the samples and their corresponding values from `samplelist` and
     `traits_data_list`.
 
     This migrates the code in
@@ -240,17 +240,17 @@ def retrieve_strains_and_values(orders, strainlist, traits_data_list):
     # This feels nasty! There's a lot of mutation of values here, that might
     # indicate something untoward in the design of this function and its
     # dependents  ==>  Review
-    strains = []
+    samples = []
     values = []
     rets = []
     for order in orders:
         temp_val = traits_data_list[order]
-        for i, strain in enumerate(strainlist):
+        for i, sample in enumerate(samplelist):
             if temp_val[i] is not None:
-                strains.append(strain)
+                samples.append(sample)
                 values.append(temp_val[i])
-        rets.append([order, strains[:], values[:]])
-        strains = []
+        rets.append([order, samples[:], values[:]])
+        samples = []
         values = []
 
     return rets
diff --git a/tests/unit/computations/test_parsers.py b/tests/unit/computations/test_parsers.py
index 19c3067..b51b0bf 100644
--- a/tests/unit/computations/test_parsers.py
+++ b/tests/unit/computations/test_parsers.py
@@ -15,7 +15,7 @@ class TestParsers(unittest.TestCase):
 
     def test_parse_genofile_with_existing_file(self):
         """Test that a genotype file is parsed correctly"""
-        strains = ["bxd1", "bxd2"]
+        samples = ["bxd1", "bxd2"]
         genotypes = [
             {"chr": "1", "locus": "rs31443144",
              "cm": "1.50", "mb": "3.010274",
@@ -51,4 +51,4 @@ class TestParsers(unittest.TestCase):
             "../test_data/genotype.txt"
         ))
         self.assertEqual(parse_genofile(
-            test_genotype_file), (strains, genotypes))
+            test_genotype_file), (samples, genotypes))
diff --git a/tests/unit/test_heatmaps.py b/tests/unit/test_heatmaps.py
index fd91cf9..b54e2f3 100644
--- a/tests/unit/test_heatmaps.py
+++ b/tests/unit/test_heatmaps.py
@@ -5,41 +5,41 @@ from gn3.heatmaps import (
     get_lrs_from_chr,
     export_trait_data,
     compute_traits_order,
-    retrieve_strains_and_values,
+    retrieve_samples_and_values,
     process_traits_data_for_heatmap)
 from tests.unit.sample_test_data import organised_trait_1, organised_trait_2
 
-strainlist = ["B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"]
+samplelist = ["B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"]
 trait_data = {
     "mysqlid": 36688172,
     "data": {
-        "B6cC3-1": {"strain_name": "B6cC3-1", "value": 7.51879, "variance": None, "ndata": None},
-        "BXD1": {"strain_name": "BXD1", "value": 7.77141, "variance": None, "ndata": None},
-        "BXD12": {"strain_name": "BXD12", "value": 8.39265, "variance": None, "ndata": None},
-        "BXD16": {"strain_name": "BXD16", "value": 8.17443, "variance": None, "ndata": None},
-        "BXD19": {"strain_name": "BXD19", "value": 8.30401, "variance": None, "ndata": None},
-        "BXD2": {"strain_name": "BXD2", "value": 7.80944, "variance": None, "ndata": None},
-        "BXD21": {"strain_name": "BXD21", "value": 8.93809, "variance": None, "ndata": None},
-        "BXD24": {"strain_name": "BXD24", "value": 7.99415, "variance": None, "ndata": None},
-        "BXD27": {"strain_name": "BXD27", "value": 8.12177, "variance": None, "ndata": None},
-        "BXD28": {"strain_name": "BXD28", "value": 7.67688, "variance": None, "ndata": None},
-        "BXD32": {"strain_name": "BXD32", "value": 7.79062, "variance": None, "ndata": None},
-        "BXD39": {"strain_name": "BXD39", "value": 8.27641, "variance": None, "ndata": None},
-        "BXD40": {"strain_name": "BXD40", "value": 8.18012, "variance": None, "ndata": None},
-        "BXD42": {"strain_name": "BXD42", "value": 7.82433, "variance": None, "ndata": None},
-        "BXD6": {"strain_name": "BXD6", "value": 8.09718, "variance": None, "ndata": None},
-        "BXH14": {"strain_name": "BXH14", "value": 7.97475, "variance": None, "ndata": None},
-        "BXH19": {"strain_name": "BXH19", "value": 7.67223, "variance": None, "ndata": None},
-        "BXH2": {"strain_name": "BXH2", "value": 7.93622, "variance": None, "ndata": None},
-        "BXH22": {"strain_name": "BXH22", "value": 7.43692, "variance": None, "ndata": None},
-        "BXH4": {"strain_name": "BXH4", "value": 7.96336, "variance": None, "ndata": None},
-        "BXH6": {"strain_name": "BXH6", "value": 7.75132, "variance": None, "ndata": None},
-        "BXH7": {"strain_name": "BXH7", "value": 8.12927, "variance": None, "ndata": None},
-        "BXH8": {"strain_name": "BXH8", "value": 6.77338, "variance": None, "ndata": None},
-        "BXH9": {"strain_name": "BXH9", "value": 8.03836, "variance": None, "ndata": None},
-        "C3H/HeJ": {"strain_name": "C3H/HeJ", "value": 7.42795, "variance": None, "ndata": None},
-        "C57BL/6J": {"strain_name": "C57BL/6J", "value": 7.50606, "variance": None, "ndata": None},
-        "DBA/2J": {"strain_name": "DBA/2J", "value": 7.72588, "variance": None, "ndata": None}}}
+        "B6cC3-1": {"sample_name": "B6cC3-1", "value": 7.51879, "variance": None, "ndata": None},
+        "BXD1": {"sample_name": "BXD1", "value": 7.77141, "variance": None, "ndata": None},
+        "BXD12": {"sample_name": "BXD12", "value": 8.39265, "variance": None, "ndata": None},
+        "BXD16": {"sample_name": "BXD16", "value": 8.17443, "variance": None, "ndata": None},
+        "BXD19": {"sample_name": "BXD19", "value": 8.30401, "variance": None, "ndata": None},
+        "BXD2": {"sample_name": "BXD2", "value": 7.80944, "variance": None, "ndata": None},
+        "BXD21": {"sample_name": "BXD21", "value": 8.93809, "variance": None, "ndata": None},
+        "BXD24": {"sample_name": "BXD24", "value": 7.99415, "variance": None, "ndata": None},
+        "BXD27": {"sample_name": "BXD27", "value": 8.12177, "variance": None, "ndata": None},
+        "BXD28": {"sample_name": "BXD28", "value": 7.67688, "variance": None, "ndata": None},
+        "BXD32": {"sample_name": "BXD32", "value": 7.79062, "variance": None, "ndata": None},
+        "BXD39": {"sample_name": "BXD39", "value": 8.27641, "variance": None, "ndata": None},
+        "BXD40": {"sample_name": "BXD40", "value": 8.18012, "variance": None, "ndata": None},
+        "BXD42": {"sample_name": "BXD42", "value": 7.82433, "variance": None, "ndata": None},
+        "BXD6": {"sample_name": "BXD6", "value": 8.09718, "variance": None, "ndata": None},
+        "BXH14": {"sample_name": "BXH14", "value": 7.97475, "variance": None, "ndata": None},
+        "BXH19": {"sample_name": "BXH19", "value": 7.67223, "variance": None, "ndata": None},
+        "BXH2": {"sample_name": "BXH2", "value": 7.93622, "variance": None, "ndata": None},
+        "BXH22": {"sample_name": "BXH22", "value": 7.43692, "variance": None, "ndata": None},
+        "BXH4": {"sample_name": "BXH4", "value": 7.96336, "variance": None, "ndata": None},
+        "BXH6": {"sample_name": "BXH6", "value": 7.75132, "variance": None, "ndata": None},
+        "BXH7": {"sample_name": "BXH7", "value": 8.12927, "variance": None, "ndata": None},
+        "BXH8": {"sample_name": "BXH8", "value": 6.77338, "variance": None, "ndata": None},
+        "BXH9": {"sample_name": "BXH9", "value": 8.03836, "variance": None, "ndata": None},
+        "C3H/HeJ": {"sample_name": "C3H/HeJ", "value": 7.42795, "variance": None, "ndata": None},
+        "C57BL/6J": {"sample_name": "C57BL/6J", "value": 7.50606, "variance": None, "ndata": None},
+        "DBA/2J": {"sample_name": "DBA/2J", "value": 7.72588, "variance": None, "ndata": None}}}
 
 slinked = (
     (((0, 2, 0.16381088984330505),
@@ -66,7 +66,7 @@ class TestHeatmap(TestCase):
                 ["all", (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)]]:
             with self.subTest(dtype=dtype):
                 self.assertEqual(
-                    export_trait_data(trait_data, strainlist, dtype=dtype),
+                    export_trait_data(trait_data, samplelist, dtype=dtype),
                     expected)
 
     def test_export_trait_data_dtype_all_flags(self):
@@ -106,7 +106,7 @@ class TestHeatmap(TestCase):
             with self.subTest(dtype=dtype, vflag=vflag, nflag=nflag):
                 self.assertEqual(
                     export_trait_data(
-                        trait_data, strainlist, dtype=dtype, var_exists=vflag,
+                        trait_data, samplelist, dtype=dtype, var_exists=vflag,
                         n_exists=nflag),
                     expected)
 
@@ -164,8 +164,8 @@ class TestHeatmap(TestCase):
         self.assertEqual(
             compute_traits_order(slinked), (0, 2, 1, 7, 5, 9, 3, 6, 8, 4))
 
-    def test_retrieve_strains_and_values(self):
-        """Test retrieval of strains and values."""
+    def test_retrieve_samples_and_values(self):
+        """Test retrieval of samples and values."""
         for orders, slist, tdata, expected in [
                 [
                     [2],
@@ -185,9 +185,9 @@ class TestHeatmap(TestCase):
                      [6, None, None, 4, None]],
                     [[3, ["s1", "s4"], [6, 4]]]
                 ]]:
-            with self.subTest(strainlist=slist, traitdata=tdata):
+            with self.subTest(samplelist=slist, traitdata=tdata):
                 self.assertEqual(
-                    retrieve_strains_and_values(orders, slist, tdata), expected)
+                    retrieve_samples_and_values(orders, slist, tdata), expected)
 
     def test_get_lrs_from_chr(self):
         """Check that function gets correct LRS values"""
-- 
cgit v1.2.3


From 1d09a9222f8c661da3abd6d61c09ae19eeb5d793 Mon Sep 17 00:00:00 2001
From: Frederick Muriuki Muriithi
Date: Mon, 27 Sep 2021 05:02:09 +0300
Subject: Update terminology: `riset` to `group`

Issue:
https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi

* Update terminology to use the appropriate domain terminology according to
  Zachary's direction at
  https://github.com/genenetwork/genenetwork3/pull/37#issuecomment-926041744
---
 gn3/db/datasets.py             | 52 +++++++++++++++++++++---------------------
 gn3/db/traits.py               | 16 ++++++-------
 gn3/heatmaps.py                |  2 +-
 tests/unit/db/test_datasets.py | 42 +++++++++++++++++-----------------
 4 files changed, 56 insertions(+), 56 deletions(-)

(limited to 'gn3/db/traits.py')

diff --git a/gn3/db/datasets.py b/gn3/db/datasets.py
index 4a05499..6c328f5 100644
--- a/gn3/db/datasets.py
+++ b/gn3/db/datasets.py
@@ -119,9 +119,9 @@ def retrieve_dataset_name(
     return fn_map[trait_type](threshold, dataset_name, conn)
 
 
-def retrieve_geno_riset_fields(name, conn):
+def retrieve_geno_group_fields(name, conn):
     """
-    Retrieve the RISet, and RISetID values for various Geno trait types.
+    Retrieve the Group, and GroupID values for various Geno trait types.
     """
     query = (
         "SELECT InbredSet.Name, InbredSet.Id "
@@ -130,12 +130,12 @@ def retrieve_geno_riset_fields(name, conn):
         "AND GenoFreeze.Name = %(name)s")
     with conn.cursor() as cursor:
         cursor.execute(query, {"name": name})
-        return dict(zip(["riset", "risetid"], cursor.fetchone()))
+        return dict(zip(["group", "groupid"], cursor.fetchone()))
     return {}
 
-def retrieve_publish_riset_fields(name, conn):
+def retrieve_publish_group_fields(name, conn):
     """
-    Retrieve the RISet, and RISetID values for various Publish trait types.
+    Retrieve the Group, and GroupID values for various Publish trait types.
     """
     query = (
         "SELECT InbredSet.Name, InbredSet.Id "
@@ -144,12 +144,12 @@ def retrieve_publish_riset_fields(name, conn):
         "AND PublishFreeze.Name = %(name)s")
     with conn.cursor() as cursor:
         cursor.execute(query, {"name": name})
-        return dict(zip(["riset", "risetid"], cursor.fetchone()))
+        return dict(zip(["group", "groupid"], cursor.fetchone()))
     return {}
 
-def retrieve_probeset_riset_fields(name, conn):
+def retrieve_probeset_group_fields(name, conn):
     """
-    Retrieve the RISet, and RISetID values for various ProbeSet trait types.
+    Retrieve the Group, and GroupID values for various ProbeSet trait types.
     """
     query = (
         "SELECT InbredSet.Name, InbredSet.Id "
@@ -159,12 +159,12 @@ def retrieve_probeset_riset_fields(name, conn):
         "AND ProbeSetFreeze.Name = %(name)s")
     with conn.cursor() as cursor:
         cursor.execute(query, {"name": name})
-        return dict(zip(["riset", "risetid"], cursor.fetchone()))
+        return dict(zip(["group", "groupid"], cursor.fetchone()))
     return {}
 
-def retrieve_temp_riset_fields(name, conn):
+def retrieve_temp_group_fields(name, conn):
     """
-    Retrieve the RISet, and RISetID values for `Temp` trait types.
+    Retrieve the Group, and GroupID values for `Temp` trait types.
     """
     query = (
         "SELECT InbredSet.Name, InbredSet.Id "
@@ -173,30 +173,30 @@ def retrieve_temp_riset_fields(name, conn):
         "AND Temp.Name = %(name)s")
     with conn.cursor() as cursor:
         cursor.execute(query, {"name": name})
-        return dict(zip(["riset", "risetid"], cursor.fetchone()))
+        return dict(zip(["group", "groupid"], cursor.fetchone()))
     return {}
 
-def retrieve_riset_fields(trait_type, trait_name, dataset_info, conn):
+def retrieve_group_fields(trait_type, trait_name, dataset_info, conn):
     """
-    Retrieve the RISet, and RISetID values for various trait types.
+    Retrieve the Group, and GroupID values for various trait types.
     """
-    riset_fns_map = {
-        "Geno": retrieve_geno_riset_fields,
-        "Publish": retrieve_publish_riset_fields,
-        "ProbeSet": retrieve_probeset_riset_fields
+    group_fns_map = {
+        "Geno": retrieve_geno_group_fields,
+        "Publish": retrieve_publish_group_fields,
+        "ProbeSet": retrieve_probeset_group_fields
     }
 
     if trait_type == "Temp":
-        riset_info = retrieve_temp_riset_fields(trait_name, conn)
+        group_info = retrieve_temp_group_fields(trait_name, conn)
     else:
-        riset_info = riset_fns_map[trait_type](dataset_info["dataset_name"], conn)
+        group_info = group_fns_map[trait_type](dataset_info["dataset_name"], conn)
 
     return {
         **dataset_info,
-        **riset_info,
-        "riset": (
-            "BXD" if riset_info.get("riset") == "BXD300"
-            else riset_info.get("riset", ""))
+        **group_info,
+        "group": (
+            "BXD" if group_info.get("group") == "BXD300"
+            else group_info.get("group", ""))
     }
 
 def retrieve_temp_trait_dataset():
@@ -281,11 +281,11 @@ def retrieve_trait_dataset(trait_type, trait, threshold, conn):
             trait_type, threshold, trait["trait_name"],
             trait["db"]["dataset_name"], conn)
     }
-    riset = retrieve_riset_fields(
+    group = retrieve_group_fields(
         trait_type, trait["trait_name"], dataset_name_info, conn)
     return {
         "display_name": dataset_name_info["dataset_name"],
         **dataset_name_info,
         **dataset_fns[trait_type](),
-        **riset
+        **group
     }
diff --git a/gn3/db/traits.py b/gn3/db/traits.py
index c9d05d7..f2673c8 100644
--- a/gn3/db/traits.py
+++ b/gn3/db/traits.py
@@ -226,7 +226,7 @@ def set_homologene_id_field_probeset(trait_info, conn):
     """
     query = (
         "SELECT HomologeneId FROM Homologene, Species, InbredSet"
-        " WHERE Homologene.GeneId = %(geneid)s AND InbredSet.Name = %(riset)s"
+        " WHERE Homologene.GeneId = %(geneid)s AND InbredSet.Name = %(group)s"
         " AND InbredSet.SpeciesId = Species.Id AND"
         " Species.TaxonomyId = Homologene.TaxonomyId")
     with conn.cursor() as cursor:
@@ -234,7 +234,7 @@ def set_homologene_id_field_probeset(trait_info, conn):
             query,
             {
                 k:v for k, v in trait_info.items()
-                if k in ["geneid", "riset"]
+                if k in ["geneid", "group"]
             })
         res = cursor.fetchone()
         if res:
@@ -422,7 +422,7 @@ def retrieve_trait_info(
     if trait_info["haveinfo"]:
         return {
             **trait_post_processing_functions_table[trait_dataset_type](
-                {**trait_info, "riset": trait_dataset["riset"]}),
+                {**trait_info, "group": trait_dataset["group"]}),
             "db": {**trait["db"], **trait_dataset}
         }
     return trait_info
@@ -449,14 +449,14 @@ def retrieve_temp_trait_data(trait_info: dict, conn: Any):
                 for row in cursor.fetchall()]
     return []
 
-def retrieve_species_id(riset, conn: Any):
+def retrieve_species_id(group, conn: Any):
     """
-    Retrieve a species id given the RISet value
+    Retrieve a species id given the Group value
     """
     with conn.cursor as cursor:
         cursor.execute(
-            "SELECT SpeciesId from InbredSet WHERE Name = %(riset)s",
-            {"riset": riset})
+            "SELECT SpeciesId from InbredSet WHERE Name = %(group)s",
+            {"group": group})
         return cursor.fetchone()[0]
     return None
 
@@ -482,7 +482,7 @@ def retrieve_geno_trait_data(trait_info: Dict, conn: Any):
             {"trait_name": trait_info["trait_name"],
              "dataset_name": trait_info["db"]["dataset_name"],
              "species_id": retrieve_species_id(
-                 trait_info["db"]["riset"], conn)})
+                 trait_info["db"]["group"], conn)})
         return [dict(zip(
             ["sample_name", "value", "se_error", "id"], row))
                 for row in cursor.fetchall()]
diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py
index b6fc6d3..a36940d 100644
--- a/gn3/heatmaps.py
+++ b/gn3/heatmaps.py
@@ -164,7 +164,7 @@ def build_heatmap(traits_names, conn: Any):
         retrieve_trait_info(threshold, fullname, conn)
         for fullname in traits_names]
     traits_data_list = [retrieve_trait_data(t, conn) for t in traits]
-    genotype_filename = build_genotype_file(traits[0]["riset"])
+    genotype_filename = build_genotype_file(traits[0]["group"])
     samples = load_genotype_samples(genotype_filename)
     exported_traits_data_list = [
         export_trait_data(td, samples) for td in traits_data_list]
diff --git a/tests/unit/db/test_datasets.py b/tests/unit/db/test_datasets.py
index 38de0e2..39f4af9 100644
--- a/tests/unit/db/test_datasets.py
+++ b/tests/unit/db/test_datasets.py
@@ -3,10 +3,10 @@
 from unittest import mock, TestCase
 from gn3.db.datasets import (
     retrieve_dataset_name,
-    retrieve_riset_fields,
-    retrieve_geno_riset_fields,
-    retrieve_publish_riset_fields,
-    retrieve_probeset_riset_fields)
+    retrieve_group_fields,
+    retrieve_geno_group_fields,
+    retrieve_publish_group_fields,
+    retrieve_probeset_group_fields)
 
 class TestDatasetsDBFunctions(TestCase):
     """Test cases for datasets functions."""
@@ -40,9 +40,9 @@ class TestDatasetsDBFunctions(TestCase):
                             table=table, cols=columns),
                         {"threshold": thresh, "name": dataset_name})
 
-    def test_retrieve_probeset_riset_fields(self):
+    def test_retrieve_probeset_group_fields(self):
         """
-        Test that the `riset` and `riset_id` fields are retrieved appropriately
+        Test that the `group` and `group_id` fields are retrieved appropriately
         for the 'ProbeSet' trait type.
         """
         for trait_name, expected in [
@@ -52,7 +52,7 @@ class TestDatasetsDBFunctions(TestCase):
                 with db_mock.cursor() as cursor:
                     cursor.execute.return_value = ()
                     self.assertEqual(
-                        retrieve_probeset_riset_fields(trait_name, db_mock),
+                        retrieve_probeset_group_fields(trait_name, db_mock),
                         expected)
                     cursor.execute.assert_called_once_with(
                         (
@@ -63,34 +63,34 @@ class TestDatasetsDBFunctions(TestCase):
                             " AND ProbeSetFreeze.Name = %(name)s"),
                         {"name": trait_name})
 
-    def test_retrieve_riset_fields(self):
+    def test_retrieve_group_fields(self):
         """
-        Test that the riset fields are set up correctly for the different trait
+        Test that the group fields are set up correctly for the different trait
         types.
         """
         for trait_type, trait_name, dataset_info, expected in [
                 ["Publish", "pubTraitName01", {"dataset_name": "pubDBName01"},
-                 {"dataset_name": "pubDBName01", "riset": ""}],
+                 {"dataset_name": "pubDBName01", "group": ""}],
                 ["ProbeSet", "prbTraitName01", {"dataset_name": "prbDBName01"},
-                 {"dataset_name": "prbDBName01", "riset": ""}],
+                 {"dataset_name": "prbDBName01", "group": ""}],
                 ["Geno", "genoTraitName01", {"dataset_name": "genoDBName01"},
-                 {"dataset_name": "genoDBName01", "riset": ""}],
-                ["Temp", "tempTraitName01", {}, {"riset": ""}],
+                 {"dataset_name": "genoDBName01", "group": ""}],
+                ["Temp", "tempTraitName01", {}, {"group": ""}],
                 ]:
             db_mock = mock.MagicMock()
             with self.subTest(
                     trait_type=trait_type, trait_name=trait_name,
                     dataset_info=dataset_info):
                 with db_mock.cursor() as cursor:
-                    cursor.execute.return_value = ("riset_name", 0)
+                    cursor.execute.return_value = ("group_name", 0)
                     self.assertEqual(
-                        retrieve_riset_fields(
+                        retrieve_group_fields(
                             trait_type, trait_name, dataset_info, db_mock),
                         expected)
 
-    def test_retrieve_publish_riset_fields(self):
+    def test_retrieve_publish_group_fields(self):
         """
-        Test that the `riset` and `riset_id` fields are retrieved appropriately
+        Test that the `group` and `group_id` fields are retrieved appropriately
         for the 'Publish' trait type.
         """
         for trait_name, expected in [
@@ -100,7 +100,7 @@ class TestDatasetsDBFunctions(TestCase):
                 with db_mock.cursor() as cursor:
                     cursor.execute.return_value = ()
                     self.assertEqual(
-                        retrieve_publish_riset_fields(trait_name, db_mock),
+                        retrieve_publish_group_fields(trait_name, db_mock),
                         expected)
                     cursor.execute.assert_called_once_with(
                         (
@@ -110,9 +110,9 @@ class TestDatasetsDBFunctions(TestCase):
                             " AND PublishFreeze.Name = %(name)s"),
                         {"name": trait_name})
 
-    def test_retrieve_geno_riset_fields(self):
+    def test_retrieve_geno_group_fields(self):
         """
-        Test that the `riset` and `riset_id` fields are retrieved appropriately
+        Test that the `group` and `group_id` fields are retrieved appropriately
         for the 'Geno' trait type.
         """
         for trait_name, expected in [
@@ -122,7 +122,7 @@ class TestDatasetsDBFunctions(TestCase):
                 with db_mock.cursor() as cursor:
                     cursor.execute.return_value = ()
                     self.assertEqual(
-                        retrieve_geno_riset_fields(trait_name, db_mock),
+                        retrieve_geno_group_fields(trait_name, db_mock),
                         expected)
                     cursor.execute.assert_called_once_with(
                         (
-- 
cgit v1.2.3