From 5f56ac39c60b345e1a135c75f4bf35f8e881f4d6 Mon Sep 17 00:00:00 2001
From: Muriithi Frederick Muriuki
Date: Tue, 17 Aug 2021 08:44:20 +0300
Subject: Fix errors: add in missing parenthesis

Issue:
https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi

* Call the `cursor.fetchone()` function to get results. Without the
  parenthesis, the code was trying to use the function itself as the results,
  which was a bug, and would lead to failure.
---
 gn3/db/datasets.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'gn3/db')

diff --git a/gn3/db/datasets.py b/gn3/db/datasets.py
index 53d6811..4a05499 100644
--- a/gn3/db/datasets.py
+++ b/gn3/db/datasets.py
@@ -25,7 +25,7 @@ def retrieve_probeset_trait_dataset_name(
         return dict(zip(
             ["dataset_id", "dataset_name", "dataset_fullname",
              "dataset_shortname", "dataset_datascale"],
-            cursor.fetchone))
+            cursor.fetchone()))
 
 def retrieve_publish_trait_dataset_name(
         threshold: int, name: str, connection: Any):
@@ -49,7 +49,7 @@ def retrieve_publish_trait_dataset_name(
         return dict(zip(
             ["dataset_id", "dataset_name", "dataset_fullname",
              "dataset_shortname"],
-            cursor.fetchone))
+            cursor.fetchone()))
 
 def retrieve_geno_trait_dataset_name(
         threshold: int, name: str, connection: Any):
@@ -73,7 +73,7 @@ def retrieve_geno_trait_dataset_name(
         return dict(zip(
             ["dataset_id", "dataset_name", "dataset_fullname",
              "dataset_shortname"],
-            cursor.fetchone))
+            cursor.fetchone()))
 
 def retrieve_temp_trait_dataset_name(
         threshold: int, name: str, connection: Any):
@@ -97,7 +97,7 @@ def retrieve_temp_trait_dataset_name(
         return dict(zip(
             ["dataset_id", "dataset_name", "dataset_fullname",
              "dataset_shortname"],
-            cursor.fetchone))
+            cursor.fetchone()))
 
 def retrieve_dataset_name(
         trait_type: str, threshold: int, trait_name: str, dataset_name: str,
-- 
cgit v1.2.3


From a2f6406909951a80dc4ead809a09e8de2c15200d Mon Sep 17 00:00:00 2001
From: Muriithi Frederick Muriuki
Date: Tue, 17 Aug 2021 08:49:14 +0300
Subject: Provide top-level `riset` key-value pair

Issue:
https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi

* Provide the expected, top-level `riset` key-value pair and eliminate the
  redundant key-value pair.
---
 gn3/db/traits.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'gn3/db')

diff --git a/gn3/db/traits.py b/gn3/db/traits.py
index 6ea24be..1031e44 100644
--- a/gn3/db/traits.py
+++ b/gn3/db/traits.py
@@ -418,9 +418,9 @@ def retrieve_trait_info(
         conn)
     if trait_info["haveinfo"]:
         return {
-            **trait_post_processing_functions_table[trait_dataset_type](trait_info),
-            "db": {**trait["db"], **trait_dataset},
-            "riset": trait_dataset["riset"]
+            **trait_post_processing_functions_table[trait_dataset_type](
+                {**trait_info, "riset": trait_dataset["riset"]}),
+            "db": {**trait["db"], **trait_dataset}
         }
     return trait_info
 
-- 
cgit v1.2.3


From 6ab866183aeac8553fdcda9217e4445da2b4836b Mon Sep 17 00:00:00 2001
From: Muriithi Frederick Muriuki
Date: Tue, 31 Aug 2021 06:51:18 +0300
Subject: Provide utilities for genotype files

Issue:
https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi

* gn3/db/genotypes.py: New module
* gn3/settings.py: Add new configuration variable
* qtlfilesexport.py: Test out new code

  Add a module containing functions dealing with the genotype files.
  Add a configuration variable to point to the location of the genotype files.
---
 gn3/db/genotypes.py | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 gn3/settings.py     |  4 ++++
 qtlfilesexport.py   | 33 +++----------------------
 3 files changed, 76 insertions(+), 30 deletions(-)
 create mode 100644 gn3/db/genotypes.py

(limited to 'gn3/db')

diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py
new file mode 100644
index 0000000..610ddde
--- /dev/null
+++ b/gn3/db/genotypes.py
@@ -0,0 +1,69 @@
+"""Genotype utilities"""
+
+import os
+import gzip
+from gn3.settings import GENOTYPE_FILES
+
+def build_genotype_file(
+        geno_name: str, base_dir: str = GENOTYPE_FILES,
+        extension: str = "geno"):
+    """Build the absolute path for the genotype file."""
+    return "{}/{}.{}".format(os.path.abspath(base_dir), geno_name, extension)
+
+def load_genotype_samples(genotype_filename: str, file_type: str = "geno"):
+    """
+    Load sample of strains from genotype files.
+
+    DESCRIPTION:
+    Traits can contain a varied number of strains, some of which do not exist in
+    certain genotypes. In order to compute QTLs, GEMMAs, etc, we need to ensure
+    to pick only those strains that exist in the genotype under consideration
+    for the traits used in the computation.
+
+    This function loads a list of samples from the genotype files for use in
+    filtering out unusable strains.
+
+
+    PARAMETERS:
+    genotype_filename: The absolute path to the genotype file to load the
+        samples from.
+    file_type: The type of file. Currently supported values are 'geno' and
+        'plink'.
+    """
+    file_type_fns = {
+        "geno": __load_genotype_samples_from_geno,
+        "plink": __load_genotype_samples_from_plink
+    }
+    return file_type_fns[file_type](genotype_filename)
+
+def __load_genotype_samples_from_geno(genotype_filename: str):
+    """
+    Helper function for `load_genotype_samples` function.
+
+    Loads samples from '.geno' files.
+    """
+    gzipped_filename = "{}.gz".format(genotype_filename)
+    if os.path.isfile(gzipped_filename):
+        genofile = gzip.open(gzipped_filename)
+    else:
+        genofile = open(genotype_filename)
+
+    for row in genofile:
+        line = row.strip()
+        if (not line) or (line.startswith(("#", "@"))):
+            continue
+        break
+
+    headers = line.split("\t")
+    if headers[3] == "Mb":
+        return headers[4:]
+    return headers[3:]
+
+def __load_genotype_samples_from_plink(genotype_filename: str):
+    """
+    Helper function for `load_genotype_samples` function.
+
+    Loads samples from '.plink' files.
+    """
+    genofile = open(genotype_filename)
+    return [line.split(" ")[1] for line in genofile]
diff --git a/gn3/settings.py b/gn3/settings.py
index d137370..a08f846 100644
--- a/gn3/settings.py
+++ b/gn3/settings.py
@@ -27,3 +27,7 @@ BIWEIGHT_RSCRIPT = "~/genenetwork3/scripts/calculate_biweight.R"
 
 # qtlreaper command
 REAPER_COMMAND = "{}/bin/qtlreaper".format(os.environ.get("GUIX_ENVIRONMENT"))
+
+# genotype files
+GENOTYPE_FILES = os.environ.get(
+    "GENOTYPE_FILES", "{}/genotype_files/genotype".format(os.environ.get("HOME")))
diff --git a/qtlfilesexport.py b/qtlfilesexport.py
index adc5e77..1db4ab6 100644
--- a/qtlfilesexport.py
+++ b/qtlfilesexport.py
@@ -11,6 +11,7 @@ from gn3.computations.slink import slink
 from gn3.db_utils import database_connector
 from gn3.computations.heatmap import export_trait_data
 from gn3.db.traits import retrieve_trait_data, retrieve_trait_info
+from gn3.db.genotypes import build_genotype_file, load_genotype_samples
 from gn3.computations.qtlreaper import random_string, generate_traits_file
 from gn3.computations.heatmap import (
     cluster_traits,
@@ -41,36 +42,8 @@ def main():
         retrieve_trait_info(threshold, fullname, conn)
         for fullname in trait_fullnames()]
     traits_data_list = [retrieve_trait_data(t, conn) for t in traits]
-    # strains = list(set([k for td in traits_data_list for k in td["data"].keys()]))
-    strains = [# Use only the strains in the BXD.geno genotype file
-        "BXD1", "BXD2", "BXD5", "BXD6", "BXD8", "BXD9", "BXD11", "BXD12",
-        "BXD13", "BXD14", "BXD15", "BXD16", "BXD18", "BXD19", "BXD20", "BXD21",
-        "BXD22", "BXD23", "BXD24", "BXD24a", "BXD25", "BXD27", "BXD28", "BXD29",
-        "BXD30", "BXD31", "BXD32", "BXD33", "BXD34", "BXD35", "BXD36", "BXD37",
-        "BXD38", "BXD39", "BXD40", "BXD41", "BXD42", "BXD43", "BXD44", "BXD45",
-        "BXD48", "BXD48a", "BXD49", "BXD50", "BXD51", "BXD52", "BXD53", "BXD54",
-        "BXD55", "BXD56", "BXD59", "BXD60", "BXD61", "BXD62", "BXD63", "BXD64",
-        "BXD65", "BXD65a", "BXD65b", "BXD66", "BXD67", "BXD68", "BXD69",
-        "BXD70", "BXD71", "BXD72", "BXD73", "BXD73a", "BXD73b", "BXD74",
-        "BXD75", "BXD76", "BXD77", "BXD78", "BXD79", "BXD81", "BXD83", "BXD84",
-        "BXD85", "BXD86", "BXD87", "BXD88", "BXD89", "BXD90", "BXD91", "BXD93",
-        "BXD94", "BXD95", "BXD98", "BXD99", "BXD100", "BXD101", "BXD102",
-        "BXD104", "BXD105", "BXD106", "BXD107", "BXD108", "BXD109", "BXD110",
-        "BXD111", "BXD112", "BXD113", "BXD114", "BXD115", "BXD116", "BXD117",
-        "BXD119", "BXD120", "BXD121", "BXD122", "BXD123", "BXD124", "BXD125",
-        "BXD126", "BXD127", "BXD128", "BXD128a", "BXD130", "BXD131", "BXD132",
-        "BXD133", "BXD134", "BXD135", "BXD136", "BXD137", "BXD138", "BXD139",
-        "BXD141", "BXD142", "BXD144", "BXD145", "BXD146", "BXD147", "BXD148",
-        "BXD149", "BXD150", "BXD151", "BXD152", "BXD153", "BXD154", "BXD155",
-        "BXD156", "BXD157", "BXD160", "BXD161", "BXD162", "BXD165", "BXD168",
-        "BXD169", "BXD170", "BXD171", "BXD172", "BXD173", "BXD174", "BXD175",
-        "BXD176", "BXD177", "BXD178", "BXD180", "BXD181", "BXD183", "BXD184",
-        "BXD186", "BXD187", "BXD188", "BXD189", "BXD190", "BXD191", "BXD192",
-        "BXD193", "BXD194", "BXD195", "BXD196", "BXD197", "BXD198", "BXD199",
-        "BXD200", "BXD201", "BXD202", "BXD203", "BXD204", "BXD205", "BXD206",
-        "BXD207", "BXD208", "BXD209", "BXD210", "BXD211", "BXD212", "BXD213",
-        "BXD214", "BXD215", "BXD216", "BXD217", "BXD218", "BXD219", "BXD220"
-    ]
+    genotype_filename = build_genotype_file(traits[0]["riset"])
+    strains = load_genotype_samples(genotype_filename)
     exported_traits_data_list = [
         export_trait_data(td, strains) for td in traits_data_list]
     slinked = slink(cluster_traits(exported_traits_data_list))
-- 
cgit v1.2.3


From e441509a59c20a051fd5ab94710513f1968a5e02 Mon Sep 17 00:00:00 2001
From: Muriithi Frederick Muriuki
Date: Tue, 31 Aug 2021 10:50:56 +0300
Subject: Update `heatmap_data` function: remove extraneous data

Issue:
https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi

* gn3/computations/heatmap.py: update function
* gn3/db/traits.py: new function

  Remove extraneous data and arguments from the function.
  - Load the genotype file
  - Generate traits file
  - Provide both raw traits data, and exported traits data in return
---
 gn3/computations/heatmap.py | 42 ++++++++++++++++++++++--------------------
 gn3/db/traits.py            |  5 +++++
 2 files changed, 27 insertions(+), 20 deletions(-)

(limited to 'gn3/db')

diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py
index e0ff05b..92014cf 100644
--- a/gn3/computations/heatmap.py
+++ b/gn3/computations/heatmap.py
@@ -6,8 +6,12 @@ generate various kinds of heatmaps.
 from functools import reduce
 from typing import Any, Dict, Sequence
 from gn3.computations.slink import slink
-from gn3.db.traits import retrieve_trait_data, retrieve_trait_info
 from gn3.computations.correlations2 import compute_correlation
+from gn3.db.genotypes import build_genotype_file, load_genotype_samples
+from gn3.db.traits import (
+    retrieve_trait_data,
+    retrieve_trait_info,
+    generate_traits_filename)
 
 def export_trait_data(
         trait_data: dict, strainlist: Sequence[str], dtype: str = "val",
@@ -125,7 +129,7 @@ def cluster_traits(traits_data_list: Sequence[Dict]):
 
     return tuple(__cluster(tdata_i) for tdata_i in enumerate(traits_data_list))
 
-def heatmap_data(formd, search_result, conn: Any):
+def heatmap_data(traits_names, conn: Any):
     """
     heatmap function
 
@@ -142,39 +146,37 @@ def heatmap_data(formd, search_result, conn: Any):
     TODO: Elaborate on the parameters here...
     """
     threshold = 0 # webqtlConfig.PUBLICTHRESH
-    cluster_checked = formd.formdata.getvalue("clusterCheck", "")
-    strainlist = [
-        strain for strain in formd.strainlist if strain not in formd.parlist]
-    genotype = formd.genotype
-
     def __retrieve_traitlist_and_datalist(threshold, fullname):
         trait = retrieve_trait_info(threshold, fullname, conn)
         return (trait, retrieve_trait_data(trait, conn))
 
     traits_details = [
         __retrieve_traitlist_and_datalist(threshold, fullname)
-        for fullname in search_result]
+        for fullname in traits_names]
     traits_list = tuple(x[0] for x in traits_details)
     traits_data_list = [x[1] for x in traits_details]
     exported_traits_data_list = tuple(
         export_trait_data(td, strainlist) for td in traits_data_list)
+    genotype_filename = build_genotype_file(traits_list[0]["riset"])
+    strainlist = load_genotype_samples(genotype_filename)
+    slink_data = slink(cluster_traits(exported_traits_data_list))
+    ordering_data = compute_heatmap_order(slink_data)
+    strains_and_values = retrieve_strains_and_values(
+        orders, strainlist, exported_traits_data_list)
+    strains_values = strains_and_values[0][1]
+    trait_values = [t[2] for t in strains_and_values]
+    traits_filename = generate_traits_filename()
+    generate_traits_file(strains_values, trait_values, traits_filename)
 
     return {
-        "target_description_checked": formd.formdata.getvalue(
-            "targetDescriptionCheck", ""),
-        "cluster_checked": cluster_checked,
-        "slink_data": (
-            slink(cluster_traits(exported_traits_data_list))
-            if cluster_checked else False),
-        "sessionfile": formd.formdata.getvalue("session"),
-        "genotype": genotype,
-        "nLoci": sum(map(len, genotype)),
+        "slink_data": slink_data,
+        "ordering_data": ordering_data,
         "strainlist": strainlist,
-        "ppolar": formd.ppolar,
-        "mpolar":formd.mpolar,
+        "genotype_filename": genotype_filename,
         "traits_list": traits_list,
         "traits_data_list": traits_data_list,
-        "exported_traits_data_list": exported_traits_data_list
+        "exported_traits_data_list": exported_traits_data_list,
+        "traits_filename": traits_filename
     }
 
 def compute_heatmap_order(
diff --git a/gn3/db/traits.py b/gn3/db/traits.py
index 1031e44..ccb101a 100644
--- a/gn3/db/traits.py
+++ b/gn3/db/traits.py
@@ -1,4 +1,5 @@
 """This class contains functions relating to trait data manipulation"""
+from gn3.settings import TMPDIR
 from typing import Any, Dict, Union, Sequence
 from gn3.function_helpers import compose
 from gn3.db.datasets import retrieve_trait_dataset
@@ -666,3 +667,7 @@ def retrieve_trait_data(trait: dict, conn: Any, strainlist: Sequence[str] = tupl
                     {k:v for k, v in x.items() if x != "strain_name"}),
                 data))}
     return {}
+
+def generate_traits_filename(base_path: str = TMPDIR):
+    return "{}/traits_test_file_{}.txt".format(
+        os.path.abspath(base_path), random_string(10))
-- 
cgit v1.2.3


From b5e1d1176f1bf4f7c0b68b27beb15e99418f1650 Mon Sep 17 00:00:00 2001
From: Muriithi Frederick Muriuki
Date: Tue, 31 Aug 2021 11:16:29 +0300
Subject: Fix linting errors, minor bugs and reorganise code

* Fix some linting errors and some minor bugs caught by the linter.
  Move the `random_string` function to separate module for use in multiple
  places in the code.
---
 gn3/computations/heatmap.py               |  7 ++++---
 gn3/computations/qtlreaper.py             | 27 ++++++++++++++-------------
 gn3/db/traits.py                          |  5 ++++-
 gn3/heatmaps/heatmaps.py                  | 25 +++++++++++++++++++------
 gn3/random.py                             | 11 +++++++++++
 tests/unit/computations/test_qtlreaper.py |  5 +++--
 6 files changed, 55 insertions(+), 25 deletions(-)
 create mode 100644 gn3/random.py

(limited to 'gn3/db')

diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py
index 92014cf..1143450 100644
--- a/gn3/computations/heatmap.py
+++ b/gn3/computations/heatmap.py
@@ -6,6 +6,7 @@ generate various kinds of heatmaps.
 from functools import reduce
 from typing import Any, Dict, Sequence
 from gn3.computations.slink import slink
+from gn3.computations.qtlreaper import generate_traits_file
 from gn3.computations.correlations2 import compute_correlation
 from gn3.db.genotypes import build_genotype_file, load_genotype_samples
 from gn3.db.traits import (
@@ -155,14 +156,14 @@ def heatmap_data(traits_names, conn: Any):
         for fullname in traits_names]
     traits_list = tuple(x[0] for x in traits_details)
     traits_data_list = [x[1] for x in traits_details]
-    exported_traits_data_list = tuple(
-        export_trait_data(td, strainlist) for td in traits_data_list)
     genotype_filename = build_genotype_file(traits_list[0]["riset"])
     strainlist = load_genotype_samples(genotype_filename)
+    exported_traits_data_list = tuple(
+        export_trait_data(td, strainlist) for td in traits_data_list)
     slink_data = slink(cluster_traits(exported_traits_data_list))
     ordering_data = compute_heatmap_order(slink_data)
     strains_and_values = retrieve_strains_and_values(
-        orders, strainlist, exported_traits_data_list)
+        ordering_data, strainlist, exported_traits_data_list)
     strains_values = strains_and_values[0][1]
     trait_values = [t[2] for t in strains_and_values]
     traits_filename = generate_traits_filename()
diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py
index 3b8e4db..30c7051 100644
--- a/gn3/computations/qtlreaper.py
+++ b/gn3/computations/qtlreaper.py
@@ -3,17 +3,10 @@ This module contains functions to interact with the `qtlreaper` utility for
 computation of QTLs.
 """
 import os
-import random
-import string
 import subprocess
+from gn3.random import random_string
 from gn3.settings import TMPDIR, REAPER_COMMAND
 
-def random_string(length):
-    """Generate a random string of length `length`."""
-    return "".join(
-        random.choices(
-            string.ascii_letters + string.digits, k=length))
-
 def generate_traits_file(strains, trait_values, traits_filename):
     """
     Generate a traits file for use with `qtlreaper`.
@@ -25,11 +18,13 @@ def generate_traits_file(strains, trait_values, traits_filename):
         computation of QTLs.
     """
     header = "Trait\t{}\n".format("\t".join(strains))
-    data = [header] + [
-        "T{}\t{}\n".format(i+1, "\t".join([str(i) for i in t]))
-        for i, t in enumerate(trait_values[:-1])] + [
-        "T{}\t{}".format(len(trait_values), "\t".join([str(i) for i in t]))
-        for t in trait_values[-1:]]
+    data = (
+        [header] +
+        ["T{}\t{}\n".format(i+1, "\t".join([str(i) for i in t]))
+         for i, t in enumerate(trait_values[:-1])] +
+        ["T{}\t{}".format(
+            len(trait_values), "\t".join([str(i) for i in t]))
+         for t in trait_values[-1:]])
     with open(traits_filename, "w") as outfile:
         outfile.writelines(data)
 
@@ -93,6 +88,9 @@ def run_reaper(
 
 
 def parse_reaper_main_results(results_file):
+    """
+    Parse the results file of running QTLReaper into a list of dicts.
+    """
     with open(results_file, "r") as infile:
         lines = infile.readlines()
 
@@ -104,6 +102,9 @@ def parse_reaper_main_results(results_file):
     return [dict(zip(header, __parse_line(line))) for line in lines[1:]]
 
 def parse_reaper_permutation_results(results_file):
+    """
+    Parse the results QTLReaper permutations into a list of values.
+    """
     with open(results_file, "r") as infile:
         lines = infile.readlines()
 
diff --git a/gn3/db/traits.py b/gn3/db/traits.py
index ccb101a..bfe887e 100644
--- a/gn3/db/traits.py
+++ b/gn3/db/traits.py
@@ -1,6 +1,8 @@
 """This class contains functions relating to trait data manipulation"""
-from gn3.settings import TMPDIR
+import os
 from typing import Any, Dict, Union, Sequence
+from gn3.settings import TMPDIR
+from gn3.random import random_string
 from gn3.function_helpers import compose
 from gn3.db.datasets import retrieve_trait_dataset
 
@@ -669,5 +671,6 @@ def retrieve_trait_data(trait: dict, conn: Any, strainlist: Sequence[str] = tupl
     return {}
 
 def generate_traits_filename(base_path: str = TMPDIR):
+    """Generate a unique filename for use with generated traits files."""
     return "{}/traits_test_file_{}.txt".format(
         os.path.abspath(base_path), random_string(10))
diff --git a/gn3/heatmaps/heatmaps.py b/gn3/heatmaps/heatmaps.py
index 3bf7917..88f546d 100644
--- a/gn3/heatmaps/heatmaps.py
+++ b/gn3/heatmaps/heatmaps.py
@@ -14,6 +14,19 @@ def generate_random_data(data_stop: float = 2, width: int = 10, height: int = 30
     return [[random.uniform(0,data_stop) for i in range(0, width)]
             for j in range(0, height)]
 
+def generate_random_data2(data_stop: float = 2, width: int = 10, height: int = 30):
+    """
+    This is mostly a utility function to be used to generate random data, useful
+    for development of the heatmap generation code, without access to the actual
+    database data.
+    """
+    return [
+        [{
+            "value": item,
+            "category": random.choice(["C57BL/6J +", "DBA/2J +"])}
+         for item in axis]
+        for axis in generate_random_data(data_stop, width, height)]
+
 def heatmap_x_axis_names():
     return [
         "UCLA_BXDBXH_CARTILAGE_V2::ILM103710672",
@@ -30,13 +43,14 @@ def heatmap_x_axis_names():
 
 # Grey + Blue + Red
 def generate_heatmap():
-    rows = 20
-    data = generate_random_data(height=rows)
-    y = (["%s"%x for x in range(1, rows+1)][:-1] + ["X"]) #replace last item with x for now
+    cols = 20
+    y_axis = (["%s"%x for x in range(1, cols+1)][:-1] + ["X"]) #replace last item with x for now
+    x_axis = heatmap_x_axis_names()
+    data = generate_random_data(height=cols, width=len(x_axis))
     fig = px.imshow(
         data,
-        x=heatmap_x_axis_names(),
-        y=y,
+        x=x_axis,
+        y=y_axis,
         width=500)
     fig.update_traces(xtype="array")
     fig.update_traces(ytype="array")
@@ -49,6 +63,5 @@ def generate_heatmap():
         coloraxis_colorscale=[
             [0.0, '#3B3B3B'], [0.4999999999999999, '#ABABAB'],
             [0.5, '#F5DE11'], [1.0, '#FF0D00']])
-
     fig.write_html("%s/%s"%(heatmap_dir, "test_image.html"))
     return fig
diff --git a/gn3/random.py b/gn3/random.py
new file mode 100644
index 0000000..f0ba574
--- /dev/null
+++ b/gn3/random.py
@@ -0,0 +1,11 @@
+"""
+Functions to generate complex random data.
+"""
+import random
+import string
+
+def random_string(length):
+    """Generate a random string of length `length`."""
+    return "".join(
+        random.choices(
+            string.ascii_letters + string.digits, k=length))
diff --git a/tests/unit/computations/test_qtlreaper.py b/tests/unit/computations/test_qtlreaper.py
index ec23664..6c3b64d 100644
--- a/tests/unit/computations/test_qtlreaper.py
+++ b/tests/unit/computations/test_qtlreaper.py
@@ -1,5 +1,4 @@
 """Module contains tests for gn3.computations.qtlreaper"""
-import os
 from unittest import TestCase
 from gn3.computations.qtlreaper import (
     parse_reaper_main_results, parse_reaper_permutation_results)
@@ -8,6 +7,7 @@ class TestQTLReaper(TestCase):
     """Class for testing qtlreaper interface functions."""
 
     def test_parse_reaper_main_results(self):
+        """Test that the main results file is parsed correctly."""
         self.assertEqual(
             parse_reaper_main_results(
                 "tests/unit/computations/data/qtlreaper/main_output_sample.txt"),
@@ -65,9 +65,10 @@ class TestQTLReaper(TestCase):
             ])
 
     def test_parse_reaper_permutation_results(self):
+        """Test that the permutations results file is parsed correctly."""
         self.assertEqual(
             parse_reaper_permutation_results(
-            "tests/unit/computations/data/qtlreaper/permu_output_sample.txt"),
+                "tests/unit/computations/data/qtlreaper/permu_output_sample.txt"),
             [4.44174, 5.03825, 5.08167, 5.18119, 5.18578, 5.24563, 5.24619,
              5.24619, 5.27961, 5.28228, 5.43903, 5.50188, 5.51694, 5.56830,
              5.63874, 5.71346, 5.71936, 5.74275, 5.76764, 5.79815, 5.81671,
-- 
cgit v1.2.3


From 221c773daea839ecf0e50c196484bb91e3a6db33 Mon Sep 17 00:00:00 2001
From: Muriithi Frederick Muriuki
Date: Wed, 1 Sep 2021 06:18:20 +0300
Subject: Implement parsing of genotype labels

Issue:
https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi

* gn3/db/genotypes.py: parse genotype labels
* tests/unit/db/test_genotypes.py: test that genotype labels are parsed
  correctly

  As part of parsing the genotype files into usable python data structures,
  this commit adds a function to parse the label lines (beginning with "@")
  into the appropriate values.
---
 gn3/db/genotypes.py             | 20 ++++++++++++++++++++
 tests/unit/db/test_genotypes.py | 17 +++++++++++++++++
 2 files changed, 37 insertions(+)
 create mode 100644 tests/unit/db/test_genotypes.py

(limited to 'gn3/db')

diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py
index 610ddde..2be3e1a 100644
--- a/gn3/db/genotypes.py
+++ b/gn3/db/genotypes.py
@@ -67,3 +67,23 @@ def __load_genotype_samples_from_plink(genotype_filename: str):
     """
     genofile = open(genotype_filename)
     return [line.split(" ")[1] for line in genofile]
+
+def parse_genotype_labels(lines: list):
+    """
+    Parse label lines into usable genotype values
+
+    DESCRIPTION:
+    Reworks
+    https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/utility/gen_geno_ob.py#L75-L93
+    """
+    acceptable_labels = ["name", "filler", "type", "mat", "pat", "het", "unk"]
+    def __parse_label(line):
+        label, value = [l.strip() for l in line[1:].split(":")]
+        if label not in acceptable_labels:
+            return None
+        if label == "name":
+            return ("group", value)
+        return (label, value)
+    return tuple(
+        item for item in (__parse_label(line) for line in lines)
+        if item is not None)
diff --git a/tests/unit/db/test_genotypes.py b/tests/unit/db/test_genotypes.py
new file mode 100644
index 0000000..0264764
--- /dev/null
+++ b/tests/unit/db/test_genotypes.py
@@ -0,0 +1,17 @@
+"""Tests gn3.db.genotypes"""
+from unittest import TestCase
+from gn3.db.genotypes import parse_genotype_labels
+
+class TestGenotypes(TestCase):
+    """Tests for functions in `gn3.db.genotypes`."""
+
+    def test_parse_genotype_labels(self):
+        self.assertEqual(
+            parse_genotype_labels([
+                "@name: test_group\t", "@filler: test_filler    ",
+                "@type:test_type", "@mat:test_mat   \t", "@pat:test_pat ",
+                "@het: test_het ", "@unk: test_unk", "@other: test_other",
+                "@brrr: test_brrr "]),
+        (("group", "test_group"), ("filler", "test_filler"),
+         ("type", "test_type"), ("mat", "test_mat"), ("pat", "test_pat"),
+         ("het", "test_het"), ("unk", "test_unk")))
-- 
cgit v1.2.3


From b975e0cfd1d0adc5f51e66292d29d4651d3f053f Mon Sep 17 00:00:00 2001
From: Muriithi Frederick Muriuki
Date: Wed, 1 Sep 2021 07:35:40 +0300
Subject: Parse the genotype file's data header

* gn3/db/genotypes.py: parse data header
* tests/unit/db/test_genotypes.py: check that header's parse works correctly.

  Add tests to check that the parser works as expected. Add code to implement
  the parsing and pass the tests.
---
 gn3/db/genotypes.py             | 19 +++++++++++++++++++
 tests/unit/db/test_genotypes.py | 22 +++++++++++++++++++++-
 2 files changed, 40 insertions(+), 1 deletion(-)

(limited to 'gn3/db')

diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py
index 2be3e1a..be0dfc2 100644
--- a/gn3/db/genotypes.py
+++ b/gn3/db/genotypes.py
@@ -87,3 +87,22 @@ def parse_genotype_labels(lines: list):
     return tuple(
         item for item in (__parse_label(line) for line in lines)
         if item is not None)
+
+def parse_genotype_header(line: str, parlist = tuple()):
+    """
+    Parse the genotype file header line
+
+    DESCRIPTION:
+    Reworks
+    https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/utility/gen_geno_ob.py#L94-L114
+    """
+    items = [item.strip() for item in line.split("\t")]
+    Mbmap = "Mb" in items
+    prgy = ((parlist + tuple(items[4:])) if Mbmap
+            else (parlist + tuple(items[3:])))
+    return (
+        ("Mbmap", Mbmap),
+        ("cm_column", items.index("cM")),
+        ("mb_column", None if not Mbmap else items.index("Mb")),
+        ("prgy", prgy),
+        ("nprgy", len(prgy)))
diff --git a/tests/unit/db/test_genotypes.py b/tests/unit/db/test_genotypes.py
index 0264764..4fa8a53 100644
--- a/tests/unit/db/test_genotypes.py
+++ b/tests/unit/db/test_genotypes.py
@@ -1,6 +1,6 @@
 """Tests gn3.db.genotypes"""
 from unittest import TestCase
-from gn3.db.genotypes import parse_genotype_labels
+from gn3.db.genotypes import parse_genotype_labels, parse_genotype_header
 
 class TestGenotypes(TestCase):
     """Tests for functions in `gn3.db.genotypes`."""
@@ -15,3 +15,23 @@ class TestGenotypes(TestCase):
         (("group", "test_group"), ("filler", "test_filler"),
          ("type", "test_type"), ("mat", "test_mat"), ("pat", "test_pat"),
          ("het", "test_het"), ("unk", "test_unk")))
+
+    def test_parse_genotype_header(self):
+        for header, expected in [
+                [("Chr\tLocus\tcM\tMb\tBXD1\tBXD2\tBXD5\tBXD6\tBXD8\tBXD9\t"
+                  "BXD11\tBXD12\tBXD13\tBXD14\tBXD15\tBXD16\tBXD18\tBXD19"),
+                 (("Mbmap", True), ("cm_column", 2), ("mb_column", 3),
+                  ("prgy",
+                   ("BXD1", "BXD2", "BXD5", "BXD6", "BXD8", "BXD9", "BXD11",
+                    "BXD12", "BXD13", "BXD14", "BXD15", "BXD16", "BXD18",
+                    "BXD19")),
+                  ("nprgy", 14))],
+                [("Chr\tLocus\tcM\tBXD1\tBXD2\tBXD5\tBXD6\tBXD8\tBXD9\tBXD11"
+                  "\tBXD12\tBXD13\tBXD14\tBXD15\tBXD16\tBXD18"),
+                 (("Mbmap", False), ("cm_column", 2), ("mb_column", None),
+                  ("prgy",
+                   ("BXD1", "BXD2", "BXD5", "BXD6", "BXD8", "BXD9", "BXD11",
+                    "BXD12", "BXD13", "BXD14", "BXD15", "BXD16", "BXD18")),
+                  ("nprgy", 13))]]:
+            with self.subTest(header=header):
+                self.assertEqual(parse_genotype_header(header), expected)
-- 
cgit v1.2.3


From a1c217cf277feda3815a8435d6c8909f1b5546a1 Mon Sep 17 00:00:00 2001
From: Muriithi Frederick Muriuki
Date: Wed, 1 Sep 2021 09:11:17 +0300
Subject: Parse data lines into markers

Issue:
https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi

* gn3/db/genotypes.py: parse data lines in file to genetic markers.
* tests/unit/db/test_genotypes.py: test that parsing works.

  Add some tests to check that the parsing of the markers works as expected,
  and add the code to actually parse the markers.
---
 gn3/db/genotypes.py             | 37 +++++++++++++++++++++++++++++++++++++
 tests/unit/db/test_genotypes.py | 38 +++++++++++++++++++++++++++++++++++++-
 2 files changed, 74 insertions(+), 1 deletion(-)

(limited to 'gn3/db')

diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py
index be0dfc2..8710d2e 100644
--- a/gn3/db/genotypes.py
+++ b/gn3/db/genotypes.py
@@ -106,3 +106,40 @@ def parse_genotype_header(line: str, parlist = tuple()):
         ("mb_column", None if not Mbmap else items.index("Mb")),
         ("prgy", prgy),
         ("nprgy", len(prgy)))
+
+def parse_genotype_data_line(line: str, geno_obj: dict, parlist: list):
+    """
+    Parse a data line in a genotype file
+
+    DESCRIPTION:
+    Reworks
+    https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/utility/gen_geno_ob.py#L143-L190
+    """
+    marker_row = [item.strip() for item in line.split("\t")]
+    geno_table = {
+        geno_obj["mat"]: -1, geno_obj["pat"]: 1, geno_obj["het"]: 0,
+        geno_obj["unk"]: "U"
+    }
+    start_pos = 4 if geno_obj["Mbmap"] else 3
+    if len(parlist) > 0:
+        start_pos = start_pos + 2
+
+    alleles = marker_row[start_pos:]
+    genotype = tuple(
+        (geno_table[allele] if allele in geno_table.keys() else "U")
+        for allele in alleles)
+    if len(parlist) > 0:
+        genotype = (-1, 1) + genotype
+    try:
+        cM = float(geno_obj["cm_column"])
+    except:
+        if geno_obj["Mbmap"]:
+            cM = float(geno_obj["mb_column"])
+        else:
+            cM = 0
+    return (
+        ("chr", marker_row[0]),
+        ("name", marker_row[1]),
+        ("cM", cM),
+        ("Mb", float(geno_obj["mb_column"]) if geno_obj["Mbmap"] else None),
+        ("genotype", genotype))
diff --git a/tests/unit/db/test_genotypes.py b/tests/unit/db/test_genotypes.py
index 4fa8a53..ba90191 100644
--- a/tests/unit/db/test_genotypes.py
+++ b/tests/unit/db/test_genotypes.py
@@ -1,11 +1,13 @@
 """Tests gn3.db.genotypes"""
 from unittest import TestCase
-from gn3.db.genotypes import parse_genotype_labels, parse_genotype_header
+from gn3.db.genotypes import (
+    parse_genotype_labels, parse_genotype_header, parse_genotype_data_line)
 
 class TestGenotypes(TestCase):
     """Tests for functions in `gn3.db.genotypes`."""
 
     def test_parse_genotype_labels(self):
+        """Test that the genotype labels are parsed correctly."""
         self.assertEqual(
             parse_genotype_labels([
                 "@name: test_group\t", "@filler: test_filler    ",
@@ -17,6 +19,7 @@ class TestGenotypes(TestCase):
          ("het", "test_het"), ("unk", "test_unk")))
 
     def test_parse_genotype_header(self):
+        """Test that the genotype header is parsed correctly."""
         for header, expected in [
                 [("Chr\tLocus\tcM\tMb\tBXD1\tBXD2\tBXD5\tBXD6\tBXD8\tBXD9\t"
                   "BXD11\tBXD12\tBXD13\tBXD14\tBXD15\tBXD16\tBXD18\tBXD19"),
@@ -35,3 +38,36 @@ class TestGenotypes(TestCase):
                   ("nprgy", 13))]]:
             with self.subTest(header=header):
                 self.assertEqual(parse_genotype_header(header), expected)
+
+    def test_parse_genotype_data_line(self):
+        """Test parsing of data lines."""
+        for line, geno_obj, parlist, expected in [
+                ["1\trs31443144\t1.50\t3.010274\tB\tB\tD\tD\tD\tB\tB\tD\tB\tB",
+                 {"mat": "test_mat", "pat": "test_pat", "het": "test_het",
+                  "unk": "test_unk", "cm_column": 2, "Mbmap": True,
+                  "mb_column": 3},
+                 tuple(),
+                 (("chr", "1"), ("name", "rs31443144"), ("cM", 2.0),
+                  ("Mb", 3.0),
+                  ("genotype",
+                   ("U", "U", "U", "U", "U", "U", "U", "U", "U", "U")))],
+                ["1\trs31443144\t1.50\t3.010274\tB\tB\tD\tD\tD\tB\tB\tD\tB\tB",
+                 {"mat": "test_mat", "pat": "test_pat", "het": "test_het",
+                  "unk": "test_unk", "cm_column": 2, "Mbmap": True,
+                  "mb_column": 3},
+                 ("some", "parlist", "content"),
+                 (("chr", "1"), ("name", "rs31443144"), ("cM", 2.0),
+                  ("Mb", 3.0),
+                  ("genotype",
+                   (-1, 1, "U", "U", "U", "U", "U", "U", "U", "U")))],
+                ["1\trs31443144\t1.50\t3.010274\tB\tB\tD\tH\tD\tB\tU\tD\tB\tB",
+                 {"mat": "B", "pat": "D", "het": "H", "unk": "U",
+                  "cm_column": 2, "Mbmap": True, "mb_column": 3},
+                 tuple(),
+                 (("chr", "1"), ("name", "rs31443144"), ("cM", 2.0),
+                  ("Mb", 3.0),
+                  ("genotype", (-1, -1, 1, 0, 1, -1, "U", 1, -1, -1)))]]:
+            with self.subTest(line = line):
+                self.assertEqual(
+                    parse_genotype_data_line(line, geno_obj, parlist),
+                    expected)
-- 
cgit v1.2.3


From abfc0410a2385d8c3d6ee1915fc99b708e1d0dbc Mon Sep 17 00:00:00 2001
From: Muriithi Frederick Muriuki
Date: Wed, 1 Sep 2021 10:49:52 +0300
Subject: Built top-level genotype file parsing function

Issue:
https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi

* gn3/db/genotypes.py: parse genotype files
* tests/unit/db/test_genotypes.py: test parsing is correct

  Add the overall genotype files parsing function and tests to check that the
  parsing works as expected.
---
 gn3/db/genotypes.py             |  38 ++++++++++++++-
 tests/unit/db/test_genotypes.py | 101 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 136 insertions(+), 3 deletions(-)

(limited to 'gn3/db')

diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py
index 8710d2e..b5d14a5 100644
--- a/gn3/db/genotypes.py
+++ b/gn3/db/genotypes.py
@@ -107,7 +107,7 @@ def parse_genotype_header(line: str, parlist = tuple()):
         ("prgy", prgy),
         ("nprgy", len(prgy)))
 
-def parse_genotype_data_line(line: str, geno_obj: dict, parlist: list):
+def parse_genotype_marker(line: str, geno_obj: dict, parlist: list):
     """
     Parse a data line in a genotype file
 
@@ -143,3 +143,39 @@ def parse_genotype_data_line(line: str, geno_obj: dict, parlist: list):
         ("cM", cM),
         ("Mb", float(geno_obj["mb_column"]) if geno_obj["Mbmap"] else None),
         ("genotype", genotype))
+
+def build_genotype_chromosomes(geno_obj, markers):
+    """
+    Build up the chromosomes from the given markers and partially built geno
+    object
+    """
+    mrks = [dict(marker) for marker in markers]
+    chr_names = {marker["chr"] for marker in mrks}
+    return tuple((
+        ("name", chr_name), ("mb_exists", geno_obj["Mbmap"]), ("cm_column", 2),
+        ("mb_column", geno_obj["mb_column"]),
+        ("loci", tuple(marker for marker in mrks if marker["chr"] == chr_name)))
+           for chr_name in sorted(chr_names))
+
+def parse_genotype_file(filename: str, parlist = tuple()):
+    """
+    Parse the provided genotype file into a usable pytho3 data structure.
+    """
+    with open(filename, "r") as infile:
+        contents = infile.readlines()
+
+    lines = tuple(line for line in contents if
+             ((not line.strip().startswith("#")) and
+              (not line.strip() == "")))
+    labels = parse_genotype_labels(
+        line for line in lines if line.startswith("@"))
+    data_lines = tuple(line for line in lines if not line.startswith("@"))
+    header = parse_genotype_header(data_lines[0], parlist)
+    geno_obj = dict(labels + header)
+    markers = tuple(
+        parse_genotype_marker(line, geno_obj, parlist)
+        for line in data_lines[1:])
+    chromosomes = tuple(
+        dict(chromosome) for chromosome in
+        build_genotype_chromosomes(geno_obj, markers))
+    return {**geno_obj, "chromosomes": chromosomes}
diff --git a/tests/unit/db/test_genotypes.py b/tests/unit/db/test_genotypes.py
index ba90191..a05ce48 100644
--- a/tests/unit/db/test_genotypes.py
+++ b/tests/unit/db/test_genotypes.py
@@ -1,7 +1,11 @@
 """Tests gn3.db.genotypes"""
 from unittest import TestCase
 from gn3.db.genotypes import (
-    parse_genotype_labels, parse_genotype_header, parse_genotype_data_line)
+    parse_genotype_file,
+    parse_genotype_labels,
+    parse_genotype_header,
+    parse_genotype_marker,
+    build_genotype_chromosomes)
 
 class TestGenotypes(TestCase):
     """Tests for functions in `gn3.db.genotypes`."""
@@ -69,5 +73,98 @@ class TestGenotypes(TestCase):
                   ("genotype", (-1, -1, 1, 0, 1, -1, "U", 1, -1, -1)))]]:
             with self.subTest(line = line):
                 self.assertEqual(
-                    parse_genotype_data_line(line, geno_obj, parlist),
+                    parse_genotype_marker(line, geno_obj, parlist),
                     expected)
+
+    def test_build_genotype_chromosomes(self):
+        """
+        Given `markers` and `geno_obj`, test that `build_genotype_chromosomes`
+        builds a sequence of chromosomes with the given markers ordered
+        according to the `chr` value."""
+        for markers, geno_obj, expected in [
+                [[(("chr", "1"), ("name", "rs31443144"), ("cM", 2.0),
+                   ("Mb", 3.0),
+                   ("genotype", (-1, -1, 1, 0, 1, -1, "U", 1, -1, -1))),
+                  (("chr", "2"), ("name", "rs31443144"), ("cM", 2.0),
+                   ("Mb", 3.0),
+                   ("genotype", (-1, -1, 1, 0, 1, -1, "U", 1, -1, -1)))],
+                 {"mat": "B", "pat": "D", "het": "H", "unk": "U",
+                  "cm_column": 2, "Mbmap": True, "mb_column": 3},
+                 ((("name", "1"), ("mb_exists", True), ("cm_column", 2),
+                   ("mb_column", 3),
+                   ("loci",
+                    ({"chr": "1", "name": "rs31443144", "cM": 2.0, "Mb": 3.0,
+                      "genotype": (-1, -1, 1, 0, 1, -1, "U", 1, -1, -1)},))),
+                  (("name", "2"), ("mb_exists", True), ("cm_column", 2),
+                   ("mb_column", 3),
+                   ("loci",
+                    ({"chr": "2", "name": "rs31443144", "cM": 2.0, "Mb": 3.0,
+                      "genotype": (-1, -1, 1, 0, 1, -1, "U", 1, -1, -1)},))))],
+                [[(("chr", "1"), ("name", "rs31443144"), ("cM", 2.0),
+                   ("Mb", None),
+                   ("genotype", (-1, 1, 1, 0, 1, -1, "U", 1, -1, -1)))],
+                 {"mat": "B", "pat": "D", "het": "H", "unk": "U",
+                  "cm_column": 2, "Mbmap": False, "mb_column": None},
+                 ((("name", "1"), ("mb_exists", False), ("cm_column", 2),
+                   ("mb_column", None),
+                   ("loci",
+                    ({"chr": "1", "name": "rs31443144", "cM": 2.0, "Mb": None,
+                      "genotype": (-1, 1, 1, 0, 1, -1, "U", 1, -1, -1)},))),)]]:
+            with self.subTest(markers = markers):
+                self.assertEqual(
+                    build_genotype_chromosomes(geno_obj, markers),
+                    expected)
+
+    def test_parse_genotype_file(self):
+        """Test the parsing of genotype files. """
+        self.assertEqual(
+            parse_genotype_file(
+                "tests/unit/db/data/genotypes/genotype_sample1.geno"),
+            {"group": "BXD",
+             "type": "riset",
+             "mat": "B",
+             "pat": "D",
+             "het": "H",
+             "unk": "U",
+             "Mbmap": True,
+             "cm_column": 2,
+             "mb_column": 3,
+             "prgy": ("BXD1", "BXD2", "BXD5", "BXD6", "BXD8", "BXD9"),
+             "nprgy": 6,
+             "chromosomes": (
+                 {"name": "1",
+                  "mb_exists": True,
+                  "cm_column": 2,
+                  "mb_column": 3,
+                  "loci": (
+                      {"chr": "1",
+                       "name": "rs31443144",
+                       "cM": 2.0,
+                       "Mb": 3.0,
+                       "genotype": (-1, -1, 1, 1, 1, -1)
+                       },
+                      {"chr": "1",
+                       "name": "rs6269442",
+                       "cM": 2.0,
+                       "Mb": 3.0,
+                       "genotype": (-1, -1, 1, 1, 0, "U")},
+                      {"chr": "1",
+                       "name": "rs32285189",
+                       "cM": 2.0,
+                       "Mb": 3.0,
+                       "genotype": (-1, "U", 1, 1, 1, -1)})},
+                 {"name": "2",
+                  "mb_exists": True,
+                  "cm_column": 2,
+                  "mb_column": 3,
+                  "loci": (
+                      {"chr": "2",
+                       "name": "rs31443144",
+                       "cM": 2.0,
+                       "Mb": 3.0,
+                       "genotype": (-1, -1, 1, 1, 1, -1)},
+                      {"chr": "2",
+                       "name": "rs6269442",
+                       "cM": 2.0,
+                       "Mb": 3.0,
+                       "genotype": (-1, -1, 1, 1, 0, "U")})})})
-- 
cgit v1.2.3


From 3ded952f40f486d9aa69746eac2afe7f67fef790 Mon Sep 17 00:00:00 2001
From: Muriithi Frederick Muriuki
Date: Wed, 1 Sep 2021 11:08:38 +0300
Subject: Fix linting and typing issues

Issue:
https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi
---
 gn3/db/genotypes.py             | 32 ++++++++++++++++----------------
 tests/unit/db/test_genotypes.py | 10 +++++-----
 2 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'gn3/db')

diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py
index b5d14a5..b03d55c 100644
--- a/gn3/db/genotypes.py
+++ b/gn3/db/genotypes.py
@@ -88,7 +88,7 @@ def parse_genotype_labels(lines: list):
         item for item in (__parse_label(line) for line in lines)
         if item is not None)
 
-def parse_genotype_header(line: str, parlist = tuple()):
+def parse_genotype_header(line: str, parlist: tuple = tuple()):
     """
     Parse the genotype file header line
 
@@ -97,13 +97,13 @@ def parse_genotype_header(line: str, parlist = tuple()):
     https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/utility/gen_geno_ob.py#L94-L114
     """
     items = [item.strip() for item in line.split("\t")]
-    Mbmap = "Mb" in items
-    prgy = ((parlist + tuple(items[4:])) if Mbmap
+    mbmap = "Mb" in items
+    prgy = ((parlist + tuple(items[4:])) if mbmap
             else (parlist + tuple(items[3:])))
     return (
-        ("Mbmap", Mbmap),
+        ("Mbmap", mbmap),
         ("cm_column", items.index("cM")),
-        ("mb_column", None if not Mbmap else items.index("Mb")),
+        ("mb_column", None if not mbmap else items.index("Mb")),
         ("prgy", prgy),
         ("nprgy", len(prgy)))
 
@@ -131,16 +131,16 @@ def parse_genotype_marker(line: str, geno_obj: dict, parlist: list):
     if len(parlist) > 0:
         genotype = (-1, 1) + genotype
     try:
-        cM = float(geno_obj["cm_column"])
+        cm_val = float(geno_obj["cm_column"])
     except:
         if geno_obj["Mbmap"]:
-            cM = float(geno_obj["mb_column"])
+            cm_val = float(geno_obj["mb_column"])
         else:
-            cM = 0
+            cm_val = 0
     return (
         ("chr", marker_row[0]),
         ("name", marker_row[1]),
-        ("cM", cM),
+        ("cM", cm_val),
         ("Mb", float(geno_obj["mb_column"]) if geno_obj["Mbmap"] else None),
         ("genotype", genotype))
 
@@ -155,9 +155,9 @@ def build_genotype_chromosomes(geno_obj, markers):
         ("name", chr_name), ("mb_exists", geno_obj["Mbmap"]), ("cm_column", 2),
         ("mb_column", geno_obj["mb_column"]),
         ("loci", tuple(marker for marker in mrks if marker["chr"] == chr_name)))
-           for chr_name in sorted(chr_names))
+                 for chr_name in sorted(chr_names))
 
-def parse_genotype_file(filename: str, parlist = tuple()):
+def parse_genotype_file(filename: str, parlist: tuple = tuple()):
     """
     Parse the provided genotype file into a usable pytho3 data structure.
     """
@@ -165,16 +165,16 @@ def parse_genotype_file(filename: str, parlist = tuple()):
         contents = infile.readlines()
 
     lines = tuple(line for line in contents if
-             ((not line.strip().startswith("#")) and
-              (not line.strip() == "")))
+                  ((not line.strip().startswith("#")) and
+                   (not line.strip() == "")))
     labels = parse_genotype_labels(
-        line for line in lines if line.startswith("@"))
+        [line for line in lines if line.startswith("@")])
     data_lines = tuple(line for line in lines if not line.startswith("@"))
     header = parse_genotype_header(data_lines[0], parlist)
     geno_obj = dict(labels + header)
     markers = tuple(
-        parse_genotype_marker(line, geno_obj, parlist)
-        for line in data_lines[1:])
+        [parse_genotype_marker(line, geno_obj, parlist)
+        for line in data_lines[1:]])
     chromosomes = tuple(
         dict(chromosome) for chromosome in
         build_genotype_chromosomes(geno_obj, markers))
diff --git a/tests/unit/db/test_genotypes.py b/tests/unit/db/test_genotypes.py
index a05ce48..c125224 100644
--- a/tests/unit/db/test_genotypes.py
+++ b/tests/unit/db/test_genotypes.py
@@ -18,9 +18,9 @@ class TestGenotypes(TestCase):
                 "@type:test_type", "@mat:test_mat   \t", "@pat:test_pat ",
                 "@het: test_het ", "@unk: test_unk", "@other: test_other",
                 "@brrr: test_brrr "]),
-        (("group", "test_group"), ("filler", "test_filler"),
-         ("type", "test_type"), ("mat", "test_mat"), ("pat", "test_pat"),
-         ("het", "test_het"), ("unk", "test_unk")))
+            (("group", "test_group"), ("filler", "test_filler"),
+             ("type", "test_type"), ("mat", "test_mat"), ("pat", "test_pat"),
+             ("het", "test_het"), ("unk", "test_unk")))
 
     def test_parse_genotype_header(self):
         """Test that the genotype header is parsed correctly."""
@@ -71,7 +71,7 @@ class TestGenotypes(TestCase):
                  (("chr", "1"), ("name", "rs31443144"), ("cM", 2.0),
                   ("Mb", 3.0),
                   ("genotype", (-1, -1, 1, 0, 1, -1, "U", 1, -1, -1)))]]:
-            with self.subTest(line = line):
+            with self.subTest(line=line):
                 self.assertEqual(
                     parse_genotype_marker(line, geno_obj, parlist),
                     expected)
@@ -110,7 +110,7 @@ class TestGenotypes(TestCase):
                    ("loci",
                     ({"chr": "1", "name": "rs31443144", "cM": 2.0, "Mb": None,
                       "genotype": (-1, 1, 1, 0, 1, -1, "U", 1, -1, -1)},))),)]]:
-            with self.subTest(markers = markers):
+            with self.subTest(markers=markers):
                 self.assertEqual(
                     build_genotype_chromosomes(geno_obj, markers),
                     expected)
-- 
cgit v1.2.3


From 1e2357049adc72808fbf8eaac3da9411d3c78c66 Mon Sep 17 00:00:00 2001
From: Frederick Muriuki Muriithi
Date: Fri, 17 Sep 2021 11:20:16 +0300
Subject: Fix a number of linting issues

Issue:
https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi
---
 gn3/computations/qtlreaper.py             |  7 ++--
 gn3/db/genotypes.py                       |  2 +-
 gn3/heatmaps.py                           | 54 ++++++++++++-------------------
 tests/unit/computations/test_qtlreaper.py |  3 +-
 tests/unit/test_heatmaps.py               |  6 ++--
 5 files changed, 32 insertions(+), 40 deletions(-)

(limited to 'gn3/db')

diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py
index 5180853..377db9b 100644
--- a/gn3/computations/qtlreaper.py
+++ b/gn3/computations/qtlreaper.py
@@ -110,9 +110,10 @@ def organise_reaper_main_results(parsed_results):
         unique_chromosomes = {item["Chr"] for item in id_items}
         return {
             "ID": identifier,
-            "chromosomes": {_chr["Chr"]: _chr for _chr in [
-                __organise_by_chromosome(chromo, id_items)
-                for chromo in sorted(
+            "chromosomes": {
+                _chr["Chr"]: _chr for _chr in [
+                    __organise_by_chromosome(chromo, id_items)
+                    for chromo in sorted(
                         unique_chromosomes, key=chromosome_sorter_key_fn)]}}
 
     unique_ids = {res["ID"] for res in parsed_results}
diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py
index b03d55c..9d052d9 100644
--- a/gn3/db/genotypes.py
+++ b/gn3/db/genotypes.py
@@ -174,7 +174,7 @@ def parse_genotype_file(filename: str, parlist: tuple = tuple()):
     geno_obj = dict(labels + header)
     markers = tuple(
         [parse_genotype_marker(line, geno_obj, parlist)
-        for line in data_lines[1:]])
+         for line in data_lines[1:]])
     chromosomes = tuple(
         dict(chromosome) for chromosome in
         build_genotype_chromosomes(geno_obj, markers))
diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py
index 2859dde..c4fc67d 100644
--- a/gn3/heatmaps.py
+++ b/gn3/heatmaps.py
@@ -3,13 +3,13 @@ This module will contain functions to be used in computation of the data used to
 generate various kinds of heatmaps.
 """
 
+from typing import Any, Dict, Sequence
 import numpy as np
 from functools import reduce
 from gn3.settings import TMPDIR
 import plotly.graph_objects as go
 import plotly.figure_factory as ff
 from gn3.random import random_string
-from typing import Any, Dict, Sequence
 from gn3.computations.slink import slink
 from plotly.subplots import make_subplots
 from gn3.computations.correlations2 import compute_correlation
@@ -165,7 +165,7 @@ def build_heatmap(traits_names, conn: Any):
         for fullname in traits_names]
     traits_data_list = [retrieve_trait_data(t, conn) for t in traits]
     genotype_filename = build_genotype_file(traits[0]["riset"])
-    genotype = parse_genotype_file(genotype_filename)
+    # genotype = parse_genotype_file(genotype_filename)
     strains = load_genotype_samples(genotype_filename)
     exported_traits_data_list = [
         export_trait_data(td, strains) for td in traits_data_list]
@@ -183,22 +183,21 @@ def build_heatmap(traits_names, conn: Any):
         [t[2] for t in strains_and_values],
         traits_filename)
 
-    main_output, permutations_output = run_reaper(
+    main_output, _permutations_output = run_reaper(
         genotype_filename, traits_filename, separate_nperm_output=True)
 
     qtlresults = parse_reaper_main_results(main_output)
-    permudata = parse_reaper_permutation_results(permutations_output)
+    # permudata = parse_reaper_permutation_results(permutations_output)
     organised = organise_reaper_main_results(qtlresults)
 
     traits_ids = [# sort numerically, but retain the ids as strings
         str(i) for i in sorted({int(row["ID"]) for row in qtlresults})]
     chromosome_names = sorted(
-        {row["Chr"] for row in qtlresults}, key = chromosome_sorter_key_fn)
-    loci_names = sorted({row["Locus"] for row in qtlresults})
-    ordered_traits_names = {
-        res_id: trait for res_id, trait in
+        {row["Chr"] for row in qtlresults}, key=chromosome_sorter_key_fn)
+    # loci_names = sorted({row["Locus"] for row in qtlresults})
+    ordered_traits_names = dict(
         zip(traits_ids,
-            [traits[idx]["trait_fullname"] for idx in traits_order])}
+            [traits[idx]["trait_fullname"] for idx in traits_order]))
 
     return generate_clustered_heatmap(
         process_traits_data_for_heatmap(
@@ -207,22 +206,11 @@ def build_heatmap(traits_names, conn: Any):
         "single_heatmap_{}".format(random_string(10)),
         y_axis=tuple(
             ordered_traits_names[traits_ids[order]]
-                for order in traits_order),
+            for order in traits_order),
         y_label="Traits",
-        x_axis=[chromo for chromo in chromosome_names],
+        x_axis=chromosome_names,
         x_label="Chromosomes")
 
-    return {
-        "slink_data": slink_data,
-        "ordering_data": ordering_data,
-        "strainlist": strainlist,
-        "genotype_filename": genotype_filename,
-        "traits_list": traits_list,
-        "traits_data_list": traits_data_list,
-        "exported_traits_data_list": exported_traits_data_list,
-        "traits_filename": traits_filename
-    }
-
 def compute_traits_order(slink_data, neworder: tuple = tuple()):
     """
     Compute the order of the traits for clustering from `slink_data`.
@@ -314,7 +302,7 @@ def get_nearest_marker(traits_list, genotype):
     https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L419-L438
     """
     if not genotype["Mbmap"]:
-        return [None] * len(trait_list)
+        return [None] * len(traits_list)
 
     marker_finder = nearest_marker_finder(genotype)
     return [marker_finder(trait) for trait in traits_list]
@@ -340,10 +328,10 @@ def process_traits_data_for_heatmap(data, trait_names, chromosome_names):
     return hdata
 
 def generate_clustered_heatmap(
-        data, clustering_data, image_filename_prefix, x_axis = None,
-        x_label: str = "", y_axis = None, y_label: str = "",
+        data, clustering_data, image_filename_prefix, x_axis=None,
+        x_label: str = "", y_axis=None, y_label: str = "",
         output_dir: str = TMPDIR,
-        colorscale = (
+        colorscale=(
             (0.0, '#5D5D5D'), (0.4999999999999999, '#ABABAB'),
             (0.5, '#F5DE11'), (1.0, '#FF0D00'))):
     """
@@ -357,15 +345,15 @@ def generate_clustered_heatmap(
         shared_yaxes="rows",
         horizontal_spacing=0.001,
         subplot_titles=["distance"] + x_axis,
-        figure = ff.create_dendrogram(
+        figure=ff.create_dendrogram(
             np.array(clustering_data), orientation="right", labels=y_axis))
     hms = [go.Heatmap(
         name=chromo,
-        y = y_axis,
-        z = data_array,
+        y=y_axis,
+        z=data_array,
         showscale=False) for chromo, data_array in zip(x_axis, data)]
-    for i, hm in enumerate(hms):
-        fig.add_trace(hm, row=1, col=(i + 2))
+    for i, heatmap in enumerate(hms):
+        fig.add_trace(heatmap, row=1, col=(i + 2))
 
     fig.update_layout(
         {
@@ -380,8 +368,8 @@ def generate_clustered_heatmap(
     x_axes_layouts = {
         "xaxis{}".format(i+1 if i > 0 else ""): {
             "mirror": False,
-            "showticklabels": True if i==0 else False,
-            "ticks": "outside" if i==0 else ""
+            "showticklabels": True if i == 0 else False,
+            "ticks": "outside" if i == 0 else ""
         }
         for i in range(num_cols)}
 
diff --git a/tests/unit/computations/test_qtlreaper.py b/tests/unit/computations/test_qtlreaper.py
index 1d67827..d420470 100644
--- a/tests/unit/computations/test_qtlreaper.py
+++ b/tests/unit/computations/test_qtlreaper.py
@@ -77,6 +77,7 @@ class TestQTLReaper(TestCase):
              5.82775, 5.89659, 5.92117, 5.93396, 5.93396, 5.94957])
 
     def test_organise_reaper_main_results(self):
+        """Check that results are organised correctly."""
         self.assertEqual(
             organise_reaper_main_results([
                 {
@@ -135,7 +136,7 @@ class TestQTLReaper(TestCase):
                         1: {"Chr": 1,
                             "loci": [
                                 {
-                                    "Locus": "rs31443144",  "cM": 1.500, "Mb": 3.010,
+                                    "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010,
                                     "LRS": 0.500, "Additive": -0.074, "pValue": 1.000
                                 },
                                 {
diff --git a/tests/unit/test_heatmaps.py b/tests/unit/test_heatmaps.py
index f3a81c5..c0a496b 100644
--- a/tests/unit/test_heatmaps.py
+++ b/tests/unit/test_heatmaps.py
@@ -189,6 +189,7 @@ class TestHeatmap(TestCase):
                     retrieve_strains_and_values(orders, slist, tdata), expected)
 
     def test_get_lrs_from_chr(self):
+        """Check that function gets correct LRS values"""
         for trait, chromosome, expected in [
                 [{"chromosomes": {}}, 3, [None]],
                 [{"chromosomes": {3: {"loci": [
@@ -202,6 +203,7 @@ class TestHeatmap(TestCase):
                 self.assertEqual(get_lrs_from_chr(trait, chromosome), expected)
 
     def test_process_traits_data_for_heatmap(self):
+        """Check for correct processing of data for heatmap generation."""
         self.assertEqual(
             process_traits_data_for_heatmap(
                 {"1": {
@@ -210,7 +212,7 @@ class TestHeatmap(TestCase):
                         1: {"Chr": 1,
                             "loci": [
                                 {
-                                    "Locus": "rs31443144",  "cM": 1.500, "Mb": 3.010,
+                                    "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010,
                                     "LRS": 0.500, "Additive": -0.074, "pValue": 1.000
                                 },
                                 {
@@ -257,7 +259,7 @@ class TestHeatmap(TestCase):
                          1: {"Chr": 1,
                              "loci": [
                                  {
-                                     "Locus": "rs31443144",  "cM": 1.500, "Mb": 3.010,
+                                     "Locus": "rs31443144", "cM": 1.500, "Mb": 3.010,
                                      "LRS": 0.500, "Additive": -0.074, "pValue": 1.000
                                  },
                                  {
-- 
cgit v1.2.3


From cd7f301688fd9780df1f842f8bd2b7602775ba1f Mon Sep 17 00:00:00 2001
From: Frederick Muriuki Muriithi
Date: Wed, 22 Sep 2021 07:53:53 +0300
Subject: Fix pylint errors

* Add missing function and module docstrings
* Remove unused imports
* Fix import order
* Rework some code sections to fix issues
* Disable some pylint errors.
---
 gn3/api/heatmaps.py           |  8 ++++++++
 gn3/app.py                    |  5 +++--
 gn3/computations/qtlreaper.py |  8 ++++++++
 gn3/db/genotypes.py           |  1 +
 gn3/db/traits.py              |  2 +-
 gn3/heatmaps.py               | 28 ++++++++++++++++------------
 6 files changed, 37 insertions(+), 15 deletions(-)

(limited to 'gn3/db')

diff --git a/gn3/api/heatmaps.py b/gn3/api/heatmaps.py
index 1022a35..fe47aee 100644
--- a/gn3/api/heatmaps.py
+++ b/gn3/api/heatmaps.py
@@ -1,3 +1,7 @@
+"""
+Module to hold the entrypoint functions that generate heatmaps
+"""
+
 import io
 from flask import jsonify
 from flask import request
@@ -9,6 +13,10 @@ heatmaps = Blueprint("heatmaps", __name__)
 
 @heatmaps.route("/clustered", methods=("POST",))
 def clustered_heatmaps():
+    """
+    Parses the incoming data and responds with the JSON-serialized plotly figure
+    representing the clustered heatmap.
+    """
     heatmap_request = request.get_json()
     traits_names = heatmap_request.get("traits_names", tuple())
     if len(traits_names) < 2:
diff --git a/gn3/app.py b/gn3/app.py
index 6b4c57e..8badb65 100644
--- a/gn3/app.py
+++ b/gn3/app.py
@@ -3,7 +3,10 @@ import os
 
 from typing import Dict
 from typing import Union
+
 from flask import Flask
+from flask_cors import CORS
+
 from gn3.api.gemma import gemma
 from gn3.api.rqtl import rqtl
 from gn3.api.general import general
@@ -11,8 +14,6 @@ from gn3.api.heatmaps import heatmaps
 from gn3.api.correlation import correlation
 from gn3.api.data_entry import data_entry
 
-from flask_cors import CORS
-
 def create_app(config: Union[Dict, str, None] = None) -> Flask:
     """Create a new flask object"""
     app = Flask(__name__)
diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py
index 377db9b..5d17fed 100644
--- a/gn3/computations/qtlreaper.py
+++ b/gn3/computations/qtlreaper.py
@@ -87,11 +87,17 @@ def run_reaper(
     return (output_filename, permu_output_filename)
 
 def chromosome_sorter_key_fn(val):
+    """
+    Useful for sorting the chromosomes
+    """
     if isinstance(val, int):
         return val
     return ord(val)
 
 def organise_reaper_main_results(parsed_results):
+    """
+    Provide the results of running reaper in a format that is easier to use.
+    """
     def __organise_by_chromosome(chr_name, items):
         chr_items = [item for item in items if item["Chr"] == chr_name]
         return {
@@ -129,12 +135,14 @@ def parse_reaper_main_results(results_file):
         lines = infile.readlines()
 
     def __parse_column_float_value(value):
+        # pylint: disable=W0702
         try:
             return float(value)
         except:
             return value
 
     def __parse_column_int_value(value):
+        # pylint: disable=W0702
         try:
             return int(value)
         except:
diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py
index 9d052d9..919c539 100644
--- a/gn3/db/genotypes.py
+++ b/gn3/db/genotypes.py
@@ -115,6 +115,7 @@ def parse_genotype_marker(line: str, geno_obj: dict, parlist: list):
     Reworks
     https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/utility/gen_geno_ob.py#L143-L190
     """
+    # pylint: disable=W0702
     marker_row = [item.strip() for item in line.split("\t")]
     geno_table = {
         geno_obj["mat"]: -1, geno_obj["pat"]: 1, geno_obj["het"]: 0,
diff --git a/gn3/db/traits.py b/gn3/db/traits.py
index bfe887e..747ed27 100644
--- a/gn3/db/traits.py
+++ b/gn3/db/traits.py
@@ -46,7 +46,7 @@ def update_sample_data(conn: Any,
                        count: Union[int, str]):
     """Given the right parameters, update sample-data from the relevant
     table."""
-    # pylint: disable=[R0913, R0914]
+    # pylint: disable=[R0913, R0914, C0103]
     STRAIN_ID_SQL: str = "UPDATE Strain SET Name = %s WHERE Id = %s"
     PUBLISH_DATA_SQL: str = ("UPDATE PublishData SET value = %s "
                              "WHERE StrainId = %s AND Id = %s")
diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py
index cd93b3f..9d82fb2 100644
--- a/gn3/heatmaps.py
+++ b/gn3/heatmaps.py
@@ -3,29 +3,28 @@ This module will contain functions to be used in computation of the data used to
 generate various kinds of heatmaps.
 """
 
+from functools import reduce
 from typing import Any, Dict, Sequence
+
 import numpy as np
-from functools import reduce
-from gn3.settings import TMPDIR
 import plotly.graph_objects as go
 import plotly.figure_factory as ff
+from plotly.subplots import make_subplots
+
+from gn3.settings import TMPDIR
 from gn3.random import random_string
 from gn3.computations.slink import slink
-from plotly.subplots import make_subplots
 from gn3.computations.correlations2 import compute_correlation
 from gn3.db.genotypes import (
-    build_genotype_file, load_genotype_samples, parse_genotype_file)
+    build_genotype_file, load_genotype_samples)
 from gn3.db.traits import (
-    retrieve_trait_data,
-    retrieve_trait_info,
-    generate_traits_filename)
+    retrieve_trait_data, retrieve_trait_info)
 from gn3.computations.qtlreaper import (
     run_reaper,
     generate_traits_file,
     chromosome_sorter_key_fn,
     parse_reaper_main_results,
-    organise_reaper_main_results,
-    parse_reaper_permutation_results)
+    organise_reaper_main_results)
 
 def export_trait_data(
         trait_data: dict, strainlist: Sequence[str], dtype: str = "val",
@@ -159,13 +158,13 @@ def build_heatmap(traits_names, conn: Any):
     PARAMETERS:
     TODO: Elaborate on the parameters here...
     """
+    # pylint: disable=[R0914]
     threshold = 0 # webqtlConfig.PUBLICTHRESH
     traits = [
         retrieve_trait_info(threshold, fullname, conn)
         for fullname in traits_names]
     traits_data_list = [retrieve_trait_data(t, conn) for t in traits]
     genotype_filename = build_genotype_file(traits[0]["riset"])
-    # genotype = parse_genotype_file(genotype_filename)
     strains = load_genotype_samples(genotype_filename)
     exported_traits_data_list = [
         export_trait_data(td, strains) for td in traits_data_list]
@@ -336,6 +335,7 @@ def generate_clustered_heatmap(
     Generate a dendrogram, and heatmaps for each chromosome, and put them all
     into one plot.
     """
+    # pylint: disable=[R0913, R0914]
     num_cols = 1 + len(x_axis)
     fig = make_subplots(
         rows=1,
@@ -359,14 +359,18 @@ def generate_clustered_heatmap(
             "height": 800,
             "xaxis": {
                 "mirror": False,
-                "showgrid": True
+                "showgrid": True,
+                "title": x_label
+            },
+            "yaxis": {
+                "title": y_label
             }
         })
 
     x_axes_layouts = {
         "xaxis{}".format(i+1 if i > 0 else ""): {
             "mirror": False,
-            "showticklabels": True if i == 0 else False,
+            "showticklabels": i == 0,
             "ticks": "outside" if i == 0 else ""
         }
         for i in range(num_cols)}
-- 
cgit v1.2.3


From 71cc35e5178904b512b9007e33be17a36f6656f2 Mon Sep 17 00:00:00 2001
From: Frederick Muriuki Muriithi
Date: Wed, 22 Sep 2021 08:36:11 +0300
Subject: Fix typing issues

* Ignore some errors
* Update typing definitions for some portions of code
* Add missing imports
---
 gn3/app.py                    |  2 +-
 gn3/computations/qtlreaper.py |  6 ++++--
 gn3/db/genotypes.py           | 10 ++++++----
 gn3/db/traits.py              |  8 ++++----
 gn3/heatmaps.py               |  8 +++-----
 5 files changed, 18 insertions(+), 16 deletions(-)

(limited to 'gn3/db')

diff --git a/gn3/app.py b/gn3/app.py
index 8badb65..5e852e1 100644
--- a/gn3/app.py
+++ b/gn3/app.py
@@ -5,7 +5,7 @@ from typing import Dict
 from typing import Union
 
 from flask import Flask
-from flask_cors import CORS
+from flask_cors import CORS # type: ignore
 
 from gn3.api.gemma import gemma
 from gn3.api.rqtl import rqtl
diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py
index 5d17fed..5ddea76 100644
--- a/gn3/computations/qtlreaper.py
+++ b/gn3/computations/qtlreaper.py
@@ -4,6 +4,8 @@ computation of QTLs.
 """
 import os
 import subprocess
+from typing import Union
+
 from gn3.random import random_string
 from gn3.settings import TMPDIR, REAPER_COMMAND
 
@@ -70,9 +72,9 @@ def run_reaper(
         output_dir, random_string(10))
     output_list = ["--main_output", output_filename]
     if separate_nperm_output:
-        permu_output_filename = "{}/qtlreaper/permu_output_{}.txt".format(
+        permu_output_filename: Union[None, str] = "{}/qtlreaper/permu_output_{}.txt".format(
             output_dir, random_string(10))
-        output_list = output_list + ["--permu_output", permu_output_filename]
+        output_list = output_list + ["--permu_output", permu_output_filename] # type: ignore[list-item]
     else:
         permu_output_filename = None
 
diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py
index 919c539..9ea9f20 100644
--- a/gn3/db/genotypes.py
+++ b/gn3/db/genotypes.py
@@ -2,6 +2,8 @@
 
 import os
 import gzip
+from typing import Union, TextIO
+
 from gn3.settings import GENOTYPE_FILES
 
 def build_genotype_file(
@@ -44,17 +46,17 @@ def __load_genotype_samples_from_geno(genotype_filename: str):
     """
     gzipped_filename = "{}.gz".format(genotype_filename)
     if os.path.isfile(gzipped_filename):
-        genofile = gzip.open(gzipped_filename)
+        genofile: Union[TextIO, gzip.GzipFile] = gzip.open(gzipped_filename)
     else:
         genofile = open(genotype_filename)
 
     for row in genofile:
         line = row.strip()
-        if (not line) or (line.startswith(("#", "@"))):
+        if (not line) or (line.startswith(("#", "@"))): # type: ignore[arg-type]
             continue
         break
 
-    headers = line.split("\t")
+    headers = line.split("\t" ) # type: ignore[arg-type]
     if headers[3] == "Mb":
         return headers[4:]
     return headers[3:]
@@ -107,7 +109,7 @@ def parse_genotype_header(line: str, parlist: tuple = tuple()):
         ("prgy", prgy),
         ("nprgy", len(prgy)))
 
-def parse_genotype_marker(line: str, geno_obj: dict, parlist: list):
+def parse_genotype_marker(line: str, geno_obj: dict, parlist: tuple):
     """
     Parse a data line in a genotype file
 
diff --git a/gn3/db/traits.py b/gn3/db/traits.py
index 747ed27..4fc47c3 100644
--- a/gn3/db/traits.py
+++ b/gn3/db/traits.py
@@ -63,22 +63,22 @@ def update_sample_data(conn: Any,
     with conn.cursor() as cursor:
         # Update the Strains table
         cursor.execute(STRAIN_ID_SQL, (strain_name, strain_id))
-        updated_strains: int = cursor.rowcount
+        updated_strains = cursor.rowcount
         # Update the PublishData table
         cursor.execute(PUBLISH_DATA_SQL,
                        (None if value == "x" else value,
                         strain_id, publish_data_id))
-        updated_published_data: int = cursor.rowcount
+        updated_published_data = cursor.rowcount
         # Update the PublishSE table
         cursor.execute(PUBLISH_SE_SQL,
                        (None if error == "x" else error,
                         strain_id, publish_data_id))
-        updated_se_data: int = cursor.rowcount
+        updated_se_data = cursor.rowcount
         # Update the NStrain table
         cursor.execute(N_STRAIN_SQL,
                        (None if count == "x" else count,
                         strain_id, publish_data_id))
-        updated_n_strains: int = cursor.rowcount
+        updated_n_strains = cursor.rowcount
     return (updated_strains, updated_published_data,
             updated_se_data, updated_n_strains)
 
diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py
index 9d82fb2..45d0c22 100644
--- a/gn3/heatmaps.py
+++ b/gn3/heatmaps.py
@@ -7,9 +7,9 @@ from functools import reduce
 from typing import Any, Dict, Sequence
 
 import numpy as np
-import plotly.graph_objects as go
-import plotly.figure_factory as ff
-from plotly.subplots import make_subplots
+import plotly.graph_objects as go # type: ignore
+import plotly.figure_factory as ff # type: ignore
+from plotly.subplots import make_subplots # type: ignore
 
 from gn3.settings import TMPDIR
 from gn3.random import random_string
@@ -171,8 +171,6 @@ def build_heatmap(traits_names, conn: Any):
     clustered = cluster_traits(exported_traits_data_list)
     slinked = slink(clustered)
     traits_order = compute_traits_order(slinked)
-    ordered_traits_names = [
-        traits[idx]["trait_fullname"] for idx in traits_order]
     strains_and_values = retrieve_strains_and_values(
         traits_order, strains, exported_traits_data_list)
     traits_filename = "{}/traits_test_file_{}.txt".format(
-- 
cgit v1.2.3


From 56c73324c285d896567268370f3955bbd15754b0 Mon Sep 17 00:00:00 2001
From: Frederick Muriuki Muriithi
Date: Wed, 22 Sep 2021 09:02:46 +0300
Subject: Fix more pylint errors

---
 gn3/computations/qtlreaper.py | 3 ++-
 gn3/db/genotypes.py           | 2 +-
 tests/unit/db/test_traits.py  | 1 +
 3 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'gn3/db')

diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py
index 5ddea76..8b2893e 100644
--- a/gn3/computations/qtlreaper.py
+++ b/gn3/computations/qtlreaper.py
@@ -74,7 +74,8 @@ def run_reaper(
     if separate_nperm_output:
         permu_output_filename: Union[None, str] = "{}/qtlreaper/permu_output_{}.txt".format(
             output_dir, random_string(10))
-        output_list = output_list + ["--permu_output", permu_output_filename] # type: ignore[list-item]
+        output_list = output_list + [
+            "--permu_output", permu_output_filename] # type: ignore[list-item]
     else:
         permu_output_filename = None
 
diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py
index 9ea9f20..9987320 100644
--- a/gn3/db/genotypes.py
+++ b/gn3/db/genotypes.py
@@ -56,7 +56,7 @@ def __load_genotype_samples_from_geno(genotype_filename: str):
             continue
         break
 
-    headers = line.split("\t" ) # type: ignore[arg-type]
+    headers = line.split("\t") # type: ignore[arg-type]
     if headers[3] == "Mb":
         return headers[4:]
     return headers[3:]
diff --git a/tests/unit/db/test_traits.py b/tests/unit/db/test_traits.py
index ee98893..baa2af3 100644
--- a/tests/unit/db/test_traits.py
+++ b/tests/unit/db/test_traits.py
@@ -166,6 +166,7 @@ class TestTraitsDBFunctions(TestCase):
         the right calls.
 
         """
+        # pylint: disable=C0103
         db_mock = mock.MagicMock()
 
         STRAIN_ID_SQL: str = "UPDATE Strain SET Name = %s WHERE Id = %s"
-- 
cgit v1.2.3


From 19783a18c2bc7941fc5980e593f19fb1d18c3623 Mon Sep 17 00:00:00 2001
From: Frederick Muriuki Muriithi
Date: Mon, 27 Sep 2021 04:48:53 +0300
Subject: Update terminology: `strain` to `sample`

Issue:
https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi

* Update the terminology used: use `sample` in place of `strain` according to
  Zachary's direction at
  https://github.com/genenetwork/genenetwork3/pull/37#issuecomment-926043306
---
 gn3/computations/parsers.py             | 10 ++---
 gn3/computations/qtlreaper.py           |  8 ++--
 gn3/db/genotypes.py                     |  8 ++--
 gn3/db/traits.py                        | 44 ++++++++++-----------
 gn3/heatmaps.py                         | 62 ++++++++++++++---------------
 tests/unit/computations/test_parsers.py |  4 +-
 tests/unit/test_heatmaps.py             | 70 ++++++++++++++++-----------------
 7 files changed, 103 insertions(+), 103 deletions(-)

(limited to 'gn3/db')

diff --git a/gn3/computations/parsers.py b/gn3/computations/parsers.py
index 94387ff..1af35d6 100644
--- a/gn3/computations/parsers.py
+++ b/gn3/computations/parsers.py
@@ -14,7 +14,7 @@ def parse_genofile(file_path: str) -> Tuple[List[str],
         'h': 0,
         'u': None,
     }
-    genotypes, strains = [], []
+    genotypes, samples = [], []
     with open(file_path, "r") as _genofile:
         for line in _genofile:
             line = line.strip()
@@ -22,8 +22,8 @@ def parse_genofile(file_path: str) -> Tuple[List[str],
                 continue
             cells = line.split()
             if line.startswith("Chr"):
-                strains = cells[4:]
-                strains = [strain.lower() for strain in strains]
+                samples = cells[4:]
+                samples = [sample.lower() for sample in samples]
                 continue
             values = [__map.get(value.lower(), None) for value in cells[4:]]
             genotype = {
@@ -32,7 +32,7 @@ def parse_genofile(file_path: str) -> Tuple[List[str],
                 "cm": cells[2],
                 "mb": cells[3],
                 "values":  values,
-                "dicvalues": dict(zip(strains, values)),
+                "dicvalues": dict(zip(samples, values)),
             }
             genotypes.append(genotype)
-        return strains, genotypes
+        return samples, genotypes
diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py
index 8b2893e..166d2dd 100644
--- a/gn3/computations/qtlreaper.py
+++ b/gn3/computations/qtlreaper.py
@@ -9,17 +9,17 @@ from typing import Union
 from gn3.random import random_string
 from gn3.settings import TMPDIR, REAPER_COMMAND
 
-def generate_traits_file(strains, trait_values, traits_filename):
+def generate_traits_file(samples, trait_values, traits_filename):
     """
     Generate a traits file for use with `qtlreaper`.
 
     PARAMETERS:
-    strains: A list of strains to use as the headers for the various columns.
-    trait_values: A list of lists of values for each trait and strain.
+    samples: A list of samples to use as the headers for the various columns.
+    trait_values: A list of lists of values for each trait and sample.
     traits_filename: The tab-separated value to put the values in for
         computation of QTLs.
     """
-    header = "Trait\t{}\n".format("\t".join(strains))
+    header = "Trait\t{}\n".format("\t".join(samples))
     data = (
         [header] +
         ["{}\t{}\n".format(i+1, "\t".join([str(i) for i in t]))
diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py
index 9987320..8f18cac 100644
--- a/gn3/db/genotypes.py
+++ b/gn3/db/genotypes.py
@@ -14,16 +14,16 @@ def build_genotype_file(
 
 def load_genotype_samples(genotype_filename: str, file_type: str = "geno"):
     """
-    Load sample of strains from genotype files.
+    Load sample of samples from genotype files.
 
     DESCRIPTION:
-    Traits can contain a varied number of strains, some of which do not exist in
+    Traits can contain a varied number of samples, some of which do not exist in
     certain genotypes. In order to compute QTLs, GEMMAs, etc, we need to ensure
-    to pick only those strains that exist in the genotype under consideration
+    to pick only those samples that exist in the genotype under consideration
     for the traits used in the computation.
 
     This function loads a list of samples from the genotype files for use in
-    filtering out unusable strains.
+    filtering out unusable samples.
 
 
     PARAMETERS:
diff --git a/gn3/db/traits.py b/gn3/db/traits.py
index 4fc47c3..c9d05d7 100644
--- a/gn3/db/traits.py
+++ b/gn3/db/traits.py
@@ -445,7 +445,7 @@ def retrieve_temp_trait_data(trait_info: dict, conn: Any):
             query,
             {"trait_name": trait_info["trait_name"]})
         return [dict(zip(
-            ["strain_name", "value", "se_error", "nstrain", "id"], row))
+            ["sample_name", "value", "se_error", "nstrain", "id"], row))
                 for row in cursor.fetchall()]
     return []
 
@@ -484,7 +484,7 @@ def retrieve_geno_trait_data(trait_info: Dict, conn: Any):
              "species_id": retrieve_species_id(
                  trait_info["db"]["riset"], conn)})
         return [dict(zip(
-            ["strain_name", "value", "se_error", "id"], row))
+            ["sample_name", "value", "se_error", "id"], row))
                 for row in cursor.fetchall()]
     return []
 
@@ -515,7 +515,7 @@ def retrieve_publish_trait_data(trait_info: Dict, conn: Any):
             {"trait_name": trait_info["trait_name"],
              "dataset_id": trait_info["db"]["dataset_id"]})
         return [dict(zip(
-            ["strain_name", "value", "se_error", "nstrain", "id"], row))
+            ["sample_name", "value", "se_error", "nstrain", "id"], row))
                 for row in cursor.fetchall()]
     return []
 
@@ -548,7 +548,7 @@ def retrieve_cellid_trait_data(trait_info: Dict, conn: Any):
              "trait_name": trait_info["trait_name"],
              "dataset_id": trait_info["db"]["dataset_id"]})
         return [dict(zip(
-            ["strain_name", "value", "se_error", "id"], row))
+            ["sample_name", "value", "se_error", "id"], row))
                 for row in cursor.fetchall()]
     return []
 
@@ -577,29 +577,29 @@ def retrieve_probeset_trait_data(trait_info: Dict, conn: Any):
             {"trait_name": trait_info["trait_name"],
              "dataset_name": trait_info["db"]["dataset_name"]})
         return [dict(zip(
-            ["strain_name", "value", "se_error", "id"], row))
+            ["sample_name", "value", "se_error", "id"], row))
                 for row in cursor.fetchall()]
     return []
 
-def with_strainlist_data_setup(strainlist: Sequence[str]):
+def with_samplelist_data_setup(samplelist: Sequence[str]):
     """
-    Build function that computes the trait data from provided list of strains.
+    Build function that computes the trait data from provided list of samples.
 
     PARAMETERS
-    strainlist: (list)
-      A list of strain names
+    samplelist: (list)
+      A list of sample names
 
     RETURNS:
       Returns a function that given some data from the database, computes the
-      strain's value, variance and ndata values, only if the strain is present
-      in the provided `strainlist` variable.
+      sample's value, variance and ndata values, only if the sample is present
+      in the provided `samplelist` variable.
     """
     def setup_fn(tdata):
-        if tdata["strain_name"] in strainlist:
+        if tdata["sample_name"] in samplelist:
             val = tdata["value"]
             if val is not None:
                 return {
-                    "strain_name": tdata["strain_name"],
+                    "sample_name": tdata["sample_name"],
                     "value": val,
                     "variance": tdata["se_error"],
                     "ndata": tdata.get("nstrain", None)
@@ -607,19 +607,19 @@ def with_strainlist_data_setup(strainlist: Sequence[str]):
         return None
     return setup_fn
 
-def without_strainlist_data_setup():
+def without_samplelist_data_setup():
     """
     Build function that computes the trait data.
 
     RETURNS:
       Returns a function that given some data from the database, computes the
-      strain's value, variance and ndata values.
+      sample's value, variance and ndata values.
     """
     def setup_fn(tdata):
         val = tdata["value"]
         if val is not None:
             return {
-                "strain_name": tdata["strain_name"],
+                "sample_name": tdata["sample_name"],
                 "value": val,
                 "variance": tdata["se_error"],
                 "ndata": tdata.get("nstrain", None)
@@ -627,7 +627,7 @@ def without_strainlist_data_setup():
         return None
     return setup_fn
 
-def retrieve_trait_data(trait: dict, conn: Any, strainlist: Sequence[str] = tuple()):
+def retrieve_trait_data(trait: dict, conn: Any, samplelist: Sequence[str] = tuple()):
     """
     Retrieve trait data
 
@@ -650,23 +650,23 @@ def retrieve_trait_data(trait: dict, conn: Any, strainlist: Sequence[str] = tupl
     if results:
         # do something with mysqlid
         mysqlid = results[0]["id"]
-        if strainlist:
+        if samplelist:
             data = [
                 item for item in
-                map(with_strainlist_data_setup(strainlist), results)
+                map(with_samplelist_data_setup(samplelist), results)
                 if item is not None]
         else:
             data = [
                 item for item in
-                map(without_strainlist_data_setup(), results)
+                map(without_samplelist_data_setup(), results)
                 if item is not None]
 
         return {
             "mysqlid": mysqlid,
             "data": dict(map(
                 lambda x: (
-                    x["strain_name"],
-                    {k:v for k, v in x.items() if x != "strain_name"}),
+                    x["sample_name"],
+                    {k:v for k, v in x.items() if x != "sample_name"}),
                 data))}
     return {}
 
diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py
index 45d0c22..b6fc6d3 100644
--- a/gn3/heatmaps.py
+++ b/gn3/heatmaps.py
@@ -27,10 +27,10 @@ from gn3.computations.qtlreaper import (
     organise_reaper_main_results)
 
 def export_trait_data(
-        trait_data: dict, strainlist: Sequence[str], dtype: str = "val",
+        trait_data: dict, samplelist: Sequence[str], dtype: str = "val",
         var_exists: bool = False, n_exists: bool = False):
     """
-    Export data according to `strainlist`. Mostly used in calculating
+    Export data according to `samplelist`. Mostly used in calculating
     correlations.
 
     DESCRIPTION:
@@ -40,8 +40,8 @@ def export_trait_data(
     PARAMETERS
     trait: (dict)
       The dictionary of key-value pairs representing a trait
-    strainlist: (list)
-      A list of strain names
+    samplelist: (list)
+      A list of sample names
     dtype: (str)
       ... verify what this is ...
     var_exists: (bool)
@@ -49,18 +49,18 @@ def export_trait_data(
     n_exists: (bool)
       A flag indicating existence of ndata
     """
-    def __export_all_types(tdata, strain):
+    def __export_all_types(tdata, sample):
         sample_data = []
-        if tdata[strain]["value"]:
-            sample_data.append(tdata[strain]["value"])
+        if tdata[sample]["value"]:
+            sample_data.append(tdata[sample]["value"])
             if var_exists:
-                if tdata[strain]["variance"]:
-                    sample_data.append(tdata[strain]["variance"])
+                if tdata[sample]["variance"]:
+                    sample_data.append(tdata[sample]["variance"])
                 else:
                     sample_data.append(None)
             if n_exists:
-                if tdata[strain]["ndata"]:
-                    sample_data.append(tdata[strain]["ndata"])
+                if tdata[sample]["ndata"]:
+                    sample_data.append(tdata[sample]["ndata"])
                 else:
                     sample_data.append(None)
         else:
@@ -73,17 +73,17 @@ def export_trait_data(
 
         return tuple(sample_data)
 
-    def __exporter(accumulator, strain):
+    def __exporter(accumulator, sample):
         # pylint: disable=[R0911]
-        if strain in trait_data["data"]:
+        if sample in trait_data["data"]:
             if dtype == "val":
-                return accumulator + (trait_data["data"][strain]["value"], )
+                return accumulator + (trait_data["data"][sample]["value"], )
             if dtype == "var":
-                return accumulator + (trait_data["data"][strain]["variance"], )
+                return accumulator + (trait_data["data"][sample]["variance"], )
             if dtype == "N":
-                return accumulator + (trait_data["data"][strain]["ndata"], )
+                return accumulator + (trait_data["data"][sample]["ndata"], )
             if dtype == "all":
-                return accumulator + __export_all_types(trait_data["data"], strain)
+                return accumulator + __export_all_types(trait_data["data"], sample)
             raise KeyError("Type `%s` is incorrect" % dtype)
         if var_exists and n_exists:
             return accumulator + (None, None, None)
@@ -91,7 +91,7 @@ def export_trait_data(
             return accumulator + (None, None)
         return accumulator + (None,)
 
-    return reduce(__exporter, strainlist, tuple())
+    return reduce(__exporter, samplelist, tuple())
 
 def trait_display_name(trait: Dict):
     """
@@ -165,19 +165,19 @@ def build_heatmap(traits_names, conn: Any):
         for fullname in traits_names]
     traits_data_list = [retrieve_trait_data(t, conn) for t in traits]
     genotype_filename = build_genotype_file(traits[0]["riset"])
-    strains = load_genotype_samples(genotype_filename)
+    samples = load_genotype_samples(genotype_filename)
     exported_traits_data_list = [
-        export_trait_data(td, strains) for td in traits_data_list]
+        export_trait_data(td, samples) for td in traits_data_list]
     clustered = cluster_traits(exported_traits_data_list)
     slinked = slink(clustered)
     traits_order = compute_traits_order(slinked)
-    strains_and_values = retrieve_strains_and_values(
-        traits_order, strains, exported_traits_data_list)
+    samples_and_values = retrieve_samples_and_values(
+        traits_order, samples, exported_traits_data_list)
     traits_filename = "{}/traits_test_file_{}.txt".format(
         TMPDIR, random_string(10))
     generate_traits_file(
-        strains_and_values[0][1],
-        [t[2] for t in strains_and_values],
+        samples_and_values[0][1],
+        [t[2] for t in samples_and_values],
         traits_filename)
 
     main_output, _permutations_output = run_reaper(
@@ -229,9 +229,9 @@ def compute_traits_order(slink_data, neworder: tuple = tuple()):
 
     return __order_maker(neworder, slink_data)
 
-def retrieve_strains_and_values(orders, strainlist, traits_data_list):
+def retrieve_samples_and_values(orders, samplelist, traits_data_list):
     """
-    Get the strains and their corresponding values from `strainlist` and
+    Get the samples and their corresponding values from `samplelist` and
     `traits_data_list`.
 
     This migrates the code in
@@ -240,17 +240,17 @@ def retrieve_strains_and_values(orders, strainlist, traits_data_list):
     # This feels nasty! There's a lot of mutation of values here, that might
     # indicate something untoward in the design of this function and its
     # dependents  ==>  Review
-    strains = []
+    samples = []
     values = []
     rets = []
     for order in orders:
         temp_val = traits_data_list[order]
-        for i, strain in enumerate(strainlist):
+        for i, sample in enumerate(samplelist):
             if temp_val[i] is not None:
-                strains.append(strain)
+                samples.append(sample)
                 values.append(temp_val[i])
-        rets.append([order, strains[:], values[:]])
-        strains = []
+        rets.append([order, samples[:], values[:]])
+        samples = []
         values = []
 
     return rets
diff --git a/tests/unit/computations/test_parsers.py b/tests/unit/computations/test_parsers.py
index 19c3067..b51b0bf 100644
--- a/tests/unit/computations/test_parsers.py
+++ b/tests/unit/computations/test_parsers.py
@@ -15,7 +15,7 @@ class TestParsers(unittest.TestCase):
 
     def test_parse_genofile_with_existing_file(self):
         """Test that a genotype file is parsed correctly"""
-        strains = ["bxd1", "bxd2"]
+        samples = ["bxd1", "bxd2"]
         genotypes = [
             {"chr": "1", "locus": "rs31443144",
              "cm": "1.50", "mb": "3.010274",
@@ -51,4 +51,4 @@ class TestParsers(unittest.TestCase):
             "../test_data/genotype.txt"
         ))
         self.assertEqual(parse_genofile(
-            test_genotype_file), (strains, genotypes))
+            test_genotype_file), (samples, genotypes))
diff --git a/tests/unit/test_heatmaps.py b/tests/unit/test_heatmaps.py
index fd91cf9..b54e2f3 100644
--- a/tests/unit/test_heatmaps.py
+++ b/tests/unit/test_heatmaps.py
@@ -5,41 +5,41 @@ from gn3.heatmaps import (
     get_lrs_from_chr,
     export_trait_data,
     compute_traits_order,
-    retrieve_strains_and_values,
+    retrieve_samples_and_values,
     process_traits_data_for_heatmap)
 from tests.unit.sample_test_data import organised_trait_1, organised_trait_2
 
-strainlist = ["B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"]
+samplelist = ["B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"]
 trait_data = {
     "mysqlid": 36688172,
     "data": {
-        "B6cC3-1": {"strain_name": "B6cC3-1", "value": 7.51879, "variance": None, "ndata": None},
-        "BXD1": {"strain_name": "BXD1", "value": 7.77141, "variance": None, "ndata": None},
-        "BXD12": {"strain_name": "BXD12", "value": 8.39265, "variance": None, "ndata": None},
-        "BXD16": {"strain_name": "BXD16", "value": 8.17443, "variance": None, "ndata": None},
-        "BXD19": {"strain_name": "BXD19", "value": 8.30401, "variance": None, "ndata": None},
-        "BXD2": {"strain_name": "BXD2", "value": 7.80944, "variance": None, "ndata": None},
-        "BXD21": {"strain_name": "BXD21", "value": 8.93809, "variance": None, "ndata": None},
-        "BXD24": {"strain_name": "BXD24", "value": 7.99415, "variance": None, "ndata": None},
-        "BXD27": {"strain_name": "BXD27", "value": 8.12177, "variance": None, "ndata": None},
-        "BXD28": {"strain_name": "BXD28", "value": 7.67688, "variance": None, "ndata": None},
-        "BXD32": {"strain_name": "BXD32", "value": 7.79062, "variance": None, "ndata": None},
-        "BXD39": {"strain_name": "BXD39", "value": 8.27641, "variance": None, "ndata": None},
-        "BXD40": {"strain_name": "BXD40", "value": 8.18012, "variance": None, "ndata": None},
-        "BXD42": {"strain_name": "BXD42", "value": 7.82433, "variance": None, "ndata": None},
-        "BXD6": {"strain_name": "BXD6", "value": 8.09718, "variance": None, "ndata": None},
-        "BXH14": {"strain_name": "BXH14", "value": 7.97475, "variance": None, "ndata": None},
-        "BXH19": {"strain_name": "BXH19", "value": 7.67223, "variance": None, "ndata": None},
-        "BXH2": {"strain_name": "BXH2", "value": 7.93622, "variance": None, "ndata": None},
-        "BXH22": {"strain_name": "BXH22", "value": 7.43692, "variance": None, "ndata": None},
-        "BXH4": {"strain_name": "BXH4", "value": 7.96336, "variance": None, "ndata": None},
-        "BXH6": {"strain_name": "BXH6", "value": 7.75132, "variance": None, "ndata": None},
-        "BXH7": {"strain_name": "BXH7", "value": 8.12927, "variance": None, "ndata": None},
-        "BXH8": {"strain_name": "BXH8", "value": 6.77338, "variance": None, "ndata": None},
-        "BXH9": {"strain_name": "BXH9", "value": 8.03836, "variance": None, "ndata": None},
-        "C3H/HeJ": {"strain_name": "C3H/HeJ", "value": 7.42795, "variance": None, "ndata": None},
-        "C57BL/6J": {"strain_name": "C57BL/6J", "value": 7.50606, "variance": None, "ndata": None},
-        "DBA/2J": {"strain_name": "DBA/2J", "value": 7.72588, "variance": None, "ndata": None}}}
+        "B6cC3-1": {"sample_name": "B6cC3-1", "value": 7.51879, "variance": None, "ndata": None},
+        "BXD1": {"sample_name": "BXD1", "value": 7.77141, "variance": None, "ndata": None},
+        "BXD12": {"sample_name": "BXD12", "value": 8.39265, "variance": None, "ndata": None},
+        "BXD16": {"sample_name": "BXD16", "value": 8.17443, "variance": None, "ndata": None},
+        "BXD19": {"sample_name": "BXD19", "value": 8.30401, "variance": None, "ndata": None},
+        "BXD2": {"sample_name": "BXD2", "value": 7.80944, "variance": None, "ndata": None},
+        "BXD21": {"sample_name": "BXD21", "value": 8.93809, "variance": None, "ndata": None},
+        "BXD24": {"sample_name": "BXD24", "value": 7.99415, "variance": None, "ndata": None},
+        "BXD27": {"sample_name": "BXD27", "value": 8.12177, "variance": None, "ndata": None},
+        "BXD28": {"sample_name": "BXD28", "value": 7.67688, "variance": None, "ndata": None},
+        "BXD32": {"sample_name": "BXD32", "value": 7.79062, "variance": None, "ndata": None},
+        "BXD39": {"sample_name": "BXD39", "value": 8.27641, "variance": None, "ndata": None},
+        "BXD40": {"sample_name": "BXD40", "value": 8.18012, "variance": None, "ndata": None},
+        "BXD42": {"sample_name": "BXD42", "value": 7.82433, "variance": None, "ndata": None},
+        "BXD6": {"sample_name": "BXD6", "value": 8.09718, "variance": None, "ndata": None},
+        "BXH14": {"sample_name": "BXH14", "value": 7.97475, "variance": None, "ndata": None},
+        "BXH19": {"sample_name": "BXH19", "value": 7.67223, "variance": None, "ndata": None},
+        "BXH2": {"sample_name": "BXH2", "value": 7.93622, "variance": None, "ndata": None},
+        "BXH22": {"sample_name": "BXH22", "value": 7.43692, "variance": None, "ndata": None},
+        "BXH4": {"sample_name": "BXH4", "value": 7.96336, "variance": None, "ndata": None},
+        "BXH6": {"sample_name": "BXH6", "value": 7.75132, "variance": None, "ndata": None},
+        "BXH7": {"sample_name": "BXH7", "value": 8.12927, "variance": None, "ndata": None},
+        "BXH8": {"sample_name": "BXH8", "value": 6.77338, "variance": None, "ndata": None},
+        "BXH9": {"sample_name": "BXH9", "value": 8.03836, "variance": None, "ndata": None},
+        "C3H/HeJ": {"sample_name": "C3H/HeJ", "value": 7.42795, "variance": None, "ndata": None},
+        "C57BL/6J": {"sample_name": "C57BL/6J", "value": 7.50606, "variance": None, "ndata": None},
+        "DBA/2J": {"sample_name": "DBA/2J", "value": 7.72588, "variance": None, "ndata": None}}}
 
 slinked = (
     (((0, 2, 0.16381088984330505),
@@ -66,7 +66,7 @@ class TestHeatmap(TestCase):
                 ["all", (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)]]:
             with self.subTest(dtype=dtype):
                 self.assertEqual(
-                    export_trait_data(trait_data, strainlist, dtype=dtype),
+                    export_trait_data(trait_data, samplelist, dtype=dtype),
                     expected)
 
     def test_export_trait_data_dtype_all_flags(self):
@@ -106,7 +106,7 @@ class TestHeatmap(TestCase):
             with self.subTest(dtype=dtype, vflag=vflag, nflag=nflag):
                 self.assertEqual(
                     export_trait_data(
-                        trait_data, strainlist, dtype=dtype, var_exists=vflag,
+                        trait_data, samplelist, dtype=dtype, var_exists=vflag,
                         n_exists=nflag),
                     expected)
 
@@ -164,8 +164,8 @@ class TestHeatmap(TestCase):
         self.assertEqual(
             compute_traits_order(slinked), (0, 2, 1, 7, 5, 9, 3, 6, 8, 4))
 
-    def test_retrieve_strains_and_values(self):
-        """Test retrieval of strains and values."""
+    def test_retrieve_samples_and_values(self):
+        """Test retrieval of samples and values."""
         for orders, slist, tdata, expected in [
                 [
                     [2],
@@ -185,9 +185,9 @@ class TestHeatmap(TestCase):
                      [6, None, None, 4, None]],
                     [[3, ["s1", "s4"], [6, 4]]]
                 ]]:
-            with self.subTest(strainlist=slist, traitdata=tdata):
+            with self.subTest(samplelist=slist, traitdata=tdata):
                 self.assertEqual(
-                    retrieve_strains_and_values(orders, slist, tdata), expected)
+                    retrieve_samples_and_values(orders, slist, tdata), expected)
 
     def test_get_lrs_from_chr(self):
         """Check that function gets correct LRS values"""
-- 
cgit v1.2.3


From 1d09a9222f8c661da3abd6d61c09ae19eeb5d793 Mon Sep 17 00:00:00 2001
From: Frederick Muriuki Muriithi
Date: Mon, 27 Sep 2021 05:02:09 +0300
Subject: Update terminology: `riset` to `group`

Issue:
https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi

* Update terminology to use the appropriate domain terminology according to
  Zachary's direction at
  https://github.com/genenetwork/genenetwork3/pull/37#issuecomment-926041744
---
 gn3/db/datasets.py             | 52 +++++++++++++++++++++---------------------
 gn3/db/traits.py               | 16 ++++++-------
 gn3/heatmaps.py                |  2 +-
 tests/unit/db/test_datasets.py | 42 +++++++++++++++++-----------------
 4 files changed, 56 insertions(+), 56 deletions(-)

(limited to 'gn3/db')

diff --git a/gn3/db/datasets.py b/gn3/db/datasets.py
index 4a05499..6c328f5 100644
--- a/gn3/db/datasets.py
+++ b/gn3/db/datasets.py
@@ -119,9 +119,9 @@ def retrieve_dataset_name(
     return fn_map[trait_type](threshold, dataset_name, conn)
 
 
-def retrieve_geno_riset_fields(name, conn):
+def retrieve_geno_group_fields(name, conn):
     """
-    Retrieve the RISet, and RISetID values for various Geno trait types.
+    Retrieve the Group, and GroupID values for various Geno trait types.
     """
     query = (
         "SELECT InbredSet.Name, InbredSet.Id "
@@ -130,12 +130,12 @@ def retrieve_geno_riset_fields(name, conn):
         "AND GenoFreeze.Name = %(name)s")
     with conn.cursor() as cursor:
         cursor.execute(query, {"name": name})
-        return dict(zip(["riset", "risetid"], cursor.fetchone()))
+        return dict(zip(["group", "groupid"], cursor.fetchone()))
     return {}
 
-def retrieve_publish_riset_fields(name, conn):
+def retrieve_publish_group_fields(name, conn):
     """
-    Retrieve the RISet, and RISetID values for various Publish trait types.
+    Retrieve the Group, and GroupID values for various Publish trait types.
     """
     query = (
         "SELECT InbredSet.Name, InbredSet.Id "
@@ -144,12 +144,12 @@ def retrieve_publish_riset_fields(name, conn):
         "AND PublishFreeze.Name = %(name)s")
     with conn.cursor() as cursor:
         cursor.execute(query, {"name": name})
-        return dict(zip(["riset", "risetid"], cursor.fetchone()))
+        return dict(zip(["group", "groupid"], cursor.fetchone()))
     return {}
 
-def retrieve_probeset_riset_fields(name, conn):
+def retrieve_probeset_group_fields(name, conn):
     """
-    Retrieve the RISet, and RISetID values for various ProbeSet trait types.
+    Retrieve the Group, and GroupID values for various ProbeSet trait types.
     """
     query = (
         "SELECT InbredSet.Name, InbredSet.Id "
@@ -159,12 +159,12 @@ def retrieve_probeset_riset_fields(name, conn):
         "AND ProbeSetFreeze.Name = %(name)s")
     with conn.cursor() as cursor:
         cursor.execute(query, {"name": name})
-        return dict(zip(["riset", "risetid"], cursor.fetchone()))
+        return dict(zip(["group", "groupid"], cursor.fetchone()))
     return {}
 
-def retrieve_temp_riset_fields(name, conn):
+def retrieve_temp_group_fields(name, conn):
     """
-    Retrieve the RISet, and RISetID values for `Temp` trait types.
+    Retrieve the Group, and GroupID values for `Temp` trait types.
     """
     query = (
         "SELECT InbredSet.Name, InbredSet.Id "
@@ -173,30 +173,30 @@ def retrieve_temp_riset_fields(name, conn):
         "AND Temp.Name = %(name)s")
     with conn.cursor() as cursor:
         cursor.execute(query, {"name": name})
-        return dict(zip(["riset", "risetid"], cursor.fetchone()))
+        return dict(zip(["group", "groupid"], cursor.fetchone()))
     return {}
 
-def retrieve_riset_fields(trait_type, trait_name, dataset_info, conn):
+def retrieve_group_fields(trait_type, trait_name, dataset_info, conn):
     """
-    Retrieve the RISet, and RISetID values for various trait types.
+    Retrieve the Group, and GroupID values for various trait types.
     """
-    riset_fns_map = {
-        "Geno": retrieve_geno_riset_fields,
-        "Publish": retrieve_publish_riset_fields,
-        "ProbeSet": retrieve_probeset_riset_fields
+    group_fns_map = {
+        "Geno": retrieve_geno_group_fields,
+        "Publish": retrieve_publish_group_fields,
+        "ProbeSet": retrieve_probeset_group_fields
     }
 
     if trait_type == "Temp":
-        riset_info = retrieve_temp_riset_fields(trait_name, conn)
+        group_info = retrieve_temp_group_fields(trait_name, conn)
     else:
-        riset_info = riset_fns_map[trait_type](dataset_info["dataset_name"], conn)
+        group_info = group_fns_map[trait_type](dataset_info["dataset_name"], conn)
 
     return {
         **dataset_info,
-        **riset_info,
-        "riset": (
-            "BXD" if riset_info.get("riset") == "BXD300"
-            else riset_info.get("riset", ""))
+        **group_info,
+        "group": (
+            "BXD" if group_info.get("group") == "BXD300"
+            else group_info.get("group", ""))
     }
 
 def retrieve_temp_trait_dataset():
@@ -281,11 +281,11 @@ def retrieve_trait_dataset(trait_type, trait, threshold, conn):
             trait_type, threshold, trait["trait_name"],
             trait["db"]["dataset_name"], conn)
     }
-    riset = retrieve_riset_fields(
+    group = retrieve_group_fields(
         trait_type, trait["trait_name"], dataset_name_info, conn)
     return {
         "display_name": dataset_name_info["dataset_name"],
         **dataset_name_info,
         **dataset_fns[trait_type](),
-        **riset
+        **group
     }
diff --git a/gn3/db/traits.py b/gn3/db/traits.py
index c9d05d7..f2673c8 100644
--- a/gn3/db/traits.py
+++ b/gn3/db/traits.py
@@ -226,7 +226,7 @@ def set_homologene_id_field_probeset(trait_info, conn):
     """
     query = (
         "SELECT HomologeneId FROM Homologene, Species, InbredSet"
-        " WHERE Homologene.GeneId = %(geneid)s AND InbredSet.Name = %(riset)s"
+        " WHERE Homologene.GeneId = %(geneid)s AND InbredSet.Name = %(group)s"
         " AND InbredSet.SpeciesId = Species.Id AND"
         " Species.TaxonomyId = Homologene.TaxonomyId")
     with conn.cursor() as cursor:
@@ -234,7 +234,7 @@ def set_homologene_id_field_probeset(trait_info, conn):
             query,
             {
                 k:v for k, v in trait_info.items()
-                if k in ["geneid", "riset"]
+                if k in ["geneid", "group"]
             })
         res = cursor.fetchone()
         if res:
@@ -422,7 +422,7 @@ def retrieve_trait_info(
     if trait_info["haveinfo"]:
         return {
             **trait_post_processing_functions_table[trait_dataset_type](
-                {**trait_info, "riset": trait_dataset["riset"]}),
+                {**trait_info, "group": trait_dataset["group"]}),
             "db": {**trait["db"], **trait_dataset}
         }
     return trait_info
@@ -449,14 +449,14 @@ def retrieve_temp_trait_data(trait_info: dict, conn: Any):
                 for row in cursor.fetchall()]
     return []
 
-def retrieve_species_id(riset, conn: Any):
+def retrieve_species_id(group, conn: Any):
     """
-    Retrieve a species id given the RISet value
+    Retrieve a species id given the Group value
     """
     with conn.cursor as cursor:
         cursor.execute(
-            "SELECT SpeciesId from InbredSet WHERE Name = %(riset)s",
-            {"riset": riset})
+            "SELECT SpeciesId from InbredSet WHERE Name = %(group)s",
+            {"group": group})
         return cursor.fetchone()[0]
     return None
 
@@ -482,7 +482,7 @@ def retrieve_geno_trait_data(trait_info: Dict, conn: Any):
             {"trait_name": trait_info["trait_name"],
              "dataset_name": trait_info["db"]["dataset_name"],
              "species_id": retrieve_species_id(
-                 trait_info["db"]["riset"], conn)})
+                 trait_info["db"]["group"], conn)})
         return [dict(zip(
             ["sample_name", "value", "se_error", "id"], row))
                 for row in cursor.fetchall()]
diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py
index b6fc6d3..a36940d 100644
--- a/gn3/heatmaps.py
+++ b/gn3/heatmaps.py
@@ -164,7 +164,7 @@ def build_heatmap(traits_names, conn: Any):
         retrieve_trait_info(threshold, fullname, conn)
         for fullname in traits_names]
     traits_data_list = [retrieve_trait_data(t, conn) for t in traits]
-    genotype_filename = build_genotype_file(traits[0]["riset"])
+    genotype_filename = build_genotype_file(traits[0]["group"])
     samples = load_genotype_samples(genotype_filename)
     exported_traits_data_list = [
         export_trait_data(td, samples) for td in traits_data_list]
diff --git a/tests/unit/db/test_datasets.py b/tests/unit/db/test_datasets.py
index 38de0e2..39f4af9 100644
--- a/tests/unit/db/test_datasets.py
+++ b/tests/unit/db/test_datasets.py
@@ -3,10 +3,10 @@
 from unittest import mock, TestCase
 from gn3.db.datasets import (
     retrieve_dataset_name,
-    retrieve_riset_fields,
-    retrieve_geno_riset_fields,
-    retrieve_publish_riset_fields,
-    retrieve_probeset_riset_fields)
+    retrieve_group_fields,
+    retrieve_geno_group_fields,
+    retrieve_publish_group_fields,
+    retrieve_probeset_group_fields)
 
 class TestDatasetsDBFunctions(TestCase):
     """Test cases for datasets functions."""
@@ -40,9 +40,9 @@ class TestDatasetsDBFunctions(TestCase):
                             table=table, cols=columns),
                         {"threshold": thresh, "name": dataset_name})
 
-    def test_retrieve_probeset_riset_fields(self):
+    def test_retrieve_probeset_group_fields(self):
         """
-        Test that the `riset` and `riset_id` fields are retrieved appropriately
+        Test that the `group` and `group_id` fields are retrieved appropriately
         for the 'ProbeSet' trait type.
         """
         for trait_name, expected in [
@@ -52,7 +52,7 @@ class TestDatasetsDBFunctions(TestCase):
                 with db_mock.cursor() as cursor:
                     cursor.execute.return_value = ()
                     self.assertEqual(
-                        retrieve_probeset_riset_fields(trait_name, db_mock),
+                        retrieve_probeset_group_fields(trait_name, db_mock),
                         expected)
                     cursor.execute.assert_called_once_with(
                         (
@@ -63,34 +63,34 @@ class TestDatasetsDBFunctions(TestCase):
                             " AND ProbeSetFreeze.Name = %(name)s"),
                         {"name": trait_name})
 
-    def test_retrieve_riset_fields(self):
+    def test_retrieve_group_fields(self):
         """
-        Test that the riset fields are set up correctly for the different trait
+        Test that the group fields are set up correctly for the different trait
         types.
         """
         for trait_type, trait_name, dataset_info, expected in [
                 ["Publish", "pubTraitName01", {"dataset_name": "pubDBName01"},
-                 {"dataset_name": "pubDBName01", "riset": ""}],
+                 {"dataset_name": "pubDBName01", "group": ""}],
                 ["ProbeSet", "prbTraitName01", {"dataset_name": "prbDBName01"},
-                 {"dataset_name": "prbDBName01", "riset": ""}],
+                 {"dataset_name": "prbDBName01", "group": ""}],
                 ["Geno", "genoTraitName01", {"dataset_name": "genoDBName01"},
-                 {"dataset_name": "genoDBName01", "riset": ""}],
-                ["Temp", "tempTraitName01", {}, {"riset": ""}],
+                 {"dataset_name": "genoDBName01", "group": ""}],
+                ["Temp", "tempTraitName01", {}, {"group": ""}],
                 ]:
             db_mock = mock.MagicMock()
             with self.subTest(
                     trait_type=trait_type, trait_name=trait_name,
                     dataset_info=dataset_info):
                 with db_mock.cursor() as cursor:
-                    cursor.execute.return_value = ("riset_name", 0)
+                    cursor.execute.return_value = ("group_name", 0)
                     self.assertEqual(
-                        retrieve_riset_fields(
+                        retrieve_group_fields(
                             trait_type, trait_name, dataset_info, db_mock),
                         expected)
 
-    def test_retrieve_publish_riset_fields(self):
+    def test_retrieve_publish_group_fields(self):
         """
-        Test that the `riset` and `riset_id` fields are retrieved appropriately
+        Test that the `group` and `group_id` fields are retrieved appropriately
         for the 'Publish' trait type.
         """
         for trait_name, expected in [
@@ -100,7 +100,7 @@ class TestDatasetsDBFunctions(TestCase):
                 with db_mock.cursor() as cursor:
                     cursor.execute.return_value = ()
                     self.assertEqual(
-                        retrieve_publish_riset_fields(trait_name, db_mock),
+                        retrieve_publish_group_fields(trait_name, db_mock),
                         expected)
                     cursor.execute.assert_called_once_with(
                         (
@@ -110,9 +110,9 @@ class TestDatasetsDBFunctions(TestCase):
                             " AND PublishFreeze.Name = %(name)s"),
                         {"name": trait_name})
 
-    def test_retrieve_geno_riset_fields(self):
+    def test_retrieve_geno_group_fields(self):
         """
-        Test that the `riset` and `riset_id` fields are retrieved appropriately
+        Test that the `group` and `group_id` fields are retrieved appropriately
         for the 'Geno' trait type.
         """
         for trait_name, expected in [
@@ -122,7 +122,7 @@ class TestDatasetsDBFunctions(TestCase):
                 with db_mock.cursor() as cursor:
                     cursor.execute.return_value = ()
                     self.assertEqual(
-                        retrieve_geno_riset_fields(trait_name, db_mock),
+                        retrieve_geno_group_fields(trait_name, db_mock),
                         expected)
                     cursor.execute.assert_called_once_with(
                         (
-- 
cgit v1.2.3