Merge branch 'main' of https://github.com/genenetwork/genenetwork3 into bug/fix_rqtl_covariates

author: zsloan 2021-10-12 20:56:31 +0000
committer: zsloan 2021-10-12 20:56:31 +0000
commit: 6e211182354fb4d6941e3a44ec1ec9d378b0e4ef (patch)
tree: 60d9aaf382eefbb47cdbab9c74d98481cf0983de /gn3/db
parent: b815236123ff8e144bd84f349357a1852df95651 (diff)
parent: 77c274b79c3ec01de60e90db3299763cb58f715b (diff)
download: genenetwork3-6e211182354fb4d6941e3a44ec1ec9d378b0e4ef.tar.gz
3 files changed, 253 insertions, 61 deletions
diff --git a/gn3/db/datasets.py b/gn3/db/datasets.py
index 4a05499..6c328f5 100644
--- a/gn3/db/datasets.py
+++ b/gn3/db/datasets.py
@@ -119,9 +119,9 @@ def retrieve_dataset_name(
     return fn_map[trait_type](threshold, dataset_name, conn)
 
 
-def retrieve_geno_riset_fields(name, conn):
+def retrieve_geno_group_fields(name, conn):
     """
-    Retrieve the RISet, and RISetID values for various Geno trait types.
+    Retrieve the Group, and GroupID values for various Geno trait types.
     """
     query = (
         "SELECT InbredSet.Name, InbredSet.Id "
@@ -130,12 +130,12 @@ def retrieve_geno_riset_fields(name, conn):
         "AND GenoFreeze.Name = %(name)s")
     with conn.cursor() as cursor:
         cursor.execute(query, {"name": name})
-        return dict(zip(["riset", "risetid"], cursor.fetchone()))
+        return dict(zip(["group", "groupid"], cursor.fetchone()))
     return {}
 
-def retrieve_publish_riset_fields(name, conn):
+def retrieve_publish_group_fields(name, conn):
     """
-    Retrieve the RISet, and RISetID values for various Publish trait types.
+    Retrieve the Group, and GroupID values for various Publish trait types.
     """
     query = (
         "SELECT InbredSet.Name, InbredSet.Id "
@@ -144,12 +144,12 @@ def retrieve_publish_riset_fields(name, conn):
         "AND PublishFreeze.Name = %(name)s")
     with conn.cursor() as cursor:
         cursor.execute(query, {"name": name})
-        return dict(zip(["riset", "risetid"], cursor.fetchone()))
+        return dict(zip(["group", "groupid"], cursor.fetchone()))
     return {}
 
-def retrieve_probeset_riset_fields(name, conn):
+def retrieve_probeset_group_fields(name, conn):
     """
-    Retrieve the RISet, and RISetID values for various ProbeSet trait types.
+    Retrieve the Group, and GroupID values for various ProbeSet trait types.
     """
     query = (
         "SELECT InbredSet.Name, InbredSet.Id "
@@ -159,12 +159,12 @@ def retrieve_probeset_riset_fields(name, conn):
         "AND ProbeSetFreeze.Name = %(name)s")
     with conn.cursor() as cursor:
         cursor.execute(query, {"name": name})
-        return dict(zip(["riset", "risetid"], cursor.fetchone()))
+        return dict(zip(["group", "groupid"], cursor.fetchone()))
     return {}
 
-def retrieve_temp_riset_fields(name, conn):
+def retrieve_temp_group_fields(name, conn):
     """
-    Retrieve the RISet, and RISetID values for `Temp` trait types.
+    Retrieve the Group, and GroupID values for `Temp` trait types.
     """
     query = (
         "SELECT InbredSet.Name, InbredSet.Id "
@@ -173,30 +173,30 @@ def retrieve_temp_riset_fields(name, conn):
         "AND Temp.Name = %(name)s")
     with conn.cursor() as cursor:
         cursor.execute(query, {"name": name})
-        return dict(zip(["riset", "risetid"], cursor.fetchone()))
+        return dict(zip(["group", "groupid"], cursor.fetchone()))
     return {}
 
-def retrieve_riset_fields(trait_type, trait_name, dataset_info, conn):
+def retrieve_group_fields(trait_type, trait_name, dataset_info, conn):
     """
-    Retrieve the RISet, and RISetID values for various trait types.
+    Retrieve the Group, and GroupID values for various trait types.
     """
-    riset_fns_map = {
-        "Geno": retrieve_geno_riset_fields,
-        "Publish": retrieve_publish_riset_fields,
-        "ProbeSet": retrieve_probeset_riset_fields
+    group_fns_map = {
+        "Geno": retrieve_geno_group_fields,
+        "Publish": retrieve_publish_group_fields,
+        "ProbeSet": retrieve_probeset_group_fields
     }
 
     if trait_type == "Temp":
-        riset_info = retrieve_temp_riset_fields(trait_name, conn)
+        group_info = retrieve_temp_group_fields(trait_name, conn)
     else:
-        riset_info = riset_fns_map[trait_type](dataset_info["dataset_name"], conn)
+        group_info = group_fns_map[trait_type](dataset_info["dataset_name"], conn)
 
     return {
         **dataset_info,
-        **riset_info,
-        "riset": (
-            "BXD" if riset_info.get("riset") == "BXD300"
-            else riset_info.get("riset", ""))
+        **group_info,
+        "group": (
+            "BXD" if group_info.get("group") == "BXD300"
+            else group_info.get("group", ""))
     }
 
 def retrieve_temp_trait_dataset():
@@ -281,11 +281,11 @@ def retrieve_trait_dataset(trait_type, trait, threshold, conn):
             trait_type, threshold, trait["trait_name"],
             trait["db"]["dataset_name"], conn)
     }
-    riset = retrieve_riset_fields(
+    group = retrieve_group_fields(
         trait_type, trait["trait_name"], dataset_name_info, conn)
     return {
         "display_name": dataset_name_info["dataset_name"],
         **dataset_name_info,
         **dataset_fns[trait_type](),
-        **riset
+        **group
     }
diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py
new file mode 100644
index 0000000..8f18cac
--- /dev/null
+++ b/gn3/db/genotypes.py
@@ -0,0 +1,184 @@
+"""Genotype utilities"""
+
+import os
+import gzip
+from typing import Union, TextIO
+
+from gn3.settings import GENOTYPE_FILES
+
+def build_genotype_file(
+        geno_name: str, base_dir: str = GENOTYPE_FILES,
+        extension: str = "geno"):
+    """Build the absolute path for the genotype file."""
+    return "{}/{}.{}".format(os.path.abspath(base_dir), geno_name, extension)
+
+def load_genotype_samples(genotype_filename: str, file_type: str = "geno"):
+    """
+    Load sample of samples from genotype files.
+
+    DESCRIPTION:
+    Traits can contain a varied number of samples, some of which do not exist in
+    certain genotypes. In order to compute QTLs, GEMMAs, etc, we need to ensure
+    to pick only those samples that exist in the genotype under consideration
+    for the traits used in the computation.
+
+    This function loads a list of samples from the genotype files for use in
+    filtering out unusable samples.
+
+
+    PARAMETERS:
+    genotype_filename: The absolute path to the genotype file to load the
+        samples from.
+    file_type: The type of file. Currently supported values are 'geno' and
+        'plink'.
+    """
+    file_type_fns = {
+        "geno": __load_genotype_samples_from_geno,
+        "plink": __load_genotype_samples_from_plink
+    }
+    return file_type_fns[file_type](genotype_filename)
+
+def __load_genotype_samples_from_geno(genotype_filename: str):
+    """
+    Helper function for `load_genotype_samples` function.
+
+    Loads samples from '.geno' files.
+    """
+    gzipped_filename = "{}.gz".format(genotype_filename)
+    if os.path.isfile(gzipped_filename):
+        genofile: Union[TextIO, gzip.GzipFile] = gzip.open(gzipped_filename)
+    else:
+        genofile = open(genotype_filename)
+
+    for row in genofile:
+        line = row.strip()
+        if (not line) or (line.startswith(("#", "@"))): # type: ignore[arg-type]
+            continue
+        break
+
+    headers = line.split("\t") # type: ignore[arg-type]
+    if headers[3] == "Mb":
+        return headers[4:]
+    return headers[3:]
+
+def __load_genotype_samples_from_plink(genotype_filename: str):
+    """
+    Helper function for `load_genotype_samples` function.
+
+    Loads samples from '.plink' files.
+    """
+    genofile = open(genotype_filename)
+    return [line.split(" ")[1] for line in genofile]
+
+def parse_genotype_labels(lines: list):
+    """
+    Parse label lines into usable genotype values
+
+    DESCRIPTION:
+    Reworks
+    https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/utility/gen_geno_ob.py#L75-L93
+    """
+    acceptable_labels = ["name", "filler", "type", "mat", "pat", "het", "unk"]
+    def __parse_label(line):
+        label, value = [l.strip() for l in line[1:].split(":")]
+        if label not in acceptable_labels:
+            return None
+        if label == "name":
+            return ("group", value)
+        return (label, value)
+    return tuple(
+        item for item in (__parse_label(line) for line in lines)
+        if item is not None)
+
+def parse_genotype_header(line: str, parlist: tuple = tuple()):
+    """
+    Parse the genotype file header line
+
+    DESCRIPTION:
+    Reworks
+    https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/utility/gen_geno_ob.py#L94-L114
+    """
+    items = [item.strip() for item in line.split("\t")]
+    mbmap = "Mb" in items
+    prgy = ((parlist + tuple(items[4:])) if mbmap
+            else (parlist + tuple(items[3:])))
+    return (
+        ("Mbmap", mbmap),
+        ("cm_column", items.index("cM")),
+        ("mb_column", None if not mbmap else items.index("Mb")),
+        ("prgy", prgy),
+        ("nprgy", len(prgy)))
+
+def parse_genotype_marker(line: str, geno_obj: dict, parlist: tuple):
+    """
+    Parse a data line in a genotype file
+
+    DESCRIPTION:
+    Reworks
+    https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/utility/gen_geno_ob.py#L143-L190
+    """
+    # pylint: disable=W0702
+    marker_row = [item.strip() for item in line.split("\t")]
+    geno_table = {
+        geno_obj["mat"]: -1, geno_obj["pat"]: 1, geno_obj["het"]: 0,
+        geno_obj["unk"]: "U"
+    }
+    start_pos = 4 if geno_obj["Mbmap"] else 3
+    if len(parlist) > 0:
+        start_pos = start_pos + 2
+
+    alleles = marker_row[start_pos:]
+    genotype = tuple(
+        (geno_table[allele] if allele in geno_table.keys() else "U")
+        for allele in alleles)
+    if len(parlist) > 0:
+        genotype = (-1, 1) + genotype
+    try:
+        cm_val = float(geno_obj["cm_column"])
+    except:
+        if geno_obj["Mbmap"]:
+            cm_val = float(geno_obj["mb_column"])
+        else:
+            cm_val = 0
+    return (
+        ("chr", marker_row[0]),
+        ("name", marker_row[1]),
+        ("cM", cm_val),
+        ("Mb", float(geno_obj["mb_column"]) if geno_obj["Mbmap"] else None),
+        ("genotype", genotype))
+
+def build_genotype_chromosomes(geno_obj, markers):
+    """
+    Build up the chromosomes from the given markers and partially built geno
+    object
+    """
+    mrks = [dict(marker) for marker in markers]
+    chr_names = {marker["chr"] for marker in mrks}
+    return tuple((
+        ("name", chr_name), ("mb_exists", geno_obj["Mbmap"]), ("cm_column", 2),
+        ("mb_column", geno_obj["mb_column"]),
+        ("loci", tuple(marker for marker in mrks if marker["chr"] == chr_name)))
+                 for chr_name in sorted(chr_names))
+
+def parse_genotype_file(filename: str, parlist: tuple = tuple()):
+    """
+    Parse the provided genotype file into a usable pytho3 data structure.
+    """
+    with open(filename, "r") as infile:
+        contents = infile.readlines()
+
+    lines = tuple(line for line in contents if
+                  ((not line.strip().startswith("#")) and
+                   (not line.strip() == "")))
+    labels = parse_genotype_labels(
+        [line for line in lines if line.startswith("@")])
+    data_lines = tuple(line for line in lines if not line.startswith("@"))
+    header = parse_genotype_header(data_lines[0], parlist)
+    geno_obj = dict(labels + header)
+    markers = tuple(
+        [parse_genotype_marker(line, geno_obj, parlist)
+         for line in data_lines[1:]])
+    chromosomes = tuple(
+        dict(chromosome) for chromosome in
+        build_genotype_chromosomes(geno_obj, markers))
+    return {**geno_obj, "chromosomes": chromosomes}
diff --git a/gn3/db/traits.py b/gn3/db/traits.py
index 1031e44..f2673c8 100644
--- a/gn3/db/traits.py
+++ b/gn3/db/traits.py
@@ -1,5 +1,8 @@
 """This class contains functions relating to trait data manipulation"""
+import os
 from typing import Any, Dict, Union, Sequence
+from gn3.settings import TMPDIR
+from gn3.random import random_string
 from gn3.function_helpers import compose
 from gn3.db.datasets import retrieve_trait_dataset
 
@@ -43,7 +46,7 @@ def update_sample_data(conn: Any,
                        count: Union[int, str]):
     """Given the right parameters, update sample-data from the relevant
     table."""
-    # pylint: disable=[R0913, R0914]
+    # pylint: disable=[R0913, R0914, C0103]
     STRAIN_ID_SQL: str = "UPDATE Strain SET Name = %s WHERE Id = %s"
     PUBLISH_DATA_SQL: str = ("UPDATE PublishData SET value = %s "
                              "WHERE StrainId = %s AND Id = %s")
@@ -60,22 +63,22 @@ def update_sample_data(conn: Any,
     with conn.cursor() as cursor:
         # Update the Strains table
         cursor.execute(STRAIN_ID_SQL, (strain_name, strain_id))
-        updated_strains: int = cursor.rowcount
+        updated_strains = cursor.rowcount
         # Update the PublishData table
         cursor.execute(PUBLISH_DATA_SQL,
                        (None if value == "x" else value,
                         strain_id, publish_data_id))
-        updated_published_data: int = cursor.rowcount
+        updated_published_data = cursor.rowcount
         # Update the PublishSE table
         cursor.execute(PUBLISH_SE_SQL,
                        (None if error == "x" else error,
                         strain_id, publish_data_id))
-        updated_se_data: int = cursor.rowcount
+        updated_se_data = cursor.rowcount
         # Update the NStrain table
         cursor.execute(N_STRAIN_SQL,
                        (None if count == "x" else count,
                         strain_id, publish_data_id))
-        updated_n_strains: int = cursor.rowcount
+        updated_n_strains = cursor.rowcount
     return (updated_strains, updated_published_data,
             updated_se_data, updated_n_strains)
 
@@ -223,7 +226,7 @@ def set_homologene_id_field_probeset(trait_info, conn):
     """
     query = (
         "SELECT HomologeneId FROM Homologene, Species, InbredSet"
-        " WHERE Homologene.GeneId = %(geneid)s AND InbredSet.Name = %(riset)s"
+        " WHERE Homologene.GeneId = %(geneid)s AND InbredSet.Name = %(group)s"
         " AND InbredSet.SpeciesId = Species.Id AND"
         " Species.TaxonomyId = Homologene.TaxonomyId")
     with conn.cursor() as cursor:
@@ -231,7 +234,7 @@ def set_homologene_id_field_probeset(trait_info, conn):
             query,
             {
                 k:v for k, v in trait_info.items()
-                if k in ["geneid", "riset"]
+                if k in ["geneid", "group"]
             })
         res = cursor.fetchone()
         if res:
@@ -419,7 +422,7 @@ def retrieve_trait_info(
     if trait_info["haveinfo"]:
         return {
             **trait_post_processing_functions_table[trait_dataset_type](
-                {**trait_info, "riset": trait_dataset["riset"]}),
+                {**trait_info, "group": trait_dataset["group"]}),
             "db": {**trait["db"], **trait_dataset}
         }
     return trait_info
@@ -442,18 +445,18 @@ def retrieve_temp_trait_data(trait_info: dict, conn: Any):
             query,
             {"trait_name": trait_info["trait_name"]})
         return [dict(zip(
-            ["strain_name", "value", "se_error", "nstrain", "id"], row))
+            ["sample_name", "value", "se_error", "nstrain", "id"], row))
                 for row in cursor.fetchall()]
     return []
 
-def retrieve_species_id(riset, conn: Any):
+def retrieve_species_id(group, conn: Any):
     """
-    Retrieve a species id given the RISet value
+    Retrieve a species id given the Group value
     """
     with conn.cursor as cursor:
         cursor.execute(
-            "SELECT SpeciesId from InbredSet WHERE Name = %(riset)s",
-            {"riset": riset})
+            "SELECT SpeciesId from InbredSet WHERE Name = %(group)s",
+            {"group": group})
         return cursor.fetchone()[0]
     return None
 
@@ -479,9 +482,9 @@ def retrieve_geno_trait_data(trait_info: Dict, conn: Any):
             {"trait_name": trait_info["trait_name"],
              "dataset_name": trait_info["db"]["dataset_name"],
              "species_id": retrieve_species_id(
-                 trait_info["db"]["riset"], conn)})
+                 trait_info["db"]["group"], conn)})
         return [dict(zip(
-            ["strain_name", "value", "se_error", "id"], row))
+            ["sample_name", "value", "se_error", "id"], row))
                 for row in cursor.fetchall()]
     return []
 
@@ -512,7 +515,7 @@ def retrieve_publish_trait_data(trait_info: Dict, conn: Any):
             {"trait_name": trait_info["trait_name"],
              "dataset_id": trait_info["db"]["dataset_id"]})
         return [dict(zip(
-            ["strain_name", "value", "se_error", "nstrain", "id"], row))
+            ["sample_name", "value", "se_error", "nstrain", "id"], row))
                 for row in cursor.fetchall()]
     return []
 
@@ -545,7 +548,7 @@ def retrieve_cellid_trait_data(trait_info: Dict, conn: Any):
              "trait_name": trait_info["trait_name"],
              "dataset_id": trait_info["db"]["dataset_id"]})
         return [dict(zip(
-            ["strain_name", "value", "se_error", "id"], row))
+            ["sample_name", "value", "se_error", "id"], row))
                 for row in cursor.fetchall()]
     return []
 
@@ -574,29 +577,29 @@ def retrieve_probeset_trait_data(trait_info: Dict, conn: Any):
             {"trait_name": trait_info["trait_name"],
              "dataset_name": trait_info["db"]["dataset_name"]})
         return [dict(zip(
-            ["strain_name", "value", "se_error", "id"], row))
+            ["sample_name", "value", "se_error", "id"], row))
                 for row in cursor.fetchall()]
     return []
 
-def with_strainlist_data_setup(strainlist: Sequence[str]):
+def with_samplelist_data_setup(samplelist: Sequence[str]):
     """
-    Build function that computes the trait data from provided list of strains.
+    Build function that computes the trait data from provided list of samples.
 
     PARAMETERS
-    strainlist: (list)
-      A list of strain names
+    samplelist: (list)
+      A list of sample names
 
     RETURNS:
       Returns a function that given some data from the database, computes the
-      strain's value, variance and ndata values, only if the strain is present
-      in the provided `strainlist` variable.
+      sample's value, variance and ndata values, only if the sample is present
+      in the provided `samplelist` variable.
     """
     def setup_fn(tdata):
-        if tdata["strain_name"] in strainlist:
+        if tdata["sample_name"] in samplelist:
             val = tdata["value"]
             if val is not None:
                 return {
-                    "strain_name": tdata["strain_name"],
+                    "sample_name": tdata["sample_name"],
                     "value": val,
                     "variance": tdata["se_error"],
                     "ndata": tdata.get("nstrain", None)
@@ -604,19 +607,19 @@ def with_strainlist_data_setup(strainlist: Sequence[str]):
         return None
     return setup_fn
 
-def without_strainlist_data_setup():
+def without_samplelist_data_setup():
     """
     Build function that computes the trait data.
 
     RETURNS:
       Returns a function that given some data from the database, computes the
-      strain's value, variance and ndata values.
+      sample's value, variance and ndata values.
     """
     def setup_fn(tdata):
         val = tdata["value"]
         if val is not None:
             return {
-                "strain_name": tdata["strain_name"],
+                "sample_name": tdata["sample_name"],
                 "value": val,
                 "variance": tdata["se_error"],
                 "ndata": tdata.get("nstrain", None)
@@ -624,7 +627,7 @@ def without_strainlist_data_setup():
         return None
     return setup_fn
 
-def retrieve_trait_data(trait: dict, conn: Any, strainlist: Sequence[str] = tuple()):
+def retrieve_trait_data(trait: dict, conn: Any, samplelist: Sequence[str] = tuple()):
     """
     Retrieve trait data
 
@@ -647,22 +650,27 @@ def retrieve_trait_data(trait: dict, conn: Any, strainlist: Sequence[str] = tupl
     if results:
         # do something with mysqlid
         mysqlid = results[0]["id"]
-        if strainlist:
+        if samplelist:
             data = [
                 item for item in
-                map(with_strainlist_data_setup(strainlist), results)
+                map(with_samplelist_data_setup(samplelist), results)
                 if item is not None]
         else:
             data = [
                 item for item in
-                map(without_strainlist_data_setup(), results)
+                map(without_samplelist_data_setup(), results)
                 if item is not None]
 
         return {
             "mysqlid": mysqlid,
             "data": dict(map(
                 lambda x: (
-                    x["strain_name"],
-                    {k:v for k, v in x.items() if x != "strain_name"}),
+                    x["sample_name"],
+                    {k:v for k, v in x.items() if x != "sample_name"}),
                 data))}
     return {}
+
+def generate_traits_filename(base_path: str = TMPDIR):
+    """Generate a unique filename for use with generated traits files."""
+    return "{}/traits_test_file_{}.txt".format(
+        os.path.abspath(base_path), random_string(10))
author	zsloan	2021-10-12 20:56:31 +0000
committer	zsloan	2021-10-12 20:56:31 +0000
commit	6e211182354fb4d6941e3a44ec1ec9d378b0e4ef (patch)
tree	60d9aaf382eefbb47cdbab9c74d98481cf0983de /gn3/db
parent	b815236123ff8e144bd84f349357a1852df95651 (diff)
parent	77c274b79c3ec01de60e90db3299763cb58f715b (diff)
download	genenetwork3-6e211182354fb4d6941e3a44ec1ec9d378b0e4ef.tar.gz