wqflask/base/data_set/datasetgroup.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193

"Dataset Group class ..."

import os
import json


from base import webqtlConfig
from utility import webqtlUtil
from utility import gen_geno_ob
from db import webqtlDatabaseFunction
from maintenance import get_group_samplelists
from wqflask.database import database_connection
from utility.tools import (
    locate,
    USE_REDIS,
    flat_files,
    flat_file_exists,
    locate_ignore_error)

class DatasetGroup:
    """
    Each group has multiple datasets; each species has multiple groups.

    For example, Mouse has multiple groups (BXD, BXA, etc), and each group
    has multiple datasets associated with it.

    """

    def __init__(self, dataset, name=None):
        """This sets self.group and self.group_id"""
        with database_connection() as conn, conn.cursor() as cursor:
            if not name:
                cursor.execute(dataset.query_for_group,
                               (dataset.name,))
            else:
                cursor.execute(
                    "SELECT InbredSet.Name, "
                    "InbredSet.Id, "
                    "InbredSet.GeneticType, "
                    "InbredSet.InbredSetCode "
                    "FROM InbredSet WHERE Name = %s",
                    (dataset.name,))
            results = cursor.fetchone()
            if results:
                (self.name, self.id, self.genetic_type, self.code) = results
            else:
                self.name = name or dataset.name
        if self.name == 'BXD300':
            self.name = "BXD"

        self.f1list = None
        self.parlist = None
        self.get_f1_parent_strains()

        self.mapping_id, self.mapping_names = self.get_mapping_methods()

        self.species = webqtlDatabaseFunction.retrieve_species(self.name)

        self.incparentsf1 = False
        self.allsamples = None
        self._datasets = None
        self.genofile = None

    def get_mapping_methods(self):
        mapping_id = ()
        with database_connection() as conn, conn.cursor() as cursor:
            cursor.execute(
                "SELECT MappingMethodId FROM "
                "InbredSet WHERE Name= %s",
                (self.name,))
            results = cursor.fetchone()
            if results and results[0]:
                mapping_id = results[0]
        if mapping_id == "1":
            mapping_names = ["GEMMA", "QTLReaper", "R/qtl"]
        elif mapping_id == "2":
            mapping_names = ["GEMMA"]
        elif mapping_id == "3":
            mapping_names = ["R/qtl"]
        elif mapping_id == "4":
            mapping_names = ["GEMMA", "PLINK"]
        else:
            mapping_names = []

        return mapping_id, mapping_names

    def get_markers(self):
        def check_plink_gemma():
            if flat_file_exists("mapping"):
                MAPPING_PATH = flat_files("mapping") + "/"
                if os.path.isfile(MAPPING_PATH + self.name + ".bed"):
                    return True
            return False

        if check_plink_gemma():
            marker_class = HumanMarkers
        else:
            marker_class = Markers

        if self.genofile:
            self.markers = marker_class(self.genofile[:-5])
        else:
            self.markers = marker_class(self.name)

    def get_f1_parent_strains(self):
        try:
            # NL, 07/27/2010. ParInfo has been moved from webqtlForm.py to webqtlUtil.py;
            f1, f12, maternal, paternal = webqtlUtil.ParInfo[self.name]
        except KeyError:
            f1 = f12 = maternal = paternal = None

        if f1 and f12:
            self.f1list = [f1, f12]
        if maternal and paternal:
            self.parlist = [maternal, paternal]

    def get_study_samplelists(self):
        study_sample_file = locate_ignore_error(
            self.name + ".json", 'study_sample_lists')
        try:
            f = open(study_sample_file)
        except:
            return []
        study_samples = json.load(f)
        return study_samples

    def get_genofiles(self):
        jsonfile = "%s/%s.json" % (webqtlConfig.GENODIR, self.name)
        try:
            f = open(jsonfile)
        except:
            return None
        jsondata = json.load(f)
        return jsondata['genofile']

    def get_samplelist(self, redis_conn):
        result = None
        key = "samplelist:v3:" + self.name
        if USE_REDIS:
            result = redis_conn.get(key)

        if result is not None:
            self.samplelist = json.loads(result)
        else:
            genotype_fn = locate_ignore_error(self.name + ".geno", 'genotype')
            if genotype_fn:
                self.samplelist = get_group_samplelists.get_samplelist(
                    "geno", genotype_fn)
            else:
                self.samplelist = None

            if USE_REDIS:
                redis_conn.set(key, json.dumps(self.samplelist))
                redis_conn.expire(key, 60 * 5)

    def all_samples_ordered(self):
        result = []
        lists = (self.parlist, self.f1list, self.samplelist)
        [result.extend(l) for l in lists if l]
        return result

    def read_genotype_file(self, use_reaper=False):
        '''Read genotype from .geno file instead of database'''
        # genotype_1 is Dataset Object without parents and f1
        # genotype_2 is Dataset Object with parents and f1 (not for intercross)

        # reaper barfs on unicode filenames, so here we ensure it's a string
        if self.genofile:
            if "RData" in self.genofile:  # ZS: This is a temporary fix; I need to change the way the JSON files that point to multiple genotype files are structured to point to other file types like RData
                full_filename = str(
                    locate(self.genofile.split(".")[0] + ".geno", 'genotype'))
            else:
                full_filename = str(locate(self.genofile, 'genotype'))
        else:
            full_filename = str(locate(self.name + '.geno', 'genotype'))
        genotype_1 = gen_geno_ob.genotype(full_filename)

        if genotype_1.type == "group" and self.parlist:
            genotype_2 = genotype_1.add(
                Mat=self.parlist[0], Pat=self.parlist[1])  # , F1=_f1)
        else:
            genotype_2 = genotype_1

        # determine default genotype object
        if self.incparentsf1 and genotype_1.type != "intercross":
            genotype = genotype_2
        else:
            self.incparentsf1 = 0
            genotype = genotype_1

        self.samplelist = list(genotype.prgy)

        return genotype