scripts/maintenance/QTL_Reaper_v6.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148

####### To run this program do: python QTL_Reaper_v2.py 235
#Where 163 is the ProbeSetFreeze Id of the database that we want to calculate #the LRS
#!/usr/bin/python
import reaper
import MySQLdb
import contextlib
from argparse import ArgumentParser


def run_qtl_reaper(cursor, genotypeDir, ProbeSetFreezeIds):
    genotype_1 = reaper.Dataset()
    #####get all of the genotypes
    cursor.execute('select Id, Name from InbredSet')
    results = cursor.fetchall()
    InbredSets = {}
    for item in results:
            InbredSets[item[0]] = genotypeDir+str(item[1])+'.geno'

    if ProbeSetFreezeIds:
        #####convert the Ids to integer
        ProbeSetFreezeIds=list(map(int, ProbeSetFreezeIds))
    else:
        #####get all of the dataset that need be updated
        cursor.execute('select distinct(ProbeSetFreezeId) from ProbeSetXRef where pValue is NULL order by ProbeSetFreezeId desc')
        results = cursor.fetchall()
        ProbeSetFreezeIds = []
        for item in results:
            ProbeSetFreezeIds.append(item[0])

    ####human dataset can NOT use this program calculate the LRS, ignore it
    if ProbeSetFreezeIds.__contains__(215):
        ProbeSetFreezeIds.remove(215)

    #output_file = open ('/home/xzhou/work/DatabaseTools/cal_LRS_Additive_result.txt', 'w')

    #####update
    for ProbeSetFreezeId in ProbeSetFreezeIds:
        cursor.execute("""
            select InbredSetId
            from ProbeFreeze, ProbeSetFreeze
            where ProbeFreeze.Id=ProbeSetFreeze.ProbeFreezeId and ProbeSetFreeze.Id=%d
        """%ProbeSetFreezeId);

        InbredSetId = cursor.fetchone()[0]
        if InbredSetId==3:
            InbredSetId=1
        #if InbredSetId==12:
        #	InbredSetId=2

        print((ProbeSetFreezeId, InbredSets[InbredSetId]))

        genotype_1.read(InbredSets[InbredSetId])
        locuses = []
        for geno in genotype_1:
            for locus in geno:
                locuses.append(locus.name)

        cursor.execute('select ProbeSetId, Locus, DataId from ProbeSetXRef where ProbeSetFreezeId=%s'%ProbeSetFreezeId)
        ProbeSetXRefInfos = cursor.fetchall()

        kj=0
        for aProbeSetXRef in ProbeSetXRefInfos:
            ProbeSetId, Locus, DataId = aProbeSetXRef
            prgy = genotype_1.prgy

            cursor.execute("select Strain.Name, ProbeSetData.value from Strain, ProbeSetData where Strain.Id = ProbeSetData.StrainId and ProbeSetData.Id = %d" % DataId)
            results = cursor.fetchall()
            if not results:
                continue
            _strains = []
            _values = []
            for item2 in results:
                strain, value = item2
                if strain in prgy:
                    _strains.append(strain)
                    _values.append(value)
            if not _strains or not _values:
                continue

            if len(_strains) < 8:
                continue
            qtlresults = genotype_1.regression(strains = _strains, trait = _values)
            _max = max(qtlresults)
            _locus = _max.locus.name
            _additive = _max.additive
            _max = _max.lrs

            #output_file.write('%s\t%s\t%s\t%s\t%s\n' % (ProbeSetFreezeId, ProbeSetId, _locus, _max, _additive))

            # _max(LRS) maybe is infinite sometimes, so define it as a very big number
            if _max == float('inf'):
                _max = 10000

            cursor.execute('update ProbeSetXRef set Locus=%s, LRS=%s, additive=%s where ProbeSetId=%s and ProbeSetFreezeId=%s',
                           (_locus, _max, _additive, ProbeSetId, ProbeSetFreezeId))

            kj += 1
            if kj%1000==0:
                print((ProbeSetFreezeId, InbredSets[InbredSetId], kj))


        print(ProbeSetFreezeIds)


@contextlib.contextmanager
def dbconnection(db, user, passwd, host, port):
    """Connect to the database in a contextmanager."""
    conn = MySQLdb.Connect(db=db, user=user, passwd=passwd, host=host, port=port)
    try:
        yield conn
    except MySQLdb.Error as _err:
        conn.rollback()
    finally:
        conn.commit()
        conn.close()


def read_db_config(path):
    with open(path, "r") as infile:
        lines = tuple(line.strip() for line in infile.readlines())
        return {
            "user": lines[0],
            "passwd": lines[1],
            "host": lines[2],
            "port": int(lines[3] or "3306"),
            "db": lines[4]
        }


def main():
    parser = ArgumentParser(prog="QTLReaper", description="Python-2 QTLReaper.")
    parser.add_argument("db_config_file",
                        type=str,
                        default="dbconfig",
                        help="File with database connection configuration")
    parser.add_argument("genotype_dir",
                        type=str,
                        default="/gnshare/gn/web/genotypes/",
                        help="Directory with the genotype CSV files.")
    parser.add_argument("ProbeSetFreezeIds",
                        nargs="*",
                        help="The ProbeSetFreezeIds to act on.")
    args = parser.parse_args()
    with dbconnection(**read_db_config(args.db_config_file)) as conn:
        run_qtl_reaper(conn.cursor(), args.genotype_dir, args.ProbeSetFreezeIds)

if __name__ == "__main__":
    main()