diff options
Diffstat (limited to 'wqflask')
-rw-r--r-- | wqflask/maintenance/dataset/load_genotypes.py | 207 |
1 files changed, 207 insertions, 0 deletions
diff --git a/wqflask/maintenance/dataset/load_genotypes.py b/wqflask/maintenance/dataset/load_genotypes.py new file mode 100644 index 00000000..e4988446 --- /dev/null +++ b/wqflask/maintenance/dataset/load_genotypes.py @@ -0,0 +1,207 @@ +# GeneNetwork maintenance script +# Load genotypes from geno file into database + +# Author: Lei Yan +# Create Date: 2014-01-08 +# Last Update Date: 2014-01-10 + +# import +import sys +import os +import re +import MySQLdb +import ConfigParser + +def main(argv): + + # load configuration from configuration file + config = ConfigParser.ConfigParser() + config.read(argv[1]) + speciesid = config.get('configuration', 'speciesId') + inbredsetid = config.get('configuration', 'inbredsetid') + genofreezeid = config.get('configuration', 'genofreezeid') + genofile = config.get('configuration', 'genofile') + print "[configuration]\nspeciesid: %s\ninbredsetid: %s\ngenofreezeid: %s\ngenofile: %s\n" % (speciesid, inbredsetid, genofreezeid, genofile) + + # variables + metadic = {} + + # parse genofile + file_geno = open(genofile, 'r') + for line in file_geno: + line = line.strip() + if line.startswith('#'): + continue + if line.startswith('@'): + line = line.strip('@') + items = line.split(';') + for item in items: + kv = re.split(':|=', item) + metadic[kv[0].strip()] = kv[1].strip() + continue + if line.startswith("Chr"): + print "[meta dictionary]" + for k,v in metadic.items(): + print "%s: %s" % (k, v) + print + continue + cells = line.split() + Chr = cells[0] + Locus = cells[1] + cM = cells[2] + Mb = cells[3] + print len(cells) + ? + return + + # open db + host = 'localhost' + user = 'webqtl' + passwd = 'webqtl' + db = 'db_webqtl' + con = MySQLdb.Connect(db=db, user=user, passwd=passwd, host=host) + cursor = con.cursor() + # var + speciesid = int(argv[2]) + inbredsetid = int(argv[3]) + genofreezeid = int(argv[4]) + sql = """ + SELECT Id + FROM GenoData + ORDER BY Id DESC + LIMIT 1 + """ + cursor.execute(sql) + results = cursor.fetchall() + dataid = results[0][0] + print "speciesid: %s" % (speciesid) + print "inbredsetid: %s" % (inbredsetid) + print "genofreezeid: %s" % (genofreezeid) + print "dataid start: %s" % (dataid+1) + # samples + line = file_geno.readline() + sample_names = line.split()[4:] + sample_ids = [] + print "get %d samples from file:\n%s" % (len(sample_names), sample_names) + for sample_name in sample_names: + sql = """ + select Id + from Strain + where SpeciesId=%s + and Name like %s + """ + cursor.execute(sql, (speciesid, sample_name)) + results = cursor.fetchall() + if results: + sample_ids.append(results[0][0]) + else: + print "insert sample %s" % (sample_name) + sql = """ + INSERT INTO Strain + SET + SpeciesId=%s, + Name=%s, + Name2=%s + """ + cursor.execute(sql, (speciesid, sample_name, sample_name)) + sampleid = con.insert_id() + sample_ids.append(sampleid) + # + sql = """ + SELECT OrderId + FROM StrainXRef + where InbredSetId=%s + ORDER BY OrderId DESC + LIMIT 1 + """ + cursor.execute(sql, (inbredsetid)) + results = cursor.fetchall() + orderid = results[0][0] + 1 + # + sql = """ + INSERT INTO StrainXRef + SET + InbredSetId=%s, + StrainId=%s, + OrderId=%s, + Used_for_mapping=%s + """ + cursor.execute(sql, (inbredsetid, sampleid, orderid, "N")) + print "load %d samples from DB:" % (len(sample_names)) + for i in range(len(sample_names)): + print "%s\t%s" % (sample_names[i], sample_ids[i]) + # parse geno file + index = 0 + for line in file_geno: + index += 1 + if index % 1000 == 0: + print index + items = line.split() + chr = items[0] + name = items[1] + cm = items[2] + mb = items[3] + values = items[4:] + # geno + sql = """ + SELECT Id + FROM Geno + WHERE SpeciesId=%s + AND Name like %s + """ + cursor.execute(sql, (speciesid, name)) + results = cursor.fetchall() + if results: + genoid = results[0][0] + else: + print "insert geno %s" % (name) + sql = """ + INSERT INTO Geno + SET + SpeciesId=%s, + Name=%s, + Marker_Name=%s, + Chr=%s, + Mb=%s + """ + cursor.execute(sql, (speciesid, name, name, chr, mb)) + genoid = con.insert_id() + # genodata + dataid += 1 + for i in range(len(values)): + sample_id = sample_ids[i] + try: + value = int(values[i]) + except ValueError: + continue + if not value in [-1, 0, 1]: + print sample_id, value + continue + sql = """ + INSERT INTO GenoData + SET + Id=%s, + StrainId=%s, + value=%s + """ + cursor.execute(sql, (dataid, sample_id, value)) + # genoxref + sql = """ + INSERT INTO GenoXRef + SET + GenoFreezeId=%s, + GenoId=%s, + DataId=%s, + cM=%s, + Used_for_mapping=%s + """ + cursor.execute(sql, (genofreezeid, genoid, dataid, cm, 'N')) + print "Insert %d genoxref" % (index) + # close + file_geno.close() + con.close() + +# main +if __name__ == "__main__": + main(sys.argv) + print "exit successfully" |