From ff6224c23ff0894012297dc569de0bfdbb0da1c3 Mon Sep 17 00:00:00 2001 From: Lei Yan Date: Fri, 21 Feb 2014 15:09:10 -0600 Subject: On branch master --- .../datasampledir/load_genotypes/config.ini | 4 + .../datasampledir/load_genotypes/sample.geno | 14 + wqflask/maintenance/dataset/load_genotypes.py | 386 ++++++++++----------- 3 files changed, 202 insertions(+), 202 deletions(-) create mode 100644 wqflask/maintenance/dataset/datasampledir/load_genotypes/config.ini create mode 100644 wqflask/maintenance/dataset/datasampledir/load_genotypes/sample.geno diff --git a/wqflask/maintenance/dataset/datasampledir/load_genotypes/config.ini b/wqflask/maintenance/dataset/datasampledir/load_genotypes/config.ini new file mode 100644 index 00000000..638c3bd8 --- /dev/null +++ b/wqflask/maintenance/dataset/datasampledir/load_genotypes/config.ini @@ -0,0 +1,4 @@ +[config] +inbredsetid = 1 +datafile = datasampledir/load_phenotypes/sample_data.txt +metafile = datasampledir/load_phenotypes/sample_meta.txt diff --git a/wqflask/maintenance/dataset/datasampledir/load_genotypes/sample.geno b/wqflask/maintenance/dataset/datasampledir/load_genotypes/sample.geno new file mode 100644 index 00000000..6dad9ace --- /dev/null +++ b/wqflask/maintenance/dataset/datasampledir/load_genotypes/sample.geno @@ -0,0 +1,14 @@ +@name:BXD +@type:riset +@mat:B +@pat:D +@het:H +@unk:U +Chr Locus cM Mb BXD1 BXD2 BXD5 BXD6 BXD8 +1 rs6269442 0 3.482275 B B D D D +1 rs6365999 0 4.811062 B B D D D +1 rs6376963 0.895 5.008089 B B D D D +1 rs3677817 1.185 5.176058 B B D D D +1 rs8236463 2.081 5.579193 B B D D D +1 rs6333200 2.081 6.217921 B B D D D +1 rs6298633 2.367 6.820241 B B D D D diff --git a/wqflask/maintenance/dataset/load_genotypes.py b/wqflask/maintenance/dataset/load_genotypes.py index e4988446..d309a903 100644 --- a/wqflask/maintenance/dataset/load_genotypes.py +++ b/wqflask/maintenance/dataset/load_genotypes.py @@ -1,207 +1,189 @@ -# GeneNetwork maintenance script -# Load genotypes from geno file into database - -# Author: Lei Yan -# Create Date: 2014-01-08 -# Last Update Date: 2014-01-10 - -# import -import sys -import os -import re -import MySQLdb -import ConfigParser - def main(argv): + # config + config = utilities.get_config(argv[1]) + print "config:" + for item in config.items('config'): + print "\t%s" % (str(item)) + # variables + genofile = open(config.get('config', 'genofile'), 'r') + metadic = {} + # parse genofile + for line in genofile: + line = line.strip() + if len(line) == 0: + continue + if line.startswith('#'): + continue + if line.startswith('@'): + line = line.strip('@') + items = line.split(';') + for item in items: + kv = re.split(':|=', item) + metadic[kv[0].strip()] = kv[1].strip() + continue + if line.startswith("Chr"): + print "[meta dictionary]" + for k,v in metadic.items(): + print "%s: %s" % (k, v) + print + continue + cells = line.split() + Chr = cells[0] + Locus = cells[1] + cM = cells[2] + Mb = cells[3] + print len(cells) + ? + return - # load configuration from configuration file - config = ConfigParser.ConfigParser() - config.read(argv[1]) - speciesid = config.get('configuration', 'speciesId') - inbredsetid = config.get('configuration', 'inbredsetid') - genofreezeid = config.get('configuration', 'genofreezeid') - genofile = config.get('configuration', 'genofile') - print "[configuration]\nspeciesid: %s\ninbredsetid: %s\ngenofreezeid: %s\ngenofile: %s\n" % (speciesid, inbredsetid, genofreezeid, genofile) - - # variables - metadic = {} - - # parse genofile - file_geno = open(genofile, 'r') - for line in file_geno: - line = line.strip() - if line.startswith('#'): - continue - if line.startswith('@'): - line = line.strip('@') - items = line.split(';') - for item in items: - kv = re.split(':|=', item) - metadic[kv[0].strip()] = kv[1].strip() - continue - if line.startswith("Chr"): - print "[meta dictionary]" - for k,v in metadic.items(): - print "%s: %s" % (k, v) - print - continue - cells = line.split() - Chr = cells[0] - Locus = cells[1] - cM = cells[2] - Mb = cells[3] - print len(cells) - ? - return - - # open db - host = 'localhost' - user = 'webqtl' - passwd = 'webqtl' - db = 'db_webqtl' - con = MySQLdb.Connect(db=db, user=user, passwd=passwd, host=host) - cursor = con.cursor() - # var - speciesid = int(argv[2]) - inbredsetid = int(argv[3]) - genofreezeid = int(argv[4]) - sql = """ - SELECT Id - FROM GenoData - ORDER BY Id DESC - LIMIT 1 - """ - cursor.execute(sql) - results = cursor.fetchall() - dataid = results[0][0] - print "speciesid: %s" % (speciesid) - print "inbredsetid: %s" % (inbredsetid) - print "genofreezeid: %s" % (genofreezeid) - print "dataid start: %s" % (dataid+1) - # samples - line = file_geno.readline() - sample_names = line.split()[4:] - sample_ids = [] - print "get %d samples from file:\n%s" % (len(sample_names), sample_names) - for sample_name in sample_names: - sql = """ - select Id - from Strain - where SpeciesId=%s - and Name like %s - """ - cursor.execute(sql, (speciesid, sample_name)) - results = cursor.fetchall() - if results: - sample_ids.append(results[0][0]) - else: - print "insert sample %s" % (sample_name) - sql = """ - INSERT INTO Strain - SET - SpeciesId=%s, - Name=%s, - Name2=%s - """ - cursor.execute(sql, (speciesid, sample_name, sample_name)) - sampleid = con.insert_id() - sample_ids.append(sampleid) - # - sql = """ - SELECT OrderId - FROM StrainXRef - where InbredSetId=%s - ORDER BY OrderId DESC - LIMIT 1 - """ - cursor.execute(sql, (inbredsetid)) - results = cursor.fetchall() - orderid = results[0][0] + 1 - # - sql = """ - INSERT INTO StrainXRef - SET - InbredSetId=%s, - StrainId=%s, - OrderId=%s, - Used_for_mapping=%s - """ - cursor.execute(sql, (inbredsetid, sampleid, orderid, "N")) - print "load %d samples from DB:" % (len(sample_names)) - for i in range(len(sample_names)): - print "%s\t%s" % (sample_names[i], sample_ids[i]) - # parse geno file - index = 0 - for line in file_geno: - index += 1 - if index % 1000 == 0: - print index - items = line.split() - chr = items[0] - name = items[1] - cm = items[2] - mb = items[3] - values = items[4:] - # geno - sql = """ - SELECT Id - FROM Geno - WHERE SpeciesId=%s - AND Name like %s - """ - cursor.execute(sql, (speciesid, name)) - results = cursor.fetchall() - if results: - genoid = results[0][0] - else: - print "insert geno %s" % (name) - sql = """ - INSERT INTO Geno - SET - SpeciesId=%s, - Name=%s, - Marker_Name=%s, - Chr=%s, - Mb=%s - """ - cursor.execute(sql, (speciesid, name, name, chr, mb)) - genoid = con.insert_id() - # genodata - dataid += 1 - for i in range(len(values)): - sample_id = sample_ids[i] - try: - value = int(values[i]) - except ValueError: - continue - if not value in [-1, 0, 1]: - print sample_id, value - continue - sql = """ - INSERT INTO GenoData - SET - Id=%s, - StrainId=%s, - value=%s - """ - cursor.execute(sql, (dataid, sample_id, value)) - # genoxref - sql = """ - INSERT INTO GenoXRef - SET - GenoFreezeId=%s, - GenoId=%s, - DataId=%s, - cM=%s, - Used_for_mapping=%s - """ - cursor.execute(sql, (genofreezeid, genoid, dataid, cm, 'N')) - print "Insert %d genoxref" % (index) - # close - file_geno.close() - con.close() + # open db + host = 'localhost' + user = 'webqtl' + passwd = 'webqtl' + db = 'db_webqtl' + con = MySQLdb.Connect(db=db, user=user, passwd=passwd, host=host) + cursor = con.cursor() + # var + speciesid = int(argv[2]) + inbredsetid = int(argv[3]) + genofreezeid = int(argv[4]) + sql = """ + SELECT Id + FROM GenoData + ORDER BY Id DESC + LIMIT 1 + """ + cursor.execute(sql) + results = cursor.fetchall() + dataid = results[0][0] + print "speciesid: %s" % (speciesid) + print "inbredsetid: %s" % (inbredsetid) + print "genofreezeid: %s" % (genofreezeid) + print "dataid start: %s" % (dataid+1) + # samples + line = file_geno.readline() + sample_names = line.split()[4:] + sample_ids = [] + print "get %d samples from file:\n%s" % (len(sample_names), sample_names) + for sample_name in sample_names: + sql = """ + select Id + from Strain + where SpeciesId=%s + and Name like %s + """ + cursor.execute(sql, (speciesid, sample_name)) + results = cursor.fetchall() + if results: + sample_ids.append(results[0][0]) + else: + print "insert sample %s" % (sample_name) + sql = """ + INSERT INTO Strain + SET + SpeciesId=%s, + Name=%s, + Name2=%s + """ + cursor.execute(sql, (speciesid, sample_name, sample_name)) + sampleid = con.insert_id() + sample_ids.append(sampleid) + # + sql = """ + SELECT OrderId + FROM StrainXRef + where InbredSetId=%s + ORDER BY OrderId DESC + LIMIT 1 + """ + cursor.execute(sql, (inbredsetid)) + results = cursor.fetchall() + orderid = results[0][0] + 1 + # + sql = """ + INSERT INTO StrainXRef + SET + InbredSetId=%s, + StrainId=%s, + OrderId=%s, + Used_for_mapping=%s + """ + cursor.execute(sql, (inbredsetid, sampleid, orderid, "N")) + print "load %d samples from DB:" % (len(sample_names)) + for i in range(len(sample_names)): + print "%s\t%s" % (sample_names[i], sample_ids[i]) + # parse geno file + index = 0 + for line in file_geno: + index += 1 + if index % 1000 == 0: + print index + items = line.split() + chr = items[0] + name = items[1] + cm = items[2] + mb = items[3] + values = items[4:] + # geno + sql = """ + SELECT Id + FROM Geno + WHERE SpeciesId=%s + AND Name like %s + """ + cursor.execute(sql, (speciesid, name)) + results = cursor.fetchall() + if results: + genoid = results[0][0] + else: + print "insert geno %s" % (name) + sql = """ + INSERT INTO Geno + SET + SpeciesId=%s, + Name=%s, + Marker_Name=%s, + Chr=%s, + Mb=%s + """ + cursor.execute(sql, (speciesid, name, name, chr, mb)) + genoid = con.insert_id() + # genodata + dataid += 1 + for i in range(len(values)): + sample_id = sample_ids[i] + try: + value = int(values[i]) + except ValueError: + continue + if not value in [-1, 0, 1]: + print sample_id, value + continue + sql = """ + INSERT INTO GenoData + SET + Id=%s, + StrainId=%s, + value=%s + """ + cursor.execute(sql, (dataid, sample_id, value)) + # genoxref + sql = """ + INSERT INTO GenoXRef + SET + GenoFreezeId=%s, + GenoId=%s, + DataId=%s, + cM=%s, + Used_for_mapping=%s + """ + cursor.execute(sql, (genofreezeid, genoid, dataid, cm, 'N')) + print "Insert %d genoxref" % (index) + # close + file_geno.close() + con.close() # main if __name__ == "__main__": - main(sys.argv) - print "exit successfully" + main(sys.argv) + print "exit successfully" -- cgit v1.2.3