From 8de6fec18cd98a10c58702c448a1e01e147dc5f7 Mon Sep 17 00:00:00 2001 From: Lei Yan Date: Wed, 5 Mar 2014 21:33:31 +0000 Subject: Improved load_genotypes.py Committer: Lei Yan On branch master --- .../datasampledir/load_genotypes/sample.geno | 2 +- wqflask/maintenance/dataset/genotypes_load.py | 207 --------------------- wqflask/maintenance/dataset/load_genotypes.py | 127 ++++++------- 3 files changed, 58 insertions(+), 278 deletions(-) delete mode 100644 wqflask/maintenance/dataset/genotypes_load.py (limited to 'wqflask/maintenance/dataset') diff --git a/wqflask/maintenance/dataset/datasampledir/load_genotypes/sample.geno b/wqflask/maintenance/dataset/datasampledir/load_genotypes/sample.geno index a28d31fc..0024ffd1 100644 --- a/wqflask/maintenance/dataset/datasampledir/load_genotypes/sample.geno +++ b/wqflask/maintenance/dataset/datasampledir/load_genotypes/sample.geno @@ -9,4 +9,4 @@ Chr Locus cM Mb BXD1 BXD2 BXD5 BXD6 BXD8 2 rs6365999 0.3 4.811062 B B D D D 3 rs6376963 0.895 5.008089 B B D D D 4 rs3677817 1.185 5.176058 B B D D D -5 rstest8236463 2.081 5.579193 B B D D D +5 rs8236463 2.081 5.579193 B B D D D diff --git a/wqflask/maintenance/dataset/genotypes_load.py b/wqflask/maintenance/dataset/genotypes_load.py deleted file mode 100644 index e4988446..00000000 --- a/wqflask/maintenance/dataset/genotypes_load.py +++ /dev/null @@ -1,207 +0,0 @@ -# GeneNetwork maintenance script -# Load genotypes from geno file into database - -# Author: Lei Yan -# Create Date: 2014-01-08 -# Last Update Date: 2014-01-10 - -# import -import sys -import os -import re -import MySQLdb -import ConfigParser - -def main(argv): - - # load configuration from configuration file - config = ConfigParser.ConfigParser() - config.read(argv[1]) - speciesid = config.get('configuration', 'speciesId') - inbredsetid = config.get('configuration', 'inbredsetid') - genofreezeid = config.get('configuration', 'genofreezeid') - genofile = config.get('configuration', 'genofile') - print "[configuration]\nspeciesid: %s\ninbredsetid: %s\ngenofreezeid: %s\ngenofile: %s\n" % (speciesid, inbredsetid, genofreezeid, genofile) - - # variables - metadic = {} - - # parse genofile - file_geno = open(genofile, 'r') - for line in file_geno: - line = line.strip() - if line.startswith('#'): - continue - if line.startswith('@'): - line = line.strip('@') - items = line.split(';') - for item in items: - kv = re.split(':|=', item) - metadic[kv[0].strip()] = kv[1].strip() - continue - if line.startswith("Chr"): - print "[meta dictionary]" - for k,v in metadic.items(): - print "%s: %s" % (k, v) - print - continue - cells = line.split() - Chr = cells[0] - Locus = cells[1] - cM = cells[2] - Mb = cells[3] - print len(cells) - ? - return - - # open db - host = 'localhost' - user = 'webqtl' - passwd = 'webqtl' - db = 'db_webqtl' - con = MySQLdb.Connect(db=db, user=user, passwd=passwd, host=host) - cursor = con.cursor() - # var - speciesid = int(argv[2]) - inbredsetid = int(argv[3]) - genofreezeid = int(argv[4]) - sql = """ - SELECT Id - FROM GenoData - ORDER BY Id DESC - LIMIT 1 - """ - cursor.execute(sql) - results = cursor.fetchall() - dataid = results[0][0] - print "speciesid: %s" % (speciesid) - print "inbredsetid: %s" % (inbredsetid) - print "genofreezeid: %s" % (genofreezeid) - print "dataid start: %s" % (dataid+1) - # samples - line = file_geno.readline() - sample_names = line.split()[4:] - sample_ids = [] - print "get %d samples from file:\n%s" % (len(sample_names), sample_names) - for sample_name in sample_names: - sql = """ - select Id - from Strain - where SpeciesId=%s - and Name like %s - """ - cursor.execute(sql, (speciesid, sample_name)) - results = cursor.fetchall() - if results: - sample_ids.append(results[0][0]) - else: - print "insert sample %s" % (sample_name) - sql = """ - INSERT INTO Strain - SET - SpeciesId=%s, - Name=%s, - Name2=%s - """ - cursor.execute(sql, (speciesid, sample_name, sample_name)) - sampleid = con.insert_id() - sample_ids.append(sampleid) - # - sql = """ - SELECT OrderId - FROM StrainXRef - where InbredSetId=%s - ORDER BY OrderId DESC - LIMIT 1 - """ - cursor.execute(sql, (inbredsetid)) - results = cursor.fetchall() - orderid = results[0][0] + 1 - # - sql = """ - INSERT INTO StrainXRef - SET - InbredSetId=%s, - StrainId=%s, - OrderId=%s, - Used_for_mapping=%s - """ - cursor.execute(sql, (inbredsetid, sampleid, orderid, "N")) - print "load %d samples from DB:" % (len(sample_names)) - for i in range(len(sample_names)): - print "%s\t%s" % (sample_names[i], sample_ids[i]) - # parse geno file - index = 0 - for line in file_geno: - index += 1 - if index % 1000 == 0: - print index - items = line.split() - chr = items[0] - name = items[1] - cm = items[2] - mb = items[3] - values = items[4:] - # geno - sql = """ - SELECT Id - FROM Geno - WHERE SpeciesId=%s - AND Name like %s - """ - cursor.execute(sql, (speciesid, name)) - results = cursor.fetchall() - if results: - genoid = results[0][0] - else: - print "insert geno %s" % (name) - sql = """ - INSERT INTO Geno - SET - SpeciesId=%s, - Name=%s, - Marker_Name=%s, - Chr=%s, - Mb=%s - """ - cursor.execute(sql, (speciesid, name, name, chr, mb)) - genoid = con.insert_id() - # genodata - dataid += 1 - for i in range(len(values)): - sample_id = sample_ids[i] - try: - value = int(values[i]) - except ValueError: - continue - if not value in [-1, 0, 1]: - print sample_id, value - continue - sql = """ - INSERT INTO GenoData - SET - Id=%s, - StrainId=%s, - value=%s - """ - cursor.execute(sql, (dataid, sample_id, value)) - # genoxref - sql = """ - INSERT INTO GenoXRef - SET - GenoFreezeId=%s, - GenoId=%s, - DataId=%s, - cM=%s, - Used_for_mapping=%s - """ - cursor.execute(sql, (genofreezeid, genoid, dataid, cm, 'N')) - print "Insert %d genoxref" % (index) - # close - file_geno.close() - con.close() - -# main -if __name__ == "__main__": - main(sys.argv) - print "exit successfully" diff --git a/wqflask/maintenance/dataset/load_genotypes.py b/wqflask/maintenance/dataset/load_genotypes.py index fbf6484f..4697382b 100644 --- a/wqflask/maintenance/dataset/load_genotypes.py +++ b/wqflask/maintenance/dataset/load_genotypes.py @@ -1,10 +1,3 @@ -#Do whatever else is needed with the Marker object -#Probably create Genofile object as well -#Make sure rest of code works with params object (though -#everything in the params object should probably just be the parameters of -#the Genofile object) - - from __future__ import absolute_import, print_function, division import sys @@ -14,36 +7,28 @@ import argparse import utilities import datastructure -def main(): - parser = argparse.ArgumentParser(description='Load Genotypes') - parser.add_argument('-c', '--config') - opts = parser.parse_args() - config = opts.config - # config - config = utilities.get_config(config) - print("config:") +def main(argv): + config = utilities.get_config(argv[1]) + print("config file:") for item in config.items('config'): print("\t", str(item)) - parse_genofile(fetch_parameters(config)) + parse_genofile(config, fetch_parameters(config)) def fetch_parameters(config): - # variables - params = {} - params['inbredsetid'] = config.get('config', 'inbredsetid') - species = datastructure.get_species(params['inbredsetid']) - params["speciesid"] = species[0] - genofreeze = datastructure.get_genofreeze_byinbredsetid(params['inbredsetid']) - params['genofreezeid'] = genofreeze[0] - params['dataid'] = datastructure.get_nextdataid_genotype() - params['genofile'] = config.get('config', 'genofile') - return params - -def parse_genofile(params): - # genofile - genofile = open(params['genofile'], 'r') - metadic = {} - print() - # parse genofile + config_dic = {} + config_dic['inbredsetid'] = config.get('config', 'inbredsetid') + config_dic["speciesid"] = datastructure.get_species(config_dic['inbredsetid'])[0] + config_dic['genofreezeid'] = datastructure.get_genofreeze_byinbredsetid(config_dic['inbredsetid'])[0] + config_dic['dataid'] = datastructure.get_nextdataid_genotype() + config_dic['genofile'] = config.get('config', 'genofile') + print("config dictionary:") + for k, v in config_dic.items(): + print("\t%s: %s" % (k, v)) + return config_dic + +def parse_genofile(config, config_dic): + genofile = open(config_dic['genofile'], 'r') + meta_dic = {} for line in genofile: line = line.strip() if len(line) == 0: @@ -55,40 +40,39 @@ def parse_genofile(params): items = line.split(';') for item in items: kv = re.split(':|=', item) - metadic[kv[0].strip()] = kv[1].strip() + meta_dic[kv[0].strip()] = kv[1].strip() continue if line.lower().startswith("chr"): # - print("geno file meta:") - for k, v in metadic.items(): - print("\t{}: {}".format(k, v)) + print("geno file meta dictionary:") + for k, v in meta_dic.items(): + print("\t%s: %s" % (k, v)) # - print("geno file head:\n\t{}\n".format(line)) + print("geno file head:\n\t%s" % line) strainnames = line.split()[4:] - strains = datastructure.get_strains_bynames(inbredsetid=inbredsetid, strainnames=strainnames, updatestrainxref="yes") + config_dic['strains'] = datastructure.get_strains_bynames(inbredsetid=config_dic['inbredsetid'], strainnames=strainnames, updatestrainxref="yes") continue - # geno file line - marker = Marker(line) - # - genoid = check_or_insert_geno(params, marker) - if check_genoxref(params): + # geno file line, marker + marker_dic = parse_marker(line) + marker_dic['genoid'] = check_or_insert_geno(config_dic, marker_dic) + if check_genoxref(config_dic, marker_dic): continue - insert_genodata(params) - insert_genoxref(params) - dataid += 1 + insert_genodata(config, config_dic, marker_dic) + insert_genoxref(config_dic, marker_dic) + config_dic['dataid'] += 1 genofile.close() - - -class Marker(object): - def __init__(self, line): - self.cells = line.split() - self.chromosome = cells[0] - self.locus = cells[1] - self.cm = cells[2] - self.mb = cells[3] - self.values = cells[4:] + +def parse_marker(line): + marker_dic = {} + cells = line.split() + marker_dic['chromosome'] = cells[0] + marker_dic['locus'] = cells[1] + marker_dic['cm'] = cells[2] + marker_dic['mb'] = cells[3] + marker_dic['values'] = cells[4:] + return marker_dic -def check_or_insert_geno(params, marker): +def check_or_insert_geno(config_dic, marker_dic): cursor, con = utilities.get_cursor() sql = """ SELECT Geno.`Id` @@ -96,7 +80,7 @@ def check_or_insert_geno(params, marker): WHERE Geno.`SpeciesId`=%s AND Geno.`Name` like %s """ - cursor.execute(sql, (speciesid, locus)) + cursor.execute(sql, (config_dic["speciesid"], marker_dic['locus'])) result = cursor.fetchone() if result: genoid = result[0] @@ -111,27 +95,29 @@ def check_or_insert_geno(params, marker): Geno.`Chr`=%s, Geno.`Mb`=%s """ - cursor.execute(sql, (speciesid, locus, locus, chr, mb)) + cursor.execute(sql, (config_dic['speciesid'], marker_dic['locus'], marker_dic['locus'], marker_dic['chromosome'], marker_dic['mb'])) rowcount = cursor.rowcount genoid = con.insert_id() print("INSERT INTO Geno: %d record: %d" % (rowcount, genoid)) return genoid -def check_GenoXRef(): +def check_genoxref(config_dic, marker_dic): + cursor, con = utilities.get_cursor() sql = """ select GenoXRef.* from GenoXRef where GenoXRef.`GenoFreezeId`=%s AND GenoXRef.`GenoId`=%s """ - cursor.execute(sql, (genofreezeid, genoid)) + cursor.execute(sql, (config_dic['genofreezeid'], marker_dic['genoid'])) rowcount = cursor.rowcount return rowcount - -def insert_genodata(): - for index, strain in enumerate(strains): + +def insert_genodata(config, config_dic, marker_dic): + cursor, con = utilities.get_cursor() + for index, strain in enumerate(config_dic['strains']): strainid = strain[0] - value = utilities.to_db_string(values[index], None) + value = utilities.to_db_string(marker_dic['values'][index], None) if not value: continue value = config.get('config', "genovalue_" + value) @@ -148,9 +134,10 @@ def insert_genodata(): GenoData.`StrainId`=%s, GenoData.`value`=%s """ - cursor.execute(sql, (dataid, strainid, number)) + cursor.execute(sql, (config_dic['dataid'], strainid, number)) -def insert_genoxref(): +def insert_genoxref(config_dic, marker_dic): + cursor, con = utilities.get_cursor() sql = """ INSERT INTO GenoXRef SET @@ -160,11 +147,11 @@ def insert_genoxref(): GenoXRef.`cM`=%s, GenoXRef.`Used_for_mapping`=%s """ - cursor.execute(sql, (genofreezeid, genoid, dataid, cm, 'N')) + cursor.execute(sql, (config_dic['genofreezeid'], marker_dic['genoid'], config_dic['dataid'], marker_dic['cm'], 'N')) rowcount = cursor.rowcount print("INSERT INTO GenoXRef: %d record" % (rowcount)) if __name__ == "__main__": print("command line arguments:\n\t%s" % sys.argv) - main() + main(sys.argv) print("exit successfully") -- cgit v1.2.3