about summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--wqflask/maintenance/dataset/datasampledir/load_genotypes/sample.geno2
-rw-r--r--wqflask/maintenance/dataset/genotypes_load.py207
-rw-r--r--wqflask/maintenance/dataset/load_genotypes.py127
3 files changed, 58 insertions, 278 deletions
diff --git a/wqflask/maintenance/dataset/datasampledir/load_genotypes/sample.geno b/wqflask/maintenance/dataset/datasampledir/load_genotypes/sample.geno
index a28d31fc..0024ffd1 100644
--- a/wqflask/maintenance/dataset/datasampledir/load_genotypes/sample.geno
+++ b/wqflask/maintenance/dataset/datasampledir/load_genotypes/sample.geno
@@ -9,4 +9,4 @@ Chr	Locus	cM	Mb	BXD1	BXD2	BXD5	BXD6	BXD8
 2	rs6365999	0.3	4.811062	B	B	D	D	D
 3	rs6376963	0.895	5.008089	B	B	D	D	D
 4	rs3677817	1.185	5.176058	B	B	D	D	D
-5	rstest8236463	2.081	5.579193	B	B	D	D	D
+5	rs8236463	2.081	5.579193	B	B	D	D	D
diff --git a/wqflask/maintenance/dataset/genotypes_load.py b/wqflask/maintenance/dataset/genotypes_load.py
deleted file mode 100644
index e4988446..00000000
--- a/wqflask/maintenance/dataset/genotypes_load.py
+++ /dev/null
@@ -1,207 +0,0 @@
-# GeneNetwork maintenance script
-# Load genotypes from geno file into database
-
-# Author:			Lei Yan
-# Create Date:		2014-01-08
-# Last Update Date:	2014-01-10
-
-# import
-import sys
-import os
-import re
-import MySQLdb
-import ConfigParser
-
-def main(argv):
-
-	# load configuration from configuration file
-	config = ConfigParser.ConfigParser()
-	config.read(argv[1])
-	speciesid = config.get('configuration', 'speciesId')
-	inbredsetid = config.get('configuration', 'inbredsetid')
-	genofreezeid = config.get('configuration', 'genofreezeid')
-	genofile = config.get('configuration', 'genofile')
-	print "[configuration]\nspeciesid: %s\ninbredsetid: %s\ngenofreezeid: %s\ngenofile: %s\n" % (speciesid, inbredsetid, genofreezeid, genofile)
-	
-	# variables
-	metadic = {}
-	
-	# parse genofile
-	file_geno = open(genofile, 'r')
-	for line in file_geno:
-		line = line.strip()
-		if line.startswith('#'):
-			continue
-		if line.startswith('@'):
-			line = line.strip('@')
-			items = line.split(';')
-			for item in items:
-				kv = re.split(':|=', item)
-				metadic[kv[0].strip()] = kv[1].strip()
-			continue
-		if line.startswith("Chr"):
-			print "[meta dictionary]"
-			for k,v in metadic.items():
-				print "%s: %s" % (k, v)
-			print
-			continue
-		cells = line.split()
-		Chr = cells[0]
-		Locus = cells[1]
-		cM = cells[2]
-		Mb = cells[3]
-		print len(cells)
-		?
-	return
-
-	# open db
-	host = 'localhost'
-	user = 'webqtl'
-	passwd = 'webqtl'
-	db = 'db_webqtl'
-	con = MySQLdb.Connect(db=db, user=user, passwd=passwd, host=host)
-	cursor = con.cursor()
-	# var
-	speciesid = int(argv[2])
-	inbredsetid = int(argv[3])
-	genofreezeid = int(argv[4])
-	sql = """
-		SELECT Id
-		FROM GenoData
-		ORDER BY Id DESC
-		LIMIT 1
-		"""
-	cursor.execute(sql)
-	results = cursor.fetchall()
-	dataid = results[0][0]
-	print "speciesid: %s"		% (speciesid)
-	print "inbredsetid: %s"		% (inbredsetid)
-	print "genofreezeid: %s"	% (genofreezeid)
-	print "dataid start: %s"		% (dataid+1)
-	# samples
-	line = file_geno.readline()
-	sample_names = line.split()[4:]
-	sample_ids = []
-	print "get %d samples from file:\n%s" % (len(sample_names), sample_names)
-	for sample_name in sample_names:
-		sql = """
-			select Id
-			from Strain
-			where SpeciesId=%s
-			and Name like %s
-			"""
-		cursor.execute(sql, (speciesid, sample_name))
-		results = cursor.fetchall()
-		if results:
-			sample_ids.append(results[0][0])
-		else:
-			print "insert sample %s" % (sample_name)
-			sql = """
-				INSERT INTO Strain
-				SET
-					SpeciesId=%s,
-					Name=%s,
-					Name2=%s
-				"""
-			cursor.execute(sql, (speciesid, sample_name, sample_name))
-			sampleid = con.insert_id()
-			sample_ids.append(sampleid)
-			#
-			sql = """
-				SELECT OrderId
-				FROM StrainXRef
-				where InbredSetId=%s
-				ORDER BY OrderId DESC
-				LIMIT 1 
-				"""
-			cursor.execute(sql, (inbredsetid))
-			results = cursor.fetchall()
-			orderid = results[0][0] + 1
-			#
-			sql = """
-				INSERT INTO StrainXRef
-				SET
-					InbredSetId=%s,
-					StrainId=%s,
-					OrderId=%s,
-					Used_for_mapping=%s
-				"""
-			cursor.execute(sql, (inbredsetid, sampleid, orderid, "N"))
-	print "load %d samples from DB:" % (len(sample_names))
-	for i in range(len(sample_names)):
-		print "%s\t%s" % (sample_names[i], sample_ids[i])
-	# parse geno file
-	index = 0
-	for line in file_geno:
-		index += 1
-		if index % 1000 == 0:
-			print index
-		items = line.split()
-		chr = items[0]
-		name = items[1]
-		cm = items[2]
-		mb = items[3]
-		values = items[4:]
-		# geno
-		sql = """
-			SELECT Id
-			FROM Geno
-			WHERE SpeciesId=%s
-			AND Name like %s
-			"""
-		cursor.execute(sql, (speciesid, name))
-		results = cursor.fetchall()
-		if results:
-			genoid = results[0][0]
-		else:
-			print "insert geno %s" % (name)
-			sql = """
-				INSERT INTO Geno
-				SET
-					SpeciesId=%s,
-					Name=%s,
-					Marker_Name=%s,
-					Chr=%s,
-					Mb=%s
-				"""
-			cursor.execute(sql, (speciesid, name, name, chr, mb))
-			genoid = con.insert_id()
-		# genodata
-		dataid += 1
-		for i in range(len(values)):
-			sample_id = sample_ids[i]
-			try:
-				value = int(values[i])
-			except ValueError:
-				continue
-			if not value in [-1, 0, 1]:
-				print sample_id, value
-				continue
-			sql = """
-				INSERT INTO GenoData
-				SET
-					Id=%s,
-					StrainId=%s,
-					value=%s
-				"""
-			cursor.execute(sql, (dataid, sample_id, value))
-		# genoxref
-		sql = """
-			INSERT INTO GenoXRef
-			SET
-				GenoFreezeId=%s,
-				GenoId=%s,
-				DataId=%s,
-				cM=%s,
-				Used_for_mapping=%s
-			"""
-		cursor.execute(sql, (genofreezeid, genoid, dataid, cm, 'N'))
-	print "Insert %d genoxref" % (index)
-	# close
-	file_geno.close()
-	con.close()
-
-# main
-if __name__ == "__main__":
-	main(sys.argv)
-	print "exit successfully"
diff --git a/wqflask/maintenance/dataset/load_genotypes.py b/wqflask/maintenance/dataset/load_genotypes.py
index fbf6484f..4697382b 100644
--- a/wqflask/maintenance/dataset/load_genotypes.py
+++ b/wqflask/maintenance/dataset/load_genotypes.py
@@ -1,10 +1,3 @@
-#Do whatever else is needed with the Marker object
-#Probably create Genofile object as well
-#Make sure rest of code works with params object (though
-#everything in the params object should probably just be the parameters of
-#the Genofile object)
-
-
 from __future__ import absolute_import, print_function, division
 
 import sys
@@ -14,36 +7,28 @@ import argparse
 import utilities
 import datastructure
 
-def main():
-    parser = argparse.ArgumentParser(description='Load Genotypes')
-    parser.add_argument('-c', '--config')
-    opts = parser.parse_args()
-    config = opts.config
-    # config
-    config = utilities.get_config(config)
-    print("config:")
+def main(argv):
+    config = utilities.get_config(argv[1])
+    print("config file:")
     for item in config.items('config'):
         print("\t", str(item))
-    parse_genofile(fetch_parameters(config))
+    parse_genofile(config, fetch_parameters(config))
 
 def fetch_parameters(config):
-    # variables
-    params = {}
-    params['inbredsetid'] = config.get('config', 'inbredsetid')
-    species = datastructure.get_species(params['inbredsetid'])
-    params["speciesid"] = species[0]
-    genofreeze = datastructure.get_genofreeze_byinbredsetid(params['inbredsetid'])
-    params['genofreezeid'] = genofreeze[0]
-    params['dataid'] = datastructure.get_nextdataid_genotype()
-    params['genofile'] = config.get('config', 'genofile')
-    return params
-    
-def parse_genofile(params):
-    # genofile
-    genofile = open(params['genofile'], 'r')
-    metadic = {}
-    print()
-    # parse genofile
+    config_dic = {}
+    config_dic['inbredsetid'] = config.get('config', 'inbredsetid')
+    config_dic["speciesid"] = datastructure.get_species(config_dic['inbredsetid'])[0]
+    config_dic['genofreezeid'] = datastructure.get_genofreeze_byinbredsetid(config_dic['inbredsetid'])[0]
+    config_dic['dataid'] = datastructure.get_nextdataid_genotype()
+    config_dic['genofile'] = config.get('config', 'genofile')
+    print("config dictionary:")
+    for k, v in config_dic.items():
+        print("\t%s: %s" % (k, v))
+    return config_dic
+
+def parse_genofile(config, config_dic):
+    genofile = open(config_dic['genofile'], 'r')
+    meta_dic = {}
     for line in genofile:
         line = line.strip()
         if len(line) == 0:
@@ -55,40 +40,39 @@ def parse_genofile(params):
             items = line.split(';')
             for item in items:
                 kv = re.split(':|=', item)
-                metadic[kv[0].strip()] = kv[1].strip()
+                meta_dic[kv[0].strip()] = kv[1].strip()
             continue
         if line.lower().startswith("chr"):
             #
-            print("geno file meta:")
-            for k, v in metadic.items():
-                print("\t{}: {}".format(k, v))
+            print("geno file meta dictionary:")
+            for k, v in meta_dic.items():
+                print("\t%s: %s" % (k, v))
             #
-            print("geno file head:\n\t{}\n".format(line))
+            print("geno file head:\n\t%s" % line)
             strainnames = line.split()[4:]
-            strains = datastructure.get_strains_bynames(inbredsetid=inbredsetid, strainnames=strainnames, updatestrainxref="yes")
+            config_dic['strains'] = datastructure.get_strains_bynames(inbredsetid=config_dic['inbredsetid'], strainnames=strainnames, updatestrainxref="yes")
             continue
-        # geno file line
-        marker = Marker(line)
-        #
-        genoid = check_or_insert_geno(params, marker)
-        if check_genoxref(params):
+        # geno file line, marker
+        marker_dic = parse_marker(line)
+        marker_dic['genoid'] = check_or_insert_geno(config_dic, marker_dic)
+        if check_genoxref(config_dic, marker_dic):
             continue
-        insert_genodata(params)
-        insert_genoxref(params)
-        dataid += 1
+        insert_genodata(config, config_dic, marker_dic)
+        insert_genoxref(config_dic, marker_dic)
+        config_dic['dataid'] += 1
     genofile.close()
-    
-    
-class Marker(object):
-    def __init__(self, line):
-        self.cells = line.split()
-        self.chromosome = cells[0]
-        self.locus = cells[1]
-        self.cm = cells[2]
-        self.mb = cells[3]
-        self.values = cells[4:]
+
+def parse_marker(line):
+    marker_dic = {}
+    cells = line.split()
+    marker_dic['chromosome'] = cells[0]
+    marker_dic['locus'] = cells[1]
+    marker_dic['cm'] = cells[2]
+    marker_dic['mb'] = cells[3]
+    marker_dic['values'] = cells[4:]
+    return marker_dic
         
-def check_or_insert_geno(params, marker):
+def check_or_insert_geno(config_dic, marker_dic):
     cursor, con = utilities.get_cursor()
     sql = """
         SELECT Geno.`Id`
@@ -96,7 +80,7 @@ def check_or_insert_geno(params, marker):
         WHERE Geno.`SpeciesId`=%s
         AND Geno.`Name` like %s
         """
-    cursor.execute(sql, (speciesid, locus))
+    cursor.execute(sql, (config_dic["speciesid"], marker_dic['locus']))
     result = cursor.fetchone()
     if result:
         genoid = result[0]
@@ -111,27 +95,29 @@ def check_or_insert_geno(params, marker):
             Geno.`Chr`=%s,
             Geno.`Mb`=%s
             """
-        cursor.execute(sql, (speciesid, locus, locus, chr, mb))
+        cursor.execute(sql, (config_dic['speciesid'], marker_dic['locus'], marker_dic['locus'], marker_dic['chromosome'], marker_dic['mb']))
         rowcount = cursor.rowcount
         genoid = con.insert_id()
         print("INSERT INTO Geno: %d record: %d" % (rowcount, genoid))
     return genoid
 
-def check_GenoXRef():
+def check_genoxref(config_dic, marker_dic):
+    cursor, con = utilities.get_cursor()
     sql = """
         select GenoXRef.*
         from GenoXRef
         where GenoXRef.`GenoFreezeId`=%s
         AND GenoXRef.`GenoId`=%s
         """
-    cursor.execute(sql, (genofreezeid, genoid))
+    cursor.execute(sql, (config_dic['genofreezeid'], marker_dic['genoid']))
     rowcount = cursor.rowcount
     return rowcount
-    
-def insert_genodata():
-    for index, strain in enumerate(strains):
+
+def insert_genodata(config, config_dic, marker_dic):
+    cursor, con = utilities.get_cursor()
+    for index, strain in enumerate(config_dic['strains']):
         strainid = strain[0]
-        value = utilities.to_db_string(values[index], None)
+        value = utilities.to_db_string(marker_dic['values'][index], None)
         if not value:
             continue
         value = config.get('config', "genovalue_" + value)
@@ -148,9 +134,10 @@ def insert_genodata():
             GenoData.`StrainId`=%s,
             GenoData.`value`=%s
             """
-        cursor.execute(sql, (dataid, strainid, number))
+        cursor.execute(sql, (config_dic['dataid'], strainid, number))
 
-def insert_genoxref():
+def insert_genoxref(config_dic, marker_dic):
+    cursor, con = utilities.get_cursor()
     sql = """
         INSERT INTO GenoXRef
         SET
@@ -160,11 +147,11 @@ def insert_genoxref():
         GenoXRef.`cM`=%s,
         GenoXRef.`Used_for_mapping`=%s
         """
-    cursor.execute(sql, (genofreezeid, genoid, dataid, cm, 'N'))
+    cursor.execute(sql, (config_dic['genofreezeid'], marker_dic['genoid'], config_dic['dataid'], marker_dic['cm'], 'N'))
     rowcount = cursor.rowcount
     print("INSERT INTO GenoXRef: %d record" % (rowcount))
 
 if __name__ == "__main__":
     print("command line arguments:\n\t%s" % sys.argv)
-    main()
+    main(sys.argv)
     print("exit successfully")