about summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--wqflask/maintenance/dataset/datasampledir/load_genotypes/config.ini4
-rw-r--r--wqflask/maintenance/dataset/datasampledir/load_genotypes/sample.geno14
-rw-r--r--wqflask/maintenance/dataset/load_genotypes.py386
3 files changed, 202 insertions, 202 deletions
diff --git a/wqflask/maintenance/dataset/datasampledir/load_genotypes/config.ini b/wqflask/maintenance/dataset/datasampledir/load_genotypes/config.ini
new file mode 100644
index 00000000..638c3bd8
--- /dev/null
+++ b/wqflask/maintenance/dataset/datasampledir/load_genotypes/config.ini
@@ -0,0 +1,4 @@
+[config]
+inbredsetid = 1
+datafile = datasampledir/load_phenotypes/sample_data.txt
+metafile = datasampledir/load_phenotypes/sample_meta.txt
diff --git a/wqflask/maintenance/dataset/datasampledir/load_genotypes/sample.geno b/wqflask/maintenance/dataset/datasampledir/load_genotypes/sample.geno
new file mode 100644
index 00000000..6dad9ace
--- /dev/null
+++ b/wqflask/maintenance/dataset/datasampledir/load_genotypes/sample.geno
@@ -0,0 +1,14 @@
+@name:BXD

+@type:riset

+@mat:B

+@pat:D

+@het:H

+@unk:U

+Chr	Locus	cM	Mb	BXD1	BXD2	BXD5	BXD6	BXD8

+1	rs6269442	0	3.482275	B	B	D	D	D

+1	rs6365999	0	4.811062	B	B	D	D	D

+1	rs6376963	0.895	5.008089	B	B	D	D	D

+1	rs3677817	1.185	5.176058	B	B	D	D	D

+1	rs8236463	2.081	5.579193	B	B	D	D	D

+1	rs6333200	2.081	6.217921	B	B	D	D	D

+1	rs6298633	2.367	6.820241	B	B	D	D	D

diff --git a/wqflask/maintenance/dataset/load_genotypes.py b/wqflask/maintenance/dataset/load_genotypes.py
index e4988446..d309a903 100644
--- a/wqflask/maintenance/dataset/load_genotypes.py
+++ b/wqflask/maintenance/dataset/load_genotypes.py
@@ -1,207 +1,189 @@
-# GeneNetwork maintenance script
-# Load genotypes from geno file into database
-
-# Author:			Lei Yan
-# Create Date:		2014-01-08
-# Last Update Date:	2014-01-10
-
-# import
-import sys
-import os
-import re
-import MySQLdb
-import ConfigParser
-
 def main(argv):
+    # config
+    config = utilities.get_config(argv[1])
+    print "config:"
+    for item in config.items('config'):
+        print "\t%s" % (str(item))
+    # variables
+    genofile = open(config.get('config', 'genofile'), 'r')
+    metadic = {}
+    # parse genofile
+    for line in genofile:
+        line = line.strip()
+        if len(line) == 0:
+            continue
+        if line.startswith('#'):
+            continue
+        if line.startswith('@'):
+            line = line.strip('@')
+            items = line.split(';')
+            for item in items:
+                kv = re.split(':|=', item)
+                metadic[kv[0].strip()] = kv[1].strip()
+            continue
+        if line.startswith("Chr"):
+            print "[meta dictionary]"
+            for k,v in metadic.items():
+                print "%s: %s" % (k, v)
+            print
+            continue
+        cells = line.split()
+        Chr = cells[0]
+        Locus = cells[1]
+        cM = cells[2]
+        Mb = cells[3]
+        print len(cells)
+        ?
+    return
 
-	# load configuration from configuration file
-	config = ConfigParser.ConfigParser()
-	config.read(argv[1])
-	speciesid = config.get('configuration', 'speciesId')
-	inbredsetid = config.get('configuration', 'inbredsetid')
-	genofreezeid = config.get('configuration', 'genofreezeid')
-	genofile = config.get('configuration', 'genofile')
-	print "[configuration]\nspeciesid: %s\ninbredsetid: %s\ngenofreezeid: %s\ngenofile: %s\n" % (speciesid, inbredsetid, genofreezeid, genofile)
-	
-	# variables
-	metadic = {}
-	
-	# parse genofile
-	file_geno = open(genofile, 'r')
-	for line in file_geno:
-		line = line.strip()
-		if line.startswith('#'):
-			continue
-		if line.startswith('@'):
-			line = line.strip('@')
-			items = line.split(';')
-			for item in items:
-				kv = re.split(':|=', item)
-				metadic[kv[0].strip()] = kv[1].strip()
-			continue
-		if line.startswith("Chr"):
-			print "[meta dictionary]"
-			for k,v in metadic.items():
-				print "%s: %s" % (k, v)
-			print
-			continue
-		cells = line.split()
-		Chr = cells[0]
-		Locus = cells[1]
-		cM = cells[2]
-		Mb = cells[3]
-		print len(cells)
-		?
-	return
-
-	# open db
-	host = 'localhost'
-	user = 'webqtl'
-	passwd = 'webqtl'
-	db = 'db_webqtl'
-	con = MySQLdb.Connect(db=db, user=user, passwd=passwd, host=host)
-	cursor = con.cursor()
-	# var
-	speciesid = int(argv[2])
-	inbredsetid = int(argv[3])
-	genofreezeid = int(argv[4])
-	sql = """
-		SELECT Id
-		FROM GenoData
-		ORDER BY Id DESC
-		LIMIT 1
-		"""
-	cursor.execute(sql)
-	results = cursor.fetchall()
-	dataid = results[0][0]
-	print "speciesid: %s"		% (speciesid)
-	print "inbredsetid: %s"		% (inbredsetid)
-	print "genofreezeid: %s"	% (genofreezeid)
-	print "dataid start: %s"		% (dataid+1)
-	# samples
-	line = file_geno.readline()
-	sample_names = line.split()[4:]
-	sample_ids = []
-	print "get %d samples from file:\n%s" % (len(sample_names), sample_names)
-	for sample_name in sample_names:
-		sql = """
-			select Id
-			from Strain
-			where SpeciesId=%s
-			and Name like %s
-			"""
-		cursor.execute(sql, (speciesid, sample_name))
-		results = cursor.fetchall()
-		if results:
-			sample_ids.append(results[0][0])
-		else:
-			print "insert sample %s" % (sample_name)
-			sql = """
-				INSERT INTO Strain
-				SET
-					SpeciesId=%s,
-					Name=%s,
-					Name2=%s
-				"""
-			cursor.execute(sql, (speciesid, sample_name, sample_name))
-			sampleid = con.insert_id()
-			sample_ids.append(sampleid)
-			#
-			sql = """
-				SELECT OrderId
-				FROM StrainXRef
-				where InbredSetId=%s
-				ORDER BY OrderId DESC
-				LIMIT 1 
-				"""
-			cursor.execute(sql, (inbredsetid))
-			results = cursor.fetchall()
-			orderid = results[0][0] + 1
-			#
-			sql = """
-				INSERT INTO StrainXRef
-				SET
-					InbredSetId=%s,
-					StrainId=%s,
-					OrderId=%s,
-					Used_for_mapping=%s
-				"""
-			cursor.execute(sql, (inbredsetid, sampleid, orderid, "N"))
-	print "load %d samples from DB:" % (len(sample_names))
-	for i in range(len(sample_names)):
-		print "%s\t%s" % (sample_names[i], sample_ids[i])
-	# parse geno file
-	index = 0
-	for line in file_geno:
-		index += 1
-		if index % 1000 == 0:
-			print index
-		items = line.split()
-		chr = items[0]
-		name = items[1]
-		cm = items[2]
-		mb = items[3]
-		values = items[4:]
-		# geno
-		sql = """
-			SELECT Id
-			FROM Geno
-			WHERE SpeciesId=%s
-			AND Name like %s
-			"""
-		cursor.execute(sql, (speciesid, name))
-		results = cursor.fetchall()
-		if results:
-			genoid = results[0][0]
-		else:
-			print "insert geno %s" % (name)
-			sql = """
-				INSERT INTO Geno
-				SET
-					SpeciesId=%s,
-					Name=%s,
-					Marker_Name=%s,
-					Chr=%s,
-					Mb=%s
-				"""
-			cursor.execute(sql, (speciesid, name, name, chr, mb))
-			genoid = con.insert_id()
-		# genodata
-		dataid += 1
-		for i in range(len(values)):
-			sample_id = sample_ids[i]
-			try:
-				value = int(values[i])
-			except ValueError:
-				continue
-			if not value in [-1, 0, 1]:
-				print sample_id, value
-				continue
-			sql = """
-				INSERT INTO GenoData
-				SET
-					Id=%s,
-					StrainId=%s,
-					value=%s
-				"""
-			cursor.execute(sql, (dataid, sample_id, value))
-		# genoxref
-		sql = """
-			INSERT INTO GenoXRef
-			SET
-				GenoFreezeId=%s,
-				GenoId=%s,
-				DataId=%s,
-				cM=%s,
-				Used_for_mapping=%s
-			"""
-		cursor.execute(sql, (genofreezeid, genoid, dataid, cm, 'N'))
-	print "Insert %d genoxref" % (index)
-	# close
-	file_geno.close()
-	con.close()
+    # open db
+    host = 'localhost'
+    user = 'webqtl'
+    passwd = 'webqtl'
+    db = 'db_webqtl'
+    con = MySQLdb.Connect(db=db, user=user, passwd=passwd, host=host)
+    cursor = con.cursor()
+    # var
+    speciesid = int(argv[2])
+    inbredsetid = int(argv[3])
+    genofreezeid = int(argv[4])
+    sql = """
+        SELECT Id
+        FROM GenoData
+        ORDER BY Id DESC
+        LIMIT 1
+        """
+    cursor.execute(sql)
+    results = cursor.fetchall()
+    dataid = results[0][0]
+    print "speciesid: %s"        % (speciesid)
+    print "inbredsetid: %s"        % (inbredsetid)
+    print "genofreezeid: %s"    % (genofreezeid)
+    print "dataid start: %s"        % (dataid+1)
+    # samples
+    line = file_geno.readline()
+    sample_names = line.split()[4:]
+    sample_ids = []
+    print "get %d samples from file:\n%s" % (len(sample_names), sample_names)
+    for sample_name in sample_names:
+        sql = """
+            select Id
+            from Strain
+            where SpeciesId=%s
+            and Name like %s
+            """
+        cursor.execute(sql, (speciesid, sample_name))
+        results = cursor.fetchall()
+        if results:
+            sample_ids.append(results[0][0])
+        else:
+            print "insert sample %s" % (sample_name)
+            sql = """
+                INSERT INTO Strain
+                SET
+                    SpeciesId=%s,
+                    Name=%s,
+                    Name2=%s
+                """
+            cursor.execute(sql, (speciesid, sample_name, sample_name))
+            sampleid = con.insert_id()
+            sample_ids.append(sampleid)
+            #
+            sql = """
+                SELECT OrderId
+                FROM StrainXRef
+                where InbredSetId=%s
+                ORDER BY OrderId DESC
+                LIMIT 1 
+                """
+            cursor.execute(sql, (inbredsetid))
+            results = cursor.fetchall()
+            orderid = results[0][0] + 1
+            #
+            sql = """
+                INSERT INTO StrainXRef
+                SET
+                    InbredSetId=%s,
+                    StrainId=%s,
+                    OrderId=%s,
+                    Used_for_mapping=%s
+                """
+            cursor.execute(sql, (inbredsetid, sampleid, orderid, "N"))
+    print "load %d samples from DB:" % (len(sample_names))
+    for i in range(len(sample_names)):
+        print "%s\t%s" % (sample_names[i], sample_ids[i])
+    # parse geno file
+    index = 0
+    for line in file_geno:
+        index += 1
+        if index % 1000 == 0:
+            print index
+        items = line.split()
+        chr = items[0]
+        name = items[1]
+        cm = items[2]
+        mb = items[3]
+        values = items[4:]
+        # geno
+        sql = """
+            SELECT Id
+            FROM Geno
+            WHERE SpeciesId=%s
+            AND Name like %s
+            """
+        cursor.execute(sql, (speciesid, name))
+        results = cursor.fetchall()
+        if results:
+            genoid = results[0][0]
+        else:
+            print "insert geno %s" % (name)
+            sql = """
+                INSERT INTO Geno
+                SET
+                    SpeciesId=%s,
+                    Name=%s,
+                    Marker_Name=%s,
+                    Chr=%s,
+                    Mb=%s
+                """
+            cursor.execute(sql, (speciesid, name, name, chr, mb))
+            genoid = con.insert_id()
+        # genodata
+        dataid += 1
+        for i in range(len(values)):
+            sample_id = sample_ids[i]
+            try:
+                value = int(values[i])
+            except ValueError:
+                continue
+            if not value in [-1, 0, 1]:
+                print sample_id, value
+                continue
+            sql = """
+                INSERT INTO GenoData
+                SET
+                    Id=%s,
+                    StrainId=%s,
+                    value=%s
+                """
+            cursor.execute(sql, (dataid, sample_id, value))
+        # genoxref
+        sql = """
+            INSERT INTO GenoXRef
+            SET
+                GenoFreezeId=%s,
+                GenoId=%s,
+                DataId=%s,
+                cM=%s,
+                Used_for_mapping=%s
+            """
+        cursor.execute(sql, (genofreezeid, genoid, dataid, cm, 'N'))
+    print "Insert %d genoxref" % (index)
+    # close
+    file_geno.close()
+    con.close()
 
 # main
 if __name__ == "__main__":
-	main(sys.argv)
-	print "exit successfully"
+    main(sys.argv)
+    print "exit successfully"