6 files changed, 271 insertions, 0 deletions
diff --git a/wqflask/maintenance/dataset/genotypes/code/genotypes_load.py b/wqflask/maintenance/dataset/genotypes/code/genotypes_load.py
new file mode 100644
index 00000000..e4988446
--- /dev/null
+++ b/wqflask/maintenance/dataset/genotypes/code/genotypes_load.py
@@ -0,0 +1,207 @@
+# GeneNetwork maintenance script
+# Load genotypes from geno file into database
+
+# Author:			Lei Yan
+# Create Date:		2014-01-08
+# Last Update Date:	2014-01-10
+
+# import
+import sys
+import os
+import re
+import MySQLdb
+import ConfigParser
+
+def main(argv):
+
+	# load configuration from configuration file
+	config = ConfigParser.ConfigParser()
+	config.read(argv[1])
+	speciesid = config.get('configuration', 'speciesId')
+	inbredsetid = config.get('configuration', 'inbredsetid')
+	genofreezeid = config.get('configuration', 'genofreezeid')
+	genofile = config.get('configuration', 'genofile')
+	print "[configuration]\nspeciesid: %s\ninbredsetid: %s\ngenofreezeid: %s\ngenofile: %s\n" % (speciesid, inbredsetid, genofreezeid, genofile)
+	
+	# variables
+	metadic = {}
+	
+	# parse genofile
+	file_geno = open(genofile, 'r')
+	for line in file_geno:
+		line = line.strip()
+		if line.startswith('#'):
+			continue
+		if line.startswith('@'):
+			line = line.strip('@')
+			items = line.split(';')
+			for item in items:
+				kv = re.split(':|=', item)
+				metadic[kv[0].strip()] = kv[1].strip()
+			continue
+		if line.startswith("Chr"):
+			print "[meta dictionary]"
+			for k,v in metadic.items():
+				print "%s: %s" % (k, v)
+			print
+			continue
+		cells = line.split()
+		Chr = cells[0]
+		Locus = cells[1]
+		cM = cells[2]
+		Mb = cells[3]
+		print len(cells)
+		?
+	return
+
+	# open db
+	host = 'localhost'
+	user = 'webqtl'
+	passwd = 'webqtl'
+	db = 'db_webqtl'
+	con = MySQLdb.Connect(db=db, user=user, passwd=passwd, host=host)
+	cursor = con.cursor()
+	# var
+	speciesid = int(argv[2])
+	inbredsetid = int(argv[3])
+	genofreezeid = int(argv[4])
+	sql = """
+		SELECT Id
+		FROM GenoData
+		ORDER BY Id DESC
+		LIMIT 1
+		"""
+	cursor.execute(sql)
+	results = cursor.fetchall()
+	dataid = results[0][0]
+	print "speciesid: %s"		% (speciesid)
+	print "inbredsetid: %s"		% (inbredsetid)
+	print "genofreezeid: %s"	% (genofreezeid)
+	print "dataid start: %s"		% (dataid+1)
+	# samples
+	line = file_geno.readline()
+	sample_names = line.split()[4:]
+	sample_ids = []
+	print "get %d samples from file:\n%s" % (len(sample_names), sample_names)
+	for sample_name in sample_names:
+		sql = """
+			select Id
+			from Strain
+			where SpeciesId=%s
+			and Name like %s
+			"""
+		cursor.execute(sql, (speciesid, sample_name))
+		results = cursor.fetchall()
+		if results:
+			sample_ids.append(results[0][0])
+		else:
+			print "insert sample %s" % (sample_name)
+			sql = """
+				INSERT INTO Strain
+				SET
+					SpeciesId=%s,
+					Name=%s,
+					Name2=%s
+				"""
+			cursor.execute(sql, (speciesid, sample_name, sample_name))
+			sampleid = con.insert_id()
+			sample_ids.append(sampleid)
+			#
+			sql = """
+				SELECT OrderId
+				FROM StrainXRef
+				where InbredSetId=%s
+				ORDER BY OrderId DESC
+				LIMIT 1 
+				"""
+			cursor.execute(sql, (inbredsetid))
+			results = cursor.fetchall()
+			orderid = results[0][0] + 1
+			#
+			sql = """
+				INSERT INTO StrainXRef
+				SET
+					InbredSetId=%s,
+					StrainId=%s,
+					OrderId=%s,
+					Used_for_mapping=%s
+				"""
+			cursor.execute(sql, (inbredsetid, sampleid, orderid, "N"))
+	print "load %d samples from DB:" % (len(sample_names))
+	for i in range(len(sample_names)):
+		print "%s\t%s" % (sample_names[i], sample_ids[i])
+	# parse geno file
+	index = 0
+	for line in file_geno:
+		index += 1
+		if index % 1000 == 0:
+			print index
+		items = line.split()
+		chr = items[0]
+		name = items[1]
+		cm = items[2]
+		mb = items[3]
+		values = items[4:]
+		# geno
+		sql = """
+			SELECT Id
+			FROM Geno
+			WHERE SpeciesId=%s
+			AND Name like %s
+			"""
+		cursor.execute(sql, (speciesid, name))
+		results = cursor.fetchall()
+		if results:
+			genoid = results[0][0]
+		else:
+			print "insert geno %s" % (name)
+			sql = """
+				INSERT INTO Geno
+				SET
+					SpeciesId=%s,
+					Name=%s,
+					Marker_Name=%s,
+					Chr=%s,
+					Mb=%s
+				"""
+			cursor.execute(sql, (speciesid, name, name, chr, mb))
+			genoid = con.insert_id()
+		# genodata
+		dataid += 1
+		for i in range(len(values)):
+			sample_id = sample_ids[i]
+			try:
+				value = int(values[i])
+			except ValueError:
+				continue
+			if not value in [-1, 0, 1]:
+				print sample_id, value
+				continue
+			sql = """
+				INSERT INTO GenoData
+				SET
+					Id=%s,
+					StrainId=%s,
+					value=%s
+				"""
+			cursor.execute(sql, (dataid, sample_id, value))
+		# genoxref
+		sql = """
+			INSERT INTO GenoXRef
+			SET
+				GenoFreezeId=%s,
+				GenoId=%s,
+				DataId=%s,
+				cM=%s,
+				Used_for_mapping=%s
+			"""
+		cursor.execute(sql, (genofreezeid, genoid, dataid, cm, 'N'))
+	print "Insert %d genoxref" % (index)
+	# close
+	file_geno.close()
+	con.close()
+
+# main
+if __name__ == "__main__":
+	main(sys.argv)
+	print "exit successfully"
diff --git a/wqflask/maintenance/dataset/special/correlation/about.txt b/wqflask/maintenance/dataset/special/correlation/about.txt
new file mode 100644
index 00000000..a12f8c47
--- /dev/null
+++ b/wqflask/maintenance/dataset/special/correlation/about.txt
@@ -0,0 +1,3 @@
+BXD
+genotype, phenotype, mRNA expression
+correlation
+\ No newline at end of file
diff --git a/wqflask/maintenance/dataset/special/correlation/conf.ini b/wqflask/maintenance/dataset/special/correlation/conf.ini
new file mode 100644
index 00000000..9c23bb45
--- /dev/null
+++ b/wqflask/maintenance/dataset/special/correlation/conf.ini
@@ -0,0 +1,2 @@
+[configuration]
+genofile = /home/leiyan/gn/web/genotypes/BXD.geno
diff --git a/wqflask/maintenance/dataset/special/correlation/correlations.py b/wqflask/maintenance/dataset/special/correlation/correlations.py
new file mode 100644
index 00000000..b089e446
--- /dev/null
+++ b/wqflask/maintenance/dataset/special/correlation/correlations.py
@@ -0,0 +1,47 @@
+# Author:               Lei Yan
+# Create Date:          2014-01-21
+# Last Update Date:     2014-01-24
+
+# import
+import sys
+import os
+import re
+import MySQLdb
+import ConfigParser
+
+def main(argv):
+
+    # load configuration from configuration file
+    config = ConfigParser.ConfigParser()
+    config.read(argv[1])
+    genofile = config.get('configuration', 'genofile')
+
+    # parse genofile
+    genotypes = []
+    file_geno = open(genofile, 'r')
+    for line in file_geno:
+        line = line.strip()
+        if line.startswith('#'):
+            continue
+        if line.startswith('@'):
+            continue
+        cells = line.split()
+        if line.startswith("Chr"):
+            strains = cells[4:]
+            continue
+        genotype = {}
+        genotype['chr'] = cells[0]
+        genotype['locus'] = cells[1]
+        genotype['cm'] = cells[2]
+        genotype['mb'] = cells[3]
+        genotype['values'] = cells[4:]
+        genotypes.append(genotype)
+    print "get %d strains:\t%s" % (len(strains), strains)
+    print "load %d genotypes" % len(genotypes)
+
+    # phenotypes
+
+# main
+if __name__ == "__main__":
+    main(sys.argv)
+    print "exit successfully"
diff --git a/wqflask/maintenance/dataset/special/correlation/run.sh b/wqflask/maintenance/dataset/special/correlation/run.sh
new file mode 100644
index 00000000..eccfa507
--- /dev/null
+++ b/wqflask/maintenance/dataset/special/correlation/run.sh
@@ -0,0 +1 @@
+/usr/bin/python correlations.py conf.ini
+\ No newline at end of file
diff --git a/wqflask/maintenance/dataset/utilities/fetch.py b/wqflask/maintenance/dataset/utilities/fetch.py
new file mode 100644
index 00000000..fcb2d2d8
--- /dev/null
+++ b/wqflask/maintenance/dataset/utilities/fetch.py
@@ -0,0 +1,11 @@
+import sys
+
+inputfile = open(sys.argv[1], 'r')
+
+for line in inputfile:
+	cells = line.split()
+	#print cells[int(sys.argv[2])]
+	i = len(cells)
+	print i
+
+inputfile.close()