aboutsummaryrefslogtreecommitdiff
path: root/wqflask/maintenance
diff options
context:
space:
mode:
Diffstat (limited to 'wqflask/maintenance')
-rw-r--r--wqflask/maintenance/dataset/datasampledir/load_genotypes/config.ini4
-rw-r--r--wqflask/maintenance/dataset/datasampledir/load_genotypes/sample.geno14
-rw-r--r--wqflask/maintenance/dataset/load_genotypes.py386
3 files changed, 202 insertions, 202 deletions
diff --git a/wqflask/maintenance/dataset/datasampledir/load_genotypes/config.ini b/wqflask/maintenance/dataset/datasampledir/load_genotypes/config.ini
new file mode 100644
index 00000000..638c3bd8
--- /dev/null
+++ b/wqflask/maintenance/dataset/datasampledir/load_genotypes/config.ini
@@ -0,0 +1,4 @@
+[config]
+inbredsetid = 1
+datafile = datasampledir/load_phenotypes/sample_data.txt
+metafile = datasampledir/load_phenotypes/sample_meta.txt
diff --git a/wqflask/maintenance/dataset/datasampledir/load_genotypes/sample.geno b/wqflask/maintenance/dataset/datasampledir/load_genotypes/sample.geno
new file mode 100644
index 00000000..6dad9ace
--- /dev/null
+++ b/wqflask/maintenance/dataset/datasampledir/load_genotypes/sample.geno
@@ -0,0 +1,14 @@
+@name:BXD
+@type:riset
+@mat:B
+@pat:D
+@het:H
+@unk:U
+Chr Locus cM Mb BXD1 BXD2 BXD5 BXD6 BXD8
+1 rs6269442 0 3.482275 B B D D D
+1 rs6365999 0 4.811062 B B D D D
+1 rs6376963 0.895 5.008089 B B D D D
+1 rs3677817 1.185 5.176058 B B D D D
+1 rs8236463 2.081 5.579193 B B D D D
+1 rs6333200 2.081 6.217921 B B D D D
+1 rs6298633 2.367 6.820241 B B D D D
diff --git a/wqflask/maintenance/dataset/load_genotypes.py b/wqflask/maintenance/dataset/load_genotypes.py
index e4988446..d309a903 100644
--- a/wqflask/maintenance/dataset/load_genotypes.py
+++ b/wqflask/maintenance/dataset/load_genotypes.py
@@ -1,207 +1,189 @@
-# GeneNetwork maintenance script
-# Load genotypes from geno file into database
-
-# Author: Lei Yan
-# Create Date: 2014-01-08
-# Last Update Date: 2014-01-10
-
-# import
-import sys
-import os
-import re
-import MySQLdb
-import ConfigParser
-
def main(argv):
+ # config
+ config = utilities.get_config(argv[1])
+ print "config:"
+ for item in config.items('config'):
+ print "\t%s" % (str(item))
+ # variables
+ genofile = open(config.get('config', 'genofile'), 'r')
+ metadic = {}
+ # parse genofile
+ for line in genofile:
+ line = line.strip()
+ if len(line) == 0:
+ continue
+ if line.startswith('#'):
+ continue
+ if line.startswith('@'):
+ line = line.strip('@')
+ items = line.split(';')
+ for item in items:
+ kv = re.split(':|=', item)
+ metadic[kv[0].strip()] = kv[1].strip()
+ continue
+ if line.startswith("Chr"):
+ print "[meta dictionary]"
+ for k,v in metadic.items():
+ print "%s: %s" % (k, v)
+ print
+ continue
+ cells = line.split()
+ Chr = cells[0]
+ Locus = cells[1]
+ cM = cells[2]
+ Mb = cells[3]
+ print len(cells)
+ ?
+ return
- # load configuration from configuration file
- config = ConfigParser.ConfigParser()
- config.read(argv[1])
- speciesid = config.get('configuration', 'speciesId')
- inbredsetid = config.get('configuration', 'inbredsetid')
- genofreezeid = config.get('configuration', 'genofreezeid')
- genofile = config.get('configuration', 'genofile')
- print "[configuration]\nspeciesid: %s\ninbredsetid: %s\ngenofreezeid: %s\ngenofile: %s\n" % (speciesid, inbredsetid, genofreezeid, genofile)
-
- # variables
- metadic = {}
-
- # parse genofile
- file_geno = open(genofile, 'r')
- for line in file_geno:
- line = line.strip()
- if line.startswith('#'):
- continue
- if line.startswith('@'):
- line = line.strip('@')
- items = line.split(';')
- for item in items:
- kv = re.split(':|=', item)
- metadic[kv[0].strip()] = kv[1].strip()
- continue
- if line.startswith("Chr"):
- print "[meta dictionary]"
- for k,v in metadic.items():
- print "%s: %s" % (k, v)
- print
- continue
- cells = line.split()
- Chr = cells[0]
- Locus = cells[1]
- cM = cells[2]
- Mb = cells[3]
- print len(cells)
- ?
- return
-
- # open db
- host = 'localhost'
- user = 'webqtl'
- passwd = 'webqtl'
- db = 'db_webqtl'
- con = MySQLdb.Connect(db=db, user=user, passwd=passwd, host=host)
- cursor = con.cursor()
- # var
- speciesid = int(argv[2])
- inbredsetid = int(argv[3])
- genofreezeid = int(argv[4])
- sql = """
- SELECT Id
- FROM GenoData
- ORDER BY Id DESC
- LIMIT 1
- """
- cursor.execute(sql)
- results = cursor.fetchall()
- dataid = results[0][0]
- print "speciesid: %s" % (speciesid)
- print "inbredsetid: %s" % (inbredsetid)
- print "genofreezeid: %s" % (genofreezeid)
- print "dataid start: %s" % (dataid+1)
- # samples
- line = file_geno.readline()
- sample_names = line.split()[4:]
- sample_ids = []
- print "get %d samples from file:\n%s" % (len(sample_names), sample_names)
- for sample_name in sample_names:
- sql = """
- select Id
- from Strain
- where SpeciesId=%s
- and Name like %s
- """
- cursor.execute(sql, (speciesid, sample_name))
- results = cursor.fetchall()
- if results:
- sample_ids.append(results[0][0])
- else:
- print "insert sample %s" % (sample_name)
- sql = """
- INSERT INTO Strain
- SET
- SpeciesId=%s,
- Name=%s,
- Name2=%s
- """
- cursor.execute(sql, (speciesid, sample_name, sample_name))
- sampleid = con.insert_id()
- sample_ids.append(sampleid)
- #
- sql = """
- SELECT OrderId
- FROM StrainXRef
- where InbredSetId=%s
- ORDER BY OrderId DESC
- LIMIT 1
- """
- cursor.execute(sql, (inbredsetid))
- results = cursor.fetchall()
- orderid = results[0][0] + 1
- #
- sql = """
- INSERT INTO StrainXRef
- SET
- InbredSetId=%s,
- StrainId=%s,
- OrderId=%s,
- Used_for_mapping=%s
- """
- cursor.execute(sql, (inbredsetid, sampleid, orderid, "N"))
- print "load %d samples from DB:" % (len(sample_names))
- for i in range(len(sample_names)):
- print "%s\t%s" % (sample_names[i], sample_ids[i])
- # parse geno file
- index = 0
- for line in file_geno:
- index += 1
- if index % 1000 == 0:
- print index
- items = line.split()
- chr = items[0]
- name = items[1]
- cm = items[2]
- mb = items[3]
- values = items[4:]
- # geno
- sql = """
- SELECT Id
- FROM Geno
- WHERE SpeciesId=%s
- AND Name like %s
- """
- cursor.execute(sql, (speciesid, name))
- results = cursor.fetchall()
- if results:
- genoid = results[0][0]
- else:
- print "insert geno %s" % (name)
- sql = """
- INSERT INTO Geno
- SET
- SpeciesId=%s,
- Name=%s,
- Marker_Name=%s,
- Chr=%s,
- Mb=%s
- """
- cursor.execute(sql, (speciesid, name, name, chr, mb))
- genoid = con.insert_id()
- # genodata
- dataid += 1
- for i in range(len(values)):
- sample_id = sample_ids[i]
- try:
- value = int(values[i])
- except ValueError:
- continue
- if not value in [-1, 0, 1]:
- print sample_id, value
- continue
- sql = """
- INSERT INTO GenoData
- SET
- Id=%s,
- StrainId=%s,
- value=%s
- """
- cursor.execute(sql, (dataid, sample_id, value))
- # genoxref
- sql = """
- INSERT INTO GenoXRef
- SET
- GenoFreezeId=%s,
- GenoId=%s,
- DataId=%s,
- cM=%s,
- Used_for_mapping=%s
- """
- cursor.execute(sql, (genofreezeid, genoid, dataid, cm, 'N'))
- print "Insert %d genoxref" % (index)
- # close
- file_geno.close()
- con.close()
+ # open db
+ host = 'localhost'
+ user = 'webqtl'
+ passwd = 'webqtl'
+ db = 'db_webqtl'
+ con = MySQLdb.Connect(db=db, user=user, passwd=passwd, host=host)
+ cursor = con.cursor()
+ # var
+ speciesid = int(argv[2])
+ inbredsetid = int(argv[3])
+ genofreezeid = int(argv[4])
+ sql = """
+ SELECT Id
+ FROM GenoData
+ ORDER BY Id DESC
+ LIMIT 1
+ """
+ cursor.execute(sql)
+ results = cursor.fetchall()
+ dataid = results[0][0]
+ print "speciesid: %s" % (speciesid)
+ print "inbredsetid: %s" % (inbredsetid)
+ print "genofreezeid: %s" % (genofreezeid)
+ print "dataid start: %s" % (dataid+1)
+ # samples
+ line = file_geno.readline()
+ sample_names = line.split()[4:]
+ sample_ids = []
+ print "get %d samples from file:\n%s" % (len(sample_names), sample_names)
+ for sample_name in sample_names:
+ sql = """
+ select Id
+ from Strain
+ where SpeciesId=%s
+ and Name like %s
+ """
+ cursor.execute(sql, (speciesid, sample_name))
+ results = cursor.fetchall()
+ if results:
+ sample_ids.append(results[0][0])
+ else:
+ print "insert sample %s" % (sample_name)
+ sql = """
+ INSERT INTO Strain
+ SET
+ SpeciesId=%s,
+ Name=%s,
+ Name2=%s
+ """
+ cursor.execute(sql, (speciesid, sample_name, sample_name))
+ sampleid = con.insert_id()
+ sample_ids.append(sampleid)
+ #
+ sql = """
+ SELECT OrderId
+ FROM StrainXRef
+ where InbredSetId=%s
+ ORDER BY OrderId DESC
+ LIMIT 1
+ """
+ cursor.execute(sql, (inbredsetid))
+ results = cursor.fetchall()
+ orderid = results[0][0] + 1
+ #
+ sql = """
+ INSERT INTO StrainXRef
+ SET
+ InbredSetId=%s,
+ StrainId=%s,
+ OrderId=%s,
+ Used_for_mapping=%s
+ """
+ cursor.execute(sql, (inbredsetid, sampleid, orderid, "N"))
+ print "load %d samples from DB:" % (len(sample_names))
+ for i in range(len(sample_names)):
+ print "%s\t%s" % (sample_names[i], sample_ids[i])
+ # parse geno file
+ index = 0
+ for line in file_geno:
+ index += 1
+ if index % 1000 == 0:
+ print index
+ items = line.split()
+ chr = items[0]
+ name = items[1]
+ cm = items[2]
+ mb = items[3]
+ values = items[4:]
+ # geno
+ sql = """
+ SELECT Id
+ FROM Geno
+ WHERE SpeciesId=%s
+ AND Name like %s
+ """
+ cursor.execute(sql, (speciesid, name))
+ results = cursor.fetchall()
+ if results:
+ genoid = results[0][0]
+ else:
+ print "insert geno %s" % (name)
+ sql = """
+ INSERT INTO Geno
+ SET
+ SpeciesId=%s,
+ Name=%s,
+ Marker_Name=%s,
+ Chr=%s,
+ Mb=%s
+ """
+ cursor.execute(sql, (speciesid, name, name, chr, mb))
+ genoid = con.insert_id()
+ # genodata
+ dataid += 1
+ for i in range(len(values)):
+ sample_id = sample_ids[i]
+ try:
+ value = int(values[i])
+ except ValueError:
+ continue
+ if not value in [-1, 0, 1]:
+ print sample_id, value
+ continue
+ sql = """
+ INSERT INTO GenoData
+ SET
+ Id=%s,
+ StrainId=%s,
+ value=%s
+ """
+ cursor.execute(sql, (dataid, sample_id, value))
+ # genoxref
+ sql = """
+ INSERT INTO GenoXRef
+ SET
+ GenoFreezeId=%s,
+ GenoId=%s,
+ DataId=%s,
+ cM=%s,
+ Used_for_mapping=%s
+ """
+ cursor.execute(sql, (genofreezeid, genoid, dataid, cm, 'N'))
+ print "Insert %d genoxref" % (index)
+ # close
+ file_geno.close()
+ con.close()
# main
if __name__ == "__main__":
- main(sys.argv)
- print "exit successfully"
+ main(sys.argv)
+ print "exit successfully"