aboutsummaryrefslogtreecommitdiff
path: root/wqflask/maintenance/dataset
diff options
context:
space:
mode:
authorLei Yan2014-03-05 21:33:31 +0000
committerLei Yan2014-03-05 21:33:31 +0000
commit8de6fec18cd98a10c58702c448a1e01e147dc5f7 (patch)
treec2f144d0dcf99c33c80d5231cbd4e725bc83b876 /wqflask/maintenance/dataset
parent07dceea5c550891c00b33f4d665f1de2ec936fea (diff)
downloadgenenetwork2-8de6fec18cd98a10c58702c448a1e01e147dc5f7.tar.gz
Improved load_genotypes.py
Committer: Lei Yan <lei@penguin.uthsc.edu> On branch master
Diffstat (limited to 'wqflask/maintenance/dataset')
-rw-r--r--wqflask/maintenance/dataset/datasampledir/load_genotypes/sample.geno2
-rw-r--r--wqflask/maintenance/dataset/genotypes_load.py207
-rw-r--r--wqflask/maintenance/dataset/load_genotypes.py127
3 files changed, 58 insertions, 278 deletions
diff --git a/wqflask/maintenance/dataset/datasampledir/load_genotypes/sample.geno b/wqflask/maintenance/dataset/datasampledir/load_genotypes/sample.geno
index a28d31fc..0024ffd1 100644
--- a/wqflask/maintenance/dataset/datasampledir/load_genotypes/sample.geno
+++ b/wqflask/maintenance/dataset/datasampledir/load_genotypes/sample.geno
@@ -9,4 +9,4 @@ Chr Locus cM Mb BXD1 BXD2 BXD5 BXD6 BXD8
2 rs6365999 0.3 4.811062 B B D D D
3 rs6376963 0.895 5.008089 B B D D D
4 rs3677817 1.185 5.176058 B B D D D
-5 rstest8236463 2.081 5.579193 B B D D D
+5 rs8236463 2.081 5.579193 B B D D D
diff --git a/wqflask/maintenance/dataset/genotypes_load.py b/wqflask/maintenance/dataset/genotypes_load.py
deleted file mode 100644
index e4988446..00000000
--- a/wqflask/maintenance/dataset/genotypes_load.py
+++ /dev/null
@@ -1,207 +0,0 @@
-# GeneNetwork maintenance script
-# Load genotypes from geno file into database
-
-# Author: Lei Yan
-# Create Date: 2014-01-08
-# Last Update Date: 2014-01-10
-
-# import
-import sys
-import os
-import re
-import MySQLdb
-import ConfigParser
-
-def main(argv):
-
- # load configuration from configuration file
- config = ConfigParser.ConfigParser()
- config.read(argv[1])
- speciesid = config.get('configuration', 'speciesId')
- inbredsetid = config.get('configuration', 'inbredsetid')
- genofreezeid = config.get('configuration', 'genofreezeid')
- genofile = config.get('configuration', 'genofile')
- print "[configuration]\nspeciesid: %s\ninbredsetid: %s\ngenofreezeid: %s\ngenofile: %s\n" % (speciesid, inbredsetid, genofreezeid, genofile)
-
- # variables
- metadic = {}
-
- # parse genofile
- file_geno = open(genofile, 'r')
- for line in file_geno:
- line = line.strip()
- if line.startswith('#'):
- continue
- if line.startswith('@'):
- line = line.strip('@')
- items = line.split(';')
- for item in items:
- kv = re.split(':|=', item)
- metadic[kv[0].strip()] = kv[1].strip()
- continue
- if line.startswith("Chr"):
- print "[meta dictionary]"
- for k,v in metadic.items():
- print "%s: %s" % (k, v)
- print
- continue
- cells = line.split()
- Chr = cells[0]
- Locus = cells[1]
- cM = cells[2]
- Mb = cells[3]
- print len(cells)
- ?
- return
-
- # open db
- host = 'localhost'
- user = 'webqtl'
- passwd = 'webqtl'
- db = 'db_webqtl'
- con = MySQLdb.Connect(db=db, user=user, passwd=passwd, host=host)
- cursor = con.cursor()
- # var
- speciesid = int(argv[2])
- inbredsetid = int(argv[3])
- genofreezeid = int(argv[4])
- sql = """
- SELECT Id
- FROM GenoData
- ORDER BY Id DESC
- LIMIT 1
- """
- cursor.execute(sql)
- results = cursor.fetchall()
- dataid = results[0][0]
- print "speciesid: %s" % (speciesid)
- print "inbredsetid: %s" % (inbredsetid)
- print "genofreezeid: %s" % (genofreezeid)
- print "dataid start: %s" % (dataid+1)
- # samples
- line = file_geno.readline()
- sample_names = line.split()[4:]
- sample_ids = []
- print "get %d samples from file:\n%s" % (len(sample_names), sample_names)
- for sample_name in sample_names:
- sql = """
- select Id
- from Strain
- where SpeciesId=%s
- and Name like %s
- """
- cursor.execute(sql, (speciesid, sample_name))
- results = cursor.fetchall()
- if results:
- sample_ids.append(results[0][0])
- else:
- print "insert sample %s" % (sample_name)
- sql = """
- INSERT INTO Strain
- SET
- SpeciesId=%s,
- Name=%s,
- Name2=%s
- """
- cursor.execute(sql, (speciesid, sample_name, sample_name))
- sampleid = con.insert_id()
- sample_ids.append(sampleid)
- #
- sql = """
- SELECT OrderId
- FROM StrainXRef
- where InbredSetId=%s
- ORDER BY OrderId DESC
- LIMIT 1
- """
- cursor.execute(sql, (inbredsetid))
- results = cursor.fetchall()
- orderid = results[0][0] + 1
- #
- sql = """
- INSERT INTO StrainXRef
- SET
- InbredSetId=%s,
- StrainId=%s,
- OrderId=%s,
- Used_for_mapping=%s
- """
- cursor.execute(sql, (inbredsetid, sampleid, orderid, "N"))
- print "load %d samples from DB:" % (len(sample_names))
- for i in range(len(sample_names)):
- print "%s\t%s" % (sample_names[i], sample_ids[i])
- # parse geno file
- index = 0
- for line in file_geno:
- index += 1
- if index % 1000 == 0:
- print index
- items = line.split()
- chr = items[0]
- name = items[1]
- cm = items[2]
- mb = items[3]
- values = items[4:]
- # geno
- sql = """
- SELECT Id
- FROM Geno
- WHERE SpeciesId=%s
- AND Name like %s
- """
- cursor.execute(sql, (speciesid, name))
- results = cursor.fetchall()
- if results:
- genoid = results[0][0]
- else:
- print "insert geno %s" % (name)
- sql = """
- INSERT INTO Geno
- SET
- SpeciesId=%s,
- Name=%s,
- Marker_Name=%s,
- Chr=%s,
- Mb=%s
- """
- cursor.execute(sql, (speciesid, name, name, chr, mb))
- genoid = con.insert_id()
- # genodata
- dataid += 1
- for i in range(len(values)):
- sample_id = sample_ids[i]
- try:
- value = int(values[i])
- except ValueError:
- continue
- if not value in [-1, 0, 1]:
- print sample_id, value
- continue
- sql = """
- INSERT INTO GenoData
- SET
- Id=%s,
- StrainId=%s,
- value=%s
- """
- cursor.execute(sql, (dataid, sample_id, value))
- # genoxref
- sql = """
- INSERT INTO GenoXRef
- SET
- GenoFreezeId=%s,
- GenoId=%s,
- DataId=%s,
- cM=%s,
- Used_for_mapping=%s
- """
- cursor.execute(sql, (genofreezeid, genoid, dataid, cm, 'N'))
- print "Insert %d genoxref" % (index)
- # close
- file_geno.close()
- con.close()
-
-# main
-if __name__ == "__main__":
- main(sys.argv)
- print "exit successfully"
diff --git a/wqflask/maintenance/dataset/load_genotypes.py b/wqflask/maintenance/dataset/load_genotypes.py
index fbf6484f..4697382b 100644
--- a/wqflask/maintenance/dataset/load_genotypes.py
+++ b/wqflask/maintenance/dataset/load_genotypes.py
@@ -1,10 +1,3 @@
-#Do whatever else is needed with the Marker object
-#Probably create Genofile object as well
-#Make sure rest of code works with params object (though
-#everything in the params object should probably just be the parameters of
-#the Genofile object)
-
-
from __future__ import absolute_import, print_function, division
import sys
@@ -14,36 +7,28 @@ import argparse
import utilities
import datastructure
-def main():
- parser = argparse.ArgumentParser(description='Load Genotypes')
- parser.add_argument('-c', '--config')
- opts = parser.parse_args()
- config = opts.config
- # config
- config = utilities.get_config(config)
- print("config:")
+def main(argv):
+ config = utilities.get_config(argv[1])
+ print("config file:")
for item in config.items('config'):
print("\t", str(item))
- parse_genofile(fetch_parameters(config))
+ parse_genofile(config, fetch_parameters(config))
def fetch_parameters(config):
- # variables
- params = {}
- params['inbredsetid'] = config.get('config', 'inbredsetid')
- species = datastructure.get_species(params['inbredsetid'])
- params["speciesid"] = species[0]
- genofreeze = datastructure.get_genofreeze_byinbredsetid(params['inbredsetid'])
- params['genofreezeid'] = genofreeze[0]
- params['dataid'] = datastructure.get_nextdataid_genotype()
- params['genofile'] = config.get('config', 'genofile')
- return params
-
-def parse_genofile(params):
- # genofile
- genofile = open(params['genofile'], 'r')
- metadic = {}
- print()
- # parse genofile
+ config_dic = {}
+ config_dic['inbredsetid'] = config.get('config', 'inbredsetid')
+ config_dic["speciesid"] = datastructure.get_species(config_dic['inbredsetid'])[0]
+ config_dic['genofreezeid'] = datastructure.get_genofreeze_byinbredsetid(config_dic['inbredsetid'])[0]
+ config_dic['dataid'] = datastructure.get_nextdataid_genotype()
+ config_dic['genofile'] = config.get('config', 'genofile')
+ print("config dictionary:")
+ for k, v in config_dic.items():
+ print("\t%s: %s" % (k, v))
+ return config_dic
+
+def parse_genofile(config, config_dic):
+ genofile = open(config_dic['genofile'], 'r')
+ meta_dic = {}
for line in genofile:
line = line.strip()
if len(line) == 0:
@@ -55,40 +40,39 @@ def parse_genofile(params):
items = line.split(';')
for item in items:
kv = re.split(':|=', item)
- metadic[kv[0].strip()] = kv[1].strip()
+ meta_dic[kv[0].strip()] = kv[1].strip()
continue
if line.lower().startswith("chr"):
#
- print("geno file meta:")
- for k, v in metadic.items():
- print("\t{}: {}".format(k, v))
+ print("geno file meta dictionary:")
+ for k, v in meta_dic.items():
+ print("\t%s: %s" % (k, v))
#
- print("geno file head:\n\t{}\n".format(line))
+ print("geno file head:\n\t%s" % line)
strainnames = line.split()[4:]
- strains = datastructure.get_strains_bynames(inbredsetid=inbredsetid, strainnames=strainnames, updatestrainxref="yes")
+ config_dic['strains'] = datastructure.get_strains_bynames(inbredsetid=config_dic['inbredsetid'], strainnames=strainnames, updatestrainxref="yes")
continue
- # geno file line
- marker = Marker(line)
- #
- genoid = check_or_insert_geno(params, marker)
- if check_genoxref(params):
+ # geno file line, marker
+ marker_dic = parse_marker(line)
+ marker_dic['genoid'] = check_or_insert_geno(config_dic, marker_dic)
+ if check_genoxref(config_dic, marker_dic):
continue
- insert_genodata(params)
- insert_genoxref(params)
- dataid += 1
+ insert_genodata(config, config_dic, marker_dic)
+ insert_genoxref(config_dic, marker_dic)
+ config_dic['dataid'] += 1
genofile.close()
-
-
-class Marker(object):
- def __init__(self, line):
- self.cells = line.split()
- self.chromosome = cells[0]
- self.locus = cells[1]
- self.cm = cells[2]
- self.mb = cells[3]
- self.values = cells[4:]
+
+def parse_marker(line):
+ marker_dic = {}
+ cells = line.split()
+ marker_dic['chromosome'] = cells[0]
+ marker_dic['locus'] = cells[1]
+ marker_dic['cm'] = cells[2]
+ marker_dic['mb'] = cells[3]
+ marker_dic['values'] = cells[4:]
+ return marker_dic
-def check_or_insert_geno(params, marker):
+def check_or_insert_geno(config_dic, marker_dic):
cursor, con = utilities.get_cursor()
sql = """
SELECT Geno.`Id`
@@ -96,7 +80,7 @@ def check_or_insert_geno(params, marker):
WHERE Geno.`SpeciesId`=%s
AND Geno.`Name` like %s
"""
- cursor.execute(sql, (speciesid, locus))
+ cursor.execute(sql, (config_dic["speciesid"], marker_dic['locus']))
result = cursor.fetchone()
if result:
genoid = result[0]
@@ -111,27 +95,29 @@ def check_or_insert_geno(params, marker):
Geno.`Chr`=%s,
Geno.`Mb`=%s
"""
- cursor.execute(sql, (speciesid, locus, locus, chr, mb))
+ cursor.execute(sql, (config_dic['speciesid'], marker_dic['locus'], marker_dic['locus'], marker_dic['chromosome'], marker_dic['mb']))
rowcount = cursor.rowcount
genoid = con.insert_id()
print("INSERT INTO Geno: %d record: %d" % (rowcount, genoid))
return genoid
-def check_GenoXRef():
+def check_genoxref(config_dic, marker_dic):
+ cursor, con = utilities.get_cursor()
sql = """
select GenoXRef.*
from GenoXRef
where GenoXRef.`GenoFreezeId`=%s
AND GenoXRef.`GenoId`=%s
"""
- cursor.execute(sql, (genofreezeid, genoid))
+ cursor.execute(sql, (config_dic['genofreezeid'], marker_dic['genoid']))
rowcount = cursor.rowcount
return rowcount
-
-def insert_genodata():
- for index, strain in enumerate(strains):
+
+def insert_genodata(config, config_dic, marker_dic):
+ cursor, con = utilities.get_cursor()
+ for index, strain in enumerate(config_dic['strains']):
strainid = strain[0]
- value = utilities.to_db_string(values[index], None)
+ value = utilities.to_db_string(marker_dic['values'][index], None)
if not value:
continue
value = config.get('config', "genovalue_" + value)
@@ -148,9 +134,10 @@ def insert_genodata():
GenoData.`StrainId`=%s,
GenoData.`value`=%s
"""
- cursor.execute(sql, (dataid, strainid, number))
+ cursor.execute(sql, (config_dic['dataid'], strainid, number))
-def insert_genoxref():
+def insert_genoxref(config_dic, marker_dic):
+ cursor, con = utilities.get_cursor()
sql = """
INSERT INTO GenoXRef
SET
@@ -160,11 +147,11 @@ def insert_genoxref():
GenoXRef.`cM`=%s,
GenoXRef.`Used_for_mapping`=%s
"""
- cursor.execute(sql, (genofreezeid, genoid, dataid, cm, 'N'))
+ cursor.execute(sql, (config_dic['genofreezeid'], marker_dic['genoid'], config_dic['dataid'], marker_dic['cm'], 'N'))
rowcount = cursor.rowcount
print("INSERT INTO GenoXRef: %d record" % (rowcount))
if __name__ == "__main__":
print("command line arguments:\n\t%s" % sys.argv)
- main()
+ main(sys.argv)
print("exit successfully")