From d6543befcdaf4786f42c10c1cdd0cb473a8d8453 Mon Sep 17 00:00:00 2001 From: Lei Yan Date: Fri, 31 Jan 2014 16:12:36 -0600 Subject: On branch master --- wqflask/maintenance/dataset/__init__.py | 0 .../maintenance/dataset/calculation/__init__.py | 0 .../dataset/calculation/correlation/__init__.py | 0 .../dataset/calculation/correlation/about.txt | 3 + .../dataset/calculation/correlation/conf.ini | 2 + .../calculation/correlation/correlations.py | 47 +++++ .../dataset/calculation/correlation/run.sh | 1 + .../maintenance/dataset/datastructure/__init__.py | 0 .../dataset/datastructure/datastructure.py | 20 ++ wqflask/maintenance/dataset/genotypes/__init__.py | 0 .../dataset/genotypes/code/genotypes_load.py | 207 --------------------- .../dataset/genotypes/genotypes_load.py | 207 +++++++++++++++++++++ wqflask/maintenance/dataset/phenotypes/__init__.py | 0 wqflask/maintenance/dataset/probesets/__init__.py | 0 .../dataset/special/correlation/about.txt | 3 - .../dataset/special/correlation/conf.ini | 2 - .../dataset/special/correlation/correlations.py | 47 ----- .../maintenance/dataset/special/correlation/run.sh | 1 - wqflask/maintenance/dataset/utilities/__init__.py | 0 wqflask/maintenance/dataset/utilities/db.py | 10 + wqflask/maintenance/dataset/utilities/test.py | 3 + 21 files changed, 293 insertions(+), 260 deletions(-) create mode 100644 wqflask/maintenance/dataset/__init__.py create mode 100644 wqflask/maintenance/dataset/calculation/__init__.py create mode 100644 wqflask/maintenance/dataset/calculation/correlation/__init__.py create mode 100644 wqflask/maintenance/dataset/calculation/correlation/about.txt create mode 100644 wqflask/maintenance/dataset/calculation/correlation/conf.ini create mode 100644 wqflask/maintenance/dataset/calculation/correlation/correlations.py create mode 100644 wqflask/maintenance/dataset/calculation/correlation/run.sh create mode 100644 wqflask/maintenance/dataset/datastructure/__init__.py create mode 100644 wqflask/maintenance/dataset/datastructure/datastructure.py create mode 100644 wqflask/maintenance/dataset/genotypes/__init__.py delete mode 100644 wqflask/maintenance/dataset/genotypes/code/genotypes_load.py create mode 100644 wqflask/maintenance/dataset/genotypes/genotypes_load.py create mode 100644 wqflask/maintenance/dataset/phenotypes/__init__.py create mode 100644 wqflask/maintenance/dataset/probesets/__init__.py delete mode 100644 wqflask/maintenance/dataset/special/correlation/about.txt delete mode 100644 wqflask/maintenance/dataset/special/correlation/conf.ini delete mode 100644 wqflask/maintenance/dataset/special/correlation/correlations.py delete mode 100644 wqflask/maintenance/dataset/special/correlation/run.sh create mode 100644 wqflask/maintenance/dataset/utilities/__init__.py create mode 100644 wqflask/maintenance/dataset/utilities/db.py create mode 100644 wqflask/maintenance/dataset/utilities/test.py diff --git a/wqflask/maintenance/dataset/__init__.py b/wqflask/maintenance/dataset/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/wqflask/maintenance/dataset/calculation/__init__.py b/wqflask/maintenance/dataset/calculation/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/wqflask/maintenance/dataset/calculation/correlation/__init__.py b/wqflask/maintenance/dataset/calculation/correlation/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/wqflask/maintenance/dataset/calculation/correlation/about.txt b/wqflask/maintenance/dataset/calculation/correlation/about.txt new file mode 100644 index 00000000..a12f8c47 --- /dev/null +++ b/wqflask/maintenance/dataset/calculation/correlation/about.txt @@ -0,0 +1,3 @@ +BXD +genotype, phenotype, mRNA expression +correlation \ No newline at end of file diff --git a/wqflask/maintenance/dataset/calculation/correlation/conf.ini b/wqflask/maintenance/dataset/calculation/correlation/conf.ini new file mode 100644 index 00000000..9c23bb45 --- /dev/null +++ b/wqflask/maintenance/dataset/calculation/correlation/conf.ini @@ -0,0 +1,2 @@ +[configuration] +genofile = /home/leiyan/gn/web/genotypes/BXD.geno diff --git a/wqflask/maintenance/dataset/calculation/correlation/correlations.py b/wqflask/maintenance/dataset/calculation/correlation/correlations.py new file mode 100644 index 00000000..b089e446 --- /dev/null +++ b/wqflask/maintenance/dataset/calculation/correlation/correlations.py @@ -0,0 +1,47 @@ +# Author: Lei Yan +# Create Date: 2014-01-21 +# Last Update Date: 2014-01-24 + +# import +import sys +import os +import re +import MySQLdb +import ConfigParser + +def main(argv): + + # load configuration from configuration file + config = ConfigParser.ConfigParser() + config.read(argv[1]) + genofile = config.get('configuration', 'genofile') + + # parse genofile + genotypes = [] + file_geno = open(genofile, 'r') + for line in file_geno: + line = line.strip() + if line.startswith('#'): + continue + if line.startswith('@'): + continue + cells = line.split() + if line.startswith("Chr"): + strains = cells[4:] + continue + genotype = {} + genotype['chr'] = cells[0] + genotype['locus'] = cells[1] + genotype['cm'] = cells[2] + genotype['mb'] = cells[3] + genotype['values'] = cells[4:] + genotypes.append(genotype) + print "get %d strains:\t%s" % (len(strains), strains) + print "load %d genotypes" % len(genotypes) + + # phenotypes + +# main +if __name__ == "__main__": + main(sys.argv) + print "exit successfully" diff --git a/wqflask/maintenance/dataset/calculation/correlation/run.sh b/wqflask/maintenance/dataset/calculation/correlation/run.sh new file mode 100644 index 00000000..eccfa507 --- /dev/null +++ b/wqflask/maintenance/dataset/calculation/correlation/run.sh @@ -0,0 +1 @@ +/usr/bin/python correlations.py conf.ini \ No newline at end of file diff --git a/wqflask/maintenance/dataset/datastructure/__init__.py b/wqflask/maintenance/dataset/datastructure/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/wqflask/maintenance/dataset/datastructure/datastructure.py b/wqflask/maintenance/dataset/datastructure/datastructure.py new file mode 100644 index 00000000..ed429cb4 --- /dev/null +++ b/wqflask/maintenance/dataset/datastructure/datastructure.py @@ -0,0 +1,20 @@ +import sys +sys.path.append('.') +sys.path.append('..') + +from utilities import db + +def get_type(inbredsetid): + cursor=db.get_cursor() + sql = """ + SELECT ProbeSetFreeze.`Id`, ProbeSetFreeze.`Name`, ProbeSetFreeze.`FullName` + FROM ProbeSetFreeze, ProbeFreeze + WHERE ProbeSetFreeze.`ProbeFreezeId`=ProbeFreeze.`Id` + AND ProbeFreeze.`InbredSetId`=%s + """ + cursor.execute(sql, (inbredsetid)) + return cursor.fetchall() + + +print get_type() + \ No newline at end of file diff --git a/wqflask/maintenance/dataset/genotypes/__init__.py b/wqflask/maintenance/dataset/genotypes/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/wqflask/maintenance/dataset/genotypes/code/genotypes_load.py b/wqflask/maintenance/dataset/genotypes/code/genotypes_load.py deleted file mode 100644 index e4988446..00000000 --- a/wqflask/maintenance/dataset/genotypes/code/genotypes_load.py +++ /dev/null @@ -1,207 +0,0 @@ -# GeneNetwork maintenance script -# Load genotypes from geno file into database - -# Author: Lei Yan -# Create Date: 2014-01-08 -# Last Update Date: 2014-01-10 - -# import -import sys -import os -import re -import MySQLdb -import ConfigParser - -def main(argv): - - # load configuration from configuration file - config = ConfigParser.ConfigParser() - config.read(argv[1]) - speciesid = config.get('configuration', 'speciesId') - inbredsetid = config.get('configuration', 'inbredsetid') - genofreezeid = config.get('configuration', 'genofreezeid') - genofile = config.get('configuration', 'genofile') - print "[configuration]\nspeciesid: %s\ninbredsetid: %s\ngenofreezeid: %s\ngenofile: %s\n" % (speciesid, inbredsetid, genofreezeid, genofile) - - # variables - metadic = {} - - # parse genofile - file_geno = open(genofile, 'r') - for line in file_geno: - line = line.strip() - if line.startswith('#'): - continue - if line.startswith('@'): - line = line.strip('@') - items = line.split(';') - for item in items: - kv = re.split(':|=', item) - metadic[kv[0].strip()] = kv[1].strip() - continue - if line.startswith("Chr"): - print "[meta dictionary]" - for k,v in metadic.items(): - print "%s: %s" % (k, v) - print - continue - cells = line.split() - Chr = cells[0] - Locus = cells[1] - cM = cells[2] - Mb = cells[3] - print len(cells) - ? - return - - # open db - host = 'localhost' - user = 'webqtl' - passwd = 'webqtl' - db = 'db_webqtl' - con = MySQLdb.Connect(db=db, user=user, passwd=passwd, host=host) - cursor = con.cursor() - # var - speciesid = int(argv[2]) - inbredsetid = int(argv[3]) - genofreezeid = int(argv[4]) - sql = """ - SELECT Id - FROM GenoData - ORDER BY Id DESC - LIMIT 1 - """ - cursor.execute(sql) - results = cursor.fetchall() - dataid = results[0][0] - print "speciesid: %s" % (speciesid) - print "inbredsetid: %s" % (inbredsetid) - print "genofreezeid: %s" % (genofreezeid) - print "dataid start: %s" % (dataid+1) - # samples - line = file_geno.readline() - sample_names = line.split()[4:] - sample_ids = [] - print "get %d samples from file:\n%s" % (len(sample_names), sample_names) - for sample_name in sample_names: - sql = """ - select Id - from Strain - where SpeciesId=%s - and Name like %s - """ - cursor.execute(sql, (speciesid, sample_name)) - results = cursor.fetchall() - if results: - sample_ids.append(results[0][0]) - else: - print "insert sample %s" % (sample_name) - sql = """ - INSERT INTO Strain - SET - SpeciesId=%s, - Name=%s, - Name2=%s - """ - cursor.execute(sql, (speciesid, sample_name, sample_name)) - sampleid = con.insert_id() - sample_ids.append(sampleid) - # - sql = """ - SELECT OrderId - FROM StrainXRef - where InbredSetId=%s - ORDER BY OrderId DESC - LIMIT 1 - """ - cursor.execute(sql, (inbredsetid)) - results = cursor.fetchall() - orderid = results[0][0] + 1 - # - sql = """ - INSERT INTO StrainXRef - SET - InbredSetId=%s, - StrainId=%s, - OrderId=%s, - Used_for_mapping=%s - """ - cursor.execute(sql, (inbredsetid, sampleid, orderid, "N")) - print "load %d samples from DB:" % (len(sample_names)) - for i in range(len(sample_names)): - print "%s\t%s" % (sample_names[i], sample_ids[i]) - # parse geno file - index = 0 - for line in file_geno: - index += 1 - if index % 1000 == 0: - print index - items = line.split() - chr = items[0] - name = items[1] - cm = items[2] - mb = items[3] - values = items[4:] - # geno - sql = """ - SELECT Id - FROM Geno - WHERE SpeciesId=%s - AND Name like %s - """ - cursor.execute(sql, (speciesid, name)) - results = cursor.fetchall() - if results: - genoid = results[0][0] - else: - print "insert geno %s" % (name) - sql = """ - INSERT INTO Geno - SET - SpeciesId=%s, - Name=%s, - Marker_Name=%s, - Chr=%s, - Mb=%s - """ - cursor.execute(sql, (speciesid, name, name, chr, mb)) - genoid = con.insert_id() - # genodata - dataid += 1 - for i in range(len(values)): - sample_id = sample_ids[i] - try: - value = int(values[i]) - except ValueError: - continue - if not value in [-1, 0, 1]: - print sample_id, value - continue - sql = """ - INSERT INTO GenoData - SET - Id=%s, - StrainId=%s, - value=%s - """ - cursor.execute(sql, (dataid, sample_id, value)) - # genoxref - sql = """ - INSERT INTO GenoXRef - SET - GenoFreezeId=%s, - GenoId=%s, - DataId=%s, - cM=%s, - Used_for_mapping=%s - """ - cursor.execute(sql, (genofreezeid, genoid, dataid, cm, 'N')) - print "Insert %d genoxref" % (index) - # close - file_geno.close() - con.close() - -# main -if __name__ == "__main__": - main(sys.argv) - print "exit successfully" diff --git a/wqflask/maintenance/dataset/genotypes/genotypes_load.py b/wqflask/maintenance/dataset/genotypes/genotypes_load.py new file mode 100644 index 00000000..e4988446 --- /dev/null +++ b/wqflask/maintenance/dataset/genotypes/genotypes_load.py @@ -0,0 +1,207 @@ +# GeneNetwork maintenance script +# Load genotypes from geno file into database + +# Author: Lei Yan +# Create Date: 2014-01-08 +# Last Update Date: 2014-01-10 + +# import +import sys +import os +import re +import MySQLdb +import ConfigParser + +def main(argv): + + # load configuration from configuration file + config = ConfigParser.ConfigParser() + config.read(argv[1]) + speciesid = config.get('configuration', 'speciesId') + inbredsetid = config.get('configuration', 'inbredsetid') + genofreezeid = config.get('configuration', 'genofreezeid') + genofile = config.get('configuration', 'genofile') + print "[configuration]\nspeciesid: %s\ninbredsetid: %s\ngenofreezeid: %s\ngenofile: %s\n" % (speciesid, inbredsetid, genofreezeid, genofile) + + # variables + metadic = {} + + # parse genofile + file_geno = open(genofile, 'r') + for line in file_geno: + line = line.strip() + if line.startswith('#'): + continue + if line.startswith('@'): + line = line.strip('@') + items = line.split(';') + for item in items: + kv = re.split(':|=', item) + metadic[kv[0].strip()] = kv[1].strip() + continue + if line.startswith("Chr"): + print "[meta dictionary]" + for k,v in metadic.items(): + print "%s: %s" % (k, v) + print + continue + cells = line.split() + Chr = cells[0] + Locus = cells[1] + cM = cells[2] + Mb = cells[3] + print len(cells) + ? + return + + # open db + host = 'localhost' + user = 'webqtl' + passwd = 'webqtl' + db = 'db_webqtl' + con = MySQLdb.Connect(db=db, user=user, passwd=passwd, host=host) + cursor = con.cursor() + # var + speciesid = int(argv[2]) + inbredsetid = int(argv[3]) + genofreezeid = int(argv[4]) + sql = """ + SELECT Id + FROM GenoData + ORDER BY Id DESC + LIMIT 1 + """ + cursor.execute(sql) + results = cursor.fetchall() + dataid = results[0][0] + print "speciesid: %s" % (speciesid) + print "inbredsetid: %s" % (inbredsetid) + print "genofreezeid: %s" % (genofreezeid) + print "dataid start: %s" % (dataid+1) + # samples + line = file_geno.readline() + sample_names = line.split()[4:] + sample_ids = [] + print "get %d samples from file:\n%s" % (len(sample_names), sample_names) + for sample_name in sample_names: + sql = """ + select Id + from Strain + where SpeciesId=%s + and Name like %s + """ + cursor.execute(sql, (speciesid, sample_name)) + results = cursor.fetchall() + if results: + sample_ids.append(results[0][0]) + else: + print "insert sample %s" % (sample_name) + sql = """ + INSERT INTO Strain + SET + SpeciesId=%s, + Name=%s, + Name2=%s + """ + cursor.execute(sql, (speciesid, sample_name, sample_name)) + sampleid = con.insert_id() + sample_ids.append(sampleid) + # + sql = """ + SELECT OrderId + FROM StrainXRef + where InbredSetId=%s + ORDER BY OrderId DESC + LIMIT 1 + """ + cursor.execute(sql, (inbredsetid)) + results = cursor.fetchall() + orderid = results[0][0] + 1 + # + sql = """ + INSERT INTO StrainXRef + SET + InbredSetId=%s, + StrainId=%s, + OrderId=%s, + Used_for_mapping=%s + """ + cursor.execute(sql, (inbredsetid, sampleid, orderid, "N")) + print "load %d samples from DB:" % (len(sample_names)) + for i in range(len(sample_names)): + print "%s\t%s" % (sample_names[i], sample_ids[i]) + # parse geno file + index = 0 + for line in file_geno: + index += 1 + if index % 1000 == 0: + print index + items = line.split() + chr = items[0] + name = items[1] + cm = items[2] + mb = items[3] + values = items[4:] + # geno + sql = """ + SELECT Id + FROM Geno + WHERE SpeciesId=%s + AND Name like %s + """ + cursor.execute(sql, (speciesid, name)) + results = cursor.fetchall() + if results: + genoid = results[0][0] + else: + print "insert geno %s" % (name) + sql = """ + INSERT INTO Geno + SET + SpeciesId=%s, + Name=%s, + Marker_Name=%s, + Chr=%s, + Mb=%s + """ + cursor.execute(sql, (speciesid, name, name, chr, mb)) + genoid = con.insert_id() + # genodata + dataid += 1 + for i in range(len(values)): + sample_id = sample_ids[i] + try: + value = int(values[i]) + except ValueError: + continue + if not value in [-1, 0, 1]: + print sample_id, value + continue + sql = """ + INSERT INTO GenoData + SET + Id=%s, + StrainId=%s, + value=%s + """ + cursor.execute(sql, (dataid, sample_id, value)) + # genoxref + sql = """ + INSERT INTO GenoXRef + SET + GenoFreezeId=%s, + GenoId=%s, + DataId=%s, + cM=%s, + Used_for_mapping=%s + """ + cursor.execute(sql, (genofreezeid, genoid, dataid, cm, 'N')) + print "Insert %d genoxref" % (index) + # close + file_geno.close() + con.close() + +# main +if __name__ == "__main__": + main(sys.argv) + print "exit successfully" diff --git a/wqflask/maintenance/dataset/phenotypes/__init__.py b/wqflask/maintenance/dataset/phenotypes/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/wqflask/maintenance/dataset/probesets/__init__.py b/wqflask/maintenance/dataset/probesets/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/wqflask/maintenance/dataset/special/correlation/about.txt b/wqflask/maintenance/dataset/special/correlation/about.txt deleted file mode 100644 index a12f8c47..00000000 --- a/wqflask/maintenance/dataset/special/correlation/about.txt +++ /dev/null @@ -1,3 +0,0 @@ -BXD -genotype, phenotype, mRNA expression -correlation \ No newline at end of file diff --git a/wqflask/maintenance/dataset/special/correlation/conf.ini b/wqflask/maintenance/dataset/special/correlation/conf.ini deleted file mode 100644 index 9c23bb45..00000000 --- a/wqflask/maintenance/dataset/special/correlation/conf.ini +++ /dev/null @@ -1,2 +0,0 @@ -[configuration] -genofile = /home/leiyan/gn/web/genotypes/BXD.geno diff --git a/wqflask/maintenance/dataset/special/correlation/correlations.py b/wqflask/maintenance/dataset/special/correlation/correlations.py deleted file mode 100644 index b089e446..00000000 --- a/wqflask/maintenance/dataset/special/correlation/correlations.py +++ /dev/null @@ -1,47 +0,0 @@ -# Author: Lei Yan -# Create Date: 2014-01-21 -# Last Update Date: 2014-01-24 - -# import -import sys -import os -import re -import MySQLdb -import ConfigParser - -def main(argv): - - # load configuration from configuration file - config = ConfigParser.ConfigParser() - config.read(argv[1]) - genofile = config.get('configuration', 'genofile') - - # parse genofile - genotypes = [] - file_geno = open(genofile, 'r') - for line in file_geno: - line = line.strip() - if line.startswith('#'): - continue - if line.startswith('@'): - continue - cells = line.split() - if line.startswith("Chr"): - strains = cells[4:] - continue - genotype = {} - genotype['chr'] = cells[0] - genotype['locus'] = cells[1] - genotype['cm'] = cells[2] - genotype['mb'] = cells[3] - genotype['values'] = cells[4:] - genotypes.append(genotype) - print "get %d strains:\t%s" % (len(strains), strains) - print "load %d genotypes" % len(genotypes) - - # phenotypes - -# main -if __name__ == "__main__": - main(sys.argv) - print "exit successfully" diff --git a/wqflask/maintenance/dataset/special/correlation/run.sh b/wqflask/maintenance/dataset/special/correlation/run.sh deleted file mode 100644 index eccfa507..00000000 --- a/wqflask/maintenance/dataset/special/correlation/run.sh +++ /dev/null @@ -1 +0,0 @@ -/usr/bin/python correlations.py conf.ini \ No newline at end of file diff --git a/wqflask/maintenance/dataset/utilities/__init__.py b/wqflask/maintenance/dataset/utilities/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/wqflask/maintenance/dataset/utilities/db.py b/wqflask/maintenance/dataset/utilities/db.py new file mode 100644 index 00000000..453ee707 --- /dev/null +++ b/wqflask/maintenance/dataset/utilities/db.py @@ -0,0 +1,10 @@ +import MySQLdb + +def get_cursor(): + host = 'localhost' + user = 'webqtl' + passwd = 'webqtl' + db = 'db_webqtl' + con = MySQLdb.Connect(db=db, host=host, user=user, passwd=passwd) + cursor = con.cursor() + return cursor \ No newline at end of file diff --git a/wqflask/maintenance/dataset/utilities/test.py b/wqflask/maintenance/dataset/utilities/test.py new file mode 100644 index 00000000..340377b3 --- /dev/null +++ b/wqflask/maintenance/dataset/utilities/test.py @@ -0,0 +1,3 @@ +import db + +print db.get_cursor() \ No newline at end of file -- cgit v1.2.3