diff options
Diffstat (limited to 'wqflask/maintenance/dataset')
6 files changed, 271 insertions, 0 deletions
diff --git a/wqflask/maintenance/dataset/genotypes/code/genotypes_load.py b/wqflask/maintenance/dataset/genotypes/code/genotypes_load.py new file mode 100644 index 00000000..e4988446 --- /dev/null +++ b/wqflask/maintenance/dataset/genotypes/code/genotypes_load.py @@ -0,0 +1,207 @@ +# GeneNetwork maintenance script +# Load genotypes from geno file into database + +# Author: Lei Yan +# Create Date: 2014-01-08 +# Last Update Date: 2014-01-10 + +# import +import sys +import os +import re +import MySQLdb +import ConfigParser + +def main(argv): + + # load configuration from configuration file + config = ConfigParser.ConfigParser() + config.read(argv[1]) + speciesid = config.get('configuration', 'speciesId') + inbredsetid = config.get('configuration', 'inbredsetid') + genofreezeid = config.get('configuration', 'genofreezeid') + genofile = config.get('configuration', 'genofile') + print "[configuration]\nspeciesid: %s\ninbredsetid: %s\ngenofreezeid: %s\ngenofile: %s\n" % (speciesid, inbredsetid, genofreezeid, genofile) + + # variables + metadic = {} + + # parse genofile + file_geno = open(genofile, 'r') + for line in file_geno: + line = line.strip() + if line.startswith('#'): + continue + if line.startswith('@'): + line = line.strip('@') + items = line.split(';') + for item in items: + kv = re.split(':|=', item) + metadic[kv[0].strip()] = kv[1].strip() + continue + if line.startswith("Chr"): + print "[meta dictionary]" + for k,v in metadic.items(): + print "%s: %s" % (k, v) + print + continue + cells = line.split() + Chr = cells[0] + Locus = cells[1] + cM = cells[2] + Mb = cells[3] + print len(cells) + ? + return + + # open db + host = 'localhost' + user = 'webqtl' + passwd = 'webqtl' + db = 'db_webqtl' + con = MySQLdb.Connect(db=db, user=user, passwd=passwd, host=host) + cursor = con.cursor() + # var + speciesid = int(argv[2]) + inbredsetid = int(argv[3]) + genofreezeid = int(argv[4]) + sql = """ + SELECT Id + FROM GenoData + ORDER BY Id DESC + LIMIT 1 + """ + cursor.execute(sql) + results = cursor.fetchall() + dataid = results[0][0] + print "speciesid: %s" % (speciesid) + print "inbredsetid: %s" % (inbredsetid) + print "genofreezeid: %s" % (genofreezeid) + print "dataid start: %s" % (dataid+1) + # samples + line = file_geno.readline() + sample_names = line.split()[4:] + sample_ids = [] + print "get %d samples from file:\n%s" % (len(sample_names), sample_names) + for sample_name in sample_names: + sql = """ + select Id + from Strain + where SpeciesId=%s + and Name like %s + """ + cursor.execute(sql, (speciesid, sample_name)) + results = cursor.fetchall() + if results: + sample_ids.append(results[0][0]) + else: + print "insert sample %s" % (sample_name) + sql = """ + INSERT INTO Strain + SET + SpeciesId=%s, + Name=%s, + Name2=%s + """ + cursor.execute(sql, (speciesid, sample_name, sample_name)) + sampleid = con.insert_id() + sample_ids.append(sampleid) + # + sql = """ + SELECT OrderId + FROM StrainXRef + where InbredSetId=%s + ORDER BY OrderId DESC + LIMIT 1 + """ + cursor.execute(sql, (inbredsetid)) + results = cursor.fetchall() + orderid = results[0][0] + 1 + # + sql = """ + INSERT INTO StrainXRef + SET + InbredSetId=%s, + StrainId=%s, + OrderId=%s, + Used_for_mapping=%s + """ + cursor.execute(sql, (inbredsetid, sampleid, orderid, "N")) + print "load %d samples from DB:" % (len(sample_names)) + for i in range(len(sample_names)): + print "%s\t%s" % (sample_names[i], sample_ids[i]) + # parse geno file + index = 0 + for line in file_geno: + index += 1 + if index % 1000 == 0: + print index + items = line.split() + chr = items[0] + name = items[1] + cm = items[2] + mb = items[3] + values = items[4:] + # geno + sql = """ + SELECT Id + FROM Geno + WHERE SpeciesId=%s + AND Name like %s + """ + cursor.execute(sql, (speciesid, name)) + results = cursor.fetchall() + if results: + genoid = results[0][0] + else: + print "insert geno %s" % (name) + sql = """ + INSERT INTO Geno + SET + SpeciesId=%s, + Name=%s, + Marker_Name=%s, + Chr=%s, + Mb=%s + """ + cursor.execute(sql, (speciesid, name, name, chr, mb)) + genoid = con.insert_id() + # genodata + dataid += 1 + for i in range(len(values)): + sample_id = sample_ids[i] + try: + value = int(values[i]) + except ValueError: + continue + if not value in [-1, 0, 1]: + print sample_id, value + continue + sql = """ + INSERT INTO GenoData + SET + Id=%s, + StrainId=%s, + value=%s + """ + cursor.execute(sql, (dataid, sample_id, value)) + # genoxref + sql = """ + INSERT INTO GenoXRef + SET + GenoFreezeId=%s, + GenoId=%s, + DataId=%s, + cM=%s, + Used_for_mapping=%s + """ + cursor.execute(sql, (genofreezeid, genoid, dataid, cm, 'N')) + print "Insert %d genoxref" % (index) + # close + file_geno.close() + con.close() + +# main +if __name__ == "__main__": + main(sys.argv) + print "exit successfully" diff --git a/wqflask/maintenance/dataset/special/correlation/about.txt b/wqflask/maintenance/dataset/special/correlation/about.txt new file mode 100644 index 00000000..a12f8c47 --- /dev/null +++ b/wqflask/maintenance/dataset/special/correlation/about.txt @@ -0,0 +1,3 @@ +BXD +genotype, phenotype, mRNA expression +correlation
\ No newline at end of file diff --git a/wqflask/maintenance/dataset/special/correlation/conf.ini b/wqflask/maintenance/dataset/special/correlation/conf.ini new file mode 100644 index 00000000..9c23bb45 --- /dev/null +++ b/wqflask/maintenance/dataset/special/correlation/conf.ini @@ -0,0 +1,2 @@ +[configuration] +genofile = /home/leiyan/gn/web/genotypes/BXD.geno diff --git a/wqflask/maintenance/dataset/special/correlation/correlations.py b/wqflask/maintenance/dataset/special/correlation/correlations.py new file mode 100644 index 00000000..b089e446 --- /dev/null +++ b/wqflask/maintenance/dataset/special/correlation/correlations.py @@ -0,0 +1,47 @@ +# Author: Lei Yan +# Create Date: 2014-01-21 +# Last Update Date: 2014-01-24 + +# import +import sys +import os +import re +import MySQLdb +import ConfigParser + +def main(argv): + + # load configuration from configuration file + config = ConfigParser.ConfigParser() + config.read(argv[1]) + genofile = config.get('configuration', 'genofile') + + # parse genofile + genotypes = [] + file_geno = open(genofile, 'r') + for line in file_geno: + line = line.strip() + if line.startswith('#'): + continue + if line.startswith('@'): + continue + cells = line.split() + if line.startswith("Chr"): + strains = cells[4:] + continue + genotype = {} + genotype['chr'] = cells[0] + genotype['locus'] = cells[1] + genotype['cm'] = cells[2] + genotype['mb'] = cells[3] + genotype['values'] = cells[4:] + genotypes.append(genotype) + print "get %d strains:\t%s" % (len(strains), strains) + print "load %d genotypes" % len(genotypes) + + # phenotypes + +# main +if __name__ == "__main__": + main(sys.argv) + print "exit successfully" diff --git a/wqflask/maintenance/dataset/special/correlation/run.sh b/wqflask/maintenance/dataset/special/correlation/run.sh new file mode 100644 index 00000000..eccfa507 --- /dev/null +++ b/wqflask/maintenance/dataset/special/correlation/run.sh @@ -0,0 +1 @@ +/usr/bin/python correlations.py conf.ini
\ No newline at end of file diff --git a/wqflask/maintenance/dataset/utilities/fetch.py b/wqflask/maintenance/dataset/utilities/fetch.py new file mode 100644 index 00000000..fcb2d2d8 --- /dev/null +++ b/wqflask/maintenance/dataset/utilities/fetch.py @@ -0,0 +1,11 @@ +import sys + +inputfile = open(sys.argv[1], 'r') + +for line in inputfile: + cells = line.split() + #print cells[int(sys.argv[2])] + i = len(cells) + print i + +inputfile.close() |