diff options
author | Zachary Sloan | 2013-03-02 00:52:30 +0000 |
---|---|---|
committer | Zachary Sloan | 2013-03-02 00:52:30 +0000 |
commit | ed1dd7777b6dc49ed3496668bfd3b3df0d0a0612 (patch) | |
tree | 856ab810e04dc64e70a6556125a6e1491692f7c8 /wqflask/maintenance | |
parent | f65a92de7c65e6528c5d01117c202401f7681ab0 (diff) | |
download | genenetwork2-ed1dd7777b6dc49ed3496668bfd3b3df0d0a0612.tar.gz |
Created generate_probesetfreeze_file to create the
"probesetfreeze data matrix" file corresponding with the
muscle dataset Evan was having trouble with
Diffstat (limited to 'wqflask/maintenance')
-rw-r--r-- | wqflask/maintenance/generate_probesetfreeze_file.py | 123 |
1 files changed, 123 insertions, 0 deletions
diff --git a/wqflask/maintenance/generate_probesetfreeze_file.py b/wqflask/maintenance/generate_probesetfreeze_file.py new file mode 100644 index 00000000..95515cea --- /dev/null +++ b/wqflask/maintenance/generate_probesetfreeze_file.py @@ -0,0 +1,123 @@ +#!/usr/bin/python + +from __future__ import absolute_import, print_function, division + +import sys + +sys.path.insert(0, "..") + +import os +import collections +import csv + +import MySQLdb + +from base import webqtlConfig + +from pprint import pformat as pf + + +def get_cursor(): + con = MySQLdb.Connect(db=webqtlConfig.DB_UPDNAME, + host=webqtlConfig.MYSQL_UPDSERVER, + user=webqtlConfig.DB_UPDUSER, + passwd=webqtlConfig.DB_UPDPASSWD) + cursor = con.cursor() + return cursor + +def show_progress(process, counter): + if counter % 1000 == 0: + print("{}: {}".format(process, counter)) + +def get_strains(cursor): + cursor.execute("""select Strain.Name + from Strain, StrainXRef, InbredSet + where Strain.Id = StrainXRef.StrainId and + StrainXRef.InbredSetId = InbredSet.Id + and InbredSet.Name=%s; + """, "BXD") + + strains = [strain[0] for strain in cursor.fetchall()] + print("strains:", pf(strains)) + for strain in strains: + print(" -", strain) + + return strains + +def get_probeset_vals(cursor, dataset_name): + cursor.execute(""" select ProbeSet.Id, ProbeSet.Name + from ProbeSetXRef, + ProbeSetFreeze, + ProbeSet + where ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id and + ProbeSetFreeze.Name = %s and + ProbeSetXRef.ProbeSetId = ProbeSet.Id; + """, dataset_name) + + probesets = cursor.fetchall() + + print("Fetched probesets") + + probeset_vals = collections.OrderedDict() + + for counter, probeset in enumerate(probesets): + cursor.execute(""" select Strain.Name, ProbeSetData.value + from ProbeSetData, ProbeSetXRef, ProbeSetFreeze, Strain + where ProbeSetData.Id = ProbeSetXRef.DataId + and ProbeSetData.StrainId = Strain.Id + and ProbeSetXRef.ProbeSetId = %s + and ProbeSetFreeze.Id = ProbeSetXRef.ProbeSetFreezeId + and ProbeSetFreeze.Name = %s; + """, (probeset[0], dataset_name)) + val_dic = collections.OrderedDict() + vals = cursor.fetchall() + for val in vals: + val_dic[val[0]] = val[1] + + probeset_vals[probeset[1]] = val_dic + show_progress("Querying DB", counter) + + return probeset_vals + +def trim_strains(strains, probeset_vals): + trimmed_strains = [] + #print("probeset_vals is:", pf(probeset_vals)) + first_probeset = list(probeset_vals.itervalues())[0] + for strain in strains: + print("\n**** strain is:", pf(strain)) + print("\n**** first_probeset is:", pf(first_probeset)) + if strain in first_probeset: + trimmed_strains.append(strain) + print("trimmed_strains:", pf(trimmed_strains)) + return trimmed_strains + +def write_data_matrix_file(strains, probeset_vals, filename): + with open(filename, "wb") as fh: + csv_writer = csv.writer(fh, delimiter=",", quoting=csv.QUOTE_ALL) + #print("strains is:", pf(strains)) + csv_writer.writerow(['ID'] + strains) + for counter, probeset in enumerate(probeset_vals): + row_data = [probeset] + for strain in strains: + #print("probeset is: ", pf(probeset_vals[probeset])) + row_data.append(probeset_vals[probeset][strain]) + #print("row_data is: ", pf(row_data)) + csv_writer.writerow(row_data) + show_progress("Writing", counter) + +def main(): + filename = os.path.expanduser("~/gene/wqflask/maintenance/" + + "ProbeSetFreezeId_379_FullName_EPFL_LISP_BXD_CD_Muscle_Affy_Mouse_Gene_1.0_ST_" + + "(Dec11)_RMA_**.txt") + dataset_name = "EPFLMouseMuscleCDRMA1211" + + cursor = get_cursor() + strains = get_strains(cursor) + print("Getting probset_vals") + probeset_vals = get_probeset_vals(cursor, dataset_name) + print("Finished getting probeset_vals") + trimmed_strains = trim_strains(strains, probeset_vals) + write_data_matrix_file(trimmed_strains, probeset_vals, filename) + +if __name__ == '__main__': + main() |