#!/usr/bin/python from __future__ import absolute_import, print_function, division import sys sys.path.insert(0, "..") import os import collections import csv import MySQLdb from base import webqtlConfig from pprint import pformat as pf def get_cursor(): con = MySQLdb.Connect(db=webqtlConfig.DB_UPDNAME, host=webqtlConfig.MYSQL_UPDSERVER, user=webqtlConfig.DB_UPDUSER, passwd=webqtlConfig.DB_UPDPASSWD) cursor = con.cursor() return cursor def show_progress(process, counter): if counter % 1000 == 0: print("{}: {}".format(process, counter)) def get_strains(cursor): cursor.execute("""select Strain.Name from Strain, StrainXRef, InbredSet where Strain.Id = StrainXRef.StrainId and StrainXRef.InbredSetId = InbredSet.Id and InbredSet.Name=%s; """, "BXD") strains = [strain[0] for strain in cursor.fetchall()] print("strains:", pf(strains)) for strain in strains: print(" -", strain) return strains def get_probeset_vals(cursor, dataset_name): cursor.execute(""" select ProbeSet.Id, ProbeSet.Name from ProbeSetXRef, ProbeSetFreeze, ProbeSet where ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id and ProbeSetFreeze.Name = %s and ProbeSetXRef.ProbeSetId = ProbeSet.Id; """, dataset_name) probesets = cursor.fetchall() print("Fetched probesets") probeset_vals = collections.OrderedDict() for counter, probeset in enumerate(probesets): cursor.execute(""" select Strain.Name, ProbeSetData.value from ProbeSetData, ProbeSetXRef, ProbeSetFreeze, Strain where ProbeSetData.Id = ProbeSetXRef.DataId and ProbeSetData.StrainId = Strain.Id and ProbeSetXRef.ProbeSetId = %s and ProbeSetFreeze.Id = ProbeSetXRef.ProbeSetFreezeId and ProbeSetFreeze.Name = %s; """, (probeset[0], dataset_name)) val_dic = collections.OrderedDict() vals = cursor.fetchall() for val in vals: val_dic[val[0]] = val[1] probeset_vals[probeset[1]] = val_dic show_progress("Querying DB", counter) return probeset_vals def trim_strains(strains, probeset_vals): trimmed_strains = [] #print("probeset_vals is:", pf(probeset_vals)) first_probeset = list(probeset_vals.itervalues())[0] for strain in strains: print("\n**** strain is:", pf(strain)) print("\n**** first_probeset is:", pf(first_probeset)) if strain in first_probeset: trimmed_strains.append(strain) print("trimmed_strains:", pf(trimmed_strains)) return trimmed_strains def write_data_matrix_file(strains, probeset_vals, filename): with open(filename, "wb") as fh: csv_writer = csv.writer(fh, delimiter=",", quoting=csv.QUOTE_ALL) #print("strains is:", pf(strains)) csv_writer.writerow(['ID'] + strains) for counter, probeset in enumerate(probeset_vals): row_data = [probeset] for strain in strains: #print("probeset is: ", pf(probeset_vals[probeset])) row_data.append(probeset_vals[probeset][strain]) #print("row_data is: ", pf(row_data)) csv_writer.writerow(row_data) show_progress("Writing", counter) def main(): filename = os.path.expanduser("~/gene/wqflask/maintenance/" + "ProbeSetFreezeId_379_FullName_EPFL_LISP_BXD_CD_Muscle_Affy_Mouse_Gene_1.0_ST_" + "(Dec11)_RMA_**.txt") dataset_name = "EPFLMouseMuscleCDRMA1211" cursor = get_cursor() strains = get_strains(cursor) print("Getting probset_vals") probeset_vals = get_probeset_vals(cursor, dataset_name) print("Finished getting probeset_vals") trimmed_strains = trim_strains(strains, probeset_vals) write_data_matrix_file(trimmed_strains, probeset_vals, filename) if __name__ == '__main__': main()