# !/usr/bin/python3 """This script use the nearest marker to the transcript as control, increasing permutation rounds according to the p-value""" ######################################################################## # Last Updated 3/11/2022 by Zach ######################################################################## import csv import string import sys import MySQLdb import getpass import time ######################################################################## def translate_alias(str): if str == "B6": return "C57BL/6J" elif str == "D2": return "DBA/2J" else: return str ######################################################################## # # Indicate Data Start Position, ProbeFreezeId, gene_chip_id, DataFile # ######################################################################## data_start = 1 gene_chip_id = int(input("Enter GeneChipId:")) probeset_freeze_id = int(input("Enter ProbeSetFreezeId:")) input_file_name = input("Enter file name with suffix:") try: passwd = getpass.getpass('Please enter mysql password here : ') conn = MySQLdb.Connect(db='db_webqtl', host='localhost', user='webqtlout', passwd=passwd) db = conn.cursor() print "You have successfully connected to mysql.\n" except: print "You entered incorrect password.\n" sys.exit(0) time0 = time.time() ######################################################################### # # Check if each line have same number of members # generate the gene list of expression data here # ######################################################################### print 'Checking if each line have same number of members' gene_list = [] strain_list = [] trait_data = [] with open(input_file_name, "r") as csvfile: reader = csv.DictReader(csvfile, delimiter="\t") kj = 0 for line in reader: trait_data.append(line) # Get the strain list; only need to get it once if kj == 0: strain_list = [item for item in line.keys() if item != "ProbeSetID"] print("STRAIN LIST:", strain_list) gene_list.append(line['ProbeSetID']) if kj % 100000 == 0: print(f"checked {kj} lines") kj += 1 gene_list.sort() print(f"used {time.time() - time0} seconds") ######################################################################### # # Check if each strain exist in database # generate the string id list of expression data here # ######################################################################### print('Checking if each strain exists in database') strain_list = map(translate_alias, strain_list) strain_ids = {} for item in strain_list: try: db.execute(f'select Id from Strain where Name = "{item}" AND SpeciesId=1') strain_ids[item] = db.fetchone()[0] except: print(f"{item} does not exist, check the if the strain name is correct") sys.exit(0) print(f"Used {time.time() - time0} seconds") ######################################################################## # # Check if each ProbeSet exist in database # ######################################################################## print("Check if each ProbeSet exists in database") # Check whether ProbeSetIDs are Name or TargetId (if not Name, assume to be TargetId) id_type = "TargetId" db.execute(f"select Id from ProbeSet where Name='{gene_list[0]}' and ChipId={gene_chip_id}") if len(db.fetchall()): id_type = "Name" ## Get Name/TargetId + ID list from database db.execute(f"select {id_type}, Id from ProbeSet where ChipId={gene_chip_id} order by {id_type}") records_from_db = db.fetchall() record_names = [item[0] for item in records_from_db] record_names.sort() # Compare gene_list with gene_names invalid_records = [] lowercase_records = [name2.lower() for name2 in record_names] for name in gene_list: if name.lower() not in lowercase_records: invalid_records.append(name) if len(invalid_records): with open("ProbeSetError.txt", "wb") as error_fh: for item in invalid_records: error_fh.write(f"{item} doesn't exist, cheeck if the ProbeSet name is correct \n") sys.exit(0) print(f"used {time.time() - time0} seconds") ######################################################################### # # Insert data into database # ######################################################################### print("getting ProbeSet Name + Id") record_ids = {} for record in records_from_db: record_ids[record[0]] = record[1] print(f"used {time.time() - time0} seconds") print("inserting data") # Get old max dataId db.execute('select max(Id) from ProbeSetData') latest_data_id = int(db.fetchone()[0]) print(f"Latest DataId = {latest_data_id}") # Insert data probeset_data_values = [] probeset_xref_values = [] for i, item in enumerate(trait_data): latest_data_id += 1 probeset_id = item['ProbeSetID'] item.pop('ProbeSetID') sample_data = item for strain in sample_data: probeset_data_values.append(f"({latest_data_id},{strain_ids[strain]},{float(sample_data[strain])})") probeset_xref_values.append(f"({probeset_freeze_id},{record_ids[probeset_id]},{latest_data_id})") # Insert into tables for every 100 traits if i % 100 == 0: data_query = f"INSERT INTO ProbeSetData VALUES {','.join(probeset_data_values)}" db.execute(data_query) xref_query = ( "INSERT INTO ProbeSetXRef(ProbeSetFreezeId, ProbeSetId, DataId) " f"VALUES {','.join(probeset_xref_values)}") db.execute(xref_query) probeset_data_values = [] probeset_xref_values = [] print(f"Inserted {i} lines") print(f"Used {time.time() - time0} seconds") # Insert the remainder (since the loop above only inserts every 100 traits) if len(probeset_data_values): data_query = f"INSERT INTO ProbeSetData VALUES {','.join(probeset_data_values)}" db.execute(data_query) xref_query = ( "INSERT INTO ProbeSetXRef(ProbeSetFreezeId, ProbeSetId, DataId) " f"VALUES {','.join(probeset_xref_values)}") db.execute(xref_query) conn.commit() conn.close()