# Created by Luis Del Mar for GeneNetwork.org # Email: ladelmar99@gmail.com # LinkedIn: https://www.linkedin.com/in/luis-del-mar/ # GitHub: https://github.com/ladm99 # This script performs the following normalizations to a specific tab delimited dataset: # 1. Average 2. Log2 normalize 3. ZScore Normaliza 4. Outlier detection #!/usr/bin/env python from tkinter import Tk from tkinter.filedialog import askopenfilename import math import traceback import os # method for file selection def fileSelector(): # Code from: https://stackoverflow.com/questions/3579568/choosing-a-file-in-python-with-simple-dialog Tk().withdraw() # we don't want a full GUI, so keep the root window from appearing filename = askopenfilename() # show an "Open" dialog box and return the path to the selected file return filename # method for getting skipRow and skipCol def skips(): while True: skipRow = 0 skipCol = 0 r = input('Enter the number of rows that you wish skip (default is 0): ').strip() c = input('Enter the number of columns that you wish skip (default is 0): ').strip() try: if r != '': skipRow = int(r) if c != '': skipCol = int(c) return skipRow, skipCol except Exception as e: input('Enter a valid value') # gets the mean of an array def getMean(values): mean = 0.0 for i in values: mean+=i mean /= len(values) return mean # gets the standard deviation of an array def getSTD(values): mean = getMean(values) phi = 0.0 for i in values: phi += abs((i - mean)) ** 2 phi = math.sqrt((phi/(len(values) - 1))) return phi def log2Normalize(inputFile): try: skipRow, skipCol = skips() print(':Log2 normalize processing...looking for the minimal expression value') f = open('%s' % inputFile, 'r') # set min to max value min = float('inf') for i in range(skipRow + 1): f.readline() while True: data =f.readline() if not data: break else: s = data.split('\t') for j in range(skipCol + 1, len(s)): value = 0.0 try: value = float(s[j]) if min > value: min = value except Exception as e: pass f.close() print('Log2 normalize processing...calculating') f = open('%s' % inputFile, 'r') outputFile = os.path.split(inputFile)[1].replace('.txt', '_log2.txt') out = open(outputFile,'w') offset = 1.0 if min < 0.0: offset = -min + 1.0 for i in range(skipRow + 1): out.write(f.readline()) linenum = 0 while True: data = f.readline() if not data: break else: s = data.split('\t') for j in range(skipCol + 1): out.write(s[j] + '\t') for j in range(skipCol + 1, len(s)): try: value = math.log(float(s[j]) + offset) / math.log(2.0) out.write(str(round(value,3)) + '\t') except Exception as e: out.write('x') out.write('\t') out.write('\n') linenum +=1 if linenum % 2500 == 0: print('Log2 normalize processing...finished' + str(linenum) + ' lines') f.close() out.close() print('Log2 normalize finished') except Exception as e: print(traceback.format_exc()) def ZScoreNormalize(inputFile): try: skipRow, skipCol = skips() print('ZScore normalize processing...calculating means') f = open('%s' % inputFile, 'r') for n in range(skipRow): f.readline() # skip first row which is just headers s = f.readline().split('\t') col = len(s) - skipCol - 1 # mean and phi lists are filled with 0.00 mean = [0.00] * col phi = [0.00] * col row = 0 # read to the end of the file while True: data = f.readline() if not data: break else: # put values into a list s = data.split('\t') for m in range(skipCol + 1, len(s)): try: mean[m - skipCol - 1] = mean[m - skipCol - 1] + float(s[m]) except Exception as e: pass row+=1 f.close() for m in range(col): mean[m] = mean[m] / row print('ZScore normalize processing...calculating standard divisions') f = open('%s' % inputFile, 'r') for k in range(skipRow + 1): # skip headers f.readline() while True: data = f.readline() if not data: break else: s = data.split('\t') for i1 in range(skipCol + 1, len(s)): value = 0.0 try: value = float(s[i1]) phi[i1 - skipCol - 1] = phi[i1 - skipCol - 1] + (value - mean[i1 - skipCol - 1]) ** 2 except Exception as e: pass f.close() for j in range(col): phi[j] = math.sqrt(phi[j] / (row - 1)) # print(str(mean[j]) + '\t' + str(phi[j])) outputFile = os.path.split(inputFile)[1].replace('.txt', '_Z.txt') f = open('%s' % inputFile, 'r') out = open(outputFile,'w') for i in range(skipRow + 1): out.write(f.readline()) row = 0 while True: data = f.readline() if not data: break else: s = data.split('\t') for i1 in range(skipRow + 1): out.write(s[i1] + '\t') for i1 in range(skipCol + 1, len(s)): try: value = float(s[i1]) value = 2.0 * (value - mean[i1 - skipCol - 1]) / phi[i1 - skipCol - 1] + 8.0 out.write(str(round(value, 3)) + '\t') except Exception as e: out.write('x' + '\t') out.write('\n') row+=1 if row % 2500 == 0: print('ZScore normalize processing...finished ' + str(row) + ' lines') f.close() out.close() print('ZScore normalize finished') except Exception as e: print(e) # find the average of values with column names that are the same def average(inputFile, needSE): try: skipRow, skipCol = skips() f = open('%s' % inputFile, 'r') outputFile = os.path.split(inputFile)[1].replace('.txt', '_Avg.txt') out = open(outputFile,'w') titleList = [] # will hold all of the titles, used for getting indices for values itemList = [] # will hold all the unique titles # skips the appropriate amount of rows for i in range(skipRow): s = f.readline() out.write(s + '\n') s = f.readline().split('\t') for i in range(len(s)): s[i] = s[i].strip() titleList.append(s[i]) if s[i] not in itemList and i > skipCol: itemList.append(s[i]) for i in range(skipCol + 1): out.write(titleList[i]) out.write('\t') for i in range(len(itemList)): out.write(itemList[i]) if i < len(itemList) - 1: out.write('\t') out.write('\n') line = 0 while True: data = f.readline() if not data: break else: s = data.split('\t') for j in range(skipCol + 1): out.write(str(s[j]) + '\t') for j in range(len(itemList)): avgItemValue = 0.0 n = 0 at = titleList.index(itemList[j]) while(at >= 0): try: avgItemValue += float(s[at]) n+=1 at = titleList.index(itemList[j], at + 1) except Exception as e: at = -1 if n == 0: out.write('x\t') else: avgItemValue /= n out.write(str(round(avgItemValue, 4))) if j < len(itemList) - 1: out.write('\t') out.write('\n') line +=1 if line % 1000 == 0: print('Processing average...' + str(line) + ' lines\n') f.close() out.close() except Exception as e: print(traceback.format_exc()) if needSE: getSE(inputFile, skipRow, skipCol) # find the SE of the average of the values def getSE(inputFile, skipRow, skipCol): try: f = open('%s' % inputFile, 'r') outputFile = os.path.split(inputFile)[1].replace('.txt', '_Avg_SE.txt') out = open(outputFile,'w') titleList = [] # will hold all of the titles, used for getting indices for values itemList = [] # will hold all the unique titles # skips the appropriate amount of rows for i in range(skipRow): s = f.readline() out.write(s + '\n') s = f.readline().split('\t') for i in range(len(s)): s[i] = s[i].strip() titleList.append(s[i]) if s[i] not in itemList and i > skipCol: itemList.append(s[i]) for i in range(skipCol + 1): out.write(titleList[i]) out.write('\t') for i in range(len(itemList)): out.write(itemList[i]) if i < len(itemList) - 1: out.write('\t') out.write('\n') line = 0 while True: data = f.readline() if not data: break else: s = data.split('\t') for j in range(skipCol + 1): out.write(str(s[j]) + '\t') for j in range(len(itemList)): avgItemValue = 0.0 n = 0 at = titleList.index(itemList[j]) while(at >= 0): try: avgItemValue += float(s[at]) n+=1 at = titleList.index(itemList[j], at + 1) except Exception as e: at = -1 if n == 0: out.write('x\t') else: avgItemValue /= n SE = 0.0 n = 0 at = titleList.index(itemList[j]) while at >= 0: try: SE += (avgItemValue - float(s[at])) * (avgItemValue - float(s[at])) n +=1 at = titleList.index(itemList[j], at + 1) except Exception as e: at = -1 if n > 1: SE = math.sqrt(SE / (n - 1)) SE /= math.sqrt((n-1)) out.write(str(round(SE, 8))) out.write('\t') else: out.write('\t') out.write('\n') line +=1 if line % 1000 == 0: print('Processing average... Standard Error... ' + str(line) + ' lines\n') f.close() out.close() except Exception as e: print(traceback.format_exc()) # find outliers def outlier(inputFile): try: skipRow, skipCol = skips() print('Outlier running') f = open('%s' % inputFile, 'r') outputFile = os.path.split(inputFile)[1].replace('.txt', '_Outlier.txt') out = open(outputFile,'w') for i in range(skipRow): f.readline() data = f.readline() for i in range(skipCol + 1): if data.index('\t') >= 0: data = data[data.index('\t'):] data = data.strip() sampleTitle = data.split('\t') sampleNum = len(sampleTitle) values = [0.0] * sampleNum marks = [0] * sampleNum geneNum = 0 while True: data = f.readline() if not data: break else: s = data.split('\t') for k in range(sampleNum): try: values[k] = round(float(s[k + skipCol + 1]), 3) except Exception as e: pass mean = getMean(values) phi = getSTD(values) for k in range(sampleNum): if values[k] < (mean - 2.0 * phi) or values[k] > (mean + 2.0 * phi): marks[k] += 1 geneNum += 1 if geneNum % 1000 == 0: print('Outlier running, finished...' + str(geneNum) + ' lines') for j in range(sampleNum): out.write(sampleTitle[j] + '\t') out.write(str(marks[j]) + '\t') out.write(str(round(marks[j] * 1.0 / geneNum, 3))) out.write('\n') out.close print('Outlier finished') except Exception as e: print(traceback.format_exc()) # adds dashes based on how long the filename is, just here to make the selection a little bit nicer def dashes(string): dash = '-------------------------' for i in range(len(string)): dash+='-' return '\n' + dash def main(): input('Press Enter to select file') filename = fileSelector() while True: file = os.path.split(filename)[1] print('\nEnter Selection for file ' + file + dashes(file)) print('1. Average\n2. Log2 Normalize\n3. ZScore Normalize\n4. Outlier\n5. Select a new file\n6. Exit') select = input('Enter: ') if select == '1': needSE = False SE = input('Do you want to compute the standard error [Y/N] (default is no)').lower() if SE == 'y': needSE = True average(filename, needSE) elif select == '2': log2Normalize(filename) elif select == '3': ZScoreNormalize(filename) elif select == '4': outlier(filename) elif select == '5': filename = fileSelector() elif select == '6': exit() main()