scripts/maintenance/DataAnalyzer.py - genenetwork2

# Created by Luis Del Mar for GeneNetwork.org
# Email: ladelmar99@gmail.com
# LinkedIn: https://www.linkedin.com/in/luis-del-mar/
# GitHub: https://github.com/ladm99
# This script performs the following normalizations to a specific tab delimited dataset:
# 1. Average 2. Log2 normalize 3. ZScore Normaliza 4. Outlier detection 
#!/usr/bin/env python

from tkinter import Tk
from tkinter.filedialog import askopenfilename
import math
import traceback
import os

# method for file selection
def fileSelector():
	# Code from: https://stackoverflow.com/questions/3579568/choosing-a-file-in-python-with-simple-dialog
	Tk().withdraw() # we don't want a full GUI, so keep the root window from appearing
	filename = askopenfilename() # show an "Open" dialog box and return the path to the selected file
	return filename

# method for getting skipRow and skipCol
def skips():
	while True:
		skipRow = 0
		skipCol = 0
		r = input('Enter the number of rows that you wish skip (default is 0): ').strip()
		c = input('Enter the number of columns that you wish skip (default is 0): ').strip()
		try:
			if r != '':
				skipRow = int(r)
			if c != '':
				skipCol = int(c)
			return skipRow, skipCol
		except Exception as e:
			input('Enter a valid value')

# gets the mean of an array
def getMean(values):
	mean = 0.0
	for i in values:
		mean+=i
	mean /= len(values)
	return mean

# gets the standard deviation of an array
def getSTD(values):
	mean = getMean(values)
	phi = 0.0

	for i in values:
		phi += abs((i - mean)) ** 2
	phi = math.sqrt((phi/(len(values) - 1)))
	return phi


def log2Normalize(inputFile):
	try:
		skipRow, skipCol = skips()
		print(':Log2 normalize processing...looking for the minimal expression value')
		f = open('%s' % inputFile, 'r')
		# set min to max value
		min = float('inf')

		for i in range(skipRow + 1):
			f.readline()
		while True:
			data =f.readline()
			if not data:
				break
			else:
				s = data.split('\t')
				for j in range(skipCol + 1, len(s)):
					value = 0.0
					try:
						value = float(s[j])
						if min > value:
							min = value
					except Exception as e:
						pass
		f.close()
		print('Log2 normalize processing...calculating')
		f = open('%s' % inputFile, 'r')
		outputFile = os.path.split(inputFile)[1].replace('.txt', '_log2.txt')
		out = open(outputFile,'w')

		offset = 1.0
		if min < 0.0:
			offset = -min + 1.0
		for i in range(skipRow + 1):
			out.write(f.readline())
		linenum = 0
		while True:
			data = f.readline()
			if not data:
				break
			else:
				s = data.split('\t')
				for j in range(skipCol + 1):
					out.write(s[j] + '\t')
				for j in range(skipCol + 1, len(s)):
					try:
						value = math.log(float(s[j]) + offset) / math.log(2.0)
						out.write(str(round(value,3)) + '\t')
					except Exception as e:
						out.write('x')
						out.write('\t')
				out.write('\n')
				linenum +=1
				if linenum % 2500 == 0:
					print('Log2 normalize processing...finished' + str(linenum) + ' lines')
		f.close()
		out.close()
		print('Log2 normalize finished')
	except Exception as e:
		print(traceback.format_exc())

def ZScoreNormalize(inputFile):
	try:
		skipRow, skipCol = skips()
		print('ZScore normalize processing...calculating means')
		f = open('%s' % inputFile, 'r')
		for n in range(skipRow):
			f.readline()
		# skip first row which is just headers
		s = f.readline().split('\t')	
		col = len(s) - skipCol - 1
		# mean and phi lists are filled with 0.00
		mean = [0.00] * col
		phi = [0.00] * col

		row = 0

		# read to the end of the file
		while True:
			data = 	f.readline()
			if not data:
				break
			else:
				# put values into a list
				s = data.split('\t')
				for m in range(skipCol + 1, len(s)):
					try:
						mean[m - skipCol - 1] = mean[m - skipCol - 1] + float(s[m])
					except Exception as e:
						pass
				row+=1

		f.close()

		for m in range(col):
			mean[m] = mean[m] / row
		print('ZScore normalize processing...calculating standard divisions')
		f = open('%s' % inputFile, 'r')
		for k in range(skipRow + 1):
			# skip headers
			f.readline()
		while True:
			data = f.readline()
			if not data:
				break
			else:
				s = data.split('\t')
				for i1 in range(skipCol + 1, len(s)):
					value = 0.0
					try:
						value = float(s[i1])
						phi[i1 - skipCol - 1] = phi[i1 - skipCol - 1] + (value - mean[i1 - skipCol - 1]) ** 2
					except Exception as e:
						pass
					
		f.close()
		for j in range(col):
			phi[j] = math.sqrt(phi[j] / (row - 1))
			# print(str(mean[j]) + '\t' + str(phi[j]))

		outputFile = os.path.split(inputFile)[1].replace('.txt', '_Z.txt')
		f = open('%s' % inputFile, 'r')
		out = open(outputFile,'w')

		for i in range(skipRow + 1):
			out.write(f.readline())

		row = 0
		while True:
			data = f.readline()
			if not data:
				break
			else:
				s = data.split('\t')
				for i1 in range(skipRow + 1):
					out.write(s[i1] + '\t')
				for i1 in range(skipCol + 1, len(s)):
					try:
						value = float(s[i1])
						value = 2.0 * (value - mean[i1 - skipCol - 1]) / phi[i1 - skipCol - 1] + 8.0
						out.write(str(round(value, 3)) + '\t')
					except Exception as e:
						out.write('x' + '\t')
					

				out.write('\n')
				row+=1
				if row % 2500 == 0:
					print('ZScore normalize processing...finished ' + str(row) + ' lines')

		f.close()
		out.close()
		print('ZScore normalize finished')
	except Exception as e:
		print(e)

# find the average of values with column names that are the same
def average(inputFile, needSE):
	try:
		skipRow, skipCol = skips()
		f = open('%s' % inputFile, 'r')
		outputFile = os.path.split(inputFile)[1].replace('.txt', '_Avg.txt')
		out = open(outputFile,'w')

		titleList = [] # will hold all of the titles, used for getting indices for values
		itemList = [] # will hold all the unique titles

		# skips the appropriate amount of rows
		for i in range(skipRow):
			s = f.readline()
			out.write(s + '\n')

		s = f.readline().split('\t')
		for i in range(len(s)):
			s[i] = s[i].strip()
			titleList.append(s[i])
			if s[i] not in itemList and i > skipCol:
				itemList.append(s[i])

		for i in range(skipCol + 1):
			out.write(titleList[i])
			out.write('\t')

		for i in range(len(itemList)):
			out.write(itemList[i])
			if i < len(itemList) - 1:
				out.write('\t')
		out.write('\n')

		line = 0
		while True:
			data = f.readline()
			if not data:
				break
			else:
				s = data.split('\t')
				for j in range(skipCol + 1):
					out.write(str(s[j]) + '\t')

				for j in range(len(itemList)):
					avgItemValue = 0.0
					n = 0
					at = titleList.index(itemList[j])
					while(at >= 0):
						try:
							avgItemValue += float(s[at])
							n+=1
							at = titleList.index(itemList[j], at + 1)
						except Exception as e:
							at = -1
					if n == 0:
						out.write('x\t')
					else:
						avgItemValue /= n
						out.write(str(round(avgItemValue, 4)))
					if j < len(itemList) - 1:
						out.write('\t')

				out.write('\n')
				line +=1

				if line % 1000 == 0:
					print('Processing average...' + str(line) + ' lines\n')

		f.close()
		out.close()
	except Exception as e:
		print(traceback.format_exc())

	if needSE:
		getSE(inputFile, skipRow, skipCol)

# find the SE of the average of the values
def getSE(inputFile, skipRow, skipCol):
	try:
		f = open('%s' % inputFile, 'r')
		outputFile = os.path.split(inputFile)[1].replace('.txt', '_Avg_SE.txt')
		out = open(outputFile,'w')

		titleList = [] # will hold all of the titles, used for getting indices for values
		itemList = [] # will hold all the unique titles

		# skips the appropriate amount of rows
		for i in range(skipRow):
			s = f.readline()
			out.write(s + '\n')

		s = f.readline().split('\t')
		for i in range(len(s)):
			s[i] = s[i].strip()
			titleList.append(s[i])
			if s[i] not in itemList and i > skipCol:
				itemList.append(s[i])

		for i in range(skipCol + 1):
			out.write(titleList[i])
			out.write('\t')

		for i in range(len(itemList)):
			out.write(itemList[i])
			if i < len(itemList) - 1:
				out.write('\t')
		out.write('\n')

		line = 0
		while True:
			data = f.readline()
			if not data:
				break
			else:
				s = data.split('\t')
				for j in range(skipCol + 1):
					out.write(str(s[j]) + '\t')

				for j in range(len(itemList)):
					avgItemValue = 0.0
					n = 0
					at = titleList.index(itemList[j])
					while(at >= 0):
						try:
							avgItemValue += float(s[at])
							n+=1
							at = titleList.index(itemList[j], at + 1)
						except Exception as e:
							at = -1
					if n == 0:
						out.write('x\t')
					else:
						avgItemValue /= n
						SE = 0.0
						n = 0
						at = titleList.index(itemList[j])
						while at >= 0:
							try:
								SE += (avgItemValue - float(s[at])) * (avgItemValue - float(s[at]))
								n +=1
								at = titleList.index(itemList[j], at + 1)
							except Exception as e:
								at = -1

						if n > 1:
							SE = math.sqrt(SE / (n - 1))
							SE /= math.sqrt((n-1))
							out.write(str(round(SE, 8)))
							out.write('\t')
						else:
							out.write('\t')

				out.write('\n')
				line +=1

				if line % 1000 == 0:
					print('Processing average... Standard Error... ' + str(line) + ' lines\n')

		f.close()
		out.close()
	except Exception as e:
		print(traceback.format_exc())

# find outliers
def outlier(inputFile):
	try:
		skipRow, skipCol = skips()
		print('Outlier running')
		f = open('%s' % inputFile, 'r')
		outputFile = os.path.split(inputFile)[1].replace('.txt', '_Outlier.txt')
		out = open(outputFile,'w')

		for i in range(skipRow):
			f.readline()
		data = f.readline()
		for i in range(skipCol + 1):
			if data.index('\t') >= 0:
				data = data[data.index('\t'):]
			data = data.strip()

		sampleTitle = data.split('\t')
		sampleNum = len(sampleTitle)
		values = [0.0] * sampleNum
		marks = [0] * sampleNum
		geneNum = 0

		while True:
			data = f.readline()
			if not data:
				break
			else:
				s = data.split('\t')
				for k in range(sampleNum):
					try:
						values[k] = round(float(s[k + skipCol + 1]), 3)

					except Exception as e:
						pass

				mean = getMean(values)
				phi = getSTD(values)
				for k in range(sampleNum):
					if values[k] < (mean - 2.0 * phi) or values[k] > (mean + 2.0 * phi):
						marks[k] += 1
				
				geneNum += 1	
				if geneNum % 1000 == 0:
					print('Outlier running, finished...' + str(geneNum) + ' lines')

		for j in range(sampleNum):
			out.write(sampleTitle[j] + '\t')
			out.write(str(marks[j]) + '\t')
			out.write(str(round(marks[j] * 1.0 / geneNum, 3)))
			out.write('\n')
		out.close
		print('Outlier finished')

	except Exception as e:
		print(traceback.format_exc())

# adds dashes based on how long the filename is, just here to make the selection a little bit nicer
def dashes(string):
	dash = '-------------------------'
	for i in range(len(string)):
		dash+='-'
	return '\n' + dash

def main():
	input('Press Enter to select file')
	filename = fileSelector()

	while True:
		file = os.path.split(filename)[1]
		print('\nEnter Selection for file ' +  file + dashes(file))
		print('1. Average\n2. Log2 Normalize\n3. ZScore Normalize\n4. Outlier\n5. Select a new file\n6. Exit')
		select = input('Enter: ')

		if select == '1':
			needSE = False
			SE = input('Do you want to compute the standard error [Y/N] (default is no)').lower()
			if SE == 'y':
				needSE = True
			average(filename, needSE)
		elif select == '2':
			log2Normalize(filename)
		elif select == '3':
			ZScoreNormalize(filename)
		elif select == '4':
			outlier(filename)
		elif select == '5':
			filename = fileSelector()
		elif select == '6':
			exit()




main()