Add DataAnalyzer.py script to maintenance scripts

author: zsloan 2023-03-08 20:35:53 +0000
committer: zsloan 2023-03-08 20:35:53 +0000
commit: 638ee60bc2930ebe3b73814dab6aa3b124211345 (patch)
tree: a32a93e6c0cbe89beddc68d276a950b9962c959d
parent: 82f6bd3acd87c42275cb93bd8d9bc2cbdc8e9508 (diff)
download: genenetwork2-638ee60bc2930ebe3b73814dab6aa3b124211345.tar.gz
1 files changed, 470 insertions, 0 deletions
diff --git a/scripts/maintenance/DataAnalyzer.py b/scripts/maintenance/DataAnalyzer.py
new file mode 100644
index 00000000..6d14c8da
--- /dev/null
+++ b/scripts/maintenance/DataAnalyzer.py
@@ -0,0 +1,470 @@
+# Created by Luis Del Mar for GeneNetwork.org
+# Email: ladelmar99@gmail.com
+# LinkedIn: https://www.linkedin.com/in/luis-del-mar/
+# GitHub: https://github.com/ladm99
+# This script performs the following normalizations to a specific tab delimited dataset:
+# 1. Average 2. Log2 normalize 3. ZScore Normaliza 4. Outlier detection 
+#!/usr/bin/env python
+
+from tkinter import Tk
+from tkinter.filedialog import askopenfilename
+import math
+import traceback
+import os
+
+# method for file selection
+def fileSelector():
+	# Code from: https://stackoverflow.com/questions/3579568/choosing-a-file-in-python-with-simple-dialog
+	Tk().withdraw() # we don't want a full GUI, so keep the root window from appearing
+	filename = askopenfilename() # show an "Open" dialog box and return the path to the selected file
+	return filename
+
+# method for getting skipRow and skipCol
+def skips():
+	while True:
+		skipRow = 0
+		skipCol = 0
+		r = input('Enter the number of rows that you wish skip (default is 0): ').strip()
+		c = input('Enter the number of columns that you wish skip (default is 0): ').strip()
+		try:
+			if r != '':
+				skipRow = int(r)
+			if c != '':
+				skipCol = int(c)
+			return skipRow, skipCol
+		except Exception as e:
+			input('Enter a valid value')
+
+# gets the mean of an array
+def getMean(values):
+	mean = 0.0
+	for i in values:
+		mean+=i
+	mean /= len(values)
+	return mean
+
+# gets the standard deviation of an array
+def getSTD(values):
+	mean = getMean(values)
+	phi = 0.0
+
+	for i in values:
+		phi += abs((i - mean)) ** 2
+	phi = math.sqrt((phi/(len(values) - 1)))
+	return phi
+
+
+def log2Normalize(inputFile):
+	try:
+		skipRow, skipCol = skips()
+		print(':Log2 normalize processing...looking for the minimal expression value')
+		f = open('%s' % inputFile, 'r')
+		# set min to max value
+		min = float('inf')
+
+		for i in range(skipRow + 1):
+			f.readline()
+		while True:
+			data =f.readline()
+			if not data:
+				break
+			else:
+				s = data.split('\t')
+				for j in range(skipCol + 1, len(s)):
+					value = 0.0
+					try:
+						value = float(s[j])
+						if min > value:
+							min = value
+					except Exception as e:
+						pass
+		f.close()
+		print('Log2 normalize processing...calculating')
+		f = open('%s' % inputFile, 'r')
+		outputFile = os.path.split(inputFile)[1].replace('.txt', '_log2.txt')
+		out = open(outputFile,'w')
+
+		offset = 1.0
+		if min < 0.0:
+			offset = -min + 1.0
+		for i in range(skipRow + 1):
+			out.write(f.readline())
+		linenum = 0
+		while True:
+			data = f.readline()
+			if not data:
+				break
+			else:
+				s = data.split('\t')
+				for j in range(skipCol + 1):
+					out.write(s[j] + '\t')
+				for j in range(skipCol + 1, len(s)):
+					try:
+						value = math.log(float(s[j]) + offset) / math.log(2.0)
+						out.write(str(round(value,3)) + '\t')
+					except Exception as e:
+						out.write('x')
+						out.write('\t')
+				out.write('\n')
+				linenum +=1
+				if linenum % 2500 == 0:
+					print('Log2 normalize processing...finished' + str(linenum) + ' lines')
+		f.close()
+		out.close()
+		print('Log2 normalize finished')
+	except Exception as e:
+		print(traceback.format_exc())
+
+def ZScoreNormalize(inputFile):
+	try:
+		skipRow, skipCol = skips()
+		print('ZScore normalize processing...calculating means')
+		f = open('%s' % inputFile, 'r')
+		for n in range(skipRow):
+			f.readline()
+		# skip first row which is just headers
+		s = f.readline().split('\t')	
+		col = len(s) - skipCol - 1
+		# mean and phi lists are filled with 0.00
+		mean = [0.00] * col
+		phi = [0.00] * col
+
+		row = 0
+
+		# read to the end of the file
+		while True:
+			data = 	f.readline()
+			if not data:
+				break
+			else:
+				# put values into a list
+				s = data.split('\t')
+				for m in range(skipCol + 1, len(s)):
+					try:
+						mean[m - skipCol - 1] = mean[m - skipCol - 1] + float(s[m])
+					except Exception as e:
+						pass
+				row+=1
+
+		f.close()
+
+		for m in range(col):
+			mean[m] = mean[m] / row
+		print('ZScore normalize processing...calculating standard divisions')
+		f = open('%s' % inputFile, 'r')
+		for k in range(skipRow + 1):
+			# skip headers
+			f.readline()
+		while True:
+			data = f.readline()
+			if not data:
+				break
+			else:
+				s = data.split('\t')
+				for i1 in range(skipCol + 1, len(s)):
+					value = 0.0
+					try:
+						value = float(s[i1])
+						phi[i1 - skipCol - 1] = phi[i1 - skipCol - 1] + (value - mean[i1 - skipCol - 1]) ** 2
+					except Exception as e:
+						pass
+					
+		f.close()
+		for j in range(col):
+			phi[j] = math.sqrt(phi[j] / (row - 1))
+			# print(str(mean[j]) + '\t' + str(phi[j]))
+
+		outputFile = os.path.split(inputFile)[1].replace('.txt', '_Z.txt')
+		f = open('%s' % inputFile, 'r')
+		out = open(outputFile,'w')
+
+		for i in range(skipRow + 1):
+			out.write(f.readline())
+
+		row = 0
+		while True:
+			data = f.readline()
+			if not data:
+				break
+			else:
+				s = data.split('\t')
+				for i1 in range(skipRow + 1):
+					out.write(s[i1] + '\t')
+				for i1 in range(skipCol + 1, len(s)):
+					try:
+						value = float(s[i1])
+						value = 2.0 * (value - mean[i1 - skipCol - 1]) / phi[i1 - skipCol - 1] + 8.0
+						out.write(str(round(value, 3)) + '\t')
+					except Exception as e:
+						out.write('x' + '\t')
+					
+
+				out.write('\n')
+				row+=1
+				if row % 2500 == 0:
+					print('ZScore normalize processing...finished ' + str(row) + ' lines')
+
+		f.close()
+		out.close()
+		print('ZScore normalize finished')
+	except Exception as e:
+		print(e)
+
+# find the average of values with column names that are the same
+def average(inputFile, needSE):
+	try:
+		skipRow, skipCol = skips()
+		f = open('%s' % inputFile, 'r')
+		outputFile = os.path.split(inputFile)[1].replace('.txt', '_Avg.txt')
+		out = open(outputFile,'w')
+
+		titleList = [] # will hold all of the titles, used for getting indices for values
+		itemList = [] # will hold all the unique titles
+
+		# skips the appropriate amount of rows
+		for i in range(skipRow):
+			s = f.readline()
+			out.write(s + '\n')
+
+		s = f.readline().split('\t')
+		for i in range(len(s)):
+			s[i] = s[i].strip()
+			titleList.append(s[i])
+			if s[i] not in itemList and i > skipCol:
+				itemList.append(s[i])
+
+		for i in range(skipCol + 1):
+			out.write(titleList[i])
+			out.write('\t')
+
+		for i in range(len(itemList)):
+			out.write(itemList[i])
+			if i < len(itemList) - 1:
+				out.write('\t')
+		out.write('\n')
+
+		line = 0
+		while True:
+			data = f.readline()
+			if not data:
+				break
+			else:
+				s = data.split('\t')
+				for j in range(skipCol + 1):
+					out.write(str(s[j]) + '\t')
+
+				for j in range(len(itemList)):
+					avgItemValue = 0.0
+					n = 0
+					at = titleList.index(itemList[j])
+					while(at >= 0):
+						try:
+							avgItemValue += float(s[at])
+							n+=1
+							at = titleList.index(itemList[j], at + 1)
+						except Exception as e:
+							at = -1
+					if n == 0:
+						out.write('x\t')
+					else:
+						avgItemValue /= n
+						out.write(str(round(avgItemValue, 4)))
+					if j < len(itemList) - 1:
+						out.write('\t')
+
+				out.write('\n')
+				line +=1
+
+				if line % 1000 == 0:
+					print('Processing average...' + str(line) + ' lines\n')
+
+		f.close()
+		out.close()
+	except Exception as e:
+		print(traceback.format_exc())
+
+	if needSE:
+		getSE(inputFile, skipRow, skipCol)
+
+# find the SE of the average of the values
+def getSE(inputFile, skipRow, skipCol):
+	try:
+		f = open('%s' % inputFile, 'r')
+		outputFile = os.path.split(inputFile)[1].replace('.txt', '_Avg_SE.txt')
+		out = open(outputFile,'w')
+
+		titleList = [] # will hold all of the titles, used for getting indices for values
+		itemList = [] # will hold all the unique titles
+
+		# skips the appropriate amount of rows
+		for i in range(skipRow):
+			s = f.readline()
+			out.write(s + '\n')
+
+		s = f.readline().split('\t')
+		for i in range(len(s)):
+			s[i] = s[i].strip()
+			titleList.append(s[i])
+			if s[i] not in itemList and i > skipCol:
+				itemList.append(s[i])
+
+		for i in range(skipCol + 1):
+			out.write(titleList[i])
+			out.write('\t')
+
+		for i in range(len(itemList)):
+			out.write(itemList[i])
+			if i < len(itemList) - 1:
+				out.write('\t')
+		out.write('\n')
+
+		line = 0
+		while True:
+			data = f.readline()
+			if not data:
+				break
+			else:
+				s = data.split('\t')
+				for j in range(skipCol + 1):
+					out.write(str(s[j]) + '\t')
+
+				for j in range(len(itemList)):
+					avgItemValue = 0.0
+					n = 0
+					at = titleList.index(itemList[j])
+					while(at >= 0):
+						try:
+							avgItemValue += float(s[at])
+							n+=1
+							at = titleList.index(itemList[j], at + 1)
+						except Exception as e:
+							at = -1
+					if n == 0:
+						out.write('x\t')
+					else:
+						avgItemValue /= n
+						SE = 0.0
+						n = 0
+						at = titleList.index(itemList[j])
+						while at >= 0:
+							try:
+								SE += (avgItemValue - float(s[at])) * (avgItemValue - float(s[at]))
+								n +=1
+								at = titleList.index(itemList[j], at + 1)
+							except Exception as e:
+								at = -1
+
+						if n > 1:
+							SE = math.sqrt(SE / (n - 1))
+							SE /= math.sqrt((n-1))
+							out.write(str(round(SE, 8)))
+							out.write('\t')
+						else:
+							out.write('\t')
+
+				out.write('\n')
+				line +=1
+
+				if line % 1000 == 0:
+					print('Processing average... Standard Error... ' + str(line) + ' lines\n')
+
+		f.close()
+		out.close()
+	except Exception as e:
+		print(traceback.format_exc())
+
+# find outliers
+def outlier(inputFile):
+	try:
+		skipRow, skipCol = skips()
+		print('Outlier running')
+		f = open('%s' % inputFile, 'r')
+		outputFile = os.path.split(inputFile)[1].replace('.txt', '_Outlier.txt')
+		out = open(outputFile,'w')
+
+		for i in range(skipRow):
+			f.readline()
+		data = f.readline()
+		for i in range(skipCol + 1):
+			if data.index('\t') >= 0:
+				data = data[data.index('\t'):]
+			data = data.strip()
+
+		sampleTitle = data.split('\t')
+		sampleNum = len(sampleTitle)
+		values = [0.0] * sampleNum
+		marks = [0] * sampleNum
+		geneNum = 0
+
+		while True:
+			data = f.readline()
+			if not data:
+				break
+			else:
+				s = data.split('\t')
+				for k in range(sampleNum):
+					try:
+						values[k] = round(float(s[k + skipCol + 1]), 3)
+
+					except Exception as e:
+						pass
+
+				mean = getMean(values)
+				phi = getSTD(values)
+				for k in range(sampleNum):
+					if values[k] < (mean - 2.0 * phi) or values[k] > (mean + 2.0 * phi):
+						marks[k] += 1
+				
+				geneNum += 1	
+				if geneNum % 1000 == 0:
+					print('Outlier running, finished...' + str(geneNum) + ' lines')
+
+		for j in range(sampleNum):
+			out.write(sampleTitle[j] + '\t')
+			out.write(str(marks[j]) + '\t')
+			out.write(str(round(marks[j] * 1.0 / geneNum, 3)))
+			out.write('\n')
+		out.close
+		print('Outlier finished')
+
+	except Exception as e:
+		print(traceback.format_exc())
+
+# adds dashes based on how long the filename is, just here to make the selection a little bit nicer
+def dashes(string):
+	dash = '-------------------------'
+	for i in range(len(string)):
+		dash+='-'
+	return '\n' + dash
+
+def main():
+	input('Press Enter to select file')
+	filename = fileSelector()
+
+	while True:
+		file = os.path.split(filename)[1]
+		print('\nEnter Selection for file ' +  file + dashes(file))
+		print('1. Average\n2. Log2 Normalize\n3. ZScore Normalize\n4. Outlier\n5. Select a new file\n6. Exit')
+		select = input('Enter: ')
+
+		if select == '1':
+			needSE = False
+			SE = input('Do you want to compute the standard error [Y/N] (default is no)').lower()
+			if SE == 'y':
+				needSE = True
+			average(filename, needSE)
+		elif select == '2':
+			log2Normalize(filename)
+		elif select == '3':
+			ZScoreNormalize(filename)
+		elif select == '4':
+			outlier(filename)
+		elif select == '5':
+			filename = fileSelector()
+		elif select == '6':
+			exit()
+
+
+
+
+main()
\ No newline at end of file
author	zsloan	2023-03-08 20:35:53 +0000
committer	zsloan	2023-03-08 20:35:53 +0000
commit	638ee60bc2930ebe3b73814dab6aa3b124211345 (patch)
tree	a32a93e6c0cbe89beddc68d276a950b9962c959d
parent	82f6bd3acd87c42275cb93bd8d9bc2cbdc8e9508 (diff)
download	genenetwork2-638ee60bc2930ebe3b73814dab6aa3b124211345.tar.gz