about summary refs log tree commit diff
diff options
context:
space:
mode:
authorzsloan2023-03-08 20:35:53 +0000
committerzsloan2023-03-08 20:35:53 +0000
commit638ee60bc2930ebe3b73814dab6aa3b124211345 (patch)
treea32a93e6c0cbe89beddc68d276a950b9962c959d
parent82f6bd3acd87c42275cb93bd8d9bc2cbdc8e9508 (diff)
downloadgenenetwork2-638ee60bc2930ebe3b73814dab6aa3b124211345.tar.gz
Add DataAnalyzer.py script to maintenance scripts
-rw-r--r--scripts/maintenance/DataAnalyzer.py470
1 files changed, 470 insertions, 0 deletions
diff --git a/scripts/maintenance/DataAnalyzer.py b/scripts/maintenance/DataAnalyzer.py
new file mode 100644
index 00000000..6d14c8da
--- /dev/null
+++ b/scripts/maintenance/DataAnalyzer.py
@@ -0,0 +1,470 @@
+# Created by Luis Del Mar for GeneNetwork.org

+# Email: ladelmar99@gmail.com

+# LinkedIn: https://www.linkedin.com/in/luis-del-mar/

+# GitHub: https://github.com/ladm99

+# This script performs the following normalizations to a specific tab delimited dataset:

+# 1. Average 2. Log2 normalize 3. ZScore Normaliza 4. Outlier detection 

+#!/usr/bin/env python

+

+from tkinter import Tk

+from tkinter.filedialog import askopenfilename

+import math

+import traceback

+import os

+

+# method for file selection

+def fileSelector():

+	# Code from: https://stackoverflow.com/questions/3579568/choosing-a-file-in-python-with-simple-dialog

+	Tk().withdraw() # we don't want a full GUI, so keep the root window from appearing

+	filename = askopenfilename() # show an "Open" dialog box and return the path to the selected file

+	return filename

+

+# method for getting skipRow and skipCol

+def skips():

+	while True:

+		skipRow = 0

+		skipCol = 0

+		r = input('Enter the number of rows that you wish skip (default is 0): ').strip()

+		c = input('Enter the number of columns that you wish skip (default is 0): ').strip()

+		try:

+			if r != '':

+				skipRow = int(r)

+			if c != '':

+				skipCol = int(c)

+			return skipRow, skipCol

+		except Exception as e:

+			input('Enter a valid value')

+

+# gets the mean of an array

+def getMean(values):

+	mean = 0.0

+	for i in values:

+		mean+=i

+	mean /= len(values)

+	return mean

+

+# gets the standard deviation of an array

+def getSTD(values):

+	mean = getMean(values)

+	phi = 0.0

+

+	for i in values:

+		phi += abs((i - mean)) ** 2

+	phi = math.sqrt((phi/(len(values) - 1)))

+	return phi

+

+

+def log2Normalize(inputFile):

+	try:

+		skipRow, skipCol = skips()

+		print(':Log2 normalize processing...looking for the minimal expression value')

+		f = open('%s' % inputFile, 'r')

+		# set min to max value

+		min = float('inf')

+

+		for i in range(skipRow + 1):

+			f.readline()

+		while True:

+			data =f.readline()

+			if not data:

+				break

+			else:

+				s = data.split('\t')

+				for j in range(skipCol + 1, len(s)):

+					value = 0.0

+					try:

+						value = float(s[j])

+						if min > value:

+							min = value

+					except Exception as e:

+						pass

+		f.close()

+		print('Log2 normalize processing...calculating')

+		f = open('%s' % inputFile, 'r')

+		outputFile = os.path.split(inputFile)[1].replace('.txt', '_log2.txt')

+		out = open(outputFile,'w')

+

+		offset = 1.0

+		if min < 0.0:

+			offset = -min + 1.0

+		for i in range(skipRow + 1):

+			out.write(f.readline())

+		linenum = 0

+		while True:

+			data = f.readline()

+			if not data:

+				break

+			else:

+				s = data.split('\t')

+				for j in range(skipCol + 1):

+					out.write(s[j] + '\t')

+				for j in range(skipCol + 1, len(s)):

+					try:

+						value = math.log(float(s[j]) + offset) / math.log(2.0)

+						out.write(str(round(value,3)) + '\t')

+					except Exception as e:

+						out.write('x')

+						out.write('\t')

+				out.write('\n')

+				linenum +=1

+				if linenum % 2500 == 0:

+					print('Log2 normalize processing...finished' + str(linenum) + ' lines')

+		f.close()

+		out.close()

+		print('Log2 normalize finished')

+	except Exception as e:

+		print(traceback.format_exc())

+

+def ZScoreNormalize(inputFile):

+	try:

+		skipRow, skipCol = skips()

+		print('ZScore normalize processing...calculating means')

+		f = open('%s' % inputFile, 'r')

+		for n in range(skipRow):

+			f.readline()

+		# skip first row which is just headers

+		s = f.readline().split('\t')	

+		col = len(s) - skipCol - 1

+		# mean and phi lists are filled with 0.00

+		mean = [0.00] * col

+		phi = [0.00] * col

+

+		row = 0

+

+		# read to the end of the file

+		while True:

+			data = 	f.readline()

+			if not data:

+				break

+			else:

+				# put values into a list

+				s = data.split('\t')

+				for m in range(skipCol + 1, len(s)):

+					try:

+						mean[m - skipCol - 1] = mean[m - skipCol - 1] + float(s[m])

+					except Exception as e:

+						pass

+				row+=1

+

+		f.close()

+

+		for m in range(col):

+			mean[m] = mean[m] / row

+		print('ZScore normalize processing...calculating standard divisions')

+		f = open('%s' % inputFile, 'r')

+		for k in range(skipRow + 1):

+			# skip headers

+			f.readline()

+		while True:

+			data = f.readline()

+			if not data:

+				break

+			else:

+				s = data.split('\t')

+				for i1 in range(skipCol + 1, len(s)):

+					value = 0.0

+					try:

+						value = float(s[i1])

+						phi[i1 - skipCol - 1] = phi[i1 - skipCol - 1] + (value - mean[i1 - skipCol - 1]) ** 2

+					except Exception as e:

+						pass

+					

+		f.close()

+		for j in range(col):

+			phi[j] = math.sqrt(phi[j] / (row - 1))

+			# print(str(mean[j]) + '\t' + str(phi[j]))

+

+		outputFile = os.path.split(inputFile)[1].replace('.txt', '_Z.txt')

+		f = open('%s' % inputFile, 'r')

+		out = open(outputFile,'w')

+

+		for i in range(skipRow + 1):

+			out.write(f.readline())

+

+		row = 0

+		while True:

+			data = f.readline()

+			if not data:

+				break

+			else:

+				s = data.split('\t')

+				for i1 in range(skipRow + 1):

+					out.write(s[i1] + '\t')

+				for i1 in range(skipCol + 1, len(s)):

+					try:

+						value = float(s[i1])

+						value = 2.0 * (value - mean[i1 - skipCol - 1]) / phi[i1 - skipCol - 1] + 8.0

+						out.write(str(round(value, 3)) + '\t')

+					except Exception as e:

+						out.write('x' + '\t')

+					

+

+				out.write('\n')

+				row+=1

+				if row % 2500 == 0:

+					print('ZScore normalize processing...finished ' + str(row) + ' lines')

+

+		f.close()

+		out.close()

+		print('ZScore normalize finished')

+	except Exception as e:

+		print(e)

+

+# find the average of values with column names that are the same

+def average(inputFile, needSE):

+	try:

+		skipRow, skipCol = skips()

+		f = open('%s' % inputFile, 'r')

+		outputFile = os.path.split(inputFile)[1].replace('.txt', '_Avg.txt')

+		out = open(outputFile,'w')

+

+		titleList = [] # will hold all of the titles, used for getting indices for values

+		itemList = [] # will hold all the unique titles

+

+		# skips the appropriate amount of rows

+		for i in range(skipRow):

+			s = f.readline()

+			out.write(s + '\n')

+

+		s = f.readline().split('\t')

+		for i in range(len(s)):

+			s[i] = s[i].strip()

+			titleList.append(s[i])

+			if s[i] not in itemList and i > skipCol:

+				itemList.append(s[i])

+

+		for i in range(skipCol + 1):

+			out.write(titleList[i])

+			out.write('\t')

+

+		for i in range(len(itemList)):

+			out.write(itemList[i])

+			if i < len(itemList) - 1:

+				out.write('\t')

+		out.write('\n')

+

+		line = 0

+		while True:

+			data = f.readline()

+			if not data:

+				break

+			else:

+				s = data.split('\t')

+				for j in range(skipCol + 1):

+					out.write(str(s[j]) + '\t')

+

+				for j in range(len(itemList)):

+					avgItemValue = 0.0

+					n = 0

+					at = titleList.index(itemList[j])

+					while(at >= 0):

+						try:

+							avgItemValue += float(s[at])

+							n+=1

+							at = titleList.index(itemList[j], at + 1)

+						except Exception as e:

+							at = -1

+					if n == 0:

+						out.write('x\t')

+					else:

+						avgItemValue /= n

+						out.write(str(round(avgItemValue, 4)))

+					if j < len(itemList) - 1:

+						out.write('\t')

+

+				out.write('\n')

+				line +=1

+

+				if line % 1000 == 0:

+					print('Processing average...' + str(line) + ' lines\n')

+

+		f.close()

+		out.close()

+	except Exception as e:

+		print(traceback.format_exc())

+

+	if needSE:

+		getSE(inputFile, skipRow, skipCol)

+

+# find the SE of the average of the values

+def getSE(inputFile, skipRow, skipCol):

+	try:

+		f = open('%s' % inputFile, 'r')

+		outputFile = os.path.split(inputFile)[1].replace('.txt', '_Avg_SE.txt')

+		out = open(outputFile,'w')

+

+		titleList = [] # will hold all of the titles, used for getting indices for values

+		itemList = [] # will hold all the unique titles

+

+		# skips the appropriate amount of rows

+		for i in range(skipRow):

+			s = f.readline()

+			out.write(s + '\n')

+

+		s = f.readline().split('\t')

+		for i in range(len(s)):

+			s[i] = s[i].strip()

+			titleList.append(s[i])

+			if s[i] not in itemList and i > skipCol:

+				itemList.append(s[i])

+

+		for i in range(skipCol + 1):

+			out.write(titleList[i])

+			out.write('\t')

+

+		for i in range(len(itemList)):

+			out.write(itemList[i])

+			if i < len(itemList) - 1:

+				out.write('\t')

+		out.write('\n')

+

+		line = 0

+		while True:

+			data = f.readline()

+			if not data:

+				break

+			else:

+				s = data.split('\t')

+				for j in range(skipCol + 1):

+					out.write(str(s[j]) + '\t')

+

+				for j in range(len(itemList)):

+					avgItemValue = 0.0

+					n = 0

+					at = titleList.index(itemList[j])

+					while(at >= 0):

+						try:

+							avgItemValue += float(s[at])

+							n+=1

+							at = titleList.index(itemList[j], at + 1)

+						except Exception as e:

+							at = -1

+					if n == 0:

+						out.write('x\t')

+					else:

+						avgItemValue /= n

+						SE = 0.0

+						n = 0

+						at = titleList.index(itemList[j])

+						while at >= 0:

+							try:

+								SE += (avgItemValue - float(s[at])) * (avgItemValue - float(s[at]))

+								n +=1

+								at = titleList.index(itemList[j], at + 1)

+							except Exception as e:

+								at = -1

+

+						if n > 1:

+							SE = math.sqrt(SE / (n - 1))

+							SE /= math.sqrt((n-1))

+							out.write(str(round(SE, 8)))

+							out.write('\t')

+						else:

+							out.write('\t')

+

+				out.write('\n')

+				line +=1

+

+				if line % 1000 == 0:

+					print('Processing average... Standard Error... ' + str(line) + ' lines\n')

+

+		f.close()

+		out.close()

+	except Exception as e:

+		print(traceback.format_exc())

+

+# find outliers

+def outlier(inputFile):

+	try:

+		skipRow, skipCol = skips()

+		print('Outlier running')

+		f = open('%s' % inputFile, 'r')

+		outputFile = os.path.split(inputFile)[1].replace('.txt', '_Outlier.txt')

+		out = open(outputFile,'w')

+

+		for i in range(skipRow):

+			f.readline()

+		data = f.readline()

+		for i in range(skipCol + 1):

+			if data.index('\t') >= 0:

+				data = data[data.index('\t'):]

+			data = data.strip()

+

+		sampleTitle = data.split('\t')

+		sampleNum = len(sampleTitle)

+		values = [0.0] * sampleNum

+		marks = [0] * sampleNum

+		geneNum = 0

+

+		while True:

+			data = f.readline()

+			if not data:

+				break

+			else:

+				s = data.split('\t')

+				for k in range(sampleNum):

+					try:

+						values[k] = round(float(s[k + skipCol + 1]), 3)

+

+					except Exception as e:

+						pass

+

+				mean = getMean(values)

+				phi = getSTD(values)

+				for k in range(sampleNum):

+					if values[k] < (mean - 2.0 * phi) or values[k] > (mean + 2.0 * phi):

+						marks[k] += 1

+				

+				geneNum += 1	

+				if geneNum % 1000 == 0:

+					print('Outlier running, finished...' + str(geneNum) + ' lines')

+

+		for j in range(sampleNum):

+			out.write(sampleTitle[j] + '\t')

+			out.write(str(marks[j]) + '\t')

+			out.write(str(round(marks[j] * 1.0 / geneNum, 3)))

+			out.write('\n')

+		out.close

+		print('Outlier finished')

+

+	except Exception as e:

+		print(traceback.format_exc())

+

+# adds dashes based on how long the filename is, just here to make the selection a little bit nicer

+def dashes(string):

+	dash = '-------------------------'

+	for i in range(len(string)):

+		dash+='-'

+	return '\n' + dash

+

+def main():

+	input('Press Enter to select file')

+	filename = fileSelector()

+

+	while True:

+		file = os.path.split(filename)[1]

+		print('\nEnter Selection for file ' +  file + dashes(file))

+		print('1. Average\n2. Log2 Normalize\n3. ZScore Normalize\n4. Outlier\n5. Select a new file\n6. Exit')

+		select = input('Enter: ')

+

+		if select == '1':

+			needSE = False

+			SE = input('Do you want to compute the standard error [Y/N] (default is no)').lower()

+			if SE == 'y':

+				needSE = True

+			average(filename, needSE)

+		elif select == '2':

+			log2Normalize(filename)

+		elif select == '3':

+			ZScoreNormalize(filename)

+		elif select == '4':

+			outlier(filename)

+		elif select == '5':

+			filename = fileSelector()

+		elif select == '6':

+			exit()

+

+

+

+

+main()
\ No newline at end of file