aboutsummaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
Diffstat (limited to 'scripts')
-rw-r--r--scripts/maintenance/DataAnalyzer.py470
1 files changed, 470 insertions, 0 deletions
diff --git a/scripts/maintenance/DataAnalyzer.py b/scripts/maintenance/DataAnalyzer.py
new file mode 100644
index 00000000..6d14c8da
--- /dev/null
+++ b/scripts/maintenance/DataAnalyzer.py
@@ -0,0 +1,470 @@
+# Created by Luis Del Mar for GeneNetwork.org
+# Email: ladelmar99@gmail.com
+# LinkedIn: https://www.linkedin.com/in/luis-del-mar/
+# GitHub: https://github.com/ladm99
+# This script performs the following normalizations to a specific tab delimited dataset:
+# 1. Average 2. Log2 normalize 3. ZScore Normaliza 4. Outlier detection
+#!/usr/bin/env python
+
+from tkinter import Tk
+from tkinter.filedialog import askopenfilename
+import math
+import traceback
+import os
+
+# method for file selection
+def fileSelector():
+ # Code from: https://stackoverflow.com/questions/3579568/choosing-a-file-in-python-with-simple-dialog
+ Tk().withdraw() # we don't want a full GUI, so keep the root window from appearing
+ filename = askopenfilename() # show an "Open" dialog box and return the path to the selected file
+ return filename
+
+# method for getting skipRow and skipCol
+def skips():
+ while True:
+ skipRow = 0
+ skipCol = 0
+ r = input('Enter the number of rows that you wish skip (default is 0): ').strip()
+ c = input('Enter the number of columns that you wish skip (default is 0): ').strip()
+ try:
+ if r != '':
+ skipRow = int(r)
+ if c != '':
+ skipCol = int(c)
+ return skipRow, skipCol
+ except Exception as e:
+ input('Enter a valid value')
+
+# gets the mean of an array
+def getMean(values):
+ mean = 0.0
+ for i in values:
+ mean+=i
+ mean /= len(values)
+ return mean
+
+# gets the standard deviation of an array
+def getSTD(values):
+ mean = getMean(values)
+ phi = 0.0
+
+ for i in values:
+ phi += abs((i - mean)) ** 2
+ phi = math.sqrt((phi/(len(values) - 1)))
+ return phi
+
+
+def log2Normalize(inputFile):
+ try:
+ skipRow, skipCol = skips()
+ print(':Log2 normalize processing...looking for the minimal expression value')
+ f = open('%s' % inputFile, 'r')
+ # set min to max value
+ min = float('inf')
+
+ for i in range(skipRow + 1):
+ f.readline()
+ while True:
+ data =f.readline()
+ if not data:
+ break
+ else:
+ s = data.split('\t')
+ for j in range(skipCol + 1, len(s)):
+ value = 0.0
+ try:
+ value = float(s[j])
+ if min > value:
+ min = value
+ except Exception as e:
+ pass
+ f.close()
+ print('Log2 normalize processing...calculating')
+ f = open('%s' % inputFile, 'r')
+ outputFile = os.path.split(inputFile)[1].replace('.txt', '_log2.txt')
+ out = open(outputFile,'w')
+
+ offset = 1.0
+ if min < 0.0:
+ offset = -min + 1.0
+ for i in range(skipRow + 1):
+ out.write(f.readline())
+ linenum = 0
+ while True:
+ data = f.readline()
+ if not data:
+ break
+ else:
+ s = data.split('\t')
+ for j in range(skipCol + 1):
+ out.write(s[j] + '\t')
+ for j in range(skipCol + 1, len(s)):
+ try:
+ value = math.log(float(s[j]) + offset) / math.log(2.0)
+ out.write(str(round(value,3)) + '\t')
+ except Exception as e:
+ out.write('x')
+ out.write('\t')
+ out.write('\n')
+ linenum +=1
+ if linenum % 2500 == 0:
+ print('Log2 normalize processing...finished' + str(linenum) + ' lines')
+ f.close()
+ out.close()
+ print('Log2 normalize finished')
+ except Exception as e:
+ print(traceback.format_exc())
+
+def ZScoreNormalize(inputFile):
+ try:
+ skipRow, skipCol = skips()
+ print('ZScore normalize processing...calculating means')
+ f = open('%s' % inputFile, 'r')
+ for n in range(skipRow):
+ f.readline()
+ # skip first row which is just headers
+ s = f.readline().split('\t')
+ col = len(s) - skipCol - 1
+ # mean and phi lists are filled with 0.00
+ mean = [0.00] * col
+ phi = [0.00] * col
+
+ row = 0
+
+ # read to the end of the file
+ while True:
+ data = f.readline()
+ if not data:
+ break
+ else:
+ # put values into a list
+ s = data.split('\t')
+ for m in range(skipCol + 1, len(s)):
+ try:
+ mean[m - skipCol - 1] = mean[m - skipCol - 1] + float(s[m])
+ except Exception as e:
+ pass
+ row+=1
+
+ f.close()
+
+ for m in range(col):
+ mean[m] = mean[m] / row
+ print('ZScore normalize processing...calculating standard divisions')
+ f = open('%s' % inputFile, 'r')
+ for k in range(skipRow + 1):
+ # skip headers
+ f.readline()
+ while True:
+ data = f.readline()
+ if not data:
+ break
+ else:
+ s = data.split('\t')
+ for i1 in range(skipCol + 1, len(s)):
+ value = 0.0
+ try:
+ value = float(s[i1])
+ phi[i1 - skipCol - 1] = phi[i1 - skipCol - 1] + (value - mean[i1 - skipCol - 1]) ** 2
+ except Exception as e:
+ pass
+
+ f.close()
+ for j in range(col):
+ phi[j] = math.sqrt(phi[j] / (row - 1))
+ # print(str(mean[j]) + '\t' + str(phi[j]))
+
+ outputFile = os.path.split(inputFile)[1].replace('.txt', '_Z.txt')
+ f = open('%s' % inputFile, 'r')
+ out = open(outputFile,'w')
+
+ for i in range(skipRow + 1):
+ out.write(f.readline())
+
+ row = 0
+ while True:
+ data = f.readline()
+ if not data:
+ break
+ else:
+ s = data.split('\t')
+ for i1 in range(skipRow + 1):
+ out.write(s[i1] + '\t')
+ for i1 in range(skipCol + 1, len(s)):
+ try:
+ value = float(s[i1])
+ value = 2.0 * (value - mean[i1 - skipCol - 1]) / phi[i1 - skipCol - 1] + 8.0
+ out.write(str(round(value, 3)) + '\t')
+ except Exception as e:
+ out.write('x' + '\t')
+
+
+ out.write('\n')
+ row+=1
+ if row % 2500 == 0:
+ print('ZScore normalize processing...finished ' + str(row) + ' lines')
+
+ f.close()
+ out.close()
+ print('ZScore normalize finished')
+ except Exception as e:
+ print(e)
+
+# find the average of values with column names that are the same
+def average(inputFile, needSE):
+ try:
+ skipRow, skipCol = skips()
+ f = open('%s' % inputFile, 'r')
+ outputFile = os.path.split(inputFile)[1].replace('.txt', '_Avg.txt')
+ out = open(outputFile,'w')
+
+ titleList = [] # will hold all of the titles, used for getting indices for values
+ itemList = [] # will hold all the unique titles
+
+ # skips the appropriate amount of rows
+ for i in range(skipRow):
+ s = f.readline()
+ out.write(s + '\n')
+
+ s = f.readline().split('\t')
+ for i in range(len(s)):
+ s[i] = s[i].strip()
+ titleList.append(s[i])
+ if s[i] not in itemList and i > skipCol:
+ itemList.append(s[i])
+
+ for i in range(skipCol + 1):
+ out.write(titleList[i])
+ out.write('\t')
+
+ for i in range(len(itemList)):
+ out.write(itemList[i])
+ if i < len(itemList) - 1:
+ out.write('\t')
+ out.write('\n')
+
+ line = 0
+ while True:
+ data = f.readline()
+ if not data:
+ break
+ else:
+ s = data.split('\t')
+ for j in range(skipCol + 1):
+ out.write(str(s[j]) + '\t')
+
+ for j in range(len(itemList)):
+ avgItemValue = 0.0
+ n = 0
+ at = titleList.index(itemList[j])
+ while(at >= 0):
+ try:
+ avgItemValue += float(s[at])
+ n+=1
+ at = titleList.index(itemList[j], at + 1)
+ except Exception as e:
+ at = -1
+ if n == 0:
+ out.write('x\t')
+ else:
+ avgItemValue /= n
+ out.write(str(round(avgItemValue, 4)))
+ if j < len(itemList) - 1:
+ out.write('\t')
+
+ out.write('\n')
+ line +=1
+
+ if line % 1000 == 0:
+ print('Processing average...' + str(line) + ' lines\n')
+
+ f.close()
+ out.close()
+ except Exception as e:
+ print(traceback.format_exc())
+
+ if needSE:
+ getSE(inputFile, skipRow, skipCol)
+
+# find the SE of the average of the values
+def getSE(inputFile, skipRow, skipCol):
+ try:
+ f = open('%s' % inputFile, 'r')
+ outputFile = os.path.split(inputFile)[1].replace('.txt', '_Avg_SE.txt')
+ out = open(outputFile,'w')
+
+ titleList = [] # will hold all of the titles, used for getting indices for values
+ itemList = [] # will hold all the unique titles
+
+ # skips the appropriate amount of rows
+ for i in range(skipRow):
+ s = f.readline()
+ out.write(s + '\n')
+
+ s = f.readline().split('\t')
+ for i in range(len(s)):
+ s[i] = s[i].strip()
+ titleList.append(s[i])
+ if s[i] not in itemList and i > skipCol:
+ itemList.append(s[i])
+
+ for i in range(skipCol + 1):
+ out.write(titleList[i])
+ out.write('\t')
+
+ for i in range(len(itemList)):
+ out.write(itemList[i])
+ if i < len(itemList) - 1:
+ out.write('\t')
+ out.write('\n')
+
+ line = 0
+ while True:
+ data = f.readline()
+ if not data:
+ break
+ else:
+ s = data.split('\t')
+ for j in range(skipCol + 1):
+ out.write(str(s[j]) + '\t')
+
+ for j in range(len(itemList)):
+ avgItemValue = 0.0
+ n = 0
+ at = titleList.index(itemList[j])
+ while(at >= 0):
+ try:
+ avgItemValue += float(s[at])
+ n+=1
+ at = titleList.index(itemList[j], at + 1)
+ except Exception as e:
+ at = -1
+ if n == 0:
+ out.write('x\t')
+ else:
+ avgItemValue /= n
+ SE = 0.0
+ n = 0
+ at = titleList.index(itemList[j])
+ while at >= 0:
+ try:
+ SE += (avgItemValue - float(s[at])) * (avgItemValue - float(s[at]))
+ n +=1
+ at = titleList.index(itemList[j], at + 1)
+ except Exception as e:
+ at = -1
+
+ if n > 1:
+ SE = math.sqrt(SE / (n - 1))
+ SE /= math.sqrt((n-1))
+ out.write(str(round(SE, 8)))
+ out.write('\t')
+ else:
+ out.write('\t')
+
+ out.write('\n')
+ line +=1
+
+ if line % 1000 == 0:
+ print('Processing average... Standard Error... ' + str(line) + ' lines\n')
+
+ f.close()
+ out.close()
+ except Exception as e:
+ print(traceback.format_exc())
+
+# find outliers
+def outlier(inputFile):
+ try:
+ skipRow, skipCol = skips()
+ print('Outlier running')
+ f = open('%s' % inputFile, 'r')
+ outputFile = os.path.split(inputFile)[1].replace('.txt', '_Outlier.txt')
+ out = open(outputFile,'w')
+
+ for i in range(skipRow):
+ f.readline()
+ data = f.readline()
+ for i in range(skipCol + 1):
+ if data.index('\t') >= 0:
+ data = data[data.index('\t'):]
+ data = data.strip()
+
+ sampleTitle = data.split('\t')
+ sampleNum = len(sampleTitle)
+ values = [0.0] * sampleNum
+ marks = [0] * sampleNum
+ geneNum = 0
+
+ while True:
+ data = f.readline()
+ if not data:
+ break
+ else:
+ s = data.split('\t')
+ for k in range(sampleNum):
+ try:
+ values[k] = round(float(s[k + skipCol + 1]), 3)
+
+ except Exception as e:
+ pass
+
+ mean = getMean(values)
+ phi = getSTD(values)
+ for k in range(sampleNum):
+ if values[k] < (mean - 2.0 * phi) or values[k] > (mean + 2.0 * phi):
+ marks[k] += 1
+
+ geneNum += 1
+ if geneNum % 1000 == 0:
+ print('Outlier running, finished...' + str(geneNum) + ' lines')
+
+ for j in range(sampleNum):
+ out.write(sampleTitle[j] + '\t')
+ out.write(str(marks[j]) + '\t')
+ out.write(str(round(marks[j] * 1.0 / geneNum, 3)))
+ out.write('\n')
+ out.close
+ print('Outlier finished')
+
+ except Exception as e:
+ print(traceback.format_exc())
+
+# adds dashes based on how long the filename is, just here to make the selection a little bit nicer
+def dashes(string):
+ dash = '-------------------------'
+ for i in range(len(string)):
+ dash+='-'
+ return '\n' + dash
+
+def main():
+ input('Press Enter to select file')
+ filename = fileSelector()
+
+ while True:
+ file = os.path.split(filename)[1]
+ print('\nEnter Selection for file ' + file + dashes(file))
+ print('1. Average\n2. Log2 Normalize\n3. ZScore Normalize\n4. Outlier\n5. Select a new file\n6. Exit')
+ select = input('Enter: ')
+
+ if select == '1':
+ needSE = False
+ SE = input('Do you want to compute the standard error [Y/N] (default is no)').lower()
+ if SE == 'y':
+ needSE = True
+ average(filename, needSE)
+ elif select == '2':
+ log2Normalize(filename)
+ elif select == '3':
+ ZScoreNormalize(filename)
+ elif select == '4':
+ outlier(filename)
+ elif select == '5':
+ filename = fileSelector()
+ elif select == '6':
+ exit()
+
+
+
+
+main() \ No newline at end of file