From ae8b46b4b06a46b92fca5e6b567e6795f1b5dcca Mon Sep 17 00:00:00 2001 From: Zachary Sloan Date: Tue, 2 Apr 2013 23:01:22 +0000 Subject: Finished adding code that uses the plink genotype data if a human dataset and json data if any other species --- .../wqflask/marker_regression/marker_regression.py | 149 +++++--------------- wqflask/wqflask/my_pylmm/pyLMM/input.py | 20 +-- wqflask/wqflask/my_pylmm/pyLMM/lmm.py | 156 ++++++++++++++++----- 3 files changed, 166 insertions(+), 159 deletions(-) diff --git a/wqflask/wqflask/marker_regression/marker_regression.py b/wqflask/wqflask/marker_regression/marker_regression.py index a640d37f..f67c595a 100755 --- a/wqflask/wqflask/marker_regression/marker_regression.py +++ b/wqflask/wqflask/marker_regression/marker_regression.py @@ -62,128 +62,55 @@ class MarkerRegression(object): def gen_data(self, tempdata): """Generates p-values for each marker""" + self.dataset.group.get_markers() + + pheno_vector = np.array([val == "x" and np.nan or float(val) for val in self.vals]) + + if self.dataset.group.species == "human": + p_values, t_stats = self.gen_human_results(pheno_vector) + else: + genotype_data = [marker['genotypes'] for marker in self.dataset.group.markers.markers] + + no_val_samples = self.identify_empty_samples() + trimmed_genotype_data = self.trim_genotypes(genotype_data, no_val_samples) + + genotype_matrix = np.array(trimmed_genotype_data).T + + print("pheno_vector is: ", pf(pheno_vector)) + print("genotype_matrix is: ", pf(genotype_matrix)) + + t_stats, p_values = lmm.run( + pheno_vector, + genotype_matrix, + restricted_max_likelihood=True, + refit=False, + temp_data=tempdata + ) + + self.dataset.group.markers.add_pvalues(p_values) + self.qtl_results = self.dataset.group.markers.markers + + + def gen_human_results(self, pheno_vector): file_base = os.path.join(webqtlConfig.PYLMM_PATH, self.dataset.group.name) plink_input = input.plink(file_base, type='b') - - pheno_vector = np.array([val == "x" and np.nan or float(val) for val in self.vals]) pheno_vector = pheno_vector.reshape((len(pheno_vector), 1)) covariate_matrix = np.ones((pheno_vector.shape[0],1)) kinship_matrix = np.fromfile(open(file_base + '.kin','r'),sep=" ") kinship_matrix.resize((len(plink_input.indivs),len(plink_input.indivs))) - - refit = False - - v = np.isnan(pheno_vector) - keep = True - v - keep = keep.reshape((len(keep),)) - eigen_values = [] - eigen_vectors = [] - - #print("pheno_vector is: ", pf(pheno_vector)) - #print("kinship_matrix is: ", pf(kinship_matrix)) - - if v.sum(): - pheno_vector = pheno_vector[keep] - print("pheno_vector shape is now: ", pf(pheno_vector.shape)) - covariate_matrix = covariate_matrix[keep,:] - print("kinship_matrix shape is: ", pf(kinship_matrix.shape)) - print("len(keep) is: ", pf(keep.shape)) - kinship_matrix = kinship_matrix[keep,:][:,keep] - - #if not v.sum(): - # eigen_values = np.fromfile(file_base + ".kin.kva") - # eigen_vectors = np.fromfile(file_base + ".kin.kve") - - n = kinship_matrix.shape[0] - lmm_ob = lmm.LMM(pheno_vector, - kinship_matrix, - eigen_values, - eigen_vectors, - covariate_matrix) - lmm_ob.fit() - - # Buffers for pvalues and t-stats - p_values = [] - t_statistics = [] - count = 0 - - plink_input.getSNPIterator() - print("# snps is: ", pf(plink_input.numSNPs)) - with Bench("snp iterator loop"): - for snp, this_id in plink_input: - if count > 1000: - break - count += 1 - - x = snp[keep].reshape((n,1)) - #x[[1,50,100,200,3000],:] = np.nan - v = np.isnan(x).reshape((-1,)) - - # Check SNPs for missing values - if v.sum(): - keeps = True - v - xs = x[keeps,:] - # If no variation at this snp or all genotypes missing - if keeps.sum() <= 1 or xs.var() <= 1e-6: - p_values.append(np.nan) - t_statistics.append(np.nan) - continue - - # Its ok to center the genotype - I used options.normalizeGenotype to - # force the removal of missing genotypes as opposed to replacing them with MAF. - - #if not options.normalizeGenotype: - # xs = (xs - xs.mean()) / np.sqrt(xs.var()) - - filtered_pheno = pheno_vector[keeps] - filtered_covariate_matrix = covariate_matrix[keeps,:] - filtered_kinship_matrix = kinship_matrix[keeps,:][:,keeps] - filtered_lmm_ob = lmm.LMM(filtered_pheno,filtered_kinship_matrix,X0=filtered_covariate_matrix) - if refit: - filtered_lmm_ob.fit(X=xs) - else: - #try: - filtered_lmm_ob.fit() - #except: pdb.set_trace() - ts,ps,beta,betaVar = Ls.association(xs,returnBeta=True) - else: - if x.var() == 0: - p_values.append(np.nan) - t_statistics.append(np.nan) - continue - - if refit: - lmm_ob.fit(X=x) - ts,ps,beta,betaVar = lmm_ob.association(x) - p_values.append(ps) - t_statistics.append(ts) - - #genotype_data = [marker['genotypes'] for marker in self.dataset.group.markers.markers] - # - #no_val_samples = self.identify_empty_samples() - #trimmed_genotype_data = self.trim_genotypes(genotype_data, no_val_samples) - # - #genotype_matrix = np.array(trimmed_genotype_data).T - # - #print("pheno_vector is: ", pf(pheno_vector)) - #print("genotype_matrix is: ", pf(genotype_matrix)) - # - #t_stats, p_values = lmm.run( - # pheno_vector, - # genotype_matrix, - # restricted_max_likelihood=True, - # refit=False, - # temp_data=tempdata - #) - - self.dataset.group.get_markers() - self.dataset.group.markers.add_pvalues(p_values) - self.qtl_results = self.dataset.group.markers.markers + p_values, t_stats = lmm.run_human( + pheno_vector, + covariate_matrix, + plink_input, + kinship_matrix + ) + return p_values, t_stats + def identify_empty_samples(self): no_val_samples = [] diff --git a/wqflask/wqflask/my_pylmm/pyLMM/input.py b/wqflask/wqflask/my_pylmm/pyLMM/input.py index 35662072..dfe28a4e 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/input.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/input.py @@ -40,7 +40,7 @@ class plink: # NOW I want to use this module to just read SNPs so I'm allowing # the programmer to turn off the kinship reading. self.readKFile = readKFile - + if self.kFile: self.K = self.readKinship(self.kFile) elif os.path.isfile("%s.kin" % fbase): @@ -50,21 +50,21 @@ class plink: else: self.kFile = None self.K = None - + self.getPhenos(self.phenoFile) - + self.fhandle = None self.snpFileHandle = None def __del__(self): if self.fhandle: self.fhandle.close() if self.snpFileHandle: self.snpFileHandle.close() - + def getSNPIterator(self): if not self.type == 'b': sys.stderr.write("Have only implemented this for binary plink files (bed)\n") return - + # get the number of snps file = self.fbase + '.bim' i = 0 @@ -74,10 +74,10 @@ class plink: self.numSNPs = i self.have_read = 0 self.snpFileHandle = open(file,'r') - + self.BytestoRead = self.N / 4 + (self.N % 4 and 1 or 0) self._formatStr = 'c'*self.BytestoRead - + file = self.fbase + '.bed' self.fhandle = open(file,'rb') @@ -86,12 +86,12 @@ class plink: if not order == '\x01': sys.stderr.write("This is not in SNP major order - you did not handle this case\n") raise StopIteration - + return self - + def __iter__(self): return self.getSNPIterator() - + def next(self): if self.have_read == self.numSNPs: raise StopIteration diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py index f1f195d6..e60f7b02 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py @@ -26,42 +26,122 @@ from scipy import stats from pprint import pformat as pf -#from utility.benchmark import Bench -# -##np.seterr('raise') -# -#def run(pheno_vector, -# genotype_matrix, -# restricted_max_likelihood=True, -# refit=False, -# temp_data=None): -# """Takes the phenotype vector and genotype matrix and returns a set of p-values and t-statistics -# -# restricted_max_likelihood -- whether to use restricted max likelihood; True or False -# refit -- whether to refit the variance component for each marker -# temp_data -- TempData object that stores the progress for each major step of the -# calculations ("calculate_kinship" and "GWAS" take the majority of time) -# -# """ -# -# with Bench("Calculate Kinship"): -# kinship_matrix = calculate_kinship(genotype_matrix, temp_data) -# -# with Bench("Create LMM object"): -# lmm_ob = LMM(pheno_vector, kinship_matrix) -# -# with Bench("LMM_ob fitting"): -# lmm_ob.fit() -# -# with Bench("Doing GWAS"): -# t_stats, p_values = GWAS(pheno_vector, -# genotype_matrix, -# kinship_matrix, -# restricted_max_likelihood=True, -# refit=False, -# temp_data=temp_data) -# Bench().report() -# return t_stats, p_values +from utility.benchmark import Bench + +#np.seterr('raise') + +def run_human(pheno_vector, + covariate_matrix, + plink_input, + kinship_matrix, + refit=False, + temp_data=None): + + v = np.isnan(pheno_vector) + keep = True - v + keep = keep.reshape((len(keep),)) + + if v.sum(): + pheno_vector = pheno_vector[keep] + print("pheno_vector shape is now: ", pf(pheno_vector.shape)) + covariate_matrix = covariate_matrix[keep,:] + print("kinship_matrix shape is: ", pf(kinship_matrix.shape)) + print("len(keep) is: ", pf(keep.shape)) + kinship_matrix = kinship_matrix[keep,:][:,keep] + + n = kinship_matrix.shape[0] + lmm_ob = LMM(pheno_vector, + kinship_matrix, + covariate_matrix) + lmm_ob.fit() + + # Buffers for pvalues and t-stats + p_values = [] + t_stats = [] + count = 0 + with Bench("snp iterator loop"): + for snp, this_id in plink_input: + if count > 1000: + break + count += 1 + + x = snp[keep].reshape((n,1)) + #x[[1,50,100,200,3000],:] = np.nan + v = np.isnan(x).reshape((-1,)) + + # Check SNPs for missing values + if v.sum(): + keeps = True - v + xs = x[keeps,:] + # If no variation at this snp or all genotypes missing + if keeps.sum() <= 1 or xs.var() <= 1e-6: + p_values.append(np.nan) + t_stats.append(np.nan) + continue + + # Its ok to center the genotype - I used options.normalizeGenotype to + # force the removal of missing genotypes as opposed to replacing them with MAF. + + #if not options.normalizeGenotype: + # xs = (xs - xs.mean()) / np.sqrt(xs.var()) + + filtered_pheno = pheno_vector[keeps] + filtered_covariate_matrix = covariate_matrix[keeps,:] + filtered_kinship_matrix = kinship_matrix[keeps,:][:,keeps] + filtered_lmm_ob = lmm.LMM(filtered_pheno,filtered_kinship_matrix,X0=filtered_covariate_matrix) + if refit: + filtered_lmm_ob.fit(X=xs) + else: + #try: + filtered_lmm_ob.fit() + #except: pdb.set_trace() + ts,ps,beta,betaVar = Ls.association(xs,returnBeta=True) + else: + if x.var() == 0: + p_values.append(np.nan) + t_stats.append(np.nan) + continue + if refit: + lmm_ob.fit(X=x) + ts, ps, beta, betaVar = lmm_ob.association(x) + p_values.append(ps) + t_stats.append(ts) + + return p_values, t_stats + + +def run(pheno_vector, + genotype_matrix, + restricted_max_likelihood=True, + refit=False, + temp_data=None): + """Takes the phenotype vector and genotype matrix and returns a set of p-values and t-statistics + + restricted_max_likelihood -- whether to use restricted max likelihood; True or False + refit -- whether to refit the variance component for each marker + temp_data -- TempData object that stores the progress for each major step of the + calculations ("calculate_kinship" and "GWAS" take the majority of time) + + """ + + with Bench("Calculate Kinship"): + kinship_matrix = calculate_kinship(genotype_matrix, temp_data) + + with Bench("Create LMM object"): + lmm_ob = LMM(pheno_vector, kinship_matrix) + + with Bench("LMM_ob fitting"): + lmm_ob.fit() + + with Bench("Doing GWAS"): + t_stats, p_values = GWAS(pheno_vector, + genotype_matrix, + kinship_matrix, + restricted_max_likelihood=True, + refit=False, + temp_data=temp_data) + Bench().report() + return t_stats, p_values def matrixMult(A,B): @@ -212,7 +292,7 @@ def GWAS(pheno_vector, lmm_ob_2.fit(X=xs) else: lmm_ob_2.fit() - ts, ps = lmm_ob_2.association(xs, REML=restricted_max_likelihood) + ts, ps, beta, betaVar = lmm_ob_2.association(xs, REML=restricted_max_likelihood) else: if x.var() == 0: p_values.append(np.nan) @@ -221,7 +301,7 @@ def GWAS(pheno_vector, if refit: lmm_ob.fit(X=x) - ts, ps = lmm_ob.association(x, REML=restricted_max_likelihood) + ts, ps, beta, betaVar = lmm_ob.association(x, REML=restricted_max_likelihood) percent_complete = 45 + int(round((counter/m)*55)) temp_data.store("percent_complete", percent_complete) -- cgit v1.2.3