aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xwqflask/wqflask/marker_regression/marker_regression.py149
-rw-r--r--wqflask/wqflask/my_pylmm/pyLMM/input.py20
-rw-r--r--wqflask/wqflask/my_pylmm/pyLMM/lmm.py156
3 files changed, 166 insertions, 159 deletions
diff --git a/wqflask/wqflask/marker_regression/marker_regression.py b/wqflask/wqflask/marker_regression/marker_regression.py
index a640d37f..f67c595a 100755
--- a/wqflask/wqflask/marker_regression/marker_regression.py
+++ b/wqflask/wqflask/marker_regression/marker_regression.py
@@ -62,128 +62,55 @@ class MarkerRegression(object):
def gen_data(self, tempdata):
"""Generates p-values for each marker"""
+ self.dataset.group.get_markers()
+
+ pheno_vector = np.array([val == "x" and np.nan or float(val) for val in self.vals])
+
+ if self.dataset.group.species == "human":
+ p_values, t_stats = self.gen_human_results(pheno_vector)
+ else:
+ genotype_data = [marker['genotypes'] for marker in self.dataset.group.markers.markers]
+
+ no_val_samples = self.identify_empty_samples()
+ trimmed_genotype_data = self.trim_genotypes(genotype_data, no_val_samples)
+
+ genotype_matrix = np.array(trimmed_genotype_data).T
+
+ print("pheno_vector is: ", pf(pheno_vector))
+ print("genotype_matrix is: ", pf(genotype_matrix))
+
+ t_stats, p_values = lmm.run(
+ pheno_vector,
+ genotype_matrix,
+ restricted_max_likelihood=True,
+ refit=False,
+ temp_data=tempdata
+ )
+
+ self.dataset.group.markers.add_pvalues(p_values)
+ self.qtl_results = self.dataset.group.markers.markers
+
+
+ def gen_human_results(self, pheno_vector):
file_base = os.path.join(webqtlConfig.PYLMM_PATH, self.dataset.group.name)
plink_input = input.plink(file_base, type='b')
-
- pheno_vector = np.array([val == "x" and np.nan or float(val) for val in self.vals])
pheno_vector = pheno_vector.reshape((len(pheno_vector), 1))
covariate_matrix = np.ones((pheno_vector.shape[0],1))
kinship_matrix = np.fromfile(open(file_base + '.kin','r'),sep=" ")
kinship_matrix.resize((len(plink_input.indivs),len(plink_input.indivs)))
-
- refit = False
-
- v = np.isnan(pheno_vector)
- keep = True - v
- keep = keep.reshape((len(keep),))
- eigen_values = []
- eigen_vectors = []
-
- #print("pheno_vector is: ", pf(pheno_vector))
- #print("kinship_matrix is: ", pf(kinship_matrix))
-
- if v.sum():
- pheno_vector = pheno_vector[keep]
- print("pheno_vector shape is now: ", pf(pheno_vector.shape))
- covariate_matrix = covariate_matrix[keep,:]
- print("kinship_matrix shape is: ", pf(kinship_matrix.shape))
- print("len(keep) is: ", pf(keep.shape))
- kinship_matrix = kinship_matrix[keep,:][:,keep]
-
- #if not v.sum():
- # eigen_values = np.fromfile(file_base + ".kin.kva")
- # eigen_vectors = np.fromfile(file_base + ".kin.kve")
-
- n = kinship_matrix.shape[0]
- lmm_ob = lmm.LMM(pheno_vector,
- kinship_matrix,
- eigen_values,
- eigen_vectors,
- covariate_matrix)
- lmm_ob.fit()
-
- # Buffers for pvalues and t-stats
- p_values = []
- t_statistics = []
- count = 0
-
- plink_input.getSNPIterator()
- print("# snps is: ", pf(plink_input.numSNPs))
- with Bench("snp iterator loop"):
- for snp, this_id in plink_input:
- if count > 1000:
- break
- count += 1
-
- x = snp[keep].reshape((n,1))
- #x[[1,50,100,200,3000],:] = np.nan
- v = np.isnan(x).reshape((-1,))
-
- # Check SNPs for missing values
- if v.sum():
- keeps = True - v
- xs = x[keeps,:]
- # If no variation at this snp or all genotypes missing
- if keeps.sum() <= 1 or xs.var() <= 1e-6:
- p_values.append(np.nan)
- t_statistics.append(np.nan)
- continue
-
- # Its ok to center the genotype - I used options.normalizeGenotype to
- # force the removal of missing genotypes as opposed to replacing them with MAF.
-
- #if not options.normalizeGenotype:
- # xs = (xs - xs.mean()) / np.sqrt(xs.var())
-
- filtered_pheno = pheno_vector[keeps]
- filtered_covariate_matrix = covariate_matrix[keeps,:]
- filtered_kinship_matrix = kinship_matrix[keeps,:][:,keeps]
- filtered_lmm_ob = lmm.LMM(filtered_pheno,filtered_kinship_matrix,X0=filtered_covariate_matrix)
- if refit:
- filtered_lmm_ob.fit(X=xs)
- else:
- #try:
- filtered_lmm_ob.fit()
- #except: pdb.set_trace()
- ts,ps,beta,betaVar = Ls.association(xs,returnBeta=True)
- else:
- if x.var() == 0:
- p_values.append(np.nan)
- t_statistics.append(np.nan)
- continue
-
- if refit:
- lmm_ob.fit(X=x)
- ts,ps,beta,betaVar = lmm_ob.association(x)
- p_values.append(ps)
- t_statistics.append(ts)
-
- #genotype_data = [marker['genotypes'] for marker in self.dataset.group.markers.markers]
- #
- #no_val_samples = self.identify_empty_samples()
- #trimmed_genotype_data = self.trim_genotypes(genotype_data, no_val_samples)
- #
- #genotype_matrix = np.array(trimmed_genotype_data).T
- #
- #print("pheno_vector is: ", pf(pheno_vector))
- #print("genotype_matrix is: ", pf(genotype_matrix))
- #
- #t_stats, p_values = lmm.run(
- # pheno_vector,
- # genotype_matrix,
- # restricted_max_likelihood=True,
- # refit=False,
- # temp_data=tempdata
- #)
-
- self.dataset.group.get_markers()
- self.dataset.group.markers.add_pvalues(p_values)
- self.qtl_results = self.dataset.group.markers.markers
+ p_values, t_stats = lmm.run_human(
+ pheno_vector,
+ covariate_matrix,
+ plink_input,
+ kinship_matrix
+ )
+ return p_values, t_stats
+
def identify_empty_samples(self):
no_val_samples = []
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/input.py b/wqflask/wqflask/my_pylmm/pyLMM/input.py
index 35662072..dfe28a4e 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/input.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/input.py
@@ -40,7 +40,7 @@ class plink:
# NOW I want to use this module to just read SNPs so I'm allowing
# the programmer to turn off the kinship reading.
self.readKFile = readKFile
-
+
if self.kFile:
self.K = self.readKinship(self.kFile)
elif os.path.isfile("%s.kin" % fbase):
@@ -50,21 +50,21 @@ class plink:
else:
self.kFile = None
self.K = None
-
+
self.getPhenos(self.phenoFile)
-
+
self.fhandle = None
self.snpFileHandle = None
def __del__(self):
if self.fhandle: self.fhandle.close()
if self.snpFileHandle: self.snpFileHandle.close()
-
+
def getSNPIterator(self):
if not self.type == 'b':
sys.stderr.write("Have only implemented this for binary plink files (bed)\n")
return
-
+
# get the number of snps
file = self.fbase + '.bim'
i = 0
@@ -74,10 +74,10 @@ class plink:
self.numSNPs = i
self.have_read = 0
self.snpFileHandle = open(file,'r')
-
+
self.BytestoRead = self.N / 4 + (self.N % 4 and 1 or 0)
self._formatStr = 'c'*self.BytestoRead
-
+
file = self.fbase + '.bed'
self.fhandle = open(file,'rb')
@@ -86,12 +86,12 @@ class plink:
if not order == '\x01':
sys.stderr.write("This is not in SNP major order - you did not handle this case\n")
raise StopIteration
-
+
return self
-
+
def __iter__(self):
return self.getSNPIterator()
-
+
def next(self):
if self.have_read == self.numSNPs:
raise StopIteration
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
index f1f195d6..e60f7b02 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
@@ -26,42 +26,122 @@ from scipy import stats
from pprint import pformat as pf
-#from utility.benchmark import Bench
-#
-##np.seterr('raise')
-#
-#def run(pheno_vector,
-# genotype_matrix,
-# restricted_max_likelihood=True,
-# refit=False,
-# temp_data=None):
-# """Takes the phenotype vector and genotype matrix and returns a set of p-values and t-statistics
-#
-# restricted_max_likelihood -- whether to use restricted max likelihood; True or False
-# refit -- whether to refit the variance component for each marker
-# temp_data -- TempData object that stores the progress for each major step of the
-# calculations ("calculate_kinship" and "GWAS" take the majority of time)
-#
-# """
-#
-# with Bench("Calculate Kinship"):
-# kinship_matrix = calculate_kinship(genotype_matrix, temp_data)
-#
-# with Bench("Create LMM object"):
-# lmm_ob = LMM(pheno_vector, kinship_matrix)
-#
-# with Bench("LMM_ob fitting"):
-# lmm_ob.fit()
-#
-# with Bench("Doing GWAS"):
-# t_stats, p_values = GWAS(pheno_vector,
-# genotype_matrix,
-# kinship_matrix,
-# restricted_max_likelihood=True,
-# refit=False,
-# temp_data=temp_data)
-# Bench().report()
-# return t_stats, p_values
+from utility.benchmark import Bench
+
+#np.seterr('raise')
+
+def run_human(pheno_vector,
+ covariate_matrix,
+ plink_input,
+ kinship_matrix,
+ refit=False,
+ temp_data=None):
+
+ v = np.isnan(pheno_vector)
+ keep = True - v
+ keep = keep.reshape((len(keep),))
+
+ if v.sum():
+ pheno_vector = pheno_vector[keep]
+ print("pheno_vector shape is now: ", pf(pheno_vector.shape))
+ covariate_matrix = covariate_matrix[keep,:]
+ print("kinship_matrix shape is: ", pf(kinship_matrix.shape))
+ print("len(keep) is: ", pf(keep.shape))
+ kinship_matrix = kinship_matrix[keep,:][:,keep]
+
+ n = kinship_matrix.shape[0]
+ lmm_ob = LMM(pheno_vector,
+ kinship_matrix,
+ covariate_matrix)
+ lmm_ob.fit()
+
+ # Buffers for pvalues and t-stats
+ p_values = []
+ t_stats = []
+ count = 0
+ with Bench("snp iterator loop"):
+ for snp, this_id in plink_input:
+ if count > 1000:
+ break
+ count += 1
+
+ x = snp[keep].reshape((n,1))
+ #x[[1,50,100,200,3000],:] = np.nan
+ v = np.isnan(x).reshape((-1,))
+
+ # Check SNPs for missing values
+ if v.sum():
+ keeps = True - v
+ xs = x[keeps,:]
+ # If no variation at this snp or all genotypes missing
+ if keeps.sum() <= 1 or xs.var() <= 1e-6:
+ p_values.append(np.nan)
+ t_stats.append(np.nan)
+ continue
+
+ # Its ok to center the genotype - I used options.normalizeGenotype to
+ # force the removal of missing genotypes as opposed to replacing them with MAF.
+
+ #if not options.normalizeGenotype:
+ # xs = (xs - xs.mean()) / np.sqrt(xs.var())
+
+ filtered_pheno = pheno_vector[keeps]
+ filtered_covariate_matrix = covariate_matrix[keeps,:]
+ filtered_kinship_matrix = kinship_matrix[keeps,:][:,keeps]
+ filtered_lmm_ob = lmm.LMM(filtered_pheno,filtered_kinship_matrix,X0=filtered_covariate_matrix)
+ if refit:
+ filtered_lmm_ob.fit(X=xs)
+ else:
+ #try:
+ filtered_lmm_ob.fit()
+ #except: pdb.set_trace()
+ ts,ps,beta,betaVar = Ls.association(xs,returnBeta=True)
+ else:
+ if x.var() == 0:
+ p_values.append(np.nan)
+ t_stats.append(np.nan)
+ continue
+ if refit:
+ lmm_ob.fit(X=x)
+ ts, ps, beta, betaVar = lmm_ob.association(x)
+ p_values.append(ps)
+ t_stats.append(ts)
+
+ return p_values, t_stats
+
+
+def run(pheno_vector,
+ genotype_matrix,
+ restricted_max_likelihood=True,
+ refit=False,
+ temp_data=None):
+ """Takes the phenotype vector and genotype matrix and returns a set of p-values and t-statistics
+
+ restricted_max_likelihood -- whether to use restricted max likelihood; True or False
+ refit -- whether to refit the variance component for each marker
+ temp_data -- TempData object that stores the progress for each major step of the
+ calculations ("calculate_kinship" and "GWAS" take the majority of time)
+
+ """
+
+ with Bench("Calculate Kinship"):
+ kinship_matrix = calculate_kinship(genotype_matrix, temp_data)
+
+ with Bench("Create LMM object"):
+ lmm_ob = LMM(pheno_vector, kinship_matrix)
+
+ with Bench("LMM_ob fitting"):
+ lmm_ob.fit()
+
+ with Bench("Doing GWAS"):
+ t_stats, p_values = GWAS(pheno_vector,
+ genotype_matrix,
+ kinship_matrix,
+ restricted_max_likelihood=True,
+ refit=False,
+ temp_data=temp_data)
+ Bench().report()
+ return t_stats, p_values
def matrixMult(A,B):
@@ -212,7 +292,7 @@ def GWAS(pheno_vector,
lmm_ob_2.fit(X=xs)
else:
lmm_ob_2.fit()
- ts, ps = lmm_ob_2.association(xs, REML=restricted_max_likelihood)
+ ts, ps, beta, betaVar = lmm_ob_2.association(xs, REML=restricted_max_likelihood)
else:
if x.var() == 0:
p_values.append(np.nan)
@@ -221,7 +301,7 @@ def GWAS(pheno_vector,
if refit:
lmm_ob.fit(X=x)
- ts, ps = lmm_ob.association(x, REML=restricted_max_likelihood)
+ ts, ps, beta, betaVar = lmm_ob.association(x, REML=restricted_max_likelihood)
percent_complete = 45 + int(round((counter/m)*55))
temp_data.store("percent_complete", percent_complete)