From ae8b46b4b06a46b92fca5e6b567e6795f1b5dcca Mon Sep 17 00:00:00 2001
From: Zachary Sloan
Date: Tue, 2 Apr 2013 23:01:22 +0000
Subject: Finished adding code that uses the plink genotype data if a human
 dataset and json data if any other species

---
 .../wqflask/marker_regression/marker_regression.py | 149 +++++---------------
 wqflask/wqflask/my_pylmm/pyLMM/input.py            |  20 +--
 wqflask/wqflask/my_pylmm/pyLMM/lmm.py              | 156 ++++++++++++++++-----
 3 files changed, 166 insertions(+), 159 deletions(-)

diff --git a/wqflask/wqflask/marker_regression/marker_regression.py b/wqflask/wqflask/marker_regression/marker_regression.py
index a640d37f..f67c595a 100755
--- a/wqflask/wqflask/marker_regression/marker_regression.py
+++ b/wqflask/wqflask/marker_regression/marker_regression.py
@@ -62,128 +62,55 @@ class MarkerRegression(object):
     def gen_data(self, tempdata):
         """Generates p-values for each marker"""
 
+        self.dataset.group.get_markers()
+
+        pheno_vector = np.array([val == "x" and np.nan or float(val) for val in self.vals])
+
+        if self.dataset.group.species == "human":
+            p_values, t_stats = self.gen_human_results(pheno_vector)
+        else:
+            genotype_data = [marker['genotypes'] for marker in self.dataset.group.markers.markers]
+            
+            no_val_samples = self.identify_empty_samples()
+            trimmed_genotype_data = self.trim_genotypes(genotype_data, no_val_samples)
+            
+            genotype_matrix = np.array(trimmed_genotype_data).T
+            
+            print("pheno_vector is: ", pf(pheno_vector))
+            print("genotype_matrix is: ", pf(genotype_matrix))
+            
+            t_stats, p_values = lmm.run(
+                pheno_vector,
+                genotype_matrix,
+                restricted_max_likelihood=True,
+                refit=False,
+                temp_data=tempdata
+            )
+        
+        self.dataset.group.markers.add_pvalues(p_values)
 
+        self.qtl_results = self.dataset.group.markers.markers
+
+
+    def gen_human_results(self, pheno_vector):
         file_base = os.path.join(webqtlConfig.PYLMM_PATH, self.dataset.group.name)
         
         plink_input = input.plink(file_base, type='b')
 
-
-        pheno_vector = np.array([val == "x" and np.nan or float(val) for val in self.vals])
         pheno_vector = pheno_vector.reshape((len(pheno_vector), 1))
         covariate_matrix = np.ones((pheno_vector.shape[0],1))
         kinship_matrix = np.fromfile(open(file_base + '.kin','r'),sep=" ")
         kinship_matrix.resize((len(plink_input.indivs),len(plink_input.indivs)))
-        
-        refit = False
-        
-        v = np.isnan(pheno_vector)
-        keep = True - v
-        keep = keep.reshape((len(keep),))
-        eigen_values = []
-        eigen_vectors = []
-        
-        #print("pheno_vector is: ", pf(pheno_vector))
-        #print("kinship_matrix is: ", pf(kinship_matrix))
-        
-        if v.sum():
-            pheno_vector = pheno_vector[keep]
-            print("pheno_vector shape is now: ", pf(pheno_vector.shape))
-            covariate_matrix = covariate_matrix[keep,:]
-            print("kinship_matrix shape is: ", pf(kinship_matrix.shape))
-            print("len(keep) is: ", pf(keep.shape))
-            kinship_matrix = kinship_matrix[keep,:][:,keep]
-            
-        #if not v.sum():
-        #    eigen_values = np.fromfile(file_base + ".kin.kva")
-        #    eigen_vectors = np.fromfile(file_base + ".kin.kve")
-            
-        n = kinship_matrix.shape[0]
-        lmm_ob = lmm.LMM(pheno_vector,
-                         kinship_matrix,
-                         eigen_values,
-                         eigen_vectors,
-                         covariate_matrix)
-        lmm_ob.fit()
-
-        # Buffers for pvalues and t-stats
-        p_values = []
-        t_statistics = []
-        count = 0
-        
-        plink_input.getSNPIterator()
-        print("# snps is: ", pf(plink_input.numSNPs))
-        with Bench("snp iterator loop"):
-            for snp, this_id in plink_input:
-                if count > 1000:
-                    break
-                count += 1
-                
-                x = snp[keep].reshape((n,1))
-                #x[[1,50,100,200,3000],:] = np.nan
-                v = np.isnan(x).reshape((-1,))
-                
-                # Check SNPs for missing values
-                if v.sum():
-                    keeps = True - v
-                    xs = x[keeps,:]
-                    # If no variation at this snp or all genotypes missing 
-                    if keeps.sum() <= 1 or xs.var() <= 1e-6:
-                        p_values.append(np.nan)
-                        t_statistics.append(np.nan)
-                        continue
-
-                    # Its ok to center the genotype -  I used options.normalizeGenotype to
-                    # force the removal of missing genotypes as opposed to replacing them with MAF.
-
-                    #if not options.normalizeGenotype:
-                    #    xs = (xs - xs.mean()) / np.sqrt(xs.var())
-
-                    filtered_pheno = pheno_vector[keeps]
-                    filtered_covariate_matrix = covariate_matrix[keeps,:]
-                    filtered_kinship_matrix = kinship_matrix[keeps,:][:,keeps]
-                    filtered_lmm_ob = lmm.LMM(filtered_pheno,filtered_kinship_matrix,X0=filtered_covariate_matrix)
-                    if refit:
-                        filtered_lmm_ob.fit(X=xs)
-                    else:
-                        #try:
-                        filtered_lmm_ob.fit()
-                        #except: pdb.set_trace()
-                    ts,ps,beta,betaVar = Ls.association(xs,returnBeta=True)
-                else:
-                    if x.var() == 0:
-                        p_values.append(np.nan)
-                        t_statistics.append(np.nan)
-                        continue
-            
-                    if refit:
-                        lmm_ob.fit(X=x)
-                    ts,ps,beta,betaVar = lmm_ob.association(x)
-                p_values.append(ps)
-                t_statistics.append(ts)
-
-        #genotype_data = [marker['genotypes'] for marker in self.dataset.group.markers.markers]
-        #
-        #no_val_samples = self.identify_empty_samples()
-        #trimmed_genotype_data = self.trim_genotypes(genotype_data, no_val_samples)
-        #
-        #genotype_matrix = np.array(trimmed_genotype_data).T
-        #
-        #print("pheno_vector is: ", pf(pheno_vector))
-        #print("genotype_matrix is: ", pf(genotype_matrix))
-        #
-        #t_stats, p_values = lmm.run(
-        #    pheno_vector,
-        #    genotype_matrix,
-        #    restricted_max_likelihood=True,
-        #    refit=False,
-        #    temp_data=tempdata
-        #)
-        
-        self.dataset.group.get_markers()
-        self.dataset.group.markers.add_pvalues(p_values)
 
-        self.qtl_results = self.dataset.group.markers.markers
+        p_values, t_stats = lmm.run_human(
+                pheno_vector,
+                covariate_matrix,
+                plink_input,
+                kinship_matrix
+            )
 
+        return p_values, t_stats
+    
 
     def identify_empty_samples(self):
         no_val_samples = []
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/input.py b/wqflask/wqflask/my_pylmm/pyLMM/input.py
index 35662072..dfe28a4e 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/input.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/input.py
@@ -40,7 +40,7 @@ class plink:
         # NOW I want to use this module to just read SNPs so I'm allowing 
         # the programmer to turn off the kinship reading.
         self.readKFile = readKFile
-    
+
         if self.kFile:
             self.K = self.readKinship(self.kFile)
         elif os.path.isfile("%s.kin" % fbase): 
@@ -50,21 +50,21 @@ class plink:
         else: 
             self.kFile = None
             self.K = None
-    
+
         self.getPhenos(self.phenoFile)
-     
+
         self.fhandle = None
         self.snpFileHandle = None
 
     def __del__(self): 
         if self.fhandle: self.fhandle.close()
         if self.snpFileHandle: self.snpFileHandle.close()
-    
+
     def getSNPIterator(self):
         if not self.type == 'b': 
             sys.stderr.write("Have only implemented this for binary plink files (bed)\n")
             return
-    
+
         # get the number of snps
         file = self.fbase + '.bim'
         i = 0
@@ -74,10 +74,10 @@ class plink:
         self.numSNPs = i
         self.have_read = 0
         self.snpFileHandle = open(file,'r')
-    
+
         self.BytestoRead = self.N / 4 + (self.N % 4 and 1 or 0)
         self._formatStr = 'c'*self.BytestoRead
-     
+
         file = self.fbase + '.bed'
         self.fhandle = open(file,'rb')
      
@@ -86,12 +86,12 @@ class plink:
         if not order == '\x01': 
             sys.stderr.write("This is not in SNP major order - you did not handle this case\n")
             raise StopIteration
-    
+
         return self
-    
+
     def __iter__(self):
         return self.getSNPIterator()
-    
+
     def next(self):
         if self.have_read == self.numSNPs:
             raise StopIteration
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
index f1f195d6..e60f7b02 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
@@ -26,42 +26,122 @@ from scipy import stats
 
 from pprint import pformat as pf
 
-#from utility.benchmark import Bench
-#
-##np.seterr('raise')
-#
-#def run(pheno_vector,
-#        genotype_matrix,
-#        restricted_max_likelihood=True,
-#        refit=False,
-#        temp_data=None):
-#    """Takes the phenotype vector and genotype matrix and returns a set of p-values and t-statistics
-#    
-#    restricted_max_likelihood -- whether to use restricted max likelihood; True or False
-#    refit -- whether to refit the variance component for each marker
-#    temp_data -- TempData object that stores the progress for each major step of the
-#    calculations ("calculate_kinship" and "GWAS" take the majority of time)
-#    
-#    """
-#    
-#    with Bench("Calculate Kinship"):
-#        kinship_matrix = calculate_kinship(genotype_matrix, temp_data)
-#    
-#    with Bench("Create LMM object"):
-#        lmm_ob = LMM(pheno_vector, kinship_matrix)
-#    
-#    with Bench("LMM_ob fitting"):
-#        lmm_ob.fit()
-#
-#    with Bench("Doing GWAS"):
-#        t_stats, p_values = GWAS(pheno_vector,
-#                                genotype_matrix,
-#                                kinship_matrix,
-#                                restricted_max_likelihood=True,
-#                                refit=False,
-#                                temp_data=temp_data)
-#    Bench().report()
-#    return t_stats, p_values
+from utility.benchmark import Bench
+
+#np.seterr('raise')
+
+def run_human(pheno_vector,
+            covariate_matrix,
+            plink_input,
+            kinship_matrix,
+            refit=False,
+            temp_data=None):
+
+    v = np.isnan(pheno_vector)
+    keep = True - v
+    keep = keep.reshape((len(keep),))
+
+    if v.sum():
+        pheno_vector = pheno_vector[keep]
+        print("pheno_vector shape is now: ", pf(pheno_vector.shape))
+        covariate_matrix = covariate_matrix[keep,:]
+        print("kinship_matrix shape is: ", pf(kinship_matrix.shape))
+        print("len(keep) is: ", pf(keep.shape))
+        kinship_matrix = kinship_matrix[keep,:][:,keep]
+
+    n = kinship_matrix.shape[0]
+    lmm_ob = LMM(pheno_vector,
+                kinship_matrix,
+                covariate_matrix)
+    lmm_ob.fit()
+
+    # Buffers for pvalues and t-stats
+    p_values = []
+    t_stats = []
+    count = 0
+    with Bench("snp iterator loop"):
+        for snp, this_id in plink_input:
+            if count > 1000:
+                break
+            count += 1
+
+            x = snp[keep].reshape((n,1))
+            #x[[1,50,100,200,3000],:] = np.nan
+            v = np.isnan(x).reshape((-1,))
+
+            # Check SNPs for missing values
+            if v.sum():
+                keeps = True - v
+                xs = x[keeps,:]
+                # If no variation at this snp or all genotypes missing 
+                if keeps.sum() <= 1 or xs.var() <= 1e-6:
+                    p_values.append(np.nan)
+                    t_stats.append(np.nan)
+                    continue
+
+                # Its ok to center the genotype -  I used options.normalizeGenotype to
+                # force the removal of missing genotypes as opposed to replacing them with MAF.
+
+                #if not options.normalizeGenotype:
+                #    xs = (xs - xs.mean()) / np.sqrt(xs.var())
+
+                filtered_pheno = pheno_vector[keeps]
+                filtered_covariate_matrix = covariate_matrix[keeps,:]
+                filtered_kinship_matrix = kinship_matrix[keeps,:][:,keeps]
+                filtered_lmm_ob = lmm.LMM(filtered_pheno,filtered_kinship_matrix,X0=filtered_covariate_matrix)
+                if refit:
+                    filtered_lmm_ob.fit(X=xs)
+                else:
+                    #try:
+                    filtered_lmm_ob.fit()
+                    #except: pdb.set_trace()
+                ts,ps,beta,betaVar = Ls.association(xs,returnBeta=True)
+            else:
+                if x.var() == 0:
+                    p_values.append(np.nan)
+                    t_stats.append(np.nan)
+                    continue
+                if refit:
+                    lmm_ob.fit(X=x)
+                ts, ps, beta, betaVar = lmm_ob.association(x)
+            p_values.append(ps)
+            t_stats.append(ts)
+            
+    return p_values, t_stats
+
+
+def run(pheno_vector,
+        genotype_matrix,
+        restricted_max_likelihood=True,
+        refit=False,
+        temp_data=None):
+    """Takes the phenotype vector and genotype matrix and returns a set of p-values and t-statistics
+    
+    restricted_max_likelihood -- whether to use restricted max likelihood; True or False
+    refit -- whether to refit the variance component for each marker
+    temp_data -- TempData object that stores the progress for each major step of the
+    calculations ("calculate_kinship" and "GWAS" take the majority of time)
+    
+    """
+    
+    with Bench("Calculate Kinship"):
+        kinship_matrix = calculate_kinship(genotype_matrix, temp_data)
+    
+    with Bench("Create LMM object"):
+        lmm_ob = LMM(pheno_vector, kinship_matrix)
+    
+    with Bench("LMM_ob fitting"):
+        lmm_ob.fit()
+
+    with Bench("Doing GWAS"):
+        t_stats, p_values = GWAS(pheno_vector,
+                                genotype_matrix,
+                                kinship_matrix,
+                                restricted_max_likelihood=True,
+                                refit=False,
+                                temp_data=temp_data)
+    Bench().report()
+    return t_stats, p_values
 
 
 def matrixMult(A,B):
@@ -212,7 +292,7 @@ def GWAS(pheno_vector,
                 lmm_ob_2.fit(X=xs)
             else:
                 lmm_ob_2.fit()
-            ts, ps = lmm_ob_2.association(xs, REML=restricted_max_likelihood)
+            ts, ps, beta, betaVar = lmm_ob_2.association(xs, REML=restricted_max_likelihood)
         else:
             if x.var() == 0:
                 p_values.append(np.nan)
@@ -221,7 +301,7 @@ def GWAS(pheno_vector,
 
             if refit:
                 lmm_ob.fit(X=x)
-            ts, ps = lmm_ob.association(x, REML=restricted_max_likelihood)
+            ts, ps, beta, betaVar = lmm_ob.association(x, REML=restricted_max_likelihood)
             
         percent_complete = 45 + int(round((counter/m)*55))
         temp_data.store("percent_complete", percent_complete)
-- 
cgit 1.4.1