Committing before splitting code that runs pylmm with plink files

and code that runs it with json
author: Zachary Sloan 2013-04-02 22:07:01 +0000
committer: Zachary Sloan 2013-04-02 22:07:01 +0000
commit: bf78deec93a6ef3296b4c8cf38a71d1a03480d21 (patch)
tree: e13051ed1ae680ee74f6bfd45bd2b96127cc148e /wqflask
parent: 51e120ae25a7955a895d5e79d5ee459764a331ea (diff)
download: genenetwork2-bf78deec93a6ef3296b4c8cf38a71d1a03480d21.tar.gz
5 files changed, 71 insertions, 54 deletions
diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py
index 17881e53..ab8554a0 100755
--- a/wqflask/base/data_set.py
+++ b/wqflask/base/data_set.py
@@ -35,6 +35,7 @@ from base import webqtlConfig
 from base import species
 from dbFunction import webqtlDatabaseFunction
 from utility import webqtlUtil
+from utility.benchmark import Bench
 
 from MySQLdb import escape_string as escape
 from pprint import pformat as pf
@@ -73,14 +74,60 @@ class Markers(object):
         self.markers = json.load(json_data_fh)
     
     def add_pvalues(self, p_values):
+        print("length of self.markers:", len(self.markers))
+        print("length of p_values:", len(p_values))
+        
+        # THIS IS only needed for the case when we are limiting the number of p-values calculated
+        if len(self.markers) > len(p_values):
+            self.markers = self.markers[:len(p_values)]
+        
         for marker, p_value in itertools.izip(self.markers, p_values):
             marker['p_value'] = p_value
             print("p_value is:", marker['p_value'])
             marker['lod_score'] = -math.log10(marker['p_value'])
             #Using -log(p) for the LRS; need to ask Rob how he wants to get LRS from p-values
             marker['lrs_value'] = -math.log10(marker['p_value']) * 4.61
+        
+        
+
+
+class HumanMarkers(Markers):
+    
+    def __init__(self, name):
+        marker_data_fh = open(os.path.join(webqtlConfig.PYLMM_PATH + name + '.bim'))
+        self.markers = []
+        for line in marker_data_fh:
+            splat = line.strip().split()
+            marker = {}
+            marker['chr'] = int(splat[0])
+            marker['name'] = splat[1]
+            marker['Mb'] = float(splat[3]) / 1000000
+            self.markers.append(marker)
+            
+        #print("markers is: ", pf(self.markers))
 
 
+    def add_pvalues(self, p_values):
+        #for marker, p_value in itertools.izip(self.markers, p_values):
+        #    if marker['Mb'] <= 0 and marker['chr'] == 0:
+        #        continue
+        #    marker['p_value'] = p_value
+        #    print("p_value is:", marker['p_value'])
+        #    marker['lod_score'] = -math.log10(marker['p_value'])
+        #    #Using -log(p) for the LRS; need to ask Rob how he wants to get LRS from p-values
+        #    marker['lrs_value'] = -math.log10(marker['p_value']) * 4.61
+        
+        super(HumanMarkers, self).add_pvalues(p_values)
+        
+        with Bench("deleting markers"):
+            markers = []
+            for marker in self.markers:
+                if not marker['Mb'] <= 0 and not marker['chr'] == 0:
+                    markers.append(marker)
+            self.markers = markers
+        
+    
+
 class DatasetGroup(object):
     """
     Each group has multiple datasets; each species has multiple groups.
@@ -104,21 +151,17 @@ class DatasetGroup(object):
         
         self.incparentsf1 = False
         self.allsamples = None
-        self.markers = Markers(self.name)
-
-
-    #def read_genotype(self):
-    #    self.read_genotype_file()
-    #
-    #    if not self.genotype:   # Didn'd succeed, so we try method 2
-    #        self.read_genotype_data()
+        
+        
+    def get_markers(self):
+        print("self.species is:", self.species)
+        if self.species == "human":
+            marker_class = HumanMarkers 
+        else:
+            marker_class = Markers
 
-    #def read_genotype_json(self):
-    #    '''Read genotype from json file'''
-    #    
-    #    json_data = open(os.path.join(webqtlConfig.NEWGENODIR + self.name + '.json'))
-    #    markers = json.load(json_data)
-    #    
+        self.markers = marker_class(self.name)
+        
 
     def get_f1_parent_strains(self):
         try:
@@ -321,12 +364,9 @@ class PhenotypeDataSet(DataSet):
                 continue   # for now
                 if not webqtlUtil.hasAccessToConfidentialPhenotypeTrait(privilege=self.privilege, userName=self.userName, authorized_users=this_trait.authorized_users):
                     description = this_trait.pre_publication_description
-            this_trait.description_display = description
-
-            try:
-                this_trait.description_display.decode('ascii')
-            except Exception:
-                this_trait.description_display = this_trait.description_display.decode('utf-8')
+            this_trait.description_display = description.decode('utf-8')
+            
+            
 
             if not this_trait.year.isdigit():
                 this_trait.pubmed_text = "N/A"
diff --git a/wqflask/wqflask/marker_regression/marker_regression.py b/wqflask/wqflask/marker_regression/marker_regression.py
index c3555e8f..a640d37f 100755
--- a/wqflask/wqflask/marker_regression/marker_regression.py
+++ b/wqflask/wqflask/marker_regression/marker_regression.py
@@ -57,7 +57,6 @@ class MarkerRegression(object):
             chromosomes = chromosome_mb_lengths,
             qtl_results = self.qtl_results,
         )
-        
 
 
     def gen_data(self, tempdata):
@@ -67,8 +66,8 @@ class MarkerRegression(object):
         file_base = os.path.join(webqtlConfig.PYLMM_PATH, self.dataset.group.name)
         
         plink_input = input.plink(file_base, type='b')
-        
-        
+
+
         pheno_vector = np.array([val == "x" and np.nan or float(val) for val in self.vals])
         pheno_vector = pheno_vector.reshape((len(pheno_vector), 1))
         covariate_matrix = np.ones((pheno_vector.shape[0],1))
@@ -83,9 +82,6 @@ class MarkerRegression(object):
         eigen_values = []
         eigen_vectors = []
         
-        
-        print("pheno_vector shape is: ", pf(pheno_vector.shape))
-        
         #print("pheno_vector is: ", pf(pheno_vector))
         #print("kinship_matrix is: ", pf(kinship_matrix))
         
@@ -101,9 +97,6 @@ class MarkerRegression(object):
         #    eigen_values = np.fromfile(file_base + ".kin.kva")
         #    eigen_vectors = np.fromfile(file_base + ".kin.kve")
             
-        #print("eigen_values is: ", pf(eigen_values))
-        #print("eigen_vectors is: ", pf(eigen_vectors))
-            
         n = kinship_matrix.shape[0]
         lmm_ob = lmm.LMM(pheno_vector,
                          kinship_matrix,
@@ -121,8 +114,8 @@ class MarkerRegression(object):
         print("# snps is: ", pf(plink_input.numSNPs))
         with Bench("snp iterator loop"):
             for snp, this_id in plink_input:
-                #if count > 10000:
-                #    break            
+                if count > 1000:
+                    break
                 count += 1
                 
                 x = snp[keep].reshape((n,1))
@@ -138,13 +131,13 @@ class MarkerRegression(object):
                         p_values.append(np.nan)
                         t_statistics.append(np.nan)
                         continue
-            
+
                     # Its ok to center the genotype -  I used options.normalizeGenotype to
                     # force the removal of missing genotypes as opposed to replacing them with MAF.
-                    
+
                     #if not options.normalizeGenotype:
                     #    xs = (xs - xs.mean()) / np.sqrt(xs.var())
-                    
+
                     filtered_pheno = pheno_vector[keeps]
                     filtered_covariate_matrix = covariate_matrix[keeps,:]
                     filtered_kinship_matrix = kinship_matrix[keeps,:][:,keeps]
@@ -167,7 +160,6 @@ class MarkerRegression(object):
                     ts,ps,beta,betaVar = lmm_ob.association(x)
                 p_values.append(ps)
                 t_statistics.append(ts)
-        
 
         #genotype_data = [marker['genotypes'] for marker in self.dataset.group.markers.markers]
         #
@@ -187,28 +179,11 @@ class MarkerRegression(object):
         #    temp_data=tempdata
         #)
         
-        print("p_values is: ", pf(p_values))
+        self.dataset.group.get_markers()
         self.dataset.group.markers.add_pvalues(p_values)
 
-        #self.lrs_values = [marker['lrs_value'] for marker in self.dataset.group.markers.markers]
-        #lrs_values_sorted = sorted(self.lrs_values)
-        #
-        #lrs_values_length = len(lrs_values_sorted)
-        #
-        #def lrs_threshold(place):
-        #    return lrs_values_sorted[int((lrs_values_length * place) -1)]
-        #
-        #self.lrs_thresholds = Bunch(
-        #                    suggestive = lrs_threshold(.37),
-        #                    significant = lrs_threshold(.95),
-        #                    highly_significant = lrs_threshold(.99),
-        #                        )
-
         self.qtl_results = self.dataset.group.markers.markers
 
-        for marker in self.qtl_results:
-            if marker['lrs_value'] > webqtlConfig.MAXLRS:
-                marker['lrs_value'] = webqtlConfig.MAXLRS
 
     def identify_empty_samples(self):
         no_val_samples = []
diff --git a/wqflask/wqflask/static/new/javascript/marker_regression.coffee b/wqflask/wqflask/static/new/javascript/marker_regression.coffee
index 6e605fa7..3e14ab6b 100644
--- a/wqflask/wqflask/static/new/javascript/marker_regression.coffee
+++ b/wqflask/wqflask/static/new/javascript/marker_regression.coffee
@@ -2,6 +2,7 @@ $ ->
     class Manhattan_Plot
         constructor: (@plot_height, @plot_width) ->
             @qtl_results = js_data.qtl_results
+            console.log("qtl_results are:", @qtl_results)
             @chromosomes = js_data.chromosomes
 
             @total_length = 0
diff --git a/wqflask/wqflask/static/new/javascript/marker_regression.js b/wqflask/wqflask/static/new/javascript/marker_regression.js
index cb3c09cb..09470daf 100644
--- a/wqflask/wqflask/static/new/javascript/marker_regression.js
+++ b/wqflask/wqflask/static/new/javascript/marker_regression.js
@@ -11,6 +11,7 @@
         this.plot_height = plot_height;
         this.plot_width = plot_width;
         this.qtl_results = js_data.qtl_results;
+        console.log("qtl_results are:", this.qtl_results);
         this.chromosomes = js_data.chromosomes;
         this.total_length = 0;
         this.max_chr = this.get_max_chr();
diff --git a/wqflask/wqflask/views.py b/wqflask/wqflask/views.py
index 46433430..7f5f88e0 100644
--- a/wqflask/wqflask/views.py
+++ b/wqflask/wqflask/views.py
@@ -168,7 +168,7 @@ def marker_regression_page():
         if key in wanted or key.startswith(('value:')):
             start_vars[key] = value
     
-    version = "v5"
+    version = "v13"
     key = "marker_regression:{}:".format(version) + json.dumps(start_vars, sort_keys=True)
     with Bench("Loading cache"):
         result = Redis.get(key)
author	Zachary Sloan	2013-04-02 22:07:01 +0000
committer	Zachary Sloan	2013-04-02 22:07:01 +0000
commit	bf78deec93a6ef3296b4c8cf38a71d1a03480d21 (patch)
tree	e13051ed1ae680ee74f6bfd45bd2b96127cc148e /wqflask
parent	51e120ae25a7955a895d5e79d5ee459764a331ea (diff)
download	genenetwork2-bf78deec93a6ef3296b4c8cf38a71d1a03480d21.tar.gz