LMM code now reads in gzipped pickled plink snp iterator object

author: Zachary Sloan 2013-04-18 22:13:25 +0000
committer: Zachary Sloan 2013-04-18 22:13:25 +0000
commit: f36de42faa6565a04c344071a3a4befa60879509 (patch)
tree: 9519de36b297755bf20d5fe0a3b998ae5450807a
parent: a1c44dd7c11013da06dbd782dd0a0ebbde5cc995 (diff)
download: genenetwork2-f36de42faa6565a04c344071a3a4befa60879509.tar.gz
3 files changed, 38 insertions, 20 deletions
diff --git a/misc/notes.txt b/misc/notes.txt
index 10a5729a..a48ee5bf 100644
--- a/misc/notes.txt
+++ b/misc/notes.txt
@@ -161,6 +161,10 @@ du -hms * | sort -n : Gives size used by different directories
 
 ===========================================
 
+rm -rfv
+
+===========================================
+
 cp -a (archive; copies recursively and doesn't follow symbol links)
    -i (interactive, prompts before overwrite)
    -v (verbose)
diff --git a/wqflask/wqflask/marker_regression/marker_regression.py b/wqflask/wqflask/marker_regression/marker_regression.py
index c3e9a934..6ae1318e 100755
--- a/wqflask/wqflask/marker_regression/marker_regression.py
+++ b/wqflask/wqflask/marker_regression/marker_regression.py
@@ -98,7 +98,7 @@ class MarkerRegression(object):
         file_base = os.path.join(webqtlConfig.PYLMM_PATH, self.dataset.group.name)
 
         plink_input = input.plink(file_base, type='b')
-        input_file_name = os.path.join(webqtlConfig.SNP_PATH, self.dataset.group.name + ".snps")
+        input_file_name = os.path.join(webqtlConfig.SNP_PATH, self.dataset.group.name + ".snps.gz")
 
         pheno_vector = pheno_vector.reshape((len(pheno_vector), 1))
         covariate_matrix = np.ones((pheno_vector.shape[0],1))
@@ -142,13 +142,22 @@ class MarkerRegression(object):
 def create_snp_iterator_file(group):
     plink_file_base = os.path.join(webqtlConfig.PYLMM_PATH, group)
     plink_input = input.plink(plink_file_base, type='b')
-    inputs = list(plink_input)
     
-    snp_file_base = os.path.join(webqtlConfig.SNP_PATH, group + ".snps")
+    data = dict(plink_input = list(plink_input),
+                numSNPs = plink_input.numSNPs)
     
-    with open(snp_file_base, "wb") as fh:
-        pickle.dump(inputs, fh)
+    #input_dict = {}
+    #
+    #input_dict['plink_input'] = list(plink_input)
+    #input_dict['numSNPs'] = plink_input.numSNPs
+    #
+    
+    snp_file_base = os.path.join(webqtlConfig.SNP_PATH, group + ".snps.gz")
+    
+    with gzip.open(snp_file_base, "wb") as fh:
+        pickle.dump(data, fh, pickle.HIGHEST_PROTOCOL)
 
 if __name__ == '__main__':
     import cPickle as pickle
+    import gzip
     create_snp_iterator_file("HLC")
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
index ab87e4f0..8c0e0282 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
@@ -27,6 +27,7 @@ from scipy import optimize
 from scipy import stats
 import pdb
 
+import gzip
 import cPickle as pickle
 import simplejson as json
 
@@ -70,28 +71,32 @@ def run_human(pheno_vector,
 
     print("input_file: ", plink_input_file)
 
-    with open(plink_input_file, "rb") as input_file:
-        plink_input = pickle.load(input_file)
+    with Bench("Opening and loading pickle file"):
+        with gzip.open(plink_input_file, "rb") as input_file:
+            data = pickle.load(input_file)
+            
+    plink_input = data['plink_input']
 
     #plink_input.getSNPIterator()
-    #total_snps = plink_input.numSNPs
+    with Bench("Calculating numSNPs"):
+        total_snps = data['numSNPs']
 
     with Bench("snp iterator loop"):
         count = 0
 
-        #with Bench("Create list of inputs"):
-        #    inputs = list(plink_input)
+        with Bench("Create list of inputs"):
+            inputs = list(plink_input)
+            
+        with Bench("Divide into chunks"):
+            results = chunks.divide_into_chunks(inputs, 64)
+            
+        result_store = []
+        identifier = uuid.uuid4()
+        for part, result in enumerate(results):
+            data_store = temp_data.TempData(identifier, part)
             
-        #with Bench("Divide into chunks"):
-        #    results = chunks.divide_into_chunks(inputs, 64)
-        #    
-        #result_store = []
-        #identifier = uuid.uuid4()
-        #for part, result in enumerate(results):
-        #    data_store = temp_data.TempData(identifier, part)
-        #    
-        #    data_store.store(data=pickle.dumps(result))
-        #    result_store.append(data_store)
+            data_store.store(data=pickle.dumps(result))
+            result_store.append(data_store)
 
         for snp, this_id in plink_input:
             with Bench("part before association"):
author	Zachary Sloan	2013-04-18 22:13:25 +0000
committer	Zachary Sloan	2013-04-18 22:13:25 +0000
commit	f36de42faa6565a04c344071a3a4befa60879509 (patch)
tree	9519de36b297755bf20d5fe0a3b998ae5450807a
parent	a1c44dd7c11013da06dbd782dd0a0ebbde5cc995 (diff)
download	genenetwork2-f36de42faa6565a04c344071a3a4befa60879509.tar.gz