From 9bb60bb18ae5ac70fe480095554796b7c18f1b6c Mon Sep 17 00:00:00 2001
From: zsloan
Date: Fri, 25 May 2018 15:52:40 +0000
Subject: Fixed issue causing anonymous collections to not work on my branch
 and staging, though still not sure why it's working on production without
 that change

Added script to convert the dryad format genotype files to BIMBAM

removed db_uri from parameters of parse_db_uri in gen_select_dataset.py, since it can now just pull it from settings as a global variable
---
 wqflask/maintenance/convert_dryad_to_bimbam.py | 70 ++++++++++++++++++++++++++
 wqflask/maintenance/gen_select_dataset.py      |  2 +-
 wqflask/maintenance/quantile_normalize.py      |  4 +-
 3 files changed, 73 insertions(+), 3 deletions(-)
 create mode 100644 wqflask/maintenance/convert_dryad_to_bimbam.py

(limited to 'wqflask/maintenance')

diff --git a/wqflask/maintenance/convert_dryad_to_bimbam.py b/wqflask/maintenance/convert_dryad_to_bimbam.py
new file mode 100644
index 00000000..e833b395
--- /dev/null
+++ b/wqflask/maintenance/convert_dryad_to_bimbam.py
@@ -0,0 +1,70 @@
+#!/usr/bin/python
+
+"""
+Convert data dryad files to a BIMBAM _geno and _snps file
+
+
+"""
+
+from __future__ import print_function, division, absolute_import
+import sys
+sys.path.append("..")
+
+
+def read_dryad_file(filename):
+    exclude_count = 0
+    marker_list = []
+    sample_dict = {}
+    sample_list = []
+    geno_rows = []
+    with open(filename, 'r') as the_file:
+        for i, line in enumerate(the_file):
+            if i > 0:
+                if line.split(" ")[1] == "no":
+                    sample_name = line.split(" ")[0]
+                    sample_list.append(sample_name)
+                    sample_dict[sample_name] = line.split(" ")[2:]
+                else:
+                    exclude_count += 1
+            else:
+                marker_list = line.split(" ")[2:]
+
+    for i, marker in enumerate(marker_list):
+        this_row = []
+        this_row.append(marker)
+        this_row.append("X")
+        this_row.append("Y")
+        for sample in sample_list:
+            this_row.append(sample_dict[sample][i])
+        geno_rows.append(this_row)
+
+    print(exclude_count)
+
+    return geno_rows
+
+    #for i, marker in enumerate(marker_list):
+    #    this_row = []
+    #    this_row.append(marker)
+    #    this_row.append("X")
+    #    this_row.append("Y")
+    #    with open(filename, 'r') as the_file:
+    #        for j, line in enumerate(the_file):
+    #            if j > 0:
+    #                this_row.append(line.split(" ")[i+2])
+    #        print("row: " + str(i))
+    #        geno_rows.append(this_row)
+    #            
+    #return geno_rows
+
+def write_bimbam_files(geno_rows):
+    with open('/home/zas1024/cfw_data/CFW_geno.txt', 'w') as geno_fh:
+        for row in geno_rows:
+            geno_fh.write(", ".join(row) + "\n")
+
+def convert_dryad_to_bimbam(filename):
+    geno_file_rows = read_dryad_file(filename)
+    write_bimbam_files(geno_file_rows)
+
+if __name__=="__main__":
+    input_filename = "/home/zas1024/cfw_data/" + sys.argv[1] + ".txt"
+    convert_dryad_to_bimbam(input_filename)
\ No newline at end of file
diff --git a/wqflask/maintenance/gen_select_dataset.py b/wqflask/maintenance/gen_select_dataset.py
index 2825c6ea..18b2dac9 100644
--- a/wqflask/maintenance/gen_select_dataset.py
+++ b/wqflask/maintenance/gen_select_dataset.py
@@ -63,7 +63,7 @@ from pprint import pformat as pf
 
 #conn = Engine.connect()
 
-def parse_db_uri(db_uri):
+def parse_db_uri():
     """Converts a database URI to the db name, host name, user name, and password"""
 
     parsed_uri = urlparse.urlparse(SQL_URI)
diff --git a/wqflask/maintenance/quantile_normalize.py b/wqflask/maintenance/quantile_normalize.py
index c11073fb..41a3aad8 100644
--- a/wqflask/maintenance/quantile_normalize.py
+++ b/wqflask/maintenance/quantile_normalize.py
@@ -59,7 +59,7 @@ def set_data(dataset_name):
     orig_file = "/home/zas1024/cfw_data/" + dataset_name + ".txt"
 
     sample_list = []
-    with open(orig_file, 'r') as orig_fh, open('quant_norm.csv', 'r') as quant_fh:
+    with open(orig_file, 'r') as orig_fh, open('/home/zas1024/cfw_data/quant_norm.csv', 'r') as quant_fh:
         for i, (line1, line2) in enumerate(izip(orig_fh, quant_fh)):
             trait_dict = {}
             sample_list = []
@@ -118,7 +118,7 @@ if __name__ == '__main__':
 
     #out_filename = sys.argv[1][:-4] + '_quantnorm.txt'
 
-    #success, _ = bulk(es, set_data(sys.argv[1]))
+    success, _ = bulk(es, set_data(sys.argv[1]))
 
     response = es.search(
         index = "traits", doc_type = "trait", body = {
-- 
cgit 1.4.1