about summary refs log tree commit diff
path: root/gn2/maintenance/geno_to_json.py
diff options
context:
space:
mode:
Diffstat (limited to 'gn2/maintenance/geno_to_json.py')
-rw-r--r--gn2/maintenance/geno_to_json.py196
1 files changed, 196 insertions, 0 deletions
diff --git a/gn2/maintenance/geno_to_json.py b/gn2/maintenance/geno_to_json.py
new file mode 100644
index 00000000..7be2ed83
--- /dev/null
+++ b/gn2/maintenance/geno_to_json.py
@@ -0,0 +1,196 @@
+#!/usr/bin/python
+
+"""
+Convert .geno files to json
+
+This file goes through all of the genofiles in the genofile directory (.geno)
+and converts them to json files that are used when running the marker regression
+code
+
+"""
+
+import sys
+sys.path.append("..")
+import os
+import glob
+import traceback
+import gzip
+
+#import numpy as np
+#from pyLMM import lmm
+
+import simplejson as json
+
+from pprint import pformat as pf
+
+#from gn2.utility.tools import flat_files
+
+
+class EmptyConfigurations(Exception):
+    pass
+
+
+class Marker:
+    def __init__(self):
+        self.name = None
+        self.chr = None
+        self.cM = None
+        self.Mb = None
+        self.genotypes = []
+
+
+class ConvertGenoFile:
+
+    def __init__(self, input_file, output_file):
+
+        self.input_file = input_file
+        self.output_file = output_file
+
+        self.mb_exists = False
+        self.cm_exists = False
+        self.markers = []
+
+        self.latest_row_pos = None
+        self.latest_col_pos = None
+
+        self.latest_row_value = None
+        self.latest_col_value = None
+
+    def convert(self):
+
+        self.haplotype_notation = {
+            '@mat': "1",
+            '@pat': "0",
+            '@het': "0.5",
+            '@unk': "NA"
+        }
+
+        self.configurations = {}
+        #self.skipped_cols = 3
+
+        # if self.input_file.endswith(".geno.gz"):
+        #    print("self.input_file: ", self.input_file)
+        #    self.input_fh = gzip.open(self.input_file)
+        # else:
+        self.input_fh = open(self.input_file)
+
+        with open(self.output_file, "w") as self.output_fh:
+            # if self.file_type == "geno":
+            self.process_csv()
+            # elif self.file_type == "snps":
+            #    self.process_snps_file()
+
+    def process_csv(self):
+        for row_count, row in enumerate(self.process_rows()):
+            row_items = row.split("\t")
+
+            this_marker = Marker()
+            this_marker.name = row_items[1]
+            this_marker.chr = row_items[0]
+            if self.cm_exists and self.mb_exists:
+                this_marker.cM = row_items[2]
+                this_marker.Mb = row_items[3]
+                genotypes = row_items[4:]
+            elif self.cm_exists:
+                this_marker.cM = row_items[2]
+                genotypes = row_items[3:]
+            elif self.mb_exists:
+                this_marker.Mb = row_items[2]
+                genotypes = row_items[3:]
+            else:
+                genotypes = row_items[2:]
+            for item_count, genotype in enumerate(genotypes):
+                if genotype.upper() in self.configurations:
+                    this_marker.genotypes.append(
+                        self.configurations[genotype.upper()])
+                else:
+                    this_marker.genotypes.append("NA")
+
+            #print("this_marker is:", pf(this_marker.__dict__))
+            # if this_marker.chr == "14":
+            self.markers.append(this_marker.__dict__)
+
+        with open(self.output_file, 'w') as fh:
+            json.dump(self.markers, fh, indent="   ", sort_keys=True)
+
+            # print('configurations:', str(configurations))
+            #self.latest_col_pos = item_count + self.skipped_cols
+            #self.latest_col_value = item
+
+            # if item_count != 0:
+            #    self.output_fh.write(" ")
+            # self.output_fh.write(self.configurations[item.upper()])
+
+            # self.output_fh.write("\n")
+
+    def process_rows(self):
+        for self.latest_row_pos, row in enumerate(self.input_fh):
+            # if self.input_file.endswith(".geno.gz"):
+            #    print("row: ", row)
+            self.latest_row_value = row
+            # Take care of headers
+            if not row.strip():
+                continue
+            if row.startswith('#'):
+                continue
+            if row.startswith('Chr'):
+                if 'Mb' in row.split():
+                    self.mb_exists = True
+                if 'cM' in row.split():
+                    self.cm_exists = True
+                continue
+            if row.startswith('@'):
+                key, _separater, value = row.partition(':')
+                key = key.strip()
+                value = value.strip()
+                if key in self.haplotype_notation:
+                    self.configurations[value] = self.haplotype_notation[key]
+                continue
+            if not len(self.configurations):
+                raise EmptyConfigurations
+            yield row
+
+    @classmethod
+    def process_all(cls, old_directory, new_directory):
+        os.chdir(old_directory)
+        for input_file in glob.glob("*"):
+            if not input_file.endswith(('geno', '.geno.gz')):
+                continue
+            group_name = ".".join(input_file.split('.')[:-1])
+            output_file = os.path.join(new_directory, group_name + ".json")
+            print("%s -> %s" % (
+                os.path.join(old_directory, input_file), output_file))
+            convertob = ConvertGenoFile(input_file, output_file)
+            try:
+                convertob.convert()
+            except EmptyConfigurations as why:
+                print("  No config info? Continuing...")
+                #excepted = True
+                continue
+            except Exception as why:
+
+                print("  Exception:", why)
+                print(traceback.print_exc())
+                print("    Found in row %s at tabular column %s" % (convertob.latest_row_pos,
+                                                                    convertob.latest_col_pos))
+                print("    Column is:", convertob.latest_col_value)
+                print("    Row is:", convertob.latest_row_value)
+                break
+
+    # def process_snps_file(cls, snps_file, new_directory):
+    #    output_file = os.path.join(new_directory, "mouse_families.json")
+    #    print("%s -> %s" % (snps_file, output_file))
+    #    convertob = ConvertGenoFile(input_file, output_file)
+
+
+if __name__ == "__main__":
+    Old_Geno_Directory = """/export/local/home/zas1024/gn2-zach/genotype_files/genotype"""
+    New_Geno_Directory = """/export/local/home/zas1024/gn2-zach/genotype_files/genotype/json"""
+    #Input_File = """/home/zas1024/gene/genotype_files/genotypes/BXD.geno"""
+    #Output_File = """/home/zas1024/gene/wqflask/wqflask/pylmm/data/bxd.snps"""
+    #convertob = ConvertGenoFile("/home/zas1024/gene/genotype_files/genotypes/SRxSHRSPF2.geno", "/home/zas1024/gene/genotype_files/new_genotypes/SRxSHRSPF2.json")
+    # convertob.convert()
+    ConvertGenoFile.process_all(Old_Geno_Directory, New_Geno_Directory)
+    # ConvertGenoFiles(Geno_Directory)
+
+    #process_csv(Input_File, Output_File)