aboutsummaryrefslogtreecommitdiff
path: root/wqflask/maintenance
diff options
context:
space:
mode:
Diffstat (limited to 'wqflask/maintenance')
-rw-r--r--wqflask/maintenance/convert_geno_to_bimbam.py77
1 files changed, 12 insertions, 65 deletions
diff --git a/wqflask/maintenance/convert_geno_to_bimbam.py b/wqflask/maintenance/convert_geno_to_bimbam.py
index 05006d5c..45522705 100644
--- a/wqflask/maintenance/convert_geno_to_bimbam.py
+++ b/wqflask/maintenance/convert_geno_to_bimbam.py
@@ -17,17 +17,12 @@ import glob
import traceback
import gzip
-#import numpy as np
-#from pyLMM import lmm
-
import simplejson as json
from pprint import pformat as pf
class EmptyConfigurations(Exception): pass
-
-
class Marker(object):
def __init__(self):
self.name = None
@@ -39,47 +34,34 @@ class Marker(object):
class ConvertGenoFile(object):
def __init__(self, input_file, output_files):
-
self.input_file = input_file
self.output_files = output_files
-
+
self.mb_exists = False
self.cm_exists = False
self.markers = []
-
+
self.latest_row_pos = None
self.latest_col_pos = None
-
+
self.latest_row_value = None
self.latest_col_value = None
-
- def convert(self):
+ def convert(self):
self.haplotype_notation = {
'@mat': "1",
'@pat': "0",
'@het': "0.5",
'@unk': "NA"
}
-
+
self.configurations = {}
- #self.skipped_cols = 3
-
- #if self.input_file.endswith(".geno.gz"):
- # print("self.input_file: ", self.input_file)
- # self.input_fh = gzip.open(self.input_file)
- #else:
self.input_fh = open(self.input_file)
-
- with open(self.output_files[0], "w") as self.geno_fh:
- #if self.file_type == "geno":
- self.process_csv()
- #elif self.file_type == "snps":
- # self.process_snps_file()
+ self.process_csv()
def process_csv(self):
- for row_count, row in enumerate(self.process_rows()):
+ for row in self.process_rows():
row_items = row.split("\t")
this_marker = Marker()
@@ -102,53 +84,30 @@ class ConvertGenoFile(object):
this_marker.genotypes.append(self.configurations[genotype.upper().strip()])
else:
this_marker.genotypes.append("NA")
-
- #print("this_marker is:", pf(this_marker.__dict__))
- #if this_marker.chr == "14":
+
self.markers.append(this_marker.__dict__)
self.write_to_bimbam()
-
- # with open(self.output_file, 'w') as fh:
- # json.dump(self.markers, fh, indent=" ", sort_keys=True)
-
- # print('configurations:', str(configurations))
- #self.latest_col_pos = item_count + self.skipped_cols
- #self.latest_col_value = item
-
- #if item_count != 0:
- # self.output_fh.write(" ")
- #self.output_fh.write(self.configurations[item.upper()])
-
- #self.output_fh.write("\n")
def write_to_bimbam(self):
with open(self.output_files[0], "w") as geno_fh:
- # geno_fh.write(str(len(self.sample_list)) + "\n")
- # geno_fh.write("2\n")
- # geno_fh.write("IND")
- # for sample in self.sample_list:
- # geno_fh.write(" " + sample)
- # geno_fh.write("\n")
for marker in self.markers:
geno_fh.write(marker['name'])
geno_fh.write(", X, Y")
geno_fh.write(", " + ", ".join(marker['genotypes']))
geno_fh.write("\n")
-
- #pheno_fh = open(self.output_files[1], 'w')
+
with open(self.output_files[1], "w") as pheno_fh:
for sample in self.sample_list:
pheno_fh.write("1\n")
-
+
with open(self.output_files[2], "w") as snp_fh:
for marker in self.markers:
if self.mb_exists:
snp_fh.write(marker['name'] +", " + str(int(float(marker['Mb'])*1000000)) + ", " + marker['chr'] + "\n")
else:
snp_fh.write(marker['name'] +", " + str(int(float(marker['cM'])*1000000)) + ", " + marker['chr'] + "\n")
-
-
+
def get_sample_list(self, row_contents):
self.sample_list = []
if self.mb_exists:
@@ -164,8 +123,6 @@ class ConvertGenoFile(object):
def process_rows(self):
for self.latest_row_pos, row in enumerate(self.input_fh):
- #if self.input_file.endswith(".geno.gz"):
- # print("row: ", row)
self.latest_row_value = row
# Take care of headers
if not row.strip():
@@ -208,10 +165,8 @@ class ConvertGenoFile(object):
convertob.convert()
except EmptyConfigurations as why:
print(" No config info? Continuing...")
- #excepted = True
continue
except Exception as why:
-
print(" Exception:", why)
print(traceback.print_exc())
print(" Found in row %s at tabular column %s" % (convertob.latest_row_pos,
@@ -219,12 +174,6 @@ class ConvertGenoFile(object):
print(" Column is:", convertob.latest_col_value)
print(" Row is:", convertob.latest_row_value)
break
-
- #def process_snps_file(cls, snps_file, new_directory):
- # output_file = os.path.join(new_directory, "mouse_families.json")
- # print("%s -> %s" % (snps_file, output_file))
- # convertob = ConvertGenoFile(input_file, output_file)
-
if __name__=="__main__":
Old_Geno_Directory = """/home/zas1024/genotype_files/genotype/"""
@@ -234,6 +183,4 @@ if __name__=="__main__":
#convertob = ConvertGenoFile("/home/zas1024/gene/genotype_files/genotypes/SRxSHRSPF2.geno", "/home/zas1024/gene/genotype_files/new_genotypes/SRxSHRSPF2.json")
#convertob.convert()
ConvertGenoFile.process_all(Old_Geno_Directory, New_Geno_Directory)
- #ConvertGenoFiles(Geno_Directory)
-
- #process_csv(Input_File, Output_File) \ No newline at end of file
+ #ConvertGenoFiles(Geno_Directory) \ No newline at end of file