From 36ceda09d76c898c5f818236122f5d431261a5b1 Mon Sep 17 00:00:00 2001 From: zsloan Date: Mon, 23 Apr 2018 16:19:24 +0000 Subject: Changed GEMMA mapping to use -lmm 2 (Likelihood ratio test) as a parameter instead of -lmm 1 (Wald test) Added script to convert .geno files to JSON to maintenance folder (geno_to_json.py) --- wqflask/maintenance/geno_to_json.py | 195 +++++++++++++++++++++ wqflask/wqflask/marker_regression/gemma_mapping.py | 10 +- .../marker_regression/marker_regression_gn1.py | 8 +- .../templates/show_trait_mapping_tools.html | 13 ++ 4 files changed, 214 insertions(+), 12 deletions(-) create mode 100644 wqflask/maintenance/geno_to_json.py (limited to 'wqflask') diff --git a/wqflask/maintenance/geno_to_json.py b/wqflask/maintenance/geno_to_json.py new file mode 100644 index 00000000..789a1691 --- /dev/null +++ b/wqflask/maintenance/geno_to_json.py @@ -0,0 +1,195 @@ +#!/usr/bin/python + +""" +Convert .geno files to json + +This file goes through all of the genofiles in the genofile directory (.geno) +and converts them to json files that are used when running the marker regression +code + +""" + +from __future__ import print_function, division, absolute_import +import sys +sys.path.append("..") +import os +import glob +import traceback +import gzip + +#import numpy as np +#from pyLMM import lmm + +import simplejson as json + +from pprint import pformat as pf + +class EmptyConfigurations(Exception): pass + + + +class Marker(object): + def __init__(self): + self.name = None + self.chr = None + self.cM = None + self.Mb = None + self.genotypes = [] + +class ConvertGenoFile(object): + + def __init__(self, input_file, output_file): + + self.input_file = input_file + self.output_file = output_file + + self.mb_exists = False + self.cm_exists = False + self.markers = [] + + self.latest_row_pos = None + self.latest_col_pos = None + + self.latest_row_value = None + self.latest_col_value = None + + def convert(self): + + self.haplotype_notation = { + '@mat': "1", + '@pat': "0", + '@het': "0.5", + '@unk': "NA" + } + + self.configurations = {} + #self.skipped_cols = 3 + + #if self.input_file.endswith(".geno.gz"): + # print("self.input_file: ", self.input_file) + # self.input_fh = gzip.open(self.input_file) + #else: + self.input_fh = open(self.input_file) + + with open(self.output_file, "w") as self.output_fh: + #if self.file_type == "geno": + self.process_csv() + #elif self.file_type == "snps": + # self.process_snps_file() + + + def process_csv(self): + for row_count, row in enumerate(self.process_rows()): + row_items = row.split("\t") + + this_marker = Marker() + this_marker.name = row_items[1] + this_marker.chr = row_items[0] + if self.cm_exists and self.mb_exists: + this_marker.cM = row_items[2] + this_marker.Mb = row_items[3] + genotypes = row_items[4:] + elif self.cm_exists: + this_marker.cM = row_items[2] + genotypes = row_items[3:] + elif self.mb_exists: + this_marker.Mb = row_items[2] + genotypes = row_items[3:] + else: + genotypes = row_items[2:] + for item_count, genotype in enumerate(genotypes): + if genotype.upper() in self.configurations: + this_marker.genotypes.append(self.configurations[genotype.upper()]) + else: + this_marker.genotypes.append("NA") + + #print("this_marker is:", pf(this_marker.__dict__)) + #if this_marker.chr == "14": + self.markers.append(this_marker.__dict__) + + with open(self.output_file, 'w') as fh: + json.dump(self.markers, fh, indent=" ", sort_keys=True) + + # print('configurations:', str(configurations)) + #self.latest_col_pos = item_count + self.skipped_cols + #self.latest_col_value = item + + #if item_count != 0: + # self.output_fh.write(" ") + #self.output_fh.write(self.configurations[item.upper()]) + + #self.output_fh.write("\n") + + + def process_rows(self): + for self.latest_row_pos, row in enumerate(self.input_fh): + #if self.input_file.endswith(".geno.gz"): + # print("row: ", row) + self.latest_row_value = row + # Take care of headers + if not row.strip(): + continue + if row.startswith('#'): + continue + if row.startswith('Chr'): + if 'Mb' in row.split(): + self.mb_exists = True + if 'cM' in row.split(): + self.cm_exists = True + continue + if row.startswith('@'): + key, _separater, value = row.partition(':') + key = key.strip() + value = value.strip() + if key in self.haplotype_notation: + self.configurations[value] = self.haplotype_notation[key] + continue + if not len(self.configurations): + raise EmptyConfigurations + yield row + + @classmethod + def process_all(cls, old_directory, new_directory): + os.chdir(old_directory) + for input_file in glob.glob("*"): + if not input_file.endswith(('geno', '.geno.gz')): + continue + group_name = ".".join(input_file.split('.')[:-1]) + output_file = os.path.join(new_directory, group_name + ".json") + print("%s -> %s" % ( + os.path.join(old_directory, input_file), output_file)) + convertob = ConvertGenoFile(input_file, output_file) + try: + convertob.convert() + except EmptyConfigurations as why: + print(" No config info? Continuing...") + #excepted = True + continue + except Exception as why: + + print(" Exception:", why) + print(traceback.print_exc()) + print(" Found in row %s at tabular column %s" % (convertob.latest_row_pos, + convertob.latest_col_pos)) + print(" Column is:", convertob.latest_col_value) + print(" Row is:", convertob.latest_row_value) + break + + #def process_snps_file(cls, snps_file, new_directory): + # output_file = os.path.join(new_directory, "mouse_families.json") + # print("%s -> %s" % (snps_file, output_file)) + # convertob = ConvertGenoFile(input_file, output_file) + + + +if __name__=="__main__": + Old_Geno_Directory = """/home/zas1024/genotype_files/genotype/""" + New_Geno_Directory = """/home/zas1024/genotype_files/genotype/json/""" + #Input_File = """/home/zas1024/gene/genotype_files/genotypes/BXD.geno""" + #Output_File = """/home/zas1024/gene/wqflask/wqflask/pylmm/data/bxd.snps""" + #convertob = ConvertGenoFile("/home/zas1024/gene/genotype_files/genotypes/SRxSHRSPF2.geno", "/home/zas1024/gene/genotype_files/new_genotypes/SRxSHRSPF2.json") + #convertob.convert() + ConvertGenoFile.process_all(Old_Geno_Directory, New_Geno_Directory) + #ConvertGenoFiles(Geno_Directory) + + #process_csv(Input_File, Output_File) \ No newline at end of file diff --git a/wqflask/wqflask/marker_regression/gemma_mapping.py b/wqflask/wqflask/marker_regression/gemma_mapping.py index 157e4f33..0e31e73e 100644 --- a/wqflask/wqflask/marker_regression/gemma_mapping.py +++ b/wqflask/wqflask/marker_regression/gemma_mapping.py @@ -33,7 +33,7 @@ def run_gemma(this_dataset, samples, vals, covariates, method, use_loco): gen_covariates_file(this_dataset, covariates) if method == "gemma_plink": - gemma_command = GEMMA_COMMAND + ' -bfile %s/%s -k %s/%s.cXX.txt -lmm 1 -maf 0.1' % (flat_files('mapping'), + gemma_command = GEMMA_COMMAND + ' -bfile %s/%s -k %s/%s.cXX.txt -lmm 2 -maf 0.1' % (flat_files('mapping'), this_dataset.group.name, flat_files('mapping'), this_dataset.group.name) @@ -43,7 +43,7 @@ def run_gemma(this_dataset, samples, vals, covariates, method, use_loco): webqtlConfig.GENERATED_IMAGE_DIR, this_dataset.group.name) else: - #gemma_command = GEMMA_COMMAND + ' -bfile %s/%s -k %s/%s.sXX.txt -lmm 1 -maf 0.1 -o %s_output' % (flat_files('mapping'), + #gemma_command = GEMMA_COMMAND + ' -bfile %s/%s -k %s/%s.sXX.txt -lmm 2 -maf 0.1 -o %s_output' % (flat_files('mapping'), gemma_command += ' -outdir %s -o %s_output' % (webqtlConfig.GENERATED_IMAGE_DIR, this_dataset.group.name) else: @@ -69,20 +69,20 @@ def run_gemma(this_dataset, samples, vals, covariates, method, use_loco): gwa_output_filename = this_dataset.group.name + "_GWA_" + ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(6)) if covariates != "": - gemma_command += ' -c %s/%s_covariates.txt -a %s/%s_snps.txt -lmm 1 -maf 0.1 -debug > %s/gn2/%s.json' % (flat_files('mapping'), + gemma_command += ' -c %s/%s_covariates.txt -a %s/%s_snps.txt -lmm 2 -maf 0.1 -debug > %s/gn2/%s.json' % (flat_files('mapping'), this_dataset.group.name, flat_files('genotype/bimbam'), genofile_name, TEMPDIR, gwa_output_filename) else: - gemma_command += ' -a %s/%s_snps.txt -lmm 1 -maf 0.1 -debug > %s/gn2/%s.json' % (flat_files('genotype/bimbam'), + gemma_command += ' -a %s/%s_snps.txt -lmm 2 -maf 0.1 -debug > %s/gn2/%s.json' % (flat_files('genotype/bimbam'), genofile_name, TEMPDIR, gwa_output_filename) else: - gemma_command = GEMMA_COMMAND + ' -g %s/%s_geno.txt -p %s/%s_pheno.txt -a %s/%s_snps.txt -k %s/%s.cXX.txt -lmm 1 -maf 0.1' % (flat_files('genotype/bimbam'), + gemma_command = GEMMA_COMMAND + ' -g %s/%s_geno.txt -p %s/%s_pheno.txt -a %s/%s_snps.txt -k %s/%s.cXX.txt -lmm 2 -maf 0.1' % (flat_files('genotype/bimbam'), genofile_name, flat_files('genotype/bimbam'), genofile_name, diff --git a/wqflask/wqflask/marker_regression/marker_regression_gn1.py b/wqflask/wqflask/marker_regression/marker_regression_gn1.py index da713325..8e9c5b4c 100644 --- a/wqflask/wqflask/marker_regression/marker_regression_gn1.py +++ b/wqflask/wqflask/marker_regression/marker_regression_gn1.py @@ -1062,10 +1062,6 @@ class MarkerRegression(object): #draw gray blocks for 3' and 5' UTR blocks if cdsStart and cdsEnd: - logger.debug("txStart:", txStart) - logger.debug("cdsStart:", cdsStart) - logger.debug("txEnd:", txEnd) - logger.debug("cdsEnd:", cdsEnd) utrStartPix = (txStart-startMb)*plotXScale + xLeftOffset utrEndPix = (cdsStart-startMb)*plotXScale + xLeftOffset if (utrStartPix < xLeftOffset): @@ -1721,8 +1717,8 @@ class MarkerRegression(object): for i, qtlresult in enumerate(self.qtlresults): m = 0 thisLRSColor = self.colorCollection[0] - if qtlresult['chr'] != previous_chr and self.selectedChr == -1: + if self.manhattan_plot != True: canvas.drawPolygon(LRSCoordXY,edgeColor=thisLRSColor,closed=0, edgeWidth=lrsEdgeWidth, clipX=(xLeftOffset, xLeftOffset + plotWidth)) @@ -1760,7 +1756,6 @@ class MarkerRegression(object): AdditiveCoordXY = [] previous_chr = qtlresult['chr'] previous_chr_as_int += 1 - newStartPosX = (self.ChrLengthDistList[previous_chr_as_int - 1]+self.GraphInterval)*plotXScale if newStartPosX != oldStartPosX: startPosX += newStartPosX @@ -1773,7 +1768,6 @@ class MarkerRegression(object): this_chr = str(self.ChrList[self.selectedChr][1]+1) if self.selectedChr == -1 or str(qtlresult['chr']) == this_chr: Xc = startPosX + (qtlresult['Mb']-startMb)*plotXScale - # updated by NL 06-18-2011: # fix the over limit LRS graph issue since genotype trait may give infinite LRS; # for any lrs is over than 460(LRS max in this system), it will be reset to 460 diff --git a/wqflask/wqflask/templates/show_trait_mapping_tools.html b/wqflask/wqflask/templates/show_trait_mapping_tools.html index 0ecf1eb9..d40a7bd6 100644 --- a/wqflask/wqflask/templates/show_trait_mapping_tools.html +++ b/wqflask/wqflask/templates/show_trait_mapping_tools.html @@ -342,6 +342,19 @@ +