from gn2.base.trait import GeneralTrait from gn2.base import data_set # import create_dataset from pprint import pformat as pf import string import math from decimal import Decimal import random import sys import datetime import os import collections import uuid import numpy as np import pickle as pickle import itertools import simplejson as json from redis import Redis Redis = Redis() from flask import Flask, g from gn2.base.trait import GeneralTrait, create_trait from gn2.base import data_set from gn2.base import species from gn2.base import webqtlConfig from gn2.utility import webqtlUtil, helper_functions, hmac, Plot, Bunch, temp_data from gn2.utility.redis_tools import get_redis_conn from gn2.wqflask.database import database_connection from gn2.wqflask.marker_regression import gemma_mapping, rqtl_mapping, qtlreaper_mapping, plink_mapping from gn2.wqflask.show_trait.SampleList import SampleList from gn2.utility.tools import locate, get_setting, locate_ignore_error, GEMMA_COMMAND, PLINK_COMMAND, TEMPDIR from gn2.utility.external import shell from gn2.base.webqtlConfig import TMPDIR, GENERATED_TEXT_DIR Redis = get_redis_conn() class RunMapping: def __init__(self, start_vars, temp_uuid): helper_functions.get_species_dataset_trait(self, start_vars) # needed to pass temp_uuid to gn1 mapping code (marker_regression_gn1.py) self.temp_uuid = temp_uuid # Needed to zoom in or remap temp traits like PCA traits if "temp_trait" in start_vars and start_vars['temp_trait'] != "False": self.temp_trait = "True" self.group = self.dataset.group.name self.hash_of_inputs = start_vars['hash_of_inputs'] self.dataid = start_vars['dataid'] self.json_data = {} self.json_data['lodnames'] = ['lod.hk'] # Sometimes a group may have a genofile that only includes a subset of samples genofile_samplelist = [] if 'genofile' in start_vars: if start_vars['genofile'] != "": self.genofile_string = start_vars['genofile'] self.dataset.group.genofile = self.genofile_string.split(":")[ 0] genofile_samplelist = get_genofile_samplelist(self.dataset) all_samples_ordered = self.dataset.group.all_samples_ordered() self.vals = [] self.samples = [] self.sample_vals = start_vars['sample_vals'] self.vals_hash = start_vars['vals_hash'] sample_val_dict = json.loads(self.sample_vals) samples = sample_val_dict.keys() if (len(genofile_samplelist) != 0): for sample in genofile_samplelist: self.samples.append(sample) if sample in samples: self.vals.append(sample_val_dict[sample]) else: self.vals.append("x") else: for sample in self.dataset.group.samplelist: if sample in samples: self.vals.append(sample_val_dict[sample]) self.samples.append(sample) if 'n_samples' in start_vars: self.n_samples = start_vars['n_samples'] else: self.n_samples = len([val for val in self.vals if val != "x"]) self.mapping_method = start_vars['method'] if "results_path" in start_vars: self.mapping_results_path = start_vars['results_path'] else: mapping_results_filename = "_".join([self.dataset.group.name, self.mapping_method, self.vals_hash]).replace("/", "_") self.mapping_results_path = "{}{}.csv".format( webqtlConfig.GENERATED_IMAGE_DIR, mapping_results_filename) self.pair_scan = False self.manhattan_plot = False if 'manhattan_plot' in start_vars: if start_vars['manhattan_plot'].lower() != "false": self.color_scheme = "alternating" if "color_scheme" in start_vars: self.color_scheme = start_vars['color_scheme'] if self.color_scheme == "single": self.manhattan_single_color = start_vars['manhattan_single_color'] self.manhattan_plot = True self.maf = start_vars['maf'] # Minor allele frequency if "use_loco" in start_vars: self.use_loco = start_vars['use_loco'] else: self.use_loco = None self.suggestive = "" self.significant = "" if 'transform' in start_vars: self.transform = start_vars['transform'] else: self.transform = "" self.score_type = "LRS" # ZS: LRS or LOD self.mapping_scale = "physic" if "mapping_scale" in start_vars: self.mapping_scale = start_vars['mapping_scale'] self.num_perm = 0 self.perm_output = [] self.bootstrap_results = [] self.covariates = start_vars['covariates'] if "covariates" in start_vars else "" self.categorical_vars = [] self.geno_db_exists = False # ZS: This is passed to GN1 code for single chr mapping self.selected_chr = -1 if "selected_chr" in start_vars: # ZS: Needs to be -1 if showing full map; there's probably a better way to fix this if int(start_vars['selected_chr']) != -1: self.selected_chr = int(start_vars['selected_chr']) + 1 else: self.selected_chr = int(start_vars['selected_chr']) if "startMb" in start_vars: self.startMb = start_vars['startMb'] if "endMb" in start_vars: self.endMb = start_vars['endMb'] if "graphWidth" in start_vars: self.graphWidth = start_vars['graphWidth'] if "lrsMax" in start_vars: self.lrsMax = start_vars['lrsMax'] if "haplotypeAnalystCheck" in start_vars: self.haplotypeAnalystCheck = start_vars['haplotypeAnalystCheck'] if "startMb" in start_vars: # This is to ensure showGenes, Legend, etc are checked the first time you open the mapping page, since startMb will only not be set during the first load if "permCheck" in start_vars: self.permCheck = "ON" else: self.permCheck = False self.num_perm = int(start_vars['num_perm']) self.LRSCheck = start_vars['LRSCheck'] if "showSNP" in start_vars: self.showSNP = start_vars['showSNP'] else: self.showSNP = False if "showHomology" in start_vars: self.showHomology = start_vars['showHomology'] else: self.showHomology = False if "showGenes" in start_vars: self.showGenes = start_vars['showGenes'] else: self.showGenes = False if "viewLegend" in start_vars: self.viewLegend = start_vars['viewLegend'] else: self.viewLegend = False else: try: if int(start_vars['num_perm']) > 0: self.num_perm = int(start_vars['num_perm']) except: self.num_perm = 0 if self.num_perm > 0: self.permCheck = "ON" else: self.permCheck = False self.showSNP = "ON" self.showGenes = "ON" self.viewLegend = "ON" # self.dataset.group.get_markers() if self.mapping_method == "gemma": self.first_run = True self.output_files = None if 'output_files' in start_vars: self.output_files = start_vars['output_files'] # ZS: check if first run so existing result files can be used if it isn't (for example zooming on a chromosome, etc) if 'first_run' in start_vars: self.first_run = False self.score_type = "-logP" self.manhattan_plot = True if self.use_loco == "True": marker_obs, self.output_files = gemma_mapping.run_gemma( self.this_trait, self.dataset, self.samples, self.vals, self.covariates, self.use_loco, self.maf, self.first_run, self.output_files) else: marker_obs, self.output_files = gemma_mapping.run_gemma( self.this_trait, self.dataset, self.samples, self.vals, self.covariates, self.use_loco, self.maf, self.first_run, self.output_files) results = marker_obs elif self.mapping_method == "rqtl_plink": results = self.run_rqtl_plink() elif self.mapping_method == "rqtl_geno": self.perm_strata = [] if "perm_strata" in start_vars and "categorical_vars" in start_vars: self.categorical_vars = start_vars["categorical_vars"].split( ",") if len(self.categorical_vars) and start_vars["perm_strata"] == "True": primary_samples = SampleList(dataset=self.dataset, sample_names=self.samples, this_trait=self.this_trait) self.perm_strata = get_perm_strata( self.this_trait, primary_samples, self.categorical_vars, self.samples) self.score_type = "-logP" self.control_marker = start_vars['control_marker'] self.do_control = start_vars['do_control'] if 'mapmethod_rqtl' in start_vars: self.method = start_vars['mapmethod_rqtl'] else: self.method = "em" self.model = start_vars['mapmodel_rqtl'] self.pair_scan = False if start_vars['pair_scan'] == "true": self.pair_scan = True if self.permCheck and self.num_perm > 0: self.perm_output, self.suggestive, self.significant, results = rqtl_mapping.run_rqtl( self.this_trait.name, self.vals, self.samples, self.dataset, self.pair_scan, self.mapping_scale, self.model, self.method, self.num_perm, self.perm_strata, self.do_control, self.control_marker, self.manhattan_plot, self.covariates) else: results = rqtl_mapping.run_rqtl(self.this_trait.name, self.vals, self.samples, self.dataset, self.pair_scan, self.mapping_scale, self.model, self.method, self.num_perm, self.perm_strata, self.do_control, self.control_marker, self.manhattan_plot, self.covariates) elif self.mapping_method == "reaper": if "startMb" in start_vars: # ZS: Check if first time page loaded, so it can default to ON if "additiveCheck" in start_vars: self.additiveCheck = start_vars['additiveCheck'] else: self.additiveCheck = False if "bootCheck" in start_vars: self.bootCheck = "ON" else: self.bootCheck = False self.num_bootstrap = int(start_vars['num_bootstrap']) else: self.additiveCheck = "ON" try: if int(start_vars['num_bootstrap']) > 0: self.bootCheck = "ON" self.num_bootstrap = int(start_vars['num_bootstrap']) else: self.bootCheck = False self.num_bootstrap = 0 except: self.bootCheck = False self.num_bootstrap = 0 self.control_marker = start_vars['control_marker'] self.do_control = start_vars['do_control'] self.first_run = True self.output_files = None # ZS: check if first run so existing result files can be used if it isn't (for example zooming on a chromosome, etc) if 'first_run' in start_vars: self.first_run = False if 'output_files' in start_vars: self.output_files = start_vars['output_files'].split( ",") results, self.perm_output, self.suggestive, self.significant, self.bootstrap_results, self.output_files = qtlreaper_mapping.run_reaper(self.this_trait, self.dataset, self.samples, self.vals, self.json_data, self.num_perm, self.bootCheck, self.num_bootstrap, self.do_control, self.control_marker, self.manhattan_plot, self.first_run, self.output_files) elif self.mapping_method == "plink": self.score_type = "-logP" self.manhattan_plot = True results = plink_mapping.run_plink( self.this_trait, self.dataset, self.species, self.vals, self.maf) #results = self.run_plink() self.no_results = False if len(results) == 0: self.no_results = True else: # Check if genotypes exist in the DB in order to create links for markers self.geno_db_exists = geno_db_exists(self.dataset, results[0]['name']) if self.pair_scan == True: self.figure_data = results[0] self.table_data = results[1] else: self.qtl_results = [] self.results_for_browser = [] self.annotations_for_browser = [] highest_chr = 1 # This is needed in order to convert the highest chr to X/Y for marker in results: marker['hmac'] = hmac.data_hmac('{}:{}'.format(marker['name'], self.dataset.group.name + "Geno")) if 'Mb' in marker: this_ps = marker['Mb'] * 1000000 else: this_ps = marker['cM'] * 1000000 browser_marker = dict( chr=str(marker['chr']), rs=marker['name'], ps=this_ps, url="/show_trait?trait_id=" + \ marker['name'] + "&dataset=" + \ self.dataset.group.name + "Geno" ) if self.geno_db_exists == "True": annot_marker = dict( name=str(marker['name']), chr=str(marker['chr']), rs=marker['name'], pos=this_ps, url="/show_trait?trait_id=" + \ marker['name'] + "&dataset=" + \ self.dataset.group.name + "Geno" ) else: annot_marker = dict( name=str(marker['name']), chr=str(marker['chr']), rs=marker['name'], pos=this_ps ) if 'lrs_value' in marker and marker['lrs_value'] > 0: browser_marker['p_wald'] = 10**- \ (marker['lrs_value'] / 4.61) elif 'lod_score' in marker and marker['lod_score'] > 0: browser_marker['p_wald'] = 10**-(marker['lod_score']) else: browser_marker['p_wald'] = 0 self.results_for_browser.append(browser_marker) self.annotations_for_browser.append(annot_marker) if str(marker['chr']) > '0' or str(marker['chr']) == "X" or str(marker['chr']) == "X/Y": if str(marker['chr']) > str(highest_chr) or str(marker['chr']) == "X" or str(marker['chr']) == "X/Y": highest_chr = marker['chr'] if ('lod_score' in marker.keys()) or ('lrs_value' in marker.keys()): if 'Mb' in marker.keys(): marker['display_pos'] = "Chr" + \ str(marker['chr']) + ": " + \ "{:.6f}".format(marker['Mb']) elif 'cM' in marker.keys(): marker['display_pos'] = "Chr" + \ str(marker['chr']) + ": " + \ "{:.3f}".format(marker['cM']) else: marker['display_pos'] = "N/A" self.qtl_results.append(marker) total_markers = len(self.qtl_results) export_mapping_results(self.dataset, self.this_trait, self.qtl_results, self.mapping_results_path, self.mapping_method, self.mapping_scale, self.score_type, self.transform, self.covariates, self.n_samples, self.vals_hash) if len(self.qtl_results) > 30000: self.qtl_results = trim_markers_for_figure( self.qtl_results) self.results_for_browser = trim_markers_for_figure( self.results_for_browser) filtered_annotations = [] for marker in self.results_for_browser: for annot_marker in self.annotations_for_browser: if annot_marker['rs'] == marker['rs']: filtered_annotations.append(annot_marker) break self.annotations_for_browser = filtered_annotations browser_files = write_input_for_browser( self.dataset, self.results_for_browser, self.annotations_for_browser) else: browser_files = write_input_for_browser( self.dataset, self.results_for_browser, self.annotations_for_browser) self.trimmed_markers = trim_markers_for_table(results) chr_lengths = get_chr_lengths( self.mapping_scale, self.mapping_method, self.dataset, self.qtl_results) # ZS: For zooming into genome browser, need to pass chromosome name instead of number if self.dataset.group.species == "mouse": if self.selected_chr == 20: this_chr = "X" else: this_chr = str(self.selected_chr) elif self.dataset.group.species == "rat": if self.selected_chr == 21: this_chr = "X" else: this_chr = str(self.selected_chr) else: if self.selected_chr == 22: this_chr = "X" elif self.selected_chr == 23: this_chr = "Y" else: this_chr = str(self.selected_chr) if self.mapping_method != "gemma": if self.score_type == "LRS": significant_for_browser = self.significant / 4.61 else: significant_for_browser = self.significant self.js_data = dict( #result_score_type = self.score_type, #this_trait = self.this_trait.name, #data_set = self.dataset.name, #maf = self.maf, #manhattan_plot = self.manhattan_plot, #mapping_scale = self.mapping_scale, #chromosomes = chromosome_mb_lengths, #qtl_results = self.qtl_results, categorical_vars=self.categorical_vars, chr_lengths=chr_lengths, num_perm=self.num_perm, perm_results=self.perm_output, significant=significant_for_browser, browser_files=browser_files, selected_chr=this_chr, total_markers=total_markers ) else: self.js_data = dict( chr_lengths=chr_lengths, browser_files=browser_files, selected_chr=this_chr, total_markers=total_markers ) def run_rqtl_plink(self): # os.chdir("") never do this inside a webserver!! output_filename = webqtlUtil.genRandStr("%s_%s_" % ( self.dataset.group.name, self.this_trait.name)) plink_mapping.gen_pheno_txt_file_plink( self.this_trait, self.dataset, self.vals, pheno_filename=output_filename) rqtl_command = './plink --noweb --ped %s.ped --no-fid --no-parents --no-sex --no-pheno --map %s.map --pheno %s/%s.txt --pheno-name %s --maf %s --missing-phenotype -9999 --out %s%s --assoc ' % ( self.dataset.group.name, self.dataset.group.name, TMPDIR, plink_output_filename, self.this_trait.name, self.maf, TMPDIR, plink_output_filename) os.system(rqtl_command) count, p_values = self.parse_rqtl_output(plink_output_filename) def identify_empty_samples(self): no_val_samples = [] for sample_count, val in enumerate(self.vals): if val == "x": no_val_samples.append(sample_count) return no_val_samples def trim_genotypes(self, genotype_data, no_value_samples): trimmed_genotype_data = [] for marker in genotype_data: new_genotypes = [] for item_count, genotype in enumerate(marker): if item_count in no_value_samples: continue try: genotype = float(genotype) except ValueError: genotype = np.nan pass new_genotypes.append(genotype) trimmed_genotype_data.append(new_genotypes) return trimmed_genotype_data def export_mapping_results(dataset, trait, markers, results_path, mapping_method, mapping_scale, score_type, transform, covariates, n_samples, vals_hash): if mapping_scale == "physic": scale_string = "Mb" else: scale_string = "cM" with open(results_path, "w+") as output_file: output_file.write( "Time/Date: " + datetime.datetime.now().strftime("%x / %X") + "\n") output_file.write( "Population: " + dataset.group.species.title() + " " + dataset.group.name + "\n") output_file.write("Data Set: " + dataset.fullname + "\n") output_file.write("Trait: " + trait.display_name + "\n") output_file.write("Trait Hash: " + vals_hash + "\n") output_file.write("N Samples: " + str(n_samples) + "\n") output_file.write("Mapping Tool: " + str(mapping_method) + "\n") if len(transform) > 0: transform_text = "Transform - " if transform == "qnorm": transform_text += "Quantile Normalized" elif transform == "log2" or transform == "log10": transform_text += transform.capitalize() elif transform == "sqrt": transform_text += "Square Root" elif transform == "zscore": transform_text += "Z-Score" elif transform == "invert": transform_text += "Invert +/-" else: transform_text = "" output_file.write(transform_text + "\n") if dataset.type == "ProbeSet": if trait.symbol: output_file.write("Gene Symbol: " + trait.symbol + "\n") output_file.write("Location: " + str(trait.chr) + \ " @ " + str(trait.mb) + " Mb\n") if len(covariates) > 0: output_file.write("Cofactors (dataset - trait):\n") for covariate in covariates.split(","): trait_name = covariate.split(":")[0] dataset_name = covariate.split(":")[1] output_file.write(dataset_name + " - " + trait_name + "\n") output_file.write("\n") output_file.write("Name,Chr,") if score_type.lower() == "-logP": score_type = "-logP" output_file.write(scale_string + "," + score_type) if "additive" in list(markers[0].keys()): output_file.write(",Additive") if "dominance" in list(markers[0].keys()): output_file.write(",Dominance") output_file.write("\n") for i, marker in enumerate(markers): output_file.write(marker['name'] + "," + str(marker['chr']) + ",") output_file.write(str(marker[scale_string]) + ",") if score_type == "-logP": output_file.write(str(marker['lod_score'])) else: output_file.write(str(marker['lrs_value'])) if "additive" in list(marker.keys()): output_file.write("," + str(marker['additive'])) if "dominance" in list(marker.keys()): output_file.write("," + str(marker['dominance'])) if i < (len(markers) - 1): output_file.write("\n") def trim_markers_for_figure(markers): if 'p_wald' in list(markers[0].keys()): score_type = 'p_wald' elif 'lod_score' in list(markers[0].keys()): score_type = 'lod_score' else: score_type = 'lrs_value' filtered_markers = [] low_counter = 0 med_counter = 0 high_counter = 0 for marker in markers: if score_type == 'p_wald': if marker[score_type] > 0.1: if low_counter % 20 == 0: filtered_markers.append(marker) low_counter += 1 elif 0.1 >= marker[score_type] > 0.01: if med_counter % 10 == 0: filtered_markers.append(marker) med_counter += 1 elif 0.01 >= marker[score_type] > 0.001: if high_counter % 2 == 0: filtered_markers.append(marker) high_counter += 1 else: filtered_markers.append(marker) elif score_type == 'lod_score': if marker[score_type] < 1: if low_counter % 20 == 0: filtered_markers.append(marker) low_counter += 1 elif 1 <= marker[score_type] < 2: if med_counter % 10 == 0: filtered_markers.append(marker) med_counter += 1 elif 2 <= marker[score_type] <= 3: if high_counter % 2 == 0: filtered_markers.append(marker) high_counter += 1 else: filtered_markers.append(marker) else: if marker[score_type] < 4.61: if low_counter % 20 == 0: filtered_markers.append(marker) low_counter += 1 elif 4.61 <= marker[score_type] < (2 * 4.61): if med_counter % 10 == 0: filtered_markers.append(marker) med_counter += 1 elif (2 * 4.61) <= marker[score_type] <= (3 * 4.61): if high_counter % 2 == 0: filtered_markers.append(marker) high_counter += 1 else: filtered_markers.append(marker) return filtered_markers def trim_markers_for_table(markers): if 'lod_score' in list(markers[0].keys()): sorted_markers = sorted( markers, key=lambda k: k['lod_score'], reverse=True) else: sorted_markers = sorted( markers, key=lambda k: k['lrs_value'], reverse=True) #ZS: So we end up with a list of just 2000 markers if len(sorted_markers) >= 25000: trimmed_sorted_markers = sorted_markers[:25000] return trimmed_sorted_markers else: return sorted_markers def write_input_for_browser(this_dataset, gwas_results, annotations): file_base = this_dataset.group.name + "_" + \ ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(6)) gwas_filename = file_base + "_GWAS" annot_filename = file_base + "_ANNOT" gwas_path = "{}/gn2/".format(TEMPDIR) + gwas_filename annot_path = "{}/gn2/".format(TEMPDIR) + annot_filename with open(gwas_path + ".json", "w") as gwas_file, open(annot_path + ".json", "w") as annot_file: gwas_file.write(json.dumps(gwas_results)) annot_file.write(json.dumps(annotations)) return [gwas_filename, annot_filename] def geno_db_exists(this_dataset, first_marker): """ Check if genotypes are databased This checks two things: - A genotypes dataset exists for this group - The first marker in the genotype file is in the genotypes dataset, since there might be a mismatch between the file and databased markers """ geno_db_name = this_dataset.group.name + "Geno" try: geno_db = data_set.create_dataset( dataset_name=geno_db_name, get_samplelist=False) geno_trait = create_trait(name=first_marker, dataset_name=geno_db_name) return "True" except: return "False" def get_chr_lengths(mapping_scale, mapping_method, dataset, qtl_results): chr_lengths = [] if mapping_scale == "physic": with database_connection(get_setting("SQL_URI")) as conn, conn.cursor() as db_cursor: for i, the_chr in enumerate(dataset.species.chromosomes.chromosomes(db_cursor)): this_chr = { "chr": dataset.species.chromosomes.chromosomes(db_cursor)[the_chr].name, "size": str(dataset.species.chromosomes.chromosomes(db_cursor)[the_chr].length) } chr_lengths.append(this_chr) else: this_chr = 1 highest_pos = 0 for i, result in enumerate(qtl_results): chr_as_num = 0 try: chr_as_num = int(result['chr']) except: chr_as_num = 20 if chr_as_num > this_chr or i == (len(qtl_results) - 1): if i == (len(qtl_results) - 1): if mapping_method == "reaper": highest_pos = float(result['cM']) * 1000000 else: highest_pos = float(result['Mb']) * 1000000 chr_lengths.append( {"chr": str(this_chr), "size": str(highest_pos)}) else: chr_lengths.append( {"chr": str(this_chr), "size": str(highest_pos)}) this_chr = chr_as_num else: if mapping_method == "reaper": if float(result['cM']) > highest_pos: highest_pos = float(result['cM']) * 1000000 else: if float(result['Mb']) > highest_pos: highest_pos = float(result['Mb']) * 1000000 return chr_lengths def get_genofile_samplelist(dataset): genofile_samplelist = [] genofile_json = dataset.group.get_genofiles() for genofile in genofile_json: if genofile['location'] == dataset.group.genofile and 'sample_list' in genofile: genofile_samplelist = genofile['sample_list'] return genofile_samplelist def get_perm_strata(this_trait, sample_list, categorical_vars, used_samples): perm_strata_strings = [] for sample in used_samples: if sample in list(sample_list.sample_attribute_values.keys()): combined_string = "" for var in categorical_vars: if var in sample_list.sample_attribute_values[sample]: combined_string += str( sample_list.sample_attribute_values[sample][var]) else: combined_string += "NA" else: combined_string = "NA" perm_strata_strings.append(combined_string) d = dict([(y, x + 1) for x, y in enumerate(sorted(set(perm_strata_strings)))]) list_to_numbers = [d[x] for x in perm_strata_strings] perm_strata = list_to_numbers return perm_strata