From 7d9ead16f45cc68d1543c673487891b03c990efe Mon Sep 17 00:00:00 2001 From: zsloan Date: Fri, 7 Feb 2020 12:44:15 -0600 Subject: Users can now add and do mapping on genotype files that only include a subset of samples/strains Also filtered the results used by Christian's genome browser, so now it should have an easier time loading when dealing with larger numbers of markers --- wqflask/base/data_set.py | 9 ++ wqflask/wqflask/marker_regression/gemma_mapping.py | 15 +-- wqflask/wqflask/marker_regression/run_mapping.py | 101 +++++++++++++++------ wqflask/wqflask/show_trait/show_trait.py | 11 +-- 4 files changed, 89 insertions(+), 47 deletions(-) diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py index 49600c64..7fe9a8ac 100644 --- a/wqflask/base/data_set.py +++ b/wqflask/base/data_set.py @@ -347,6 +347,15 @@ class DatasetGroup(object): if maternal and paternal: self.parlist = [maternal, paternal] + def get_genofiles(self): + jsonfile = "%s/%s.json" % (webqtlConfig.GENODIR, self.name) + try: + f = open(jsonfile) + except: + return None + jsondata = json.load(f) + return jsondata['genofile'] + def get_samplelist(self): result = None key = "samplelist:v3:" + self.name diff --git a/wqflask/wqflask/marker_regression/gemma_mapping.py b/wqflask/wqflask/marker_regression/gemma_mapping.py index 3d3f26f3..e2b15c26 100644 --- a/wqflask/wqflask/marker_regression/gemma_mapping.py +++ b/wqflask/wqflask/marker_regression/gemma_mapping.py @@ -39,7 +39,7 @@ def run_gemma(this_trait, this_dataset, samples, vals, covariates, use_loco, maf chr_list_string += this_chromosomes[i+1].name if covariates != "": - gen_covariates_file(this_dataset, covariates) + gen_covariates_file(this_dataset, covariates, samples) if use_loco == "True": generate_k_command = GEMMA_WRAPPER_COMMAND + ' --json --loco ' + chr_list_string + ' -- ' + GEMMAOPTS + ' -g %s/%s_geno.txt -p %s/gn2/%s.txt -a %s/%s_snps.txt -gk > %s/gn2/%s.json' % (flat_files('genotype/bimbam'), @@ -122,7 +122,7 @@ def gen_pheno_txt_file(this_dataset, genofile_name, vals, trait_filename): else: outfile.write(value + "\n") -def gen_covariates_file(this_dataset, covariates): +def gen_covariates_file(this_dataset, covariates, samples): covariate_list = covariates.split(",") covariate_data_object = [] for covariate in covariate_list: @@ -140,11 +140,12 @@ def gen_covariates_file(this_dataset, covariates): trait_sample_data = trait_ob.data logger.debug("SAMPLE DATA:", trait_sample_data) for index, sample in enumerate(trait_samples): - if sample in trait_sample_data: - sample_value = trait_sample_data[sample].value - this_covariate_data.append(sample_value) - else: - this_covariate_data.append("-9") + if sample in samples: + if sample in trait_sample_data: + sample_value = trait_sample_data[sample].value + this_covariate_data.append(sample_value) + else: + this_covariate_data.append("-9") covariate_data_object.append(this_covariate_data) with open("{}/{}_covariates.txt".format(flat_files('mapping'), this_dataset.group.name), "w") as outfile: diff --git a/wqflask/wqflask/marker_regression/run_mapping.py b/wqflask/wqflask/marker_regression/run_mapping.py index a65dafd5..ac05c4ad 100644 --- a/wqflask/wqflask/marker_regression/run_mapping.py +++ b/wqflask/wqflask/marker_regression/run_mapping.py @@ -61,33 +61,42 @@ class RunMapping(object): self.json_data = {} self.json_data['lodnames'] = ['lod.hk'] + #ZS: Sometimes a group may have a genofile that only includes a subset of samples + genofile_samplelist = [] + if 'genofile' in start_vars: + if start_vars['genofile'] != "": + self.genofile_string = start_vars['genofile'] + self.dataset.group.genofile = self.genofile_string.split(":")[0] + genofile_samplelist = get_genofile_samplelist(self.dataset) + all_samples_ordered = self.dataset.group.all_samples_ordered() self.vals = [] if 'samples' in start_vars: self.samples = start_vars['samples'].split(",") for sample in self.samples: - value = start_vars.get('value:' + sample) - if value: - self.vals.append(value) - else: - self.samples = [] - - for sample in self.dataset.group.samplelist: - # sample is actually the name of an individual - in_trait_data = False - for item in self.this_trait.data: - if self.this_trait.data[item].name == sample: - value = start_vars['value:' + self.this_trait.data[item].name] - self.samples.append(self.this_trait.data[item].name) - self.vals.append(value) - in_trait_data = True - break - if not in_trait_data: + if (len(genofile_samplelist) == 0) or (sample in genofile_samplelist): value = start_vars.get('value:' + sample) if value: - self.samples.append(sample) self.vals.append(value) + else: + self.samples = [] + + for sample in self.dataset.group.samplelist: # sample is actually the name of an individual + if (len(genofile_samplelist) == 0) or (sample in genofile_samplelist): + in_trait_data = False + for item in self.this_trait.data: + if self.this_trait.data[item].name == sample: + value = start_vars['value:' + self.this_trait.data[item].name] + self.samples.append(self.this_trait.data[item].name) + self.vals.append(value) + in_trait_data = True + break + if not in_trait_data: + value = start_vars.get('value:' + sample) + if value: + self.samples.append(sample) + self.vals.append(value) self.num_vals = start_vars['num_vals'] @@ -177,10 +186,6 @@ class RunMapping(object): self.showGenes = "ON" self.viewLegend = "ON" - if 'genofile' in start_vars: - if start_vars['genofile'] != "": - self.genofile_string = start_vars['genofile'] - self.dataset.group.genofile = self.genofile_string.split(":")[0] self.dataset.group.get_markers() if self.mapping_method == "gemma": self.first_run = True @@ -321,7 +326,7 @@ class RunMapping(object): else: self.qtl_results = [] - self.qtl_results_for_browser = [] + self.results_for_browser = [] self.annotations_for_browser = [] highest_chr = 1 #This is needed in order to convert the highest chr to X/Y for marker in results: @@ -357,7 +362,7 @@ class RunMapping(object): else: browser_marker['p_wald'] = 0 - self.qtl_results_for_browser.append(browser_marker) + self.results_for_browser.append(browser_marker) self.annotations_for_browser.append(annot_marker) if marker['chr'] > 0 or marker['chr'] == "X" or marker['chr'] == "X/Y": if marker['chr'] > highest_chr or marker['chr'] == "X" or marker['chr'] == "X/Y": @@ -365,19 +370,28 @@ class RunMapping(object): if ('lod_score' in marker.keys()) or ('lrs_value' in marker.keys()): self.qtl_results.append(marker) - browser_files = write_input_for_browser(self.dataset, self.qtl_results_for_browser, self.annotations_for_browser) - with Bench("Exporting Results"): export_mapping_results(self.dataset, self.this_trait, self.qtl_results, self.mapping_results_path, self.mapping_scale, self.score_type) with Bench("Trimming Markers for Figure"): if len(self.qtl_results) > 30000: self.qtl_results = trim_markers_for_figure(self.qtl_results) + self.results_for_browser = trim_markers_for_figure(self.results_for_browser) + filtered_annotations = [] + for marker in self.results_for_browser: + for annot_marker in self.annotations_for_browser: + if annot_marker['rs'] == marker['rs']: + filtered_annotations.append(annot_marker) + break + self.annotations_for_browser = filtered_annotations + browser_files = write_input_for_browser(self.dataset, self.results_for_browser, self.annotations_for_browser) + else: + browser_files = write_input_for_browser(self.dataset, self.results_for_browser, self.annotations_for_browser) with Bench("Trimming Markers for Table"): self.trimmed_markers = trim_markers_for_table(results) - chr_lengths = get_chr_lengths(self.mapping_scale, self.dataset, self.qtl_results_for_browser) + chr_lengths = get_chr_lengths(self.mapping_scale, self.dataset, self.qtl_results) #ZS: For zooming into genome browser, need to pass chromosome name instead of number if self.dataset.group.species == "mouse": @@ -498,7 +512,9 @@ def export_mapping_results(dataset, trait, markers, results_path, mapping_scale, output_file.write("\n") def trim_markers_for_figure(markers): - if 'lod_score' in markers[0].keys(): + if 'p_wald' in markers[0].keys(): + score_type = 'p_wald' + elif 'lod_score' in markers[0].keys(): score_type = 'lod_score' else: score_type = 'lrs_value' @@ -508,7 +524,22 @@ def trim_markers_for_figure(markers): med_counter = 0 high_counter = 0 for marker in markers: - if score_type == 'lod_score': + if score_type == 'p_wald': + if marker[score_type] > 0.1: + if low_counter % 20 == 0: + filtered_markers.append(marker) + low_counter += 1 + elif 0.1 >= marker[score_type] > 0.01: + if med_counter % 10 == 0: + filtered_markers.append(marker) + med_counter += 1 + elif 0.01 >= marker[score_type] > 0.001: + if high_counter % 2 == 0: + filtered_markers.append(marker) + high_counter += 1 + else: + filtered_markers.append(marker) + elif score_type == 'lod_score': if marker[score_type] < 1: if low_counter % 20 == 0: filtered_markers.append(marker) @@ -595,4 +626,14 @@ def get_chr_lengths(mapping_scale, dataset, qtl_results): if float(result['ps']) > highest_pos: highest_pos = float(result['ps']) - return chr_lengths \ No newline at end of file + return chr_lengths + +def get_genofile_samplelist(dataset): + genofile_samplelist = [] + + genofile_json = dataset.group.get_genofiles() + for genofile in genofile_json: + if genofile['location'] == dataset.group.genofile and 'sample_list' in genofile: + genofile_samplelist = genofile['sample_list'] + + return genofile_samplelist \ No newline at end of file diff --git a/wqflask/wqflask/show_trait/show_trait.py b/wqflask/wqflask/show_trait/show_trait.py index c81d68d0..4efaa968 100644 --- a/wqflask/wqflask/show_trait/show_trait.py +++ b/wqflask/wqflask/show_trait/show_trait.py @@ -161,7 +161,7 @@ class ShowTrait(object): for i, this_chr in enumerate(self.dataset.species.chromosomes.chromosomes): self.chr_list.append([self.dataset.species.chromosomes.chromosomes[this_chr].name, i]) - self.genofiles = get_genofiles(self.dataset) + self.genofiles = self.dataset.group.get_genofiles() self.has_num_cases = has_num_cases(self.this_trait) @@ -508,15 +508,6 @@ def get_nearest_marker(this_trait, this_db): else: return result[0][0] -def get_genofiles(this_dataset): - jsonfile = "%s/%s.json" % (webqtlConfig.GENODIR, this_dataset.group.name) - try: - f = open(jsonfile) - except: - return None - jsondata = json.load(f) - return jsondata['genofile'] - def get_table_widths(sample_groups, has_num_cases=False): stats_table_width = 250 if len(sample_groups) > 1: -- cgit v1.2.3