diff options
| author | zsloan | 2022-03-22 15:41:45 -0500 | 
|---|---|---|
| committer | GitHub | 2022-03-22 15:41:45 -0500 | 
| commit | f4b02281cdf0a29d82caf8c06ce96f947e5cf623 (patch) | |
| tree | a81c350a5a08b3b7cdf8375b728b48951791a14c | |
| parent | 7b2901817a1aabd947483f87b5a2a2d33618de7e (diff) | |
| parent | a75634cc5637168165e601f09dc9ad820e6e443f (diff) | |
| download | genenetwork3-f4b02281cdf0a29d82caf8c06ce96f947e5cf623.tar.gz | |
Merge pull request #29 from zsloan/feature/add_rqtl_pairscan
Feature/add rqtl pairscan
| -rw-r--r-- | gn3/api/rqtl.py | 11 | ||||
| -rw-r--r-- | gn3/computations/rqtl.py | 188 | ||||
| -rw-r--r-- | scripts/rqtl_wrapper.R | 137 | 
3 files changed, 267 insertions, 69 deletions
| diff --git a/gn3/api/rqtl.py b/gn3/api/rqtl.py index b5627c5..70ebe12 100644 --- a/gn3/api/rqtl.py +++ b/gn3/api/rqtl.py @@ -6,7 +6,8 @@ from flask import current_app from flask import jsonify from flask import request -from gn3.computations.rqtl import generate_rqtl_cmd, process_rqtl_output, process_perm_output +from gn3.computations.rqtl import generate_rqtl_cmd, process_rqtl_mapping, \ + process_rqtl_pairscan, process_perm_output from gn3.computations.gemma import do_paths_exist rqtl = Blueprint("rqtl", __name__) @@ -25,7 +26,7 @@ run the rqtl_wrapper script and return the results as JSON # Split kwargs by those with values and boolean ones that just convert to True/False kwargs = ["covarstruct", "model", "method", "nperm", "scale", "control_marker"] - boolean_kwargs = ["addcovar", "interval", "pstrata"] + boolean_kwargs = ["addcovar", "interval", "pstrata", "pairscan"] all_kwargs = kwargs + boolean_kwargs rqtl_kwargs = {"geno": genofile, "pheno": phenofile} @@ -48,9 +49,11 @@ run the rqtl_wrapper script and return the results as JSON "output", rqtl_cmd.get('output_file'))): os.system(rqtl_cmd.get('rqtl_cmd')) - rqtl_output['results'] = process_rqtl_output(rqtl_cmd.get('output_file')) + if "pairscan" in rqtl_bool_kwargs: + rqtl_output['results'] = process_rqtl_pairscan(rqtl_cmd.get('output_file'), genofile) + else: + rqtl_output['results'] = process_rqtl_mapping(rqtl_cmd.get('output_file')) - rqtl_output['results'] = process_rqtl_output(rqtl_cmd.get('output_file')) if int(rqtl_kwargs['nperm']) > 0: rqtl_output['perm_results'], rqtl_output['suggestive'], rqtl_output['significant'] = \ process_perm_output(rqtl_cmd.get('output_file')) diff --git a/gn3/computations/rqtl.py b/gn3/computations/rqtl.py index b3539a9..65ee6de 100644 --- a/gn3/computations/rqtl.py +++ b/gn3/computations/rqtl.py @@ -1,6 +1,7 @@ """Procedures related rqtl computations""" import os -from typing import Dict, List, Union +from bisect import bisect +from typing import Dict, List, Tuple, Union import numpy as np @@ -15,9 +16,7 @@ def generate_rqtl_cmd(rqtl_wrapper_cmd: str, rqtl_wrapper_bool_kwargs: list) -> Dict: """Given the base rqtl_wrapper command and dict of keyword arguments, return the full rqtl_wrapper command and an -output filename generated from a hash of the genotype and phenotype files - - """ +output filename generated from a hash of the genotype and phenotype files""" # Generate a hash from contents of the genotype and phenotype files _hash = get_hash_of_files( @@ -47,11 +46,9 @@ output filename generated from a hash of the genotype and phenotype files } -def process_rqtl_output(file_name: str) -> List: +def process_rqtl_mapping(file_name: str) -> List: """Given an output file name, read in R/qtl results and return - a List of marker objects - - """ + a List of marker objects""" marker_obs = [] # Later I should probably redo this using csv.read to avoid the # awkwardness with removing quotes with [1:-1] @@ -80,12 +77,181 @@ def process_rqtl_output(file_name: str) -> List: return marker_obs +def process_rqtl_pairscan(file_name: str, geno_file: str) -> List: + """Given an output file name, read in R/qtl pair-scan results and return +a list of both the JSON needed for the d3panels figure and a list of results +to be used when generating the results table (which will include marker names)""" + figure_data = pairscan_for_figure(file_name) + table_data = pairscan_for_table(file_name, geno_file) -def process_perm_output(file_name: str): - """Given base filename, read in R/qtl permutation output and calculate - suggestive and significant thresholds + return [figure_data, table_data] + +def pairscan_for_figure(file_name: str) -> Dict: + """Given an output file name, read in R/qtl pair-scan results and return + the JSON needed for the d3panels figure""" + figure_data = {} + + # Open the file with the actual results, written as a list of lists + with open(os.path.join(current_app.config.get("TMPDIR", "/tmp"), + "output", file_name), "r") as the_file: + lod_results = [] + for i, line in enumerate(the_file): + if i == 0: # Skip first line + continue + line_items = [item.rstrip('\n') for item in line.split(",")] + lod_results.append(line_items[1:]) # Append all but first item in line + figure_data['lod'] = lod_results + + # Open the map file with the list of markers/pseudomarkers and their positions + with open(os.path.join(current_app.config.get("TMPDIR", "/tmp"), + "output", "MAP_" + file_name), "r") as the_file: + chr_list = [] # type: List + pos_list = [] # type: List + for i, line in enumerate(the_file): + if i == 0: # Skip first line + continue + line_items = [item.rstrip('\n') for item in line.split(",")] + chr_list.append(line_items[1][1:-1]) + pos_list.append(line_items[2]) + figure_data['chr'] = chr_list + figure_data['pos'] = pos_list + + return figure_data + +def get_marker_list(map_file: str) -> List: + """ + Open the map file with the list of markers/pseudomarkers and create list of marker obs + + PARAMETERS: + map_file: The map file output by R/qtl containing marker/pseudomarker positions + """ + + marker_list = [] + with open(os.path.join(current_app.config.get("TMPDIR", "/tmp"), + "output", map_file), "r") as map_fh: + for line in map_fh.readlines()[1:]: + line_items = [item.rstrip('\n') for item in line.split(",")] + this_marker = { + 'name': line_items[0], + 'chr': line_items[1][1:-1], # Strip quotes from beginning and end of chr string + 'pos': line_items[2] + } + + marker_list.append(this_marker) + + return marker_list + +def generate_table_rows(results_file: str, marker_list: List, original_markers: Dict) -> List: + """ + Open the file with the actual R/qtl pair-scan results and write them as + they will be displayed in the results table + PARAMETERS: + results_file: The filename containing R/qtl pair-scan results + marker_list: List of marker/pseudomarker names/positions from results + original_markers: Dict of markers from the .geno file, for finding proximal/distal + markers to each pseudomarker """ + + table_data = [] + with open(os.path.join(current_app.config.get("TMPDIR", "/tmp"), + "output", results_file), "r") as the_file: + for i, line in enumerate(the_file.readlines()[1:]): + marker_1 = marker_list[i] + marker_1['proximal'], marker_1['distal'] = find_nearest_marker(marker_1['chr'], + marker_1['pos'], + original_markers) + line_items = [item.rstrip('\n') for item in line.split(",")] + for j, item in enumerate(line_items[1:]): + marker_2 = marker_list[j] + marker_2['proximal'], marker_2['distal'] = find_nearest_marker(marker_2['chr'], + marker_2['pos'], + original_markers) + try: + lod_score = f"{float(item):.3f}" + except ValueError: + lod_score = f"{item}" + + table_data.append({ + 'proximal1': marker_1['proximal'], + 'distal1': marker_1['distal'], + 'pos1': f"Chr {marker_1['chr']} @ {float(marker_1['pos']):.1f} cM", + 'lod': lod_score, + 'proximal2': marker_2['proximal'], + 'distal2': marker_2['distal'], + 'pos2': f"Chr {marker_2['chr']} @ {float(marker_2['pos']):.1f} cM" + }) + + return table_data + +def pairscan_for_table(file_name: str, geno_file: str) -> List: + """ + Read in R/qtl pair-scan results and return as List representing + table row contents + + PARAMETERS: + file_name: The filename containing R/qtl pair-scan results + geno_file: Filename of the genotype file (used to get marker positions) + """ + + # Open the map file with the list of markers/pseudomarkers and create list of marker obs + marker_list = get_marker_list("MAP_" + file_name) + + # Get the list of original markers from the .geno file + original_markers = build_marker_pos_dict(geno_file) + + # Open the file with the actual results and write the results as + # they will be displayed in the results table + table_data = generate_table_rows(file_name, marker_list, original_markers) + + return sorted(table_data, key=lambda i: float(i['lod']), reverse=True)[:500] + +def build_marker_pos_dict(genotype_file: str) -> Dict: + """Gets list of markers and their positions from .geno file + + Basically a pared-down version of parse_genotype_file for R/qtl pair-scan""" + + with open(genotype_file, "r") as infile: + contents = infile.readlines() + + # Get all lines after the metadata + lines = tuple(line for line in contents if + ((not line.strip().startswith("#")) and + (not line.strip().startswith("@")) and + (not line.strip() == ""))) + + header_items = lines[0].split("\t") + mb_exists = "Mb" in header_items + pos_column = header_items.index("Mb") if mb_exists else header_items.index("cM") + + the_markers = {"1": {}} # type: Dict[str, Dict] + for line in lines[1:]: # The lines with markers + line_items = line.split("\t") + this_chr = line_items[0] + if this_chr not in the_markers: + the_markers[this_chr] = {} + the_markers[this_chr][str(float(line_items[pos_column]))] = line_items[1] + + return the_markers + +def find_nearest_marker(the_chr: str, the_pos: str, marker_list: Dict) -> Tuple[str, str]: + """Given a chromosome and position of a pseudomarker (from R/qtl pair-scan results), + return the nearest real marker""" + + pos_list = [float(pos) for pos in marker_list[the_chr]] + + # Get the position of the pseudomarker in the list of markers for the chr + the_pos_index = bisect(pos_list, float(the_pos)) + + proximal_marker = marker_list[the_chr][str(pos_list[the_pos_index-1])] + distal_marker = marker_list[the_chr][str(pos_list[the_pos_index])] + + return proximal_marker, distal_marker + +def process_perm_output(file_name: str) -> Tuple[List, float, float]: + """Given base filename, read in R/qtl permutation output and calculate + suggestive and significant thresholds""" + perm_results = [] with open(os.path.join(current_app.config.get("TMPDIR", "/tmp"), "output", "PERM_" + file_name), "r", encoding="utf-8") as the_file: diff --git a/scripts/rqtl_wrapper.R b/scripts/rqtl_wrapper.R index 13c2684..ea2c345 100644 --- a/scripts/rqtl_wrapper.R +++ b/scripts/rqtl_wrapper.R @@ -12,6 +12,7 @@ option_list = list( make_option(c("--covarstruct"), type="character", help="File detailing which covariates are categorical or numerical"), make_option(c("--model"), type="character", default="normal", help="Mapping Model - Normal or Non-Parametric"), make_option(c("--method"), type="character", default="hk", help="Mapping Method - hk (Haley Knott), ehk (Extended Haley Knott), mr (Marker Regression), em (Expectation-Maximization), imp (Imputation)"), + make_option(c("--pairscan"), action="store_true", default=NULL, help="Run Pair Scan - the R/qtl function scantwo"), make_option(c("-i", "--interval"), action="store_true", default=NULL, help="Use interval mapping"), make_option(c("--nperm"), type="integer", default=0, help="Number of permutations"), make_option(c("--pstrata"), action="store_true", default=NULL, help="Use permutation strata (stored as final column/vector in phenotype input file)"), @@ -171,9 +172,15 @@ verbose_print('Generating cross object\n') cross_object = geno_to_csvr(geno_file, trait_names, trait_vals, cross_file, type) # Calculate genotype probabilities -if (!is.null(opt$interval)) { +if (!is.null(opt$pairscan)) { + verbose_print('Calculating genotype probabilities for pair-scan\n') + cross_object <- calc.genoprob(cross_object, step=10) +} else if (!is.null(opt$interval)) { verbose_print('Calculating genotype probabilities with interval mapping\n') cross_object <- calc.genoprob(cross_object, step=5, stepwidth="max") +} else if (!is.null(opt$pairscan)) { + verbose_print('Calculating genotype probabilities for pair-scan\n') + cross_object <- calc.genoprob(cross_object, step=10) } else { verbose_print('Calculating genotype probabilities\n') cross_object <- calc.genoprob(cross_object) @@ -188,6 +195,7 @@ if (type == "4-way") { # Pull covariates out of cross object, if they exist covars <- c() # Holds the covariates which should be passed to R/qtl if (!is.null(opt$addcovar)) { + verbose_print('Pulling covariates out of cross object\n') # If perm strata are being used, it'll be included as the final column in the phenotype file if (!is.null(opt$pstrata)) { covar_names = trait_names[2:(length(trait_names)-1)] @@ -223,16 +231,30 @@ if (!is.null(opt$addcovar)) { # Pull permutation strata out of cross object, if it is being used perm_strata = vector() if (!is.null(opt$pstrata)) { + verbose_print('Pulling permutation strata out of cross object\n') strata_col = trait_names[length(trait_names)] perm_strata <- pull.pheno(cross_object, strata_col) } # If a marker name is supplied as covariate, get its vector of values and add them as a covariate if (!is.null(opt$control)) { + verbose_print('Creating marker covariates and binding them to covariates vector\n') marker_covars = create_marker_covars(cross_object, opt$control) covars <- cbind(covars, marker_covars) } +if (!is.null(opt$pairscan)) { + verbose_print("Running scantwo") + scan_func <- function(...){ + scantwo(...) + } +} else { + verbose_print("Running scanone") + scan_func <- function(...){ + scanone(...) + } +} + # Calculate permutations if (opt$nperm > 0) { if (!is.null(opt$filename)){ @@ -243,19 +265,19 @@ if (opt$nperm > 0) { if (!is.null(opt$addcovar) || !is.null(opt$control)){ if (!is.null(opt$pstrata)) { - verbose_print('Running ', opt$nperm, ' permutations with cofactors and strata\n') - perm_results = scanone(cross_object, pheno.col=1, addcovar=covars, n.perm=opt$nperm, perm.strata=perm_strata, model=opt$model, method=opt$method) + verbose_print('Running permutations with cofactors and strata\n') + perm_results = scan_func(cross_object, pheno.col=1, addcovar=covars, n.perm=opt$nperm, perm.strata=perm_strata, model=opt$model, method=opt$method) } else { - verbose_print('Running ', opt$nperm, ' permutations with cofactors\n') - perm_results = scanone(cross_object, pheno.col=1, addcovar=covars, n.perm=opt$nperm, model=opt$model, method=opt$method) + verbose_print('Running permutations with cofactors\n') + perm_results = scan_func(cross_object, pheno.col=1, addcovar=covars, n.perm=opt$nperm, model=opt$model, method=opt$method) } } else { if (!is.null(opt$pstrata)) { - verbose_print('Running ', opt$nperm, ' permutations with strata\n') - perm_results = scanone(cross_object, pheno.col=1, n.perm=opt$nperm, perm.strata=perm_strata, model=opt$model, method=opt$method) + verbose_print('Running permutations with strata\n') + perm_results = scan_func(cross_object, pheno.col=1, n.perm=opt$nperm, perm.strata=perm_strata, model=opt$model, method=opt$method) } else { - verbose_print('Running ', opt$nperm, ' permutations\n') - perm_results = scanone(cross_object, pheno.col=1, n.perm=opt$nperm, model=opt$model, method=opt$method) + verbose_print('Running permutations\n') + perm_results = scan_func(cross_object, pheno.col=1, n.perm=opt$nperm, model=opt$model, method=opt$method) } } write.csv(perm_results, perm_out_file) @@ -268,57 +290,64 @@ if (!is.null(opt$filename)){ } if (!is.null(opt$addcovar) || !is.null(opt$control)){ - verbose_print('Running scanone with cofactors\n') - qtl_results = scanone(cross_object, pheno.col=1, addcovar=covars, model=opt$model, method=opt$method) + verbose_print('Running scan with cofactors\n') + qtl_results = scan_func(cross_object, pheno.col=1, addcovar=covars, model=opt$model, method=opt$method) } else { - verbose_print('Running scanone\n') - qtl_results = scanone(cross_object, pheno.col=1, model=opt$model, method=opt$method) + verbose_print('Running scan\n') + qtl_results = scan_func(cross_object, pheno.col=1, model=opt$model, method=opt$method) } -#QTL main effects on adjusted longevity -getEffects <- function(sdata, gtsprob, marker = "1_24042124", model = "longevity ~ sex + site + cohort + treatment", trait = "longevity"){ - rownames(sdata) <- 1:nrow(sdata) - rownames(gtsprob) <- 1:nrow(gtsprob) - mp <- gtsprob[, grep(marker, colnames(gtsprob))] - gts <- unlist(lapply(lapply(lapply(apply(mp,1,function(x){which(x > 0.85)}),names), strsplit, ":"), function(x){ - if(length(x) > 0){ return(x[[1]][2]); }else{ return(NA) } - })) - - ismissing <- which(apply(sdata, 1, function(x){any(is.na(x))})) - if(length(ismissing) > 0){ - sdata <- sdata[-ismissing, ] - gts <- gts[-ismissing] +verbose_print('Writing results to CSV file\n') +if (!is.null(opt$pairscan)) { + map_out_file = file.path(opt$outdir, paste("MAP_", opt$filename, sep = "" )) + write.csv(qtl_results[1], out_file) + write.csv(qtl_results[2], map_out_file) +} else { + # QTL main effects on adjusted longevity + getEffects <- function(sdata, gtsprob, marker = "1_24042124", model = "longevity ~ sex + site + cohort + treatment", trait = "longevity"){ + rownames(sdata) <- 1:nrow(sdata) + rownames(gtsprob) <- 1:nrow(gtsprob) + mp <- gtsprob[, grep(marker, colnames(gtsprob))] + gts <- unlist(lapply(lapply(lapply(apply(mp,1,function(x){which(x > 0.85)}),names), strsplit, ":"), function(x){ + if(length(x) > 0){ return(x[[1]][2]); }else{ return(NA) } + })) + + ismissing <- which(apply(sdata, 1, function(x){any(is.na(x))})) + if(length(ismissing) > 0){ + sdata <- sdata[-ismissing, ] + gts <- gts[-ismissing] + } + + mlm <- lm(as.formula(model), data = sdata) + pheAdj <- rep(NA, nrow(sdata)) + adj <- residuals(mlm) + mean(sdata[, trait]) + pheAdj[as.numeric(names(adj))] <- adj + means <- c(mean(pheAdj[which(gts == "AC")],na.rm=TRUE),mean(pheAdj[which(gts == "AD")],na.rm=TRUE),mean(pheAdj[which(gts == "BC")],na.rm=TRUE),mean(pheAdj[which(gts == "BD")],na.rm=TRUE)) + std <- function(x) sd(x,na.rm=TRUE)/sqrt(length(x)) + stderrs <- c(std(pheAdj[which(gts == "AC")]),std(pheAdj[which(gts == "AD")]),std(pheAdj[which(gts == "BC")]),std(pheAdj[which(gts == "BD")])) + paste0(round(means,0), " ± ", round(stderrs,2)) } - mlm <- lm(as.formula(model), data = sdata) - pheAdj <- rep(NA, nrow(sdata)) - adj <- residuals(mlm) + mean(sdata[, trait]) - pheAdj[as.numeric(names(adj))] <- adj - means <- c(mean(pheAdj[which(gts == "AC")],na.rm=TRUE),mean(pheAdj[which(gts == "AD")],na.rm=TRUE),mean(pheAdj[which(gts == "BC")],na.rm=TRUE),mean(pheAdj[which(gts == "BD")],na.rm=TRUE)) - std <- function(x) sd(x,na.rm=TRUE)/sqrt(length(x)) - stderrs <- c(std(pheAdj[which(gts == "AC")]),std(pheAdj[which(gts == "AD")]),std(pheAdj[which(gts == "BC")]),std(pheAdj[which(gts == "BD")])) - paste0(round(means,0), " ± ", round(stderrs,2)) -} + if (type == "4-way") { + verbose_print("Get phenotype name + genoprob + all phenotypes + models for 4-way crosses") + traitname <- colnames(pull.pheno(cross_object))[1] + gtsp <- pull.genoprob(cross_object) + allpheno <- pull.pheno(cross_object) + if (!is.null(opt$addcovar)) { + model <- paste0(traitname, " ~ ", paste0(covar_names, sep="", collapse=" + ")) + } else { + model <- paste0(traitname, " ~ 1 ") + } -if (type == "4-way") { - verbose_print("Get phenotype name + genoprob + all phenotypes + models for 4-way crosses") - traitname <- colnames(pull.pheno(cross_object))[1] - gtsp <- pull.genoprob(cross_object) - allpheno <- pull.pheno(cross_object) - if (!is.null(opt$addcovar)) { - model <- paste0(traitname, " ~ ", paste0(covar_names, sep="", collapse=" + ")) - } else { - model <- paste0(traitname, " ~ 1 ") + meffects <- c() + verbose_print("Getting QTL main effects for 4-way crosses") + for(marker in rownames(qtl_results)){ + meff <- getEffects(allpheno, gtsp, marker = marker, model, trait = traitname) + meffects <- rbind(meffects, meff) + } + qtl_results <- cbind(data.frame(qtl_results[,1:3]), meffects) + colnames(qtl_results)[4:7] <- c("AC", "AD", "BC", "BD") } - meffects <- c() - verbose_print("Getting QTL main effects for 4-way crosses") - for(marker in rownames(qtl_results)){ - meff <- getEffects(allpheno, gtsp, marker = marker, model, trait = traitname) - meffects <- rbind(meffects, meff) - } - qtl_results <- cbind(data.frame(qtl_results[,1:3]), meffects) - colnames(qtl_results)[4:7] <- c("AC", "AD", "BC", "BD") + write.csv(qtl_results, out_file) } - -write.csv(qtl_results, out_file) | 
