From 0bfe2990ce31fbd808b680a883619575d864aef5 Mon Sep 17 00:00:00 2001 From: zsloan Date: Tue, 25 May 2021 20:28:02 +0000 Subject: Add rqtl_wrapper.R script and necessary imports to guix.scm --- scripts/rqtl_wrapper.R | 204 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 204 insertions(+) create mode 100644 scripts/rqtl_wrapper.R (limited to 'scripts') diff --git a/scripts/rqtl_wrapper.R b/scripts/rqtl_wrapper.R new file mode 100644 index 0000000..ae970d4 --- /dev/null +++ b/scripts/rqtl_wrapper.R @@ -0,0 +1,204 @@ +library(optparse) +library(qtl) +library(stringi) +library(stringr) + +tmp_dir = Sys.getenv("TMPDIR") + +option_list = list( + make_option(c("-g", "--geno"), type="character", help=".geno file containing a dataset's genotypes"), + make_option(c("-p", "--pheno"), type="character", help="File containing two columns - sample names and values"), + make_option(c("-c", "--addcovar"), action="store_true", default=NULL, help="Use covariates (included as extra columns in the phenotype input file)"), + make_option(c("--model"), type="character", default="normal", help="Mapping Model - Normal or Non-Parametric"), + make_option(c("--method"), type="character", default="hk", help="Mapping Method - hk (Haley Knott), ehk (Extended Haley Knott), mr (Marker Regression), em (Expectation-Maximization), imp (Imputation)"), + make_option(c("-i", "--interval"), action="store_true", default=NULL, help="Use interval mapping"), + make_option(c("--nperm"), type="integer", default=0, help="Number of permutations"), + make_option(c("-s", "--scale"), type="character", default="mb", help="Mapping scale - Megabases (Mb) or Centimorgans (cM)"), + make_option(c("--control"), type="character", default=NULL, help="Name of marker (contained in genotype file) to be used as a control"), + make_option(c("-o", "--outdir"), type="character", default=file.path(tmp_dir, "output"), help="Directory in which to write result file"), + make_option(c("-f", "--filename"), type="character", default=NULL, help="Name to use for result file"), + make_option(c("-v", "--verbose"), action="store_true", default=NULL, help="Show extra information") +); + +opt_parser = OptionParser(option_list=option_list); +opt = parse_args(opt_parser); + +verbose_print <- function(...){ + if (!is.null(opt$verbose)) { + for(item in list(...)){ + cat(item) + } + cat("\n") + } +} + +if (is.null(opt$geno) || is.null(opt$pheno)){ + print_help(opt_parser) + stop("Both a genotype and phenotype file must be provided.", call.=FALSE) +} + +geno_file = opt$geno +pheno_file = opt$pheno + +# Generate randomized filename for cross object +cross_file = file.path(tmp_dir, "cross", paste(stri_rand_strings(1, 8), ".cross", sep = "")) + +trim <- function( x ) { gsub("(^[[:space:]]+|[[:space:]]+$)", "", x) } + +get_geno_code <- function(header, name = 'unk'){ + mat = which(unlist(lapply(header,function(x){ length(grep(paste('@',name,sep=''), x)) })) == 1) + return(trim(strsplit(header[mat],':')[[1]][2])) +} + +geno_to_csvr <- function(genotypes, trait_names, trait_vals, out, sex = NULL, + mapping_scale = "Mb", verbose = FALSE){ + # Assume a geno header is not longer than 40 lines + header = readLines(genotypes, 40) + + # Major hack to skip the geno headers + toskip = which(unlist(lapply(header, function(x){ length(grep("Chr\t", x)) })) == 1) - 1 + + type <- get_geno_code(header, 'type') + + # Get the genotype codes + if(type == '4-way'){ + genocodes <- NULL + } else { + genocodes <- c(get_geno_code(header, 'mat'), get_geno_code(header, 'het'), + get_geno_code(header, 'pat')) + } + genodata <- read.csv(genotypes, sep='\t', skip=toskip, header=TRUE, + na.strings=get_geno_code(header,'unk'), + colClasses='character', comment.char = '#') + + verbose_print('Genodata:', toskip, " ", dim(genodata), genocodes, '\n') + + # If there isn't a sex phenotype, treat all as males + if(is.null(sex)) sex <- rep('m', (ncol(genodata)-4)) + + cross_items = list() + + # Add trait and covar phenotypes + for (i in 1:length(trait_names)){ + cross_items[[i]] <- c(trait_names[i], '', '', unlist(trait_vals[[i]])) + } + + # Sex phenotype for the mice + cross_items[[length(trait_names) + 1]] <- c('sex', '', '', sex) + # Genotypes + cross_items[[length(trait_names) + 2]] <- cbind(genodata[,c('Locus','Chr', mapping_scale)], + genodata[, 5:ncol(genodata)]) + + out_csvr <- do.call(rbind, cross_items) + + # Save it to a file + write.table(out_csvr, file=out, row.names=FALSE, col.names=FALSE, quote=FALSE, sep=',') + + # Load the created cross file using R/qtl read.cross + if (type == '4-way') { + verbose_print('Loading in as 4-WAY\n') + cross = read.cross(file=out, 'csvr', genotypes=NULL, crosstype="4way") + } else if(type == 'f2') { + verbose_print('Loading in as F2\n') + cross = read.cross(file=out, 'csvr', genotypes=genocodes, crosstype="f2") + } else { + verbose_print('Loading in as normal\n') + cross = read.cross(file=out, 'csvr', genotypes=genocodes) + } + if (type == 'riset') { + # If its a RIL, convert to a RIL in R/qtl + verbose_print('Converting to RISELF\n') + cross <- convert2riself(cross) + } + + return(cross) +} + +create_marker_covars <- function(the_cross, control_marker){ + #' Given a string of one or more marker names (comma separated), fetch + #' the markers' values from the genotypes and return them as vectors/a vector + #' of values + + # In case spaces are added in the string of marker names + covariate_names <- strsplit(str_replace(control_marker, " ", ""), ",") + + genotypes <- pull.geno(the_cross) + covariates_in_geno <- which(covariate_names %in% colnames(genotypes)) + covariate_names <- covariate_names[covariates_in_geno] + marker_covars <- genotypes[, unlist(covariate_names)] + + return(marker_covars) +} + +# Get phenotype vector from input file +df <- read.table(pheno_file, na.strings = "x", header=TRUE, check.names=FALSE) +sample_names <- df$Sample +trait_names <- colnames(df)[2:length(colnames(df))] + +trait_vals <- vector(mode = "list", length = length(trait_names)) +for (i in 1:length(trait_names)) { + this_trait <- trait_names[i] + this_vals <- df[this_trait] + trait_vals[[i]] <- this_vals +} + +# Since there will always only be one non-covar phenotype, its name will be in the first column +pheno_name = unlist(trait_names)[1] + +verbose_print('Generating cross object\n') +cross_object = geno_to_csvr(geno_file, trait_names, trait_vals, cross_file) + +# Calculate genotype probabilities +if (!is.null(opt$interval)) { + verbose_print('Calculating genotype probabilities with interval mapping\n') + cross_object <- calc.genoprob(cross_object, step=5, stepwidth="max") +} else { + verbose_print('Calculating genotype probabilities\n') + cross_object <- calc.genoprob(cross_object) +} + +# Pull covariates out of cross object, if they exist +covars = vector() +if (!is.null(opt$addcovar)) { + covar_names = trait_names[2:length(trait_names)] + covars <- pull.pheno(cross_object, covar_names) +} + +# If a marker name is supplied as covariate, get its vector of values and add them as a covariate +if (!is.null(opt$control)) { + marker_covars = create_marker_covars(cross_object, opt$control) + covars <- cbind(covars, marker_covars) +} + +# Calculate permutations +if (opt$nperm > 0) { + if (!is.null(opt$filename)){ + perm_out_file = file.path(opt$outdir, paste("PERM_", opt$filename, sep = "")) + } else { + perm_out_file = file.path(opt$outdir, paste(pheno_name, "_PERM_", stri_rand_strings(1, 8), ".csv", sep = "")) + } + + if (!is.null(opt$addcovar) || !is.null(opt$control)){ + verbose_print('Running permutations with cofactors\n') + perm_results = scanone(cross_object, pheno.col=1, addcovar=covars, n.perm=opt$nperm, model=opt$model, method=opt$method) + } else { + verbose_print('Running permutations\n') + perm_results = scanone(cross_object, pheno.col=1, n.perm=opt$nperm, model=opt$model, method=opt$method) + } + write.csv(perm_results, perm_out_file) +} + +if (!is.null(opt$filename)){ + out_file = file.path(opt$outdir, opt$filename) +} else { + out_file = file.path(opt$outdir, paste(pheno_name, "_", stri_rand_strings(1, 8), ".csv", sep = "")) +} + +if (!is.null(opt$addcovar) || !is.null(opt$control)){ + verbose_print('Running scanone with cofactors\n') + qtl_results = scanone(cross_object, pheno.col=1, addcovar=covars, model=opt$model, method=opt$method) +} else { + verbose_print('Running scanone\n') + qtl_results = scanone(cross_object, pheno.col=1, model=opt$model, method=opt$method) +} +write.csv(qtl_results, out_file) \ No newline at end of file -- cgit v1.2.3