diff options
-rw-r--r-- | gn3/api/rqtl.py | 58 | ||||
-rw-r--r-- | gn3/app.py | 2 | ||||
-rw-r--r-- | gn3/commands.py | 14 | ||||
-rw-r--r-- | gn3/computations/rqtl.py | 103 | ||||
-rw-r--r-- | gn3/settings.py | 1 | ||||
-rw-r--r-- | guix.scm | 2 | ||||
-rw-r--r-- | mypy.ini | 3 | ||||
-rw-r--r-- | scripts/rqtl_wrapper.R | 229 | ||||
-rw-r--r-- | tests/unit/computations/test_rqtl.py | 42 | ||||
-rw-r--r-- | tests/unit/test_commands.py | 25 |
10 files changed, 479 insertions, 0 deletions
diff --git a/gn3/api/rqtl.py b/gn3/api/rqtl.py new file mode 100644 index 0000000..ebb746c --- /dev/null +++ b/gn3/api/rqtl.py @@ -0,0 +1,58 @@ +"""Endpoints for running the rqtl cmd""" +import os + +from flask import Blueprint +from flask import current_app +from flask import jsonify +from flask import request + +from gn3.computations.rqtl import generate_rqtl_cmd, process_rqtl_output, process_perm_output +from gn3.computations.gemma import do_paths_exist + +rqtl = Blueprint("rqtl", __name__) + +@rqtl.route("/compute", methods=["POST"]) +def compute(): + """Given at least a geno_file and pheno_file, generate and +run the rqtl_wrapper script and return the results as JSON + + """ + genofile = request.form['geno_file'] + phenofile = request.form['pheno_file'] + + if not do_paths_exist([genofile, phenofile]): + raise FileNotFoundError + + # Split kwargs by those with values and boolean ones that just convert to True/False + kwargs = ["model", "method", "nperm", "scale", "control_marker"] + boolean_kwargs = ["addcovar", "interval", "pstrata"] + all_kwargs = kwargs + boolean_kwargs + + rqtl_kwargs = {"geno": genofile, "pheno": phenofile} + rqtl_bool_kwargs = [] + for kwarg in all_kwargs: + if kwarg in request.form: + if kwarg in kwargs: + rqtl_kwargs[kwarg] = request.form[kwarg] + if kwarg in boolean_kwargs: + rqtl_bool_kwargs.append(kwarg) + + rqtl_cmd = generate_rqtl_cmd( + rqtl_wrapper_cmd=current_app.config.get("RQTL_WRAPPER_CMD"), + rqtl_wrapper_kwargs=rqtl_kwargs, + rqtl_wrapper_bool_kwargs=rqtl_bool_kwargs + ) + + rqtl_output = {} + if not os.path.isfile(os.path.join(current_app.config.get("TMPDIR"), + "output", rqtl_cmd.get('output_file'))): + os.system(rqtl_cmd.get('rqtl_cmd')) + + rqtl_output['results'] = process_rqtl_output(rqtl_cmd.get('output_file')) + + rqtl_output['results'] = process_rqtl_output(rqtl_cmd.get('output_file')) + if int(rqtl_kwargs['nperm']) > 0: + rqtl_output['perm_results'], rqtl_output['suggestive'], rqtl_output['significant'] = \ + process_perm_output(rqtl_cmd.get('output_file')) + + return jsonify(rqtl_output) @@ -5,6 +5,7 @@ from typing import Dict from typing import Union from flask import Flask from gn3.api.gemma import gemma +from gn3.api.rqtl import rqtl from gn3.api.general import general from gn3.api.correlation import correlation from gn3.api.data_entry import data_entry @@ -28,6 +29,7 @@ def create_app(config: Union[Dict, str, None] = None) -> Flask: app.config.from_pyfile(config) app.register_blueprint(general, url_prefix="/api/") app.register_blueprint(gemma, url_prefix="/api/gemma") + app.register_blueprint(rqtl, url_prefix="/api/rqtl") app.register_blueprint(correlation, url_prefix="/api/correlation") app.register_blueprint(data_entry, url_prefix="/api/dataentry") return app diff --git a/gn3/commands.py b/gn3/commands.py index 4b0d62d..14bd295 100644 --- a/gn3/commands.py +++ b/gn3/commands.py @@ -30,6 +30,20 @@ def compose_gemma_cmd(gemma_wrapper_cmd: str = "gemma-wrapper", cmd += " ".join([f"{arg}" for arg in gemma_args]) return cmd +def compose_rqtl_cmd(rqtl_wrapper_cmd: str, + rqtl_wrapper_kwargs: Dict, + rqtl_wrapper_bool_kwargs: list) -> str: + """Compose a valid R/qtl command given the correct input""" + # Add kwargs with values + cmd = f"Rscript { rqtl_wrapper_cmd } " + " ".join( + [f"--{key} {val}" for key, val in rqtl_wrapper_kwargs.items()]) + + # Add boolean kwargs (kwargs that are either on or off, like --interval) + if rqtl_wrapper_bool_kwargs: + cmd += " " + cmd += " ".join([f"--{val}" for val in rqtl_wrapper_bool_kwargs]) + + return cmd def queue_cmd(conn: Redis, job_queue: str, diff --git a/gn3/computations/rqtl.py b/gn3/computations/rqtl.py new file mode 100644 index 0000000..0433b3f --- /dev/null +++ b/gn3/computations/rqtl.py @@ -0,0 +1,103 @@ +"""Procedures related rqtl computations""" +import os +from typing import Dict, List, Union + +import numpy as np + +from flask import current_app + +from gn3.commands import compose_rqtl_cmd +from gn3.computations.gemma import generate_hash_of_string +from gn3.fs_helpers import get_hash_of_files + +def generate_rqtl_cmd(rqtl_wrapper_cmd: str, + rqtl_wrapper_kwargs: Dict, + rqtl_wrapper_bool_kwargs: list) -> Dict: + """Given the base rqtl_wrapper command and +dict of keyword arguments, return the full rqtl_wrapper command and an +output filename generated from a hash of the genotype and phenotype files + + """ + + # Generate a hash from contents of the genotype and phenotype files + _hash = get_hash_of_files( + [v for k, v in rqtl_wrapper_kwargs.items() if k in ["g", "p"]]) + + # Append to hash a hash of keyword arguments + _hash += generate_hash_of_string( + ",".join([f"{k}:{v}" for k, v in rqtl_wrapper_kwargs.items() if k not in ["g", "p"]])) + + # Append to hash a hash of boolean keyword arguments + _hash += generate_hash_of_string( + ",".join(rqtl_wrapper_bool_kwargs)) + + # Temporarily substitute forward-slashes in hash with underscores + _hash = _hash.replace("/", "_") + + _output_filename = f"{_hash}-output.csv" + rqtl_wrapper_kwargs["filename"] = _output_filename + + return { + "output_file": + _output_filename, + "rqtl_cmd": + compose_rqtl_cmd(rqtl_wrapper_cmd=rqtl_wrapper_cmd, + rqtl_wrapper_kwargs=rqtl_wrapper_kwargs, + rqtl_wrapper_bool_kwargs=rqtl_wrapper_bool_kwargs) + } + + +def process_rqtl_output(file_name: str) -> List: + """Given an output file name, read in R/qtl results and return + a List of marker objects + + """ + marker_obs = [] + # Later I should probably redo this using csv.read to avoid the + # awkwardness with removing quotes with [1:-1] + with open(os.path.join(current_app.config.get("TMPDIR", "/tmp"), + "output", file_name), "r") as the_file: + for line in the_file: + line_items = line.split(",") + if line_items[1][1:-1] == "chr" or not line_items: + # Skip header line + continue + + # Convert chr to int if possible + the_chr: Union[int, str] + try: + the_chr = int(line_items[1][1:-1]) + except ValueError: + the_chr = line_items[1][1:-1] + this_marker = { + "name": line_items[0][1:-1], + "chr": the_chr, + "cM": float(line_items[2]), + "Mb": float(line_items[2]), + "lod_score": float(line_items[3]) + } + marker_obs.append(this_marker) + + return marker_obs + + +def process_perm_output(file_name: str): + """Given base filename, read in R/qtl permutation output and calculate + suggestive and significant thresholds + + """ + perm_results = [] + with open(os.path.join(current_app.config.get("TMPDIR", "/tmp"), + "output", "PERM_" + file_name), "r") as the_file: + for i, line in enumerate(the_file): + if i == 0: + # Skip header line + continue + + line_items = line.split(",") + perm_results.append(float(line_items[1])) + + suggestive = np.percentile(np.array(perm_results), 67) + significant = np.percentile(np.array(perm_results), 95) + + return perm_results, suggestive, significant diff --git a/gn3/settings.py b/gn3/settings.py index 2057ce1..ecfd502 100644 --- a/gn3/settings.py +++ b/gn3/settings.py @@ -6,6 +6,7 @@ import os BCRYPT_SALT = "$2b$12$mxLvu9XRLlIaaSeDxt8Sle" # Change this! DATA_DIR = "" GEMMA_WRAPPER_CMD = os.environ.get("GEMMA_WRAPPER", "gemma-wrapper") +RQTL_WRAPPER_CMD = os.environ.get("RQTL_WRAPPER") CACHEDIR = "" REDIS_URI = "redis://localhost:6379/0" REDIS_JOB_QUEUE = "GN3::job-queue" @@ -87,6 +87,8 @@ ; ("r-ctl" ,r-ctl) ("r-qtl" ,r-qtl) ("r-optparse" ,r-optparse) + ("r-stringi" ,r-stringi) + ("r-stringr" ,r-stringr) ; ("r-wgcna" ,r-wgcna) )) (build-system python-build-system) @@ -3,6 +3,9 @@ [mypy-scipy.*] ignore_missing_imports = True +[mypy-numpy.*] +ignore_missing_imports = True + [mypy-MySQLdb.*] ignore_missing_imports = True diff --git a/scripts/rqtl_wrapper.R b/scripts/rqtl_wrapper.R new file mode 100644 index 0000000..f7e0406 --- /dev/null +++ b/scripts/rqtl_wrapper.R @@ -0,0 +1,229 @@ +library(optparse) +library(qtl) +library(stringi) +library(stringr) + +tmp_dir = Sys.getenv("TMPDIR") + +option_list = list( + make_option(c("-g", "--geno"), type="character", help=".geno file containing a dataset's genotypes"), + make_option(c("-p", "--pheno"), type="character", help="File containing two columns - sample names and values"), + make_option(c("-c", "--addcovar"), action="store_true", default=NULL, help="Use covariates (included as extra columns in the phenotype input file)"), + make_option(c("--model"), type="character", default="normal", help="Mapping Model - Normal or Non-Parametric"), + make_option(c("--method"), type="character", default="hk", help="Mapping Method - hk (Haley Knott), ehk (Extended Haley Knott), mr (Marker Regression), em (Expectation-Maximization), imp (Imputation)"), + make_option(c("-i", "--interval"), action="store_true", default=NULL, help="Use interval mapping"), + make_option(c("--nperm"), type="integer", default=0, help="Number of permutations"), + make_option(c("--pstrata"), action="store_true", default=NULL, help="Use permutation strata (stored as final column/vector in phenotype input file)"), + make_option(c("-s", "--scale"), type="character", default="mb", help="Mapping scale - Megabases (Mb) or Centimorgans (cM)"), + make_option(c("--control"), type="character", default=NULL, help="Name of marker (contained in genotype file) to be used as a control"), + make_option(c("-o", "--outdir"), type="character", default=file.path(tmp_dir, "output"), help="Directory in which to write result file"), + make_option(c("-f", "--filename"), type="character", default=NULL, help="Name to use for result file"), + make_option(c("-v", "--verbose"), action="store_true", default=NULL, help="Show extra information") +); + +opt_parser = OptionParser(option_list=option_list); +opt = parse_args(opt_parser); + +verbose_print <- function(...){ + if (!is.null(opt$verbose)) { + for(item in list(...)){ + cat(item) + } + cat("\n") + } +} + +if (is.null(opt$geno) || is.null(opt$pheno)){ + print_help(opt_parser) + stop("Both a genotype and phenotype file must be provided.", call.=FALSE) +} + +geno_file = opt$geno +pheno_file = opt$pheno + +# Generate randomized filename for cross object +cross_file = file.path(tmp_dir, "cross", paste(stri_rand_strings(1, 8), ".cross", sep = "")) + +trim <- function( x ) { gsub("(^[[:space:]]+|[[:space:]]+$)", "", x) } + +get_geno_code <- function(header, name = 'unk'){ + mat = which(unlist(lapply(header,function(x){ length(grep(paste('@',name,sep=''), x)) })) == 1) + return(trim(strsplit(header[mat],':')[[1]][2])) +} + +geno_to_csvr <- function(genotypes, trait_names, trait_vals, out, sex = NULL, + mapping_scale = "Mb", verbose = FALSE){ + # Assume a geno header is not longer than 40 lines + header = readLines(genotypes, 40) + + # Major hack to skip the geno headers + toskip = which(unlist(lapply(header, function(x){ length(grep("Chr\t", x)) })) == 1) - 1 + + type <- get_geno_code(header, 'type') + + # Get the genotype codes + if(type == '4-way'){ + genocodes <- NULL + } else { + genocodes <- c(get_geno_code(header, 'mat'), get_geno_code(header, 'het'), + get_geno_code(header, 'pat')) + } + genodata <- read.csv(genotypes, sep='\t', skip=toskip, header=TRUE, + na.strings=get_geno_code(header,'unk'), + colClasses='character', comment.char = '#') + + verbose_print('Genodata:', toskip, " ", dim(genodata), genocodes, '\n') + + # If there isn't a sex phenotype, treat all as males + if(is.null(sex)) sex <- rep('m', (ncol(genodata)-4)) + + cross_items = list() + + # Add trait and covar phenotypes + for (i in 1:length(trait_names)){ + cross_items[[i]] <- c(trait_names[i], '', '', unlist(trait_vals[[i]])) + } + + # Sex phenotype for the mice + cross_items[[length(trait_names) + 1]] <- c('sex', '', '', sex) + # Genotypes + cross_items[[length(trait_names) + 2]] <- cbind(genodata[,c('Locus','Chr', mapping_scale)], + genodata[, 5:ncol(genodata)]) + + out_csvr <- do.call(rbind, cross_items) + + # Save it to a file + write.table(out_csvr, file=out, row.names=FALSE, col.names=FALSE, quote=FALSE, sep=',') + + # Load the created cross file using R/qtl read.cross + if (type == '4-way') { + verbose_print('Loading in as 4-WAY\n') + cross = read.cross(file=out, 'csvr', genotypes=NULL, crosstype="4way") + } else if(type == 'f2') { + verbose_print('Loading in as F2\n') + cross = read.cross(file=out, 'csvr', genotypes=genocodes, crosstype="f2") + } else { + verbose_print('Loading in as normal\n') + cross = read.cross(file=out, 'csvr', genotypes=genocodes) + } + if (type == 'riset') { + # If its a RIL, convert to a RIL in R/qtl + verbose_print('Converting to RISELF\n') + cross <- convert2riself(cross) + } + + return(cross) +} + +create_marker_covars <- function(the_cross, control_marker){ + #' Given a string of one or more marker names (comma separated), fetch + #' the markers' values from the genotypes and return them as vectors/a vector + #' of values + + # In case spaces are added in the string of marker names + covariate_names <- strsplit(str_replace(control_marker, " ", ""), ",") + + genotypes <- pull.geno(the_cross) + covariates_in_geno <- which(covariate_names %in% colnames(genotypes)) + covariate_names <- covariate_names[covariates_in_geno] + marker_covars <- genotypes[, unlist(covariate_names)] + + return(marker_covars) +} + +# Get phenotype vector from input file +df <- read.table(pheno_file, na.strings = "x", header=TRUE, check.names=FALSE) +sample_names <- df$Sample +trait_names <- colnames(df)[2:length(colnames(df))] + +# Since there will always only be one non-covar phenotype, its name will be in the first column +pheno_name = unlist(trait_names)[1] + +trait_vals <- vector(mode = "list", length = length(trait_names)) +for (i in 1:length(trait_names)) { + this_trait <- trait_names[i] + this_vals <- df[this_trait] + trait_vals[[i]] <- this_vals + + trait_names[i] = paste("T_", this_trait, sep = "") +} + +verbose_print('Generating cross object\n') +cross_object = geno_to_csvr(geno_file, trait_names, trait_vals, cross_file) + +# Calculate genotype probabilities +if (!is.null(opt$interval)) { + verbose_print('Calculating genotype probabilities with interval mapping\n') + cross_object <- calc.genoprob(cross_object, step=5, stepwidth="max") +} else { + verbose_print('Calculating genotype probabilities\n') + cross_object <- calc.genoprob(cross_object) +} + +# Pull covariates out of cross object, if they exist +covars = vector(mode = "list", length = length(trait_names) - 1) +if (!is.null(opt$addcovar)) { + #If perm strata are being used, it'll be included as the final column in the phenotype file + if (!is.null(opt$pstrata)) { + covar_names = trait_names[3:length(trait_names) - 1] + } else { + covar_names = trait_names[3:length(trait_names)] + } + covars <- pull.pheno(cross_object, covar_names) +} + +# Pull permutation strata out of cross object, if it is being used +perm_strata = vector() +if (!is.null(opt$pstrata)) { + strata_col = trait_names[length(trait_names)] + perm_strata <- pull.pheno(cross_object, strata_col) +} + +# If a marker name is supplied as covariate, get its vector of values and add them as a covariate +if (!is.null(opt$control)) { + marker_covars = create_marker_covars(cross_object, opt$control) + covars <- cbind(covars, marker_covars) +} + +# Calculate permutations +if (opt$nperm > 0) { + if (!is.null(opt$filename)){ + perm_out_file = file.path(opt$outdir, paste("PERM_", opt$filename, sep = "" )) + } else { + perm_out_file = file.path(opt$outdir, paste(pheno_name, "_PERM_", stri_rand_strings(1, 8), sep = "")) + } + + if (!is.null(opt$addcovar) || !is.null(opt$control)){ + if (!is.null(opt$pstrata)) { + verbose_print('Running permutations with cofactors and strata\n') + perm_results = scanone(cross_object, pheno.col=1, addcovar=covars, n.perm=opt$nperm, perm.strata=perm_strata, model=opt$model, method=opt$method) + } else { + verbose_print('Running permutations with cofactors\n') + perm_results = scanone(cross_object, pheno.col=1, addcovar=covars, n.perm=opt$nperm, model=opt$model, method=opt$method) + } + } else { + if (!is.null(opt$pstrata)) { + verbose_print('Running permutations with strata\n') + perm_results = scanone(cross_object, pheno.col=1, n.perm=opt$nperm, perm.strata=perm_strata, model=opt$model, method=opt$method) + } else { + verbose_print('Running permutations\n') + perm_results = scanone(cross_object, pheno.col=1, n.perm=opt$nperm, model=opt$model, method=opt$method) + } + } + write.csv(perm_results, perm_out_file) +} + +if (!is.null(opt$filename)){ + out_file = file.path(opt$outdir, opt$filename) +} else { + out_file = file.path(opt$outdir, paste(pheno_name, "_", stri_rand_strings(1, 8), sep = "")) +} + +if (!is.null(opt$addcovar) || !is.null(opt$control)){ + verbose_print('Running scanone with cofactors\n') + qtl_results = scanone(cross_object, pheno.col=1, addcovar=covars, model=opt$model, method=opt$method) +} else { + verbose_print('Running scanone\n') + qtl_results = scanone(cross_object, pheno.col=1, model=opt$model, method=opt$method) +} +write.csv(qtl_results, out_file) diff --git a/tests/unit/computations/test_rqtl.py b/tests/unit/computations/test_rqtl.py new file mode 100644 index 0000000..955d0ab --- /dev/null +++ b/tests/unit/computations/test_rqtl.py @@ -0,0 +1,42 @@ +"""Test cases for procedures defined in computations.rqtl""" +import unittest + +from unittest import mock +from gn3.computations.rqtl import generate_rqtl_cmd + +class TestRqtl(unittest.TestCase): + """Test cases for computations.rqtl module""" + @mock.patch("gn3.computations.rqtl.generate_hash_of_string") + @mock.patch("gn3.computations.rqtl.get_hash_of_files") + def test_generate_rqtl_command(self, mock_get_hash_files, mock_generate_hash_string): + """Test computing mapping results with R/qtl""" + mock_get_hash_files.return_value = "my-hash1" + mock_generate_hash_string.return_value = "my-hash2" + + self.assertEqual( + generate_rqtl_cmd(rqtl_wrapper_cmd="rqtl-wrapper", + rqtl_wrapper_kwargs={ + "g": "genofile", + "p": "phenofile", + "model": "normal", + "method": "hk", + "nperm": 1000, + "scale": "Mb", + "control": "rs123456" + }, + rqtl_wrapper_bool_kwargs=[ + "addcovar", + "interval" + ]), { + "output_file": + "my-hash1my-hash2my-hash2-output.csv", + "rqtl_cmd": ( + "Rscript rqtl-wrapper " + "--g genofile --p phenofile " + "--model normal --method hk " + "--nperm 1000 --scale Mb " + "--control rs123456 " + "--filename my-hash1my-hash2my-hash2-output.csv " + "--addcovar --interval" + ) + }) diff --git a/tests/unit/test_commands.py b/tests/unit/test_commands.py index aafb3a2..f36ba55 100644 --- a/tests/unit/test_commands.py +++ b/tests/unit/test_commands.py @@ -6,6 +6,7 @@ from datetime import datetime from typing import Callable from unittest import mock from gn3.commands import compose_gemma_cmd +from gn3.commands import compose_rqtl_cmd from gn3.commands import queue_cmd from gn3.commands import run_cmd from gn3.exceptions import RedisConnectionError @@ -53,6 +54,30 @@ class TestCommands(unittest.TestCase): "-p /tmp/gf13Ad0tRX/phenofile.txt" " -gk")) + def test_compose_rqtl_cmd(self): + """Test that the R/qtl cmd is composed correctly""" + self.assertEqual( + compose_rqtl_cmd(rqtl_wrapper_cmd="rqtl-wrapper", + rqtl_wrapper_kwargs={ + "g": "the_genofile", + "p": "the_phenofile", + "model": "np", + "method": "ehk", + "nperm": 2000, + "scale": "Mb", + "control": "rs234567" + }, + rqtl_wrapper_bool_kwargs=[ + "addcovar" + ]), + ("Rscript rqtl-wrapper " + "--g the_genofile --p the_phenofile " + "--model np --method ehk " + "--nperm 2000 --scale Mb " + "--control rs234567 " + "--addcovar") + ) + def test_queue_cmd_exception_raised_when_redis_is_down(self): """Test that the correct error is raised when Redis is unavailable""" self.assertRaises(RedisConnectionError, |