aboutsummaryrefslogtreecommitdiff
path: root/scripts/rqtl_wrapper.R
blob: ae970d42bf3279e8dca67b0b94bb145574092853 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
library(optparse)
library(qtl)
library(stringi)
library(stringr)

tmp_dir = Sys.getenv("TMPDIR")

option_list = list(
  make_option(c("-g", "--geno"), type="character", help=".geno file containing a dataset's genotypes"),
  make_option(c("-p", "--pheno"), type="character", help="File containing two columns - sample names and values"),
  make_option(c("-c", "--addcovar"), action="store_true", default=NULL, help="Use covariates (included as extra columns in the phenotype input file)"),
  make_option(c("--model"), type="character", default="normal", help="Mapping Model - Normal or Non-Parametric"),
  make_option(c("--method"), type="character", default="hk", help="Mapping Method - hk (Haley Knott), ehk (Extended Haley Knott), mr (Marker Regression), em (Expectation-Maximization), imp (Imputation)"),
  make_option(c("-i", "--interval"), action="store_true", default=NULL, help="Use interval mapping"),
  make_option(c("--nperm"), type="integer", default=0, help="Number of permutations"),
  make_option(c("-s", "--scale"), type="character", default="mb", help="Mapping scale - Megabases (Mb) or Centimorgans (cM)"),
  make_option(c("--control"), type="character", default=NULL, help="Name of marker (contained in genotype file) to be used as a control"),
  make_option(c("-o", "--outdir"), type="character", default=file.path(tmp_dir, "output"), help="Directory in which to write result file"),
  make_option(c("-f", "--filename"), type="character", default=NULL, help="Name to use for result file"),
  make_option(c("-v", "--verbose"), action="store_true", default=NULL, help="Show extra information")
);

opt_parser = OptionParser(option_list=option_list);
opt = parse_args(opt_parser);

verbose_print <- function(...){
  if (!is.null(opt$verbose)) {
    for(item in list(...)){
      cat(item)
    }
    cat("\n")
  }
}

if (is.null(opt$geno) || is.null(opt$pheno)){
  print_help(opt_parser)
  stop("Both a genotype and phenotype file must be provided.", call.=FALSE)
}

geno_file = opt$geno
pheno_file = opt$pheno

# Generate randomized filename for cross object
cross_file = file.path(tmp_dir, "cross", paste(stri_rand_strings(1, 8), ".cross", sep = ""))

trim <- function( x ) { gsub("(^[[:space:]]+|[[:space:]]+$)", "", x) }

get_geno_code <- function(header, name = 'unk'){
  mat = which(unlist(lapply(header,function(x){ length(grep(paste('@',name,sep=''), x)) })) == 1)
  return(trim(strsplit(header[mat],':')[[1]][2]))
}

geno_to_csvr <- function(genotypes, trait_names, trait_vals, out, sex = NULL,
                         mapping_scale = "Mb", verbose = FALSE){
  # Assume a geno header is not longer than 40 lines
  header = readLines(genotypes, 40)

  # Major hack to skip the geno headers
  toskip = which(unlist(lapply(header, function(x){ length(grep("Chr\t", x)) })) == 1) - 1

  type <- get_geno_code(header, 'type')

  # Get the genotype codes
  if(type == '4-way'){
    genocodes <- NULL
  } else {
    genocodes <- c(get_geno_code(header, 'mat'), get_geno_code(header, 'het'),
                   get_geno_code(header, 'pat'))
  }
  genodata <- read.csv(genotypes, sep='\t', skip=toskip, header=TRUE,
                       na.strings=get_geno_code(header,'unk'),
                       colClasses='character', comment.char = '#')

  verbose_print('Genodata:', toskip, " ", dim(genodata), genocodes, '\n')

  # If there isn't a sex phenotype, treat all as males
  if(is.null(sex)) sex <- rep('m', (ncol(genodata)-4))

  cross_items = list()

  # Add trait and covar phenotypes
  for (i in 1:length(trait_names)){
    cross_items[[i]] <- c(trait_names[i], '', '', unlist(trait_vals[[i]]))
  }

  # Sex phenotype for the mice
  cross_items[[length(trait_names) + 1]] <- c('sex', '', '', sex)
  # Genotypes
  cross_items[[length(trait_names) + 2]] <- cbind(genodata[,c('Locus','Chr', mapping_scale)],
                                                  genodata[, 5:ncol(genodata)])

  out_csvr <- do.call(rbind, cross_items)

  # Save it to a file
  write.table(out_csvr, file=out, row.names=FALSE, col.names=FALSE, quote=FALSE, sep=',')

  # Load the created cross file using R/qtl read.cross
  if (type == '4-way') {
    verbose_print('Loading in as 4-WAY\n')
    cross = read.cross(file=out, 'csvr', genotypes=NULL, crosstype="4way")
  } else if(type == 'f2') {
    verbose_print('Loading in as F2\n')
    cross = read.cross(file=out, 'csvr', genotypes=genocodes, crosstype="f2")
  } else {
    verbose_print('Loading in as normal\n')
    cross = read.cross(file=out, 'csvr', genotypes=genocodes)
  }
  if (type == 'riset') {
    # If its a RIL, convert to a RIL in R/qtl
    verbose_print('Converting to RISELF\n')
    cross <- convert2riself(cross)
  }

  return(cross)
}

create_marker_covars <- function(the_cross, control_marker){
  #' Given a string of one or more marker names (comma separated), fetch
  #' the markers' values from the genotypes and return them as vectors/a vector
  #' of values

  # In case spaces are added in the string of marker names
  covariate_names <- strsplit(str_replace(control_marker, " ", ""), ",")

  genotypes <- pull.geno(the_cross)
  covariates_in_geno <- which(covariate_names %in% colnames(genotypes))
  covariate_names <- covariate_names[covariates_in_geno]
  marker_covars <- genotypes[, unlist(covariate_names)]

  return(marker_covars)
}

# Get phenotype vector from input file
df <- read.table(pheno_file, na.strings = "x", header=TRUE, check.names=FALSE)
sample_names <- df$Sample
trait_names <- colnames(df)[2:length(colnames(df))]

trait_vals <- vector(mode = "list", length = length(trait_names))
for (i in 1:length(trait_names)) {
  this_trait <- trait_names[i]
  this_vals <- df[this_trait]
  trait_vals[[i]] <- this_vals
}

# Since there will always only be one non-covar phenotype, its name will be in the first column
pheno_name = unlist(trait_names)[1]

verbose_print('Generating cross object\n')
cross_object = geno_to_csvr(geno_file, trait_names, trait_vals, cross_file)

# Calculate genotype probabilities
if (!is.null(opt$interval)) {
  verbose_print('Calculating genotype probabilities with interval mapping\n')
  cross_object <- calc.genoprob(cross_object, step=5, stepwidth="max")
} else {
  verbose_print('Calculating genotype probabilities\n')
  cross_object <- calc.genoprob(cross_object)
}

# Pull covariates out of cross object, if they exist
covars = vector()
if (!is.null(opt$addcovar)) {
  covar_names = trait_names[2:length(trait_names)]
  covars <- pull.pheno(cross_object, covar_names)
}

# If a marker name is supplied as covariate, get its vector of values and add them as a covariate
if (!is.null(opt$control)) {
  marker_covars = create_marker_covars(cross_object, opt$control)
  covars <- cbind(covars, marker_covars)
}

# Calculate permutations
if (opt$nperm > 0) {
  if (!is.null(opt$filename)){
    perm_out_file = file.path(opt$outdir, paste("PERM_", opt$filename, sep = ""))
  } else {
    perm_out_file = file.path(opt$outdir, paste(pheno_name, "_PERM_", stri_rand_strings(1, 8), ".csv", sep = ""))
  }

  if (!is.null(opt$addcovar) || !is.null(opt$control)){
    verbose_print('Running permutations with cofactors\n')
    perm_results = scanone(cross_object, pheno.col=1, addcovar=covars, n.perm=opt$nperm, model=opt$model, method=opt$method)
  } else {
    verbose_print('Running permutations\n')
    perm_results = scanone(cross_object, pheno.col=1, n.perm=opt$nperm, model=opt$model, method=opt$method)
  }
  write.csv(perm_results, perm_out_file)
}

if (!is.null(opt$filename)){
  out_file = file.path(opt$outdir, opt$filename)
} else {
  out_file = file.path(opt$outdir, paste(pheno_name, "_", stri_rand_strings(1, 8), ".csv", sep = ""))
}

if (!is.null(opt$addcovar) || !is.null(opt$control)){
  verbose_print('Running scanone with cofactors\n')
  qtl_results = scanone(cross_object, pheno.col=1, addcovar=covars, model=opt$model, method=opt$method)
} else {
  verbose_print('Running scanone\n')
  qtl_results = scanone(cross_object, pheno.col=1, model=opt$model, method=opt$method)
}
write.csv(qtl_results, out_file)