aboutsummaryrefslogtreecommitdiff
path: root/scripts/rqtl_wrapper.R
blob: 31c12775bd0cae8f687c8eb20d369ac475e6d105 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
library(optparse)
library(qtl)
library(stringi)
library(stringr)


tmp_dir = Sys.getenv("TMPDIR")
if (!dir.exists(tmp_dir)) {
  tmp_dir = "/tmp"
}

option_list = list(
  make_option(c("-g", "--geno"), type="character", help=".geno file containing a dataset's genotypes"),
  make_option(c("-p", "--pheno"), type="character", help="File containing two columns - sample names and values"),
  make_option(c("-c", "--addcovar"), action="store_true", default=NULL, help="Use covariates (included as extra columns in the phenotype input file)"),
  make_option(c("--covarstruct"), type="character", help="File detailing which covariates are categorical or numerical"),
  make_option(c("--model"), type="character", default="normal", help="Mapping Model - Normal or Non-Parametric"),
  make_option(c("--method"), type="character", default="hk", help="Mapping Method - hk (Haley Knott), ehk (Extended Haley Knott), mr (Marker Regression), em (Expectation-Maximization), imp (Imputation)"),
  make_option(c("--pairscan"), action="store_true", default=NULL, help="Run Pair Scan - the R/qtl function scantwo"),
  make_option(c("-i", "--interval"), action="store_true", default=NULL, help="Use interval mapping"),
  make_option(c("--nperm"), type="integer", default=0, help="Number of permutations"),
  make_option(c("--pstrata"), action="store_true", default=NULL, help="Use permutation strata (stored as final column/vector in phenotype input file)"),
  make_option(c("-s", "--scale"), type="character", default="mb", help="Mapping scale - Megabases (Mb) or Centimorgans (cM)"),
  make_option(c("--control"), type="character", default=NULL, help="Name of marker (contained in genotype file) to be used as a control"),
  make_option(c("-o", "--outdir"), type="character", default=file.path(tmp_dir, "gn3"), help="Directory in which to write result file"),
  make_option(c("-f", "--filename"), type="character", default=NULL, help="Name to use for result file"),
  make_option(c("-v", "--verbose"), action="store_true", default=NULL, help="Show extra information")
);

opt_parser = OptionParser(option_list=option_list);
opt = parse_args(opt_parser);

verbose_print <- function(...){
  if (!is.null(opt$verbose)) {
    for(item in list(...)){
      cat(item)
    }
    cat("\n")
  }
}

adjustXprobs <- function(cross){
  sex <- getsex(cross)$sex
  pr <- cross$geno[["X"]]$prob
  stopifnot(!is.null(pr), !is.null(sex))

  for(i in 1:ncol(pr)) {
      pr[sex==0,i,3:4] <- 0
      pr[sex==1,i,1:2] <- 0
      pr[,i,] <- pr[,i,]/rowSums(pr[,i,])
  }
  cross$geno[["X"]]$prob <- pr
  invisible(cross)
}

if (is.null(opt$geno) || is.null(opt$pheno)){
  print_help(opt_parser)
  stop("Both a genotype and phenotype file must be provided.", call.=FALSE)
}

geno_file = opt$geno
pheno_file = opt$pheno

# Generate randomized filename for cross object
cross_file = file.path(opt$outdir, "gn3", paste(stri_rand_strings(1, 8), ".cross", sep = ""))

trim <- function( x ) { gsub("(^[[:space:]]+|[[:space:]]+$)", "", x) }

get_geno_code <- function(header, name = 'unk'){
  mat = which(unlist(lapply(header,function(x){ length(grep(paste('@',name,sep=''), x)) })) == 1)
  return(trim(strsplit(header[mat],':')[[1]][2]))
}

geno_to_csvr <- function(genotypes, trait_names, trait_vals, out, type, sex = NULL,
                         mapping_scale = "Mb", verbose = FALSE){
  # Assume a geno header is not longer than 40 lines
  header = readLines(genotypes, 40)

  # Major hack to skip the geno headers
  toskip = which(unlist(lapply(header, function(x){ length(grep("Chr\t", x)) })) == 1) - 1

  type <- get_geno_code(header, 'type')

  # Get the genotype codes
  if(type == '4-way'){
    genocodes <- NULL
  } else {
    genocodes <- c(get_geno_code(header, 'mat'), get_geno_code(header, 'het'),
                   get_geno_code(header, 'pat'))
  }
  genodata <- read.csv(genotypes, sep='\t', skip=toskip, header=TRUE,
                       na.strings=get_geno_code(header,'unk'),
                       colClasses='character', comment.char = '#')

  verbose_print('Genodata:', toskip, " ", dim(genodata), genocodes, '\n')

  # If there isn't a sex phenotype, treat all as males
  if(is.null(sex)) sex <- rep('m', (ncol(genodata)-4))

  cross_items = list()

  # Add trait and covar phenotypes
  for (i in 1:length(trait_names)){
    cross_items[[i]] <- c(trait_names[i], '', '', unlist(trait_vals[[i]]))
  }

  # Sex phenotype for the mice
  cross_items[[length(trait_names) + 1]] <- c('sex', '', '', sex)
  # Genotypes
  cross_items[[length(trait_names) + 2]] <- cbind(genodata[,c('Locus','Chr', mapping_scale)],
                                                  genodata[, 5:ncol(genodata)])

  out_csvr <- do.call(rbind, cross_items)

  # Save it to a file
  write.table(out_csvr, file=out, row.names=FALSE, col.names=FALSE, quote=FALSE, sep=',')

  # Load the created cross file using R/qtl read.cross
  if (type == '4-way') {
    verbose_print('Loading in as 4-WAY\n')
    cross = read.cross(file=out, 'csvr', genotypes=NULL, crosstype="4way")
  } else if(type == 'f2') {
    verbose_print('Loading in as F2\n')
    cross = read.cross(file=out, 'csvr', genotypes=genocodes, crosstype="f2")
  } else {
    verbose_print('Loading in as normal\n')
    cross = read.cross(file=out, 'csvr', genotypes=genocodes)
  }
  if (type == 'riset') {
    # If its a RIL, convert to a RIL in R/qtl
    verbose_print('Converting to RISELF\n')
    cross <- convert2riself(cross)
  }

  return(cross)
}

create_marker_covars <- function(the_cross, control_marker){
  #' Given a string of one or more marker names (comma separated), fetch
  #' the markers' values from the genotypes and return them as vectors/a vector
  #' of values

  # In case spaces are added in the string of marker names
  covariate_names <- strsplit(str_replace(control_marker, " ", ""), ",")

  genotypes <- pull.geno(the_cross)
  covariates_in_geno <- which(covariate_names %in% colnames(genotypes))
  covariate_names <- covariate_names[covariates_in_geno]
  marker_covars <- genotypes[, unlist(covariate_names)]

  return(marker_covars)
}

# Get phenotype vector from input file
df <- read.table(pheno_file, na.strings = "x", header=TRUE, check.names=FALSE)
sample_names <- df$Sample
trait_names <- colnames(df)[2:length(colnames(df))]

# Since there will always only be one non-covar phenotype, its name will be in the first column
pheno_name = unlist(trait_names)[1]

trait_vals <- vector(mode = "list", length = length(trait_names))
for (i in 1:length(trait_names)) {
  this_trait <- trait_names[i]
  this_vals <- df[this_trait]
  trait_vals[[i]] <- this_vals

  trait_names[i] = paste("T_", this_trait, sep = "")
}

# Get type of genotypes, since it needs to be checked before calc.genoprob
header = readLines(geno_file, 40)
type <- get_geno_code(header, 'type')

verbose_print('Generating cross object\n')
cross_object = geno_to_csvr(geno_file, trait_names, trait_vals, cross_file, type)

# Calculate genotype probabilities
if (!is.null(opt$pairscan)) {
  verbose_print('Calculating genotype probabilities for pair-scan\n')
  cross_object <- calc.genoprob(cross_object, step=10)
} else if (!is.null(opt$interval)) {
  verbose_print('Calculating genotype probabilities with interval mapping\n')
  cross_object <- calc.genoprob(cross_object, step=5, stepwidth="max")
} else if (!is.null(opt$pairscan)) {
  verbose_print('Calculating genotype probabilities for pair-scan\n')
  cross_object <- calc.genoprob(cross_object, step=10)
} else {
  verbose_print('Calculating genotype probabilities\n')
  cross_object <- calc.genoprob(cross_object)
}

# If 4way, adjust X chromosome genotype probabilities
if (type == "4-way") {
  verbose_print('Adjusting genotype probabilities for 4way cross')
  cross_object <- adjustXprobs(cross_object)
}

# Pull covariates out of cross object, if they exist
covars <- c() # Holds the covariates which should be passed to R/qtl
if (!is.null(opt$addcovar)) {
  verbose_print('Pulling covariates out of cross object\n')
  # If perm strata are being used, it'll be included as the final column in the phenotype file
  if (!is.null(opt$pstrata)) {
    covar_names = trait_names[2:(length(trait_names)-1)]
  } else {
    covar_names = trait_names[2:length(trait_names)]
  }
  covars <- pull.pheno(cross_object, covar_names)
  # Read in the covar description file
  covarDescr <- read.table(opt$covarstruct, sep="\t", header=FALSE)
  for(x in 1:nrow(covarDescr)){
    cat(covarDescr[x, 1])
    name <- paste0("T_", covarDescr[x, 1]) # The covar description file doesn't have T_ in trait names (the cross object does)
    type <- covarDescr[x, 2]
    if(type == "categorical"){
      verbose_print('Binding covars to covars\n')
      if (nrow(covarDescr) < 2){
        this_covar = covars
      } else {
        this_covar = covars[,name]
      }
      if(length(table(this_covar)) > 2){ # More then 2 levels create the model matrix for the factor
        mdata <- data.frame(toExpand = as.factor(this_covar))
        options(na.action='na.pass')
        modelmatrix <- model.matrix(~ toExpand + 0, mdata)[,-1]
        covars <- cbind(covars, modelmatrix)
      }else{ # 2 levels? just bind the trait as covar
        covars <- cbind(covars, this_covar)
      }
    }
  }
}

# Pull permutation strata out of cross object, if it is being used
perm_strata = vector()
if (!is.null(opt$pstrata)) {
  verbose_print('Pulling permutation strata out of cross object\n')
  strata_col = trait_names[length(trait_names)]
  perm_strata <- pull.pheno(cross_object, strata_col)
}

# If a marker name is supplied as covariate, get its vector of values and add them as a covariate
if (!is.null(opt$control)) {
  verbose_print('Creating marker covariates and binding them to covariates vector\n')
  marker_covars = create_marker_covars(cross_object, opt$control)
  covars <- cbind(covars, marker_covars)
}

if (!is.null(opt$pairscan)) {
  verbose_print("Running scantwo")
  scan_func <- function(...){
    scantwo(...)
  }
} else {
  verbose_print("Running scanone")
  scan_func <- function(...){
    scanone(...)
  }
}

# Calculate permutations
if (opt$nperm > 0) {
  if (!is.null(opt$filename)){
    perm_out_file = file.path(opt$outdir, "gn3", paste("PERM_", opt$filename, sep = "" ))
  } else {
    perm_out_file = file.path(opt$outdir, "gn3", paste(pheno_name, "_PERM_", stri_rand_strings(1, 8), sep = ""))
  }

  if (!is.null(opt$addcovar) || !is.null(opt$control)){
    if (!is.null(opt$pstrata)) {
      verbose_print('Running permutations with cofactors and strata\n')
      perm_results = scan_func(cross_object, pheno.col=1, addcovar=covars, n.perm=opt$nperm, perm.strata=perm_strata, model=opt$model, method=opt$method)
    } else {
      verbose_print('Running permutations with cofactors\n')
      perm_results = scan_func(cross_object, pheno.col=1, addcovar=covars, n.perm=opt$nperm, model=opt$model, method=opt$method)
    }
  } else {
    if (!is.null(opt$pstrata)) {
      verbose_print('Running permutations with strata\n')
      perm_results = scan_func(cross_object, pheno.col=1, n.perm=opt$nperm, perm.strata=perm_strata, model=opt$model, method=opt$method)
    } else {
      verbose_print('Running permutations\n')
      perm_results = scan_func(cross_object, pheno.col=1, n.perm=opt$nperm, model=opt$model, method=opt$method)
    }
  }
  write.csv(perm_results, perm_out_file)
}

if (!is.null(opt$filename)){
  out_file = file.path(opt$outdir, "gn3", opt$filename)
} else {
  out_file = file.path(opt$outdir, "gn3", paste(pheno_name, "_", stri_rand_strings(1, 8), sep = ""))
}

if (!is.null(opt$addcovar) || !is.null(opt$control)){
  verbose_print('Running scan with cofactors\n')
  qtl_results = scan_func(cross_object, pheno.col=1, addcovar=covars, model=opt$model, method=opt$method)
} else {
  verbose_print('Running scan\n')
  qtl_results = scan_func(cross_object, pheno.col=1, model=opt$model, method=opt$method)
}

verbose_print('Writing results to CSV file\n')
if (!is.null(opt$pairscan)) {
  map_out_file = file.path(opt$outdir, "gn3", paste("MAP_", opt$filename, sep = "" ))
  write.csv(qtl_results[1], out_file)
  write.csv(qtl_results[2], map_out_file)
} else {
  # QTL main effects on adjusted longevity
  getEffects <- function(sdata, gtsprob, marker = "1_24042124", model = "longevity ~ sex + site + cohort + treatment", trait = "longevity"){
    rownames(sdata) <- 1:nrow(sdata)
    rownames(gtsprob) <- 1:nrow(gtsprob)
    mp <- gtsprob[, grep(marker, colnames(gtsprob))]
    gts <- unlist(lapply(lapply(lapply(apply(mp,1,function(x){which(x > 0.85)}),names), strsplit, ":"), function(x){
      if(length(x) > 0){ return(x[[1]][2]); }else{ return(NA) }
    }))

    ismissing <- which(apply(sdata, 1, function(x){any(is.na(x))}))
    if(length(ismissing) > 0){
      sdata <- sdata[-ismissing, ]
      gts <- gts[-ismissing]
    }

    mlm <- lm(as.formula(model), data = sdata)
    pheAdj <- rep(NA, nrow(sdata))
    adj <- residuals(mlm) + mean(sdata[, trait])
    pheAdj[as.numeric(names(adj))] <- adj
    means <- c(mean(pheAdj[which(gts == "AC")],na.rm=TRUE),mean(pheAdj[which(gts == "AD")],na.rm=TRUE),mean(pheAdj[which(gts == "BC")],na.rm=TRUE),mean(pheAdj[which(gts == "BD")],na.rm=TRUE))
    std <- function(x) sd(x,na.rm=TRUE)/sqrt(length(x))
    stderrs <- c(std(pheAdj[which(gts == "AC")]),std(pheAdj[which(gts == "AD")]),std(pheAdj[which(gts == "BC")]),std(pheAdj[which(gts == "BD")]))
    paste0(round(means,0), " ± ", round(stderrs,2))
  }

  if (type == "4-way") {
    verbose_print("Get phenotype name + genoprob + all phenotypes + models for 4-way crosses")
    traitname <- colnames(pull.pheno(cross_object))[1]
    gtsp <- pull.genoprob(cross_object)
    allpheno <- pull.pheno(cross_object)
    if (!is.null(opt$addcovar)) {
      model <- paste0(traitname, " ~ ", paste0(covar_names, sep="", collapse=" + "))
    } else {
      model <- paste0(traitname, " ~ 1 ")
    }

    meffects <- c()
    verbose_print("Getting QTL main effects for 4-way crosses")
    for(marker in  rownames(qtl_results)){
      meff <- getEffects(allpheno, gtsp, marker = marker, model, trait = traitname)
      meffects <- rbind(meffects, meff)
    }
    qtl_results <- cbind(data.frame(qtl_results[,1:3]), meffects)
    colnames(qtl_results)[4:7] <- c("AC", "AD", "BC", "BD")
  }

  write.csv(qtl_results, out_file)
}