From 22339cb85c5db370ee6a2c247de90e603cdf85a5 Mon Sep 17 00:00:00 2001 From: zsloan Date: Wed, 12 Oct 2016 16:06:22 +0000 Subject: Removed unused gemma code from marker_regression.py and changed the order of a couple functions in collect.py to more accurately reflect when they're called --- wqflask/wqflask/collect.py | 32 +++++---- .../wqflask/marker_regression/marker_regression.py | 78 ---------------------- wqflask/wqflask/user_manager.py | 1 - 3 files changed, 15 insertions(+), 96 deletions(-) diff --git a/wqflask/wqflask/collect.py b/wqflask/wqflask/collect.py index 81d03d6c..bf465cdc 100644 --- a/wqflask/wqflask/collect.py +++ b/wqflask/wqflask/collect.py @@ -52,7 +52,7 @@ class AnonCollection(object): self.id = None self.created_timestamp = datetime.datetime.utcnow().strftime('%b %d %Y %I:%M%p') self.changed_timestamp = self.created_timestamp #ZS: will be updated when changes are made - + #ZS: Find id and set it if the collection doesn't already exist if Redis.get(self.key) == "None" or Redis.get(self.key) == None: Redis.set(self.key, None) #ZS: For some reason I get the error "Operation against a key holding the wrong kind of value" if I don't do this @@ -66,6 +66,7 @@ class AnonCollection(object): collection_exists = True self.id = collection['id'] break + if self.id == None: self.id = str(uuid.uuid4()) @@ -191,7 +192,19 @@ class UserCollection(object): # Probably have to change that return redirect(url_for('view_collection', uc_id=uc.id)) - +def process_traits(unprocessed_traits): + #print("unprocessed_traits are:", unprocessed_traits) + if isinstance(unprocessed_traits, basestring): + unprocessed_traits = unprocessed_traits.split(",") + traits = set() + for trait in unprocessed_traits: + #print("trait is:", trait) + data, _separator, hmac = trait.rpartition(':') + data = data.strip() + assert hmac==user_manager.actual_hmac_creation(data), "Data tampering?" + traits.add (str(data)) + return traits + def report_change(len_before, len_now): new_length = len_now - len_before if new_length: @@ -224,7 +237,6 @@ def collections_add(): collections = collection_names, ) - @app.route("/collections/new") def collections_new(): params = request.args @@ -248,20 +260,6 @@ def collections_new(): else: CauseAnError - -def process_traits(unprocessed_traits): - #print("unprocessed_traits are:", unprocessed_traits) - if isinstance(unprocessed_traits, basestring): - unprocessed_traits = unprocessed_traits.split(",") - traits = set() - for trait in unprocessed_traits: - #print("trait is:", trait) - data, _separator, hmac = trait.rpartition(':') - data = data.strip() - assert hmac==user_manager.actual_hmac_creation(data), "Data tampering?" - traits.add (str(data)) - return traits - def create_new(collection_name): params = request.args diff --git a/wqflask/wqflask/marker_regression/marker_regression.py b/wqflask/wqflask/marker_regression/marker_regression.py index f340e309..7ad5d25a 100644 --- a/wqflask/wqflask/marker_regression/marker_regression.py +++ b/wqflask/wqflask/marker_regression/marker_regression.py @@ -309,49 +309,6 @@ class MarkerRegression(object): perm_results = self.perm_output, ) - - def run_gemma(self): - """Generates p-values for each marker using GEMMA""" - - self.gen_pheno_txt_file() - - #os.chdir("/home/zas1024/gene/web/gemma") - - gemma_command = GEMMA_COMMAND + ' -bfile %s -k output_%s.cXX.txt -lmm 1 -o %s_output' % ( - self.dataset.group.name, - self.dataset.group.name, - self.dataset.group.name) - #logger.debug("gemma_command:" + gemma_command) - - os.system(gemma_command) - - included_markers, p_values = self.parse_gemma_output() - - self.dataset.group.get_specified_markers(markers = included_markers) - self.dataset.group.markers.add_pvalues(p_values) - return self.dataset.group.markers.markers - - def parse_gemma_output(self): - included_markers = [] - p_values = [] - # Use a temporary file name here! - with open(webqtlConfig.GENERATED_TEXT_DIR+"/{}_output.assoc.txt".format(self.dataset.group.name)) as output_file: - for line in output_file: - if line.startswith("chr"): - continue - else: - included_markers.append(line.split("\t")[1]) - p_values.append(float(line.split("\t")[10])) - #p_values[line.split("\t")[1]] = float(line.split("\t")[10]) - #logger.debug("p_values: ", p_values) - return included_markers, p_values - - def gen_pheno_txt_file(self): - """Generates phenotype file for GEMMA""" - with open(webqtlConfig.GENERATED_TEXT_DIR+"{}.fam".format(self.dataset.group.name), "w") as outfile: - for i, sample in enumerate(self.samples): - outfile.write(str(sample) + " " + str(sample) + " 0 0 0 " + str(self.vals[i]) + "\n") - def run_rqtl_plink(self): # os.chdir("") never do this inside a webserver!! @@ -425,41 +382,6 @@ class MarkerRegression(object): output_file.close() - def gen_pheno_txt_file_rqtl(self, pheno_filename = ''): - ped_sample_list = self.get_samples_from_ped_file() - output_file = open("%s%s.txt" % (TMPDIR, pheno_filename), "wb") - header = 'FID\tIID\t%s\n' % self.this_trait.name - output_file.write(header) - - new_value_list = [] - - #if valueDict does not include some strain, value will be set to -9999 as missing value - for i, sample in enumerate(ped_sample_list): - try: - value = self.vals[i] - value = str(value).replace('value=','') - value = value.strip() - except: - value = -9999 - - new_value_list.append(value) - - - new_line = '' - for i, sample in enumerate(ped_sample_list): - j = i+1 - value = new_value_list[i] - new_line += '%s\t%s\t%s\n'%(sample, sample, value) - - if j%1000 == 0: - output_file.write(newLine) - new_line = '' - - if new_line: - output_file.write(new_line) - - output_file.close() - # get strain name from ped file in order def get_samples_from_ped_file(self): ped_file= open("{}/{}.ped".format(PLINK_PATH, self.dataset.group.name),"r") diff --git a/wqflask/wqflask/user_manager.py b/wqflask/wqflask/user_manager.py index 598af0a6..1e831896 100644 --- a/wqflask/wqflask/user_manager.py +++ b/wqflask/wqflask/user_manager.py @@ -158,7 +158,6 @@ def verify_cookie(cookie): assert the_signature == actual_hmac_creation(the_uuid), "Uh-oh, someone tampering with the cookie?" return the_uuid - def create_signed_cookie(): the_uuid = str(uuid.uuid4()) signature = actual_hmac_creation(the_uuid) -- cgit v1.2.3 From 1cdd044b2acba69058aaf7b0719eef5d1a727cfc Mon Sep 17 00:00:00 2001 From: zsloan Date: Wed, 12 Oct 2016 16:52:29 +0000 Subject: Moved plink code out of marker_regression, though I don't think plink is working right now Ran some end of line conversion on rqtl_mapping because there were a few ^M's --- .../wqflask/marker_regression/marker_regression.py | 160 +-------- wqflask/wqflask/marker_regression/plink_mapping.py | 168 +++++++++ wqflask/wqflask/marker_regression/rqtl_mapping.py | 384 ++++++++++----------- 3 files changed, 363 insertions(+), 349 deletions(-) create mode 100644 wqflask/wqflask/marker_regression/plink_mapping.py diff --git a/wqflask/wqflask/marker_regression/marker_regression.py b/wqflask/wqflask/marker_regression/marker_regression.py index 7ad5d25a..fbd9a816 100644 --- a/wqflask/wqflask/marker_regression/marker_regression.py +++ b/wqflask/wqflask/marker_regression/marker_regression.py @@ -35,7 +35,7 @@ from utility import helper_functions from utility import Plot, Bunch from utility import temp_data from utility.benchmark import Bench -from wqflask.marker_regression import gemma_mapping, rqtl_mapping, qtlreaper_mapping +from wqflask.marker_regression import gemma_mapping, rqtl_mapping, qtlreaper_mapping, plink_mapping from utility.tools import locate, locate_ignore_error, PYLMM_COMMAND, GEMMA_COMMAND, PLINK_COMMAND, TEMPDIR from utility.external import shell @@ -212,7 +212,8 @@ class MarkerRegression(object): self.control_marker, self.manhattan_plot) elif self.mapping_method == "plink": - results = self.run_plink() + results = plink_mapping.run_plink(self.this_trait, self.dataset, self.species, self.vals, self.maf) + #results = self.run_plink() elif self.mapping_method == "pylmm": logger.debug("RUNNING PYLMM") if self.num_perm > 0: @@ -314,7 +315,7 @@ class MarkerRegression(object): output_filename = webqtlUtil.genRandStr("%s_%s_"%(self.dataset.group.name, self.this_trait.name)) - self.gen_pheno_txt_file_plink(pheno_filename = output_filename) + plink_mapping.gen_pheno_txt_file_plink(self.this_trait, self.dataset, self.vals, pheno_filename = output_filename) rqtl_command = './plink --noweb --ped %s.ped --no-fid --no-parents --no-sex --no-pheno --map %s.map --pheno %s/%s.txt --pheno-name %s --maf %s --missing-phenotype -9999 --out %s%s --assoc ' % (self.dataset.group.name, self.dataset.group.name, TMPDIR, plink_output_filename, self.this_trait.name, self.maf, TMPDIR, plink_output_filename) @@ -322,159 +323,6 @@ class MarkerRegression(object): count, p_values = self.parse_rqtl_output(plink_output_filename) - def run_plink(self): - plink_output_filename = webqtlUtil.genRandStr("%s_%s_"%(self.dataset.group.name, self.this_trait.name)) - - self.gen_pheno_txt_file_plink(pheno_filename = plink_output_filename) - - plink_command = PLINK_COMMAND + ' --noweb --ped %s/%s.ped --no-fid --no-parents --no-sex --no-pheno --map %s/%s.map --pheno %s%s.txt --pheno-name %s --maf %s --missing-phenotype -9999 --out %s%s --assoc ' % (PLINK_PATH, self.dataset.group.name, PLINK_PATH, self.dataset.group.name, TMPDIR, plink_output_filename, self.this_trait.name, self.maf, TMPDIR, plink_output_filename) - logger.debug("plink_command:", plink_command) - - os.system(plink_command) - - count, p_values = self.parse_plink_output(plink_output_filename) - - #for marker in self.dataset.group.markers.markers: - # if marker['name'] not in included_markers: - # logger.debug("marker:", marker) - # self.dataset.group.markers.markers.remove(marker) - # #del self.dataset.group.markers.markers[marker] - - logger.debug("p_values:", pf(p_values)) - - self.dataset.group.markers.add_pvalues(p_values) - - return self.dataset.group.markers.markers - - - def gen_pheno_txt_file_plink(self, pheno_filename = ''): - ped_sample_list = self.get_samples_from_ped_file() - output_file = open("%s%s.txt" % (TMPDIR, pheno_filename), "wb") - header = 'FID\tIID\t%s\n' % self.this_trait.name - output_file.write(header) - - new_value_list = [] - - #if valueDict does not include some strain, value will be set to -9999 as missing value - for i, sample in enumerate(ped_sample_list): - try: - value = self.vals[i] - value = str(value).replace('value=','') - value = value.strip() - except: - value = -9999 - - new_value_list.append(value) - - - new_line = '' - for i, sample in enumerate(ped_sample_list): - j = i+1 - value = new_value_list[i] - new_line += '%s\t%s\t%s\n'%(sample, sample, value) - - if j%1000 == 0: - output_file.write(newLine) - new_line = '' - - if new_line: - output_file.write(new_line) - - output_file.close() - - # get strain name from ped file in order - def get_samples_from_ped_file(self): - ped_file= open("{}/{}.ped".format(PLINK_PATH, self.dataset.group.name),"r") - line = ped_file.readline() - sample_list=[] - - while line: - lineList = string.split(string.strip(line), '\t') - lineList = map(string.strip, lineList) - - sample_name = lineList[0] - sample_list.append(sample_name) - - line = ped_file.readline() - - return sample_list - - def parse_plink_output(self, output_filename): - plink_results={} - - threshold_p_value = 0.01 - - result_fp = open("%s%s.qassoc"% (TMPDIR, output_filename), "rb") - - header_line = result_fp.readline()# read header line - line = result_fp.readline() - - value_list = [] # initialize value list, this list will include snp, bp and pvalue info - p_value_dict = {} - count = 0 - - while line: - #convert line from str to list - line_list = self.build_line_list(line=line) - - # only keep the records whose chromosome name is in db - if self.species.chromosomes.chromosomes.has_key(int(line_list[0])) and line_list[-1] and line_list[-1].strip()!='NA': - - chr_name = self.species.chromosomes.chromosomes[int(line_list[0])] - snp = line_list[1] - BP = line_list[2] - p_value = float(line_list[-1]) - if threshold_p_value >= 0 and threshold_p_value <= 1: - if p_value < threshold_p_value: - p_value_dict[snp] = float(p_value) - - if plink_results.has_key(chr_name): - value_list = plink_results[chr_name] - - # pvalue range is [0,1] - if threshold_p_value >=0 and threshold_p_value <= 1: - if p_value < threshold_p_value: - value_list.append((snp, BP, p_value)) - count += 1 - - plink_results[chr_name] = value_list - value_list = [] - else: - if threshold_p_value >= 0 and threshold_p_value <= 1: - if p_value < threshold_p_value: - value_list.append((snp, BP, p_value)) - count += 1 - - if value_list: - plink_results[chr_name] = value_list - - value_list=[] - - line = result_fp.readline() - else: - line = result_fp.readline() - - #if p_value_list: - # min_p_value = min(p_value_list) - #else: - # min_p_value = 0 - - return count, p_value_dict - - ###################################################### - # input: line: str,one line read from file - # function: convert line from str to list; - # output: lineList list - ####################################################### - def build_line_list(self, line=None): - - line_list = string.split(string.strip(line),' ')# irregular number of whitespaces between columns - line_list = [item for item in line_list if item <>''] - line_list = map(string.strip, line_list) - - return line_list - - def run_permutations(self, temp_uuid): """Runs permutations and gets significant and suggestive LOD scores""" diff --git a/wqflask/wqflask/marker_regression/plink_mapping.py b/wqflask/wqflask/marker_regression/plink_mapping.py new file mode 100644 index 00000000..c80dc700 --- /dev/null +++ b/wqflask/wqflask/marker_regression/plink_mapping.py @@ -0,0 +1,168 @@ +import string +import os + +from base.webqtlConfig import TMPDIR +from utility import webqtlUtil +from utility.tools import PLINK_COMMAND + +import utility.logger +logger = utility.logger.getLogger(__name__ ) + +def run_plink(this_trait, dataset, species, vals, maf): + plink_output_filename = webqtlUtil.genRandStr("%s_%s_"%(dataset.group.name, this_trait.name)) + + gen_pheno_txt_file_plink(this_trait, dataset, vals, pheno_filename = plink_output_filename) + + plink_command = PLINK_COMMAND + ' --noweb --ped %s/%s.ped --no-fid --no-parents --no-sex --no-pheno --map %s/%s.map --pheno %s%s.txt --pheno-name %s --maf %s --missing-phenotype -9999 --out %s%s --assoc ' % (PLINK_PATH, + dataset.group.name, + PLINK_PATH, + dataset.group.name, + TMPDIR, + plink_output_filename, + this_trait.name, + maf, + TMPDIR, + plink_output_filename) + logger.debug("plink_command:", plink_command) + + os.system(plink_command) + + count, p_values = parse_plink_output(plink_output_filename, species) + + #for marker in self.dataset.group.markers.markers: + # if marker['name'] not in included_markers: + # logger.debug("marker:", marker) + # self.dataset.group.markers.markers.remove(marker) + # #del self.dataset.group.markers.markers[marker] + + logger.debug("p_values:", pf(p_values)) + + dataset.group.markers.add_pvalues(p_values) + + return dataset.group.markers.markers + + +def gen_pheno_txt_file_plink(this_trait, dataset, vals, pheno_filename = ''): + ped_sample_list = get_samples_from_ped_file(dataset) + output_file = open("%s%s.txt" % (TMPDIR, pheno_filename), "wb") + header = 'FID\tIID\t%s\n' % this_trait.name + output_file.write(header) + + new_value_list = [] + + #if valueDict does not include some strain, value will be set to -9999 as missing value + for i, sample in enumerate(ped_sample_list): + try: + value = vals[i] + value = str(value).replace('value=','') + value = value.strip() + except: + value = -9999 + + new_value_list.append(value) + + new_line = '' + for i, sample in enumerate(ped_sample_list): + j = i+1 + value = new_value_list[i] + new_line += '%s\t%s\t%s\n'%(sample, sample, value) + + if j%1000 == 0: + output_file.write(newLine) + new_line = '' + + if new_line: + output_file.write(new_line) + + output_file.close() + +# get strain name from ped file in order +def get_samples_from_ped_file(dataset): + ped_file= open("{}/{}.ped".format(PLINK_PATH, dataset.group.name),"r") + line = ped_file.readline() + sample_list=[] + + while line: + lineList = string.split(string.strip(line), '\t') + lineList = map(string.strip, lineList) + + sample_name = lineList[0] + sample_list.append(sample_name) + + line = ped_file.readline() + + return sample_list + +def parse_plink_output(output_filename, species): + plink_results={} + + threshold_p_value = 0.01 + + result_fp = open("%s%s.qassoc"% (TMPDIR, output_filename), "rb") + + header_line = result_fp.readline()# read header line + line = result_fp.readline() + + value_list = [] # initialize value list, this list will include snp, bp and pvalue info + p_value_dict = {} + count = 0 + + while line: + #convert line from str to list + line_list = build_line_list(line=line) + + # only keep the records whose chromosome name is in db + if species.chromosomes.chromosomes.has_key(int(line_list[0])) and line_list[-1] and line_list[-1].strip()!='NA': + + chr_name = species.chromosomes.chromosomes[int(line_list[0])] + snp = line_list[1] + BP = line_list[2] + p_value = float(line_list[-1]) + if threshold_p_value >= 0 and threshold_p_value <= 1: + if p_value < threshold_p_value: + p_value_dict[snp] = float(p_value) + + if plink_results.has_key(chr_name): + value_list = plink_results[chr_name] + + # pvalue range is [0,1] + if threshold_p_value >=0 and threshold_p_value <= 1: + if p_value < threshold_p_value: + value_list.append((snp, BP, p_value)) + count += 1 + + plink_results[chr_name] = value_list + value_list = [] + else: + if threshold_p_value >= 0 and threshold_p_value <= 1: + if p_value < threshold_p_value: + value_list.append((snp, BP, p_value)) + count += 1 + + if value_list: + plink_results[chr_name] = value_list + + value_list=[] + + line = result_fp.readline() + else: + line = result_fp.readline() + + #if p_value_list: + # min_p_value = min(p_value_list) + #else: + # min_p_value = 0 + + return count, p_value_dict + +###################################################### +# input: line: str,one line read from file +# function: convert line from str to list; +# output: lineList list +####################################################### +def build_line_list(line=None): + line_list = string.split(string.strip(line),' ')# irregular number of whitespaces between columns + line_list = [item for item in line_list if item <>''] + line_list = map(string.strip, line_list) + + return line_list \ No newline at end of file diff --git a/wqflask/wqflask/marker_regression/rqtl_mapping.py b/wqflask/wqflask/marker_regression/rqtl_mapping.py index 93bf717c..afd28563 100644 --- a/wqflask/wqflask/marker_regression/rqtl_mapping.py +++ b/wqflask/wqflask/marker_regression/rqtl_mapping.py @@ -1,193 +1,191 @@ -import rpy2.robjects as ro -import numpy as np - -from base.webqtlConfig import TMPDIR -from utility import webqtlUtil -from utility.tools import locate, TEMPDIR - -def run_rqtl_geno(vals, dataset, method, model, permCheck, num_perm, do_control, control_marker, manhattan_plot, pair_scan): - geno_to_rqtl_function(dataset) - - ## Get pointers to some common R functions - r_library = ro.r["library"] # Map the library function - r_c = ro.r["c"] # Map the c function - r_sum = ro.r["sum"] # Map the sum function - plot = ro.r["plot"] # Map the plot function - postscript = ro.r["postscript"] # Map the postscript function - png = ro.r["png"] # Map the png function - dev_off = ro.r["dev.off"] # Map the device off function - - print(r_library("qtl")) # Load R/qtl - - ## Get pointers to some R/qtl functions - scanone = ro.r["scanone"] # Map the scanone function - scantwo = ro.r["scantwo"] # Map the scantwo function - calc_genoprob = ro.r["calc.genoprob"] # Map the calc.genoprob function - read_cross = ro.r["read.cross"] # Map the read.cross function - write_cross = ro.r["write.cross"] # Map the write.cross function - GENOtoCSVR = ro.r["GENOtoCSVR"] # Map the local GENOtoCSVR function - - crossname = dataset.group.name - genofilelocation = locate(crossname + ".geno", "genotype") - crossfilelocation = TMPDIR + crossname + ".cross" - - #print("Conversion of geno to cross at location:", genofilelocation, " to ", crossfilelocation) - - cross_object = GENOtoCSVR(genofilelocation, crossfilelocation) # TODO: Add the SEX if that is available - - if manhattan_plot: - cross_object = calc_genoprob(cross_object) - else: - cross_object = calc_genoprob(cross_object, step=1, stepwidth="max") - - cross_object = add_phenotype(cross_object, sanitize_rqtl_phenotype(vals)) # Add the phenotype - - # for debug: write_cross(cross_object, "csvr", "test.csvr") - - # Scan for QTLs - covar = create_covariates(control_marker, cross_object) # Create the additive covariate matrix - - if pair_scan: - if do_control == "true": # If sum(covar) > 0 we have a covariate matrix - print("Using covariate"); result_data_frame = scantwo(cross_object, pheno = "the_pheno", addcovar = covar, model=model, method=method, n_cluster = 16) - else: - print("No covariates"); result_data_frame = scantwo(cross_object, pheno = "the_pheno", model=model, method=method, n_cluster = 16) - - #print("Pair scan results:", result_data_frame) - - pair_scan_filename = webqtlUtil.genRandStr("scantwo_") + ".png" - png(file=TEMPDIR+pair_scan_filename) - plot(result_data_frame) - dev_off() - - return process_pair_scan_results(result_data_frame) - else: - if do_control == "true": - print("Using covariate"); result_data_frame = scanone(cross_object, pheno = "the_pheno", addcovar = covar, model=model, method=method) - else: - print("No covariates"); result_data_frame = scanone(cross_object, pheno = "the_pheno", model=model, method=method) - - if num_perm > 0 and permCheck == "ON": # Do permutation (if requested by user) - if do_control == "true": - perm_data_frame = scanone(cross_object, pheno_col = "the_pheno", addcovar = covar, n_perm = num_perm, model=model, method=method) - else: - perm_data_frame = scanone(cross_object, pheno_col = "the_pheno", n_perm = num_perm, model=model, method=method) - - perm_output, suggestive, significant = process_rqtl_perm_results(num_perm, perm_data_frame) # Functions that sets the thresholds for the webinterface - return perm_output, suggestive, significant, process_rqtl_results(result_data_frame) - else: - return process_rqtl_results(result_data_frame) - -def geno_to_rqtl_function(dataset): # TODO: Need to figure out why some genofiles have the wrong format and don't convert properly - - ro.r(""" - trim <- function( x ) { gsub("(^[[:space:]]+|[[:space:]]+$)", "", x) } - - getGenoCode <- function(header, name = 'unk'){ - mat = which(unlist(lapply(header,function(x){ length(grep(paste('@',name,sep=''), x)) })) == 1) - return(trim(strsplit(header[mat],':')[[1]][2])) - } - - GENOtoCSVR <- function(genotypes = '%s', out = 'cross.csvr', phenotype = NULL, sex = NULL, verbose = FALSE){ - header = readLines(genotypes, 40) # Assume a geno header is not longer than 40 lines - toskip = which(unlist(lapply(header, function(x){ length(grep("Chr\t", x)) })) == 1)-1 # Major hack to skip the geno headers - - genocodes <- c(getGenoCode(header, 'mat'), getGenoCode(header, 'het'), getGenoCode(header, 'pat')) # Get the genotype codes - type <- getGenoCode(header, 'type') - genodata <- read.csv(genotypes, sep='\t', skip=toskip, header=TRUE, na.strings=getGenoCode(header,'unk'), colClasses='character', comment.char = '#') - cat('Genodata:', toskip, " ", dim(genodata), genocodes, '\n') - if(is.null(phenotype)) phenotype <- runif((ncol(genodata)-4)) # If there isn't a phenotype, generate a random one - if(is.null(sex)) sex <- rep('m', (ncol(genodata)-4)) # If there isn't a sex phenotype, treat all as males - outCSVR <- rbind(c('Pheno', '', '', phenotype), # Phenotype - c('sex', '', '', sex), # Sex phenotype for the mice - cbind(genodata[,c('Locus','Chr', 'cM')], genodata[, 5:ncol(genodata)])) # Genotypes - write.table(outCSVR, file = out, row.names=FALSE, col.names=FALSE,quote=FALSE, sep=',') # Save it to a file - require(qtl) - cross = read.cross(file=out, 'csvr', genotypes=genocodes) # Load the created cross file using R/qtl read.cross - if(type == 'riset') cross <- convert2riself(cross) # If its a RIL, convert to a RIL in R/qtl - return(cross) - } - """ % (dataset.group.name + ".geno")) - -def add_phenotype(cross, pheno_as_string): - ro.globalenv["the_cross"] = cross - ro.r('the_cross$pheno <- cbind(pull.pheno(the_cross), the_pheno = '+ pheno_as_string +')') - return ro.r["the_cross"] - -def create_covariates(control_marker, cross): - ro.globalenv["the_cross"] = cross - ro.r('genotypes <- pull.geno(the_cross)') # Get the genotype matrix - userinputS = control_marker.replace(" ", "").split(",") # TODO: sanitize user input, Never Ever trust a user - covariate_names = ', '.join('"{0}"'.format(w) for w in userinputS) - #print("Marker names of selected covariates:", covariate_names) - ro.r('covnames <- c(' + covariate_names + ')') - ro.r('covInGeno <- which(covnames %in% colnames(genotypes))') - ro.r('covnames <- covnames[covInGeno]') - ro.r("cat('covnames (purged): ', covnames,'\n')") - ro.r('covariates <- genotypes[,covnames]') # Get the covariate matrix by using the marker name as index to the genotype file - #print("R/qtl matrix of covariates:", ro.r["covariates"]) - return ro.r["covariates"] - -def sanitize_rqtl_phenotype(vals): - pheno_as_string = "c(" - for i, val in enumerate(vals): - if val == "x": - if i < (len(vals) - 1): - pheno_as_string += "NA," - else: - pheno_as_string += "NA" - else: - if i < (len(vals) - 1): - pheno_as_string += str(val) + "," - else: - pheno_as_string += str(val) - pheno_as_string += ")" - return pheno_as_string - -def process_pair_scan_results(result): - pair_scan_results = [] - - result = result[1] - output = [tuple([result[j][i] for j in range(result.ncol)]) for i in range(result.nrow)] - #print("R/qtl scantwo output:", output) - - for i, line in enumerate(result.iter_row()): - marker = {} - marker['name'] = result.rownames[i] - marker['chr1'] = output[i][0] - marker['Mb'] = output[i][1] - marker['chr2'] = int(output[i][2]) - pair_scan_results.append(marker) - - #print("pair_scan_results:", pair_scan_results) - - return pair_scan_results - -def process_rqtl_perm_results(num_perm, results): - perm_vals = [] - for line in str(results).split("\n")[1:(num_perm+1)]: - #print("R/qtl permutation line:", line.split()) - perm_vals.append(float(line.split()[1])) - - perm_output = perm_vals - suggestive = np.percentile(np.array(perm_vals), 67) - significant = np.percentile(np.array(perm_vals), 95) - print("SIGNIFICANT:", significant) - - return perm_output, suggestive, significant - -def process_rqtl_results(result): # TODO: how to make this a one liner and not copy the stuff in a loop - qtl_results = [] - - output = [tuple([result[j][i] for j in range(result.ncol)]) for i in range(result.nrow)] - #print("R/qtl scanone output:", output) - - for i, line in enumerate(result.iter_row()): - marker = {} - marker['name'] = result.rownames[i] - marker['chr'] = output[i][0] - marker['Mb'] = output[i][1] - marker['lod_score'] = output[i][2] - qtl_results.append(marker) - - return qtl_results \ No newline at end of file +import rpy2.robjects as ro +import numpy as np + +from base.webqtlConfig import TMPDIR +from utility import webqtlUtil +from utility.tools import locate, TEMPDIR + +def run_rqtl_geno(vals, dataset, method, model, permCheck, num_perm, do_control, control_marker, manhattan_plot, pair_scan): + geno_to_rqtl_function(dataset) + + ## Get pointers to some common R functions + r_library = ro.r["library"] # Map the library function + r_c = ro.r["c"] # Map the c function + r_sum = ro.r["sum"] # Map the sum function + plot = ro.r["plot"] # Map the plot function + postscript = ro.r["postscript"] # Map the postscript function + png = ro.r["png"] # Map the png function + dev_off = ro.r["dev.off"] # Map the device off function + + print(r_library("qtl")) # Load R/qtl + + ## Get pointers to some R/qtl functions + scanone = ro.r["scanone"] # Map the scanone function + scantwo = ro.r["scantwo"] # Map the scantwo function + calc_genoprob = ro.r["calc.genoprob"] # Map the calc.genoprob function + read_cross = ro.r["read.cross"] # Map the read.cross function + write_cross = ro.r["write.cross"] # Map the write.cross function + GENOtoCSVR = ro.r["GENOtoCSVR"] # Map the local GENOtoCSVR function + + crossname = dataset.group.name + genofilelocation = locate(crossname + ".geno", "genotype") + crossfilelocation = TMPDIR + crossname + ".cross" + + #print("Conversion of geno to cross at location:", genofilelocation, " to ", crossfilelocation) + + cross_object = GENOtoCSVR(genofilelocation, crossfilelocation) # TODO: Add the SEX if that is available + + if manhattan_plot: + cross_object = calc_genoprob(cross_object) + else: + cross_object = calc_genoprob(cross_object, step=1, stepwidth="max") + + cross_object = add_phenotype(cross_object, sanitize_rqtl_phenotype(vals)) # Add the phenotype + + # for debug: write_cross(cross_object, "csvr", "test.csvr") + + # Scan for QTLs + covar = create_covariates(control_marker, cross_object) # Create the additive covariate matrix + + if pair_scan: + if do_control == "true": # If sum(covar) > 0 we have a covariate matrix + print("Using covariate"); result_data_frame = scantwo(cross_object, pheno = "the_pheno", addcovar = covar, model=model, method=method, n_cluster = 16) + else: + print("No covariates"); result_data_frame = scantwo(cross_object, pheno = "the_pheno", model=model, method=method, n_cluster = 16) + + #print("Pair scan results:", result_data_frame) + + pair_scan_filename = webqtlUtil.genRandStr("scantwo_") + ".png" + png(file=TEMPDIR+pair_scan_filename) + plot(result_data_frame) + dev_off() + + return process_pair_scan_results(result_data_frame) + else: + if do_control == "true": + print("Using covariate"); result_data_frame = scanone(cross_object, pheno = "the_pheno", addcovar = covar, model=model, method=method) + else: + print("No covariates"); result_data_frame = scanone(cross_object, pheno = "the_pheno", model=model, method=method) + + if num_perm > 0 and permCheck == "ON": # Do permutation (if requested by user) + if do_control == "true": + perm_data_frame = scanone(cross_object, pheno_col = "the_pheno", addcovar = covar, n_perm = num_perm, model=model, method=method) + else: + perm_data_frame = scanone(cross_object, pheno_col = "the_pheno", n_perm = num_perm, model=model, method=method) + + perm_output, suggestive, significant = process_rqtl_perm_results(num_perm, perm_data_frame) # Functions that sets the thresholds for the webinterface + return perm_output, suggestive, significant, process_rqtl_results(result_data_frame) + else: + return process_rqtl_results(result_data_frame) + +def geno_to_rqtl_function(dataset): # TODO: Need to figure out why some genofiles have the wrong format and don't convert properly + + ro.r(""" + trim <- function( x ) { gsub("(^[[:space:]]+|[[:space:]]+$)", "", x) } + + getGenoCode <- function(header, name = 'unk'){ + mat = which(unlist(lapply(header,function(x){ length(grep(paste('@',name,sep=''), x)) })) == 1) + return(trim(strsplit(header[mat],':')[[1]][2])) + } + + GENOtoCSVR <- function(genotypes = '%s', out = 'cross.csvr', phenotype = NULL, sex = NULL, verbose = FALSE){ + header = readLines(genotypes, 40) # Assume a geno header is not longer than 40 lines + toskip = which(unlist(lapply(header, function(x){ length(grep("Chr\t", x)) })) == 1)-1 # Major hack to skip the geno headers + + genocodes <- c(getGenoCode(header, 'mat'), getGenoCode(header, 'het'), getGenoCode(header, 'pat')) # Get the genotype codes + type <- getGenoCode(header, 'type') + genodata <- read.csv(genotypes, sep='\t', skip=toskip, header=TRUE, na.strings=getGenoCode(header,'unk'), colClasses='character', comment.char = '#') + cat('Genodata:', toskip, " ", dim(genodata), genocodes, '\n') + if(is.null(phenotype)) phenotype <- runif((ncol(genodata)-4)) # If there isn't a phenotype, generate a random one + if(is.null(sex)) sex <- rep('m', (ncol(genodata)-4)) # If there isn't a sex phenotype, treat all as males + outCSVR <- rbind(c('Pheno', '', '', phenotype), # Phenotype + c('sex', '', '', sex), # Sex phenotype for the mice + cbind(genodata[,c('Locus','Chr', 'cM')], genodata[, 5:ncol(genodata)])) # Genotypes + write.table(outCSVR, file = out, row.names=FALSE, col.names=FALSE,quote=FALSE, sep=',') # Save it to a file + require(qtl) + cross = read.cross(file=out, 'csvr', genotypes=genocodes) # Load the created cross file using R/qtl read.cross + if(type == 'riset') cross <- convert2riself(cross) # If its a RIL, convert to a RIL in R/qtl + return(cross) + } + """ % (dataset.group.name + ".geno")) + +def add_phenotype(cross, pheno_as_string): + ro.globalenv["the_cross"] = cross + ro.r('the_cross$pheno <- cbind(pull.pheno(the_cross), the_pheno = '+ pheno_as_string +')') + return ro.r["the_cross"] + +def create_covariates(control_marker, cross): + ro.globalenv["the_cross"] = cross + ro.r('genotypes <- pull.geno(the_cross)') # Get the genotype matrix + userinputS = control_marker.replace(" ", "").split(",") # TODO: sanitize user input, Never Ever trust a user + covariate_names = ', '.join('"{0}"'.format(w) for w in userinputS) + #print("Marker names of selected covariates:", covariate_names) + ro.r('covnames <- c(' + covariate_names + ')') + ro.r('covInGeno <- which(covnames %in% colnames(genotypes))') + ro.r('covnames <- covnames[covInGeno]') + ro.r("cat('covnames (purged): ', covnames,'\n')") + ro.r('covariates <- genotypes[,covnames]') # Get the covariate matrix by using the marker name as index to the genotype file + #print("R/qtl matrix of covariates:", ro.r["covariates"]) + return ro.r["covariates"] + +def sanitize_rqtl_phenotype(vals): + pheno_as_string = "c(" + for i, val in enumerate(vals): + if val == "x": + if i < (len(vals) - 1): + pheno_as_string += "NA," + else: + pheno_as_string += "NA" + else: + if i < (len(vals) - 1): + pheno_as_string += str(val) + "," + else: + pheno_as_string += str(val) + pheno_as_string += ")" + return pheno_as_string + +def process_pair_scan_results(result): + pair_scan_results = [] + + result = result[1] + output = [tuple([result[j][i] for j in range(result.ncol)]) for i in range(result.nrow)] + #print("R/qtl scantwo output:", output) + + for i, line in enumerate(result.iter_row()): + marker = {} + marker['name'] = result.rownames[i] + marker['chr1'] = output[i][0] + marker['Mb'] = output[i][1] + marker['chr2'] = int(output[i][2]) + pair_scan_results.append(marker) + + return pair_scan_results + +def process_rqtl_perm_results(num_perm, results): + perm_vals = [] + for line in str(results).split("\n")[1:(num_perm+1)]: + #print("R/qtl permutation line:", line.split()) + perm_vals.append(float(line.split()[1])) + + perm_output = perm_vals + suggestive = np.percentile(np.array(perm_vals), 67) + significant = np.percentile(np.array(perm_vals), 95) + + return perm_output, suggestive, significant + +def process_rqtl_results(result): # TODO: how to make this a one liner and not copy the stuff in a loop + qtl_results = [] + + output = [tuple([result[j][i] for j in range(result.ncol)]) for i in range(result.nrow)] + #print("R/qtl scanone output:", output) + + for i, line in enumerate(result.iter_row()): + marker = {} + marker['name'] = result.rownames[i] + marker['chr'] = output[i][0] + marker['Mb'] = output[i][1] + marker['lod_score'] = output[i][2] + qtl_results.append(marker) + + return qtl_results + \ No newline at end of file -- cgit v1.2.3