Moved plink code out of marker_regression, though I don't think plink is working right now

Ran some end of line conversion on rqtl_mapping because there were a few ^M's
author: zsloan 2016-10-12 16:52:29 +0000
committer: zsloan 2016-10-12 16:52:29 +0000
commit: 1cdd044b2acba69058aaf7b0719eef5d1a727cfc (patch)
tree: 276691d34798512cc8b6c29491b45d3acede4a06 /wqflask
parent: 22339cb85c5db370ee6a2c247de90e603cdf85a5 (diff)
download: genenetwork2-1cdd044b2acba69058aaf7b0719eef5d1a727cfc.tar.gz
3 files changed, 363 insertions, 349 deletions
diff --git a/wqflask/wqflask/marker_regression/marker_regression.py b/wqflask/wqflask/marker_regression/marker_regression.py
index 7ad5d25a..fbd9a816 100644
--- a/wqflask/wqflask/marker_regression/marker_regression.py
+++ b/wqflask/wqflask/marker_regression/marker_regression.py
@@ -35,7 +35,7 @@ from utility import helper_functions
 from utility import Plot, Bunch
 from utility import temp_data
 from utility.benchmark import Bench
-from wqflask.marker_regression import gemma_mapping, rqtl_mapping, qtlreaper_mapping
+from wqflask.marker_regression import gemma_mapping, rqtl_mapping, qtlreaper_mapping, plink_mapping
 
 from utility.tools import locate, locate_ignore_error, PYLMM_COMMAND, GEMMA_COMMAND, PLINK_COMMAND, TEMPDIR
 from utility.external import shell
@@ -212,7 +212,8 @@ class MarkerRegression(object):
                                                                                                                                                         self.control_marker,
                                                                                                                                                         self.manhattan_plot)
         elif self.mapping_method == "plink":
-            results = self.run_plink()
+            results = plink_mapping.run_plink(self.this_trait, self.dataset, self.species, self.vals, self.maf)
+            #results = self.run_plink()
         elif self.mapping_method == "pylmm":
             logger.debug("RUNNING PYLMM")
             if self.num_perm > 0:
@@ -314,7 +315,7 @@ class MarkerRegression(object):
 
         output_filename = webqtlUtil.genRandStr("%s_%s_"%(self.dataset.group.name, self.this_trait.name))
 
-        self.gen_pheno_txt_file_plink(pheno_filename = output_filename)
+        plink_mapping.gen_pheno_txt_file_plink(self.this_trait, self.dataset, self.vals, pheno_filename = output_filename)
 
         rqtl_command = './plink --noweb --ped %s.ped --no-fid --no-parents --no-sex --no-pheno --map %s.map --pheno %s/%s.txt --pheno-name %s --maf %s --missing-phenotype -9999 --out %s%s --assoc ' % (self.dataset.group.name, self.dataset.group.name, TMPDIR, plink_output_filename, self.this_trait.name, self.maf, TMPDIR, plink_output_filename)
 
@@ -322,159 +323,6 @@ class MarkerRegression(object):
 
         count, p_values = self.parse_rqtl_output(plink_output_filename)
 
-    def run_plink(self):
-        plink_output_filename = webqtlUtil.genRandStr("%s_%s_"%(self.dataset.group.name, self.this_trait.name))
-
-        self.gen_pheno_txt_file_plink(pheno_filename = plink_output_filename)
-
-        plink_command = PLINK_COMMAND + ' --noweb --ped %s/%s.ped --no-fid --no-parents --no-sex --no-pheno --map %s/%s.map --pheno %s%s.txt --pheno-name %s --maf %s --missing-phenotype -9999 --out %s%s --assoc ' % (PLINK_PATH, self.dataset.group.name, PLINK_PATH, self.dataset.group.name, TMPDIR, plink_output_filename, self.this_trait.name, self.maf, TMPDIR, plink_output_filename)
-        logger.debug("plink_command:", plink_command)
-
-        os.system(plink_command)
-
-        count, p_values = self.parse_plink_output(plink_output_filename)
-
-        #for marker in self.dataset.group.markers.markers:
-        #    if marker['name'] not in included_markers:
-        #        logger.debug("marker:", marker)
-        #        self.dataset.group.markers.markers.remove(marker)
-        #        #del self.dataset.group.markers.markers[marker]
-
-        logger.debug("p_values:", pf(p_values))
-
-        self.dataset.group.markers.add_pvalues(p_values)
-
-        return self.dataset.group.markers.markers
-
-
-    def gen_pheno_txt_file_plink(self, pheno_filename = ''):
-        ped_sample_list = self.get_samples_from_ped_file()
-        output_file = open("%s%s.txt" % (TMPDIR, pheno_filename), "wb")
-        header = 'FID\tIID\t%s\n' % self.this_trait.name
-        output_file.write(header)
-
-        new_value_list = []
-
-        #if valueDict does not include some strain, value will be set to -9999 as missing value
-        for i, sample in enumerate(ped_sample_list):
-            try:
-                value = self.vals[i]
-                value = str(value).replace('value=','')
-                value = value.strip()
-            except:
-                value = -9999
-
-            new_value_list.append(value)
-
-
-        new_line = ''
-        for i, sample in enumerate(ped_sample_list):
-            j = i+1
-            value = new_value_list[i]
-            new_line += '%s\t%s\t%s\n'%(sample, sample, value)
-
-            if j%1000 == 0:
-                output_file.write(newLine)
-                new_line = ''
-
-        if new_line:
-            output_file.write(new_line)
-
-        output_file.close()
-
-    # get strain name from ped file in order
-    def get_samples_from_ped_file(self):
-        ped_file= open("{}/{}.ped".format(PLINK_PATH, self.dataset.group.name),"r")
-        line = ped_file.readline()
-        sample_list=[]
-
-        while line:
-            lineList = string.split(string.strip(line), '\t')
-            lineList = map(string.strip, lineList)
-
-            sample_name = lineList[0]
-            sample_list.append(sample_name)
-
-            line = ped_file.readline()
-
-        return sample_list
-
-    def parse_plink_output(self, output_filename):
-        plink_results={}
-
-        threshold_p_value = 0.01
-
-        result_fp = open("%s%s.qassoc"% (TMPDIR, output_filename), "rb")
-
-        header_line = result_fp.readline()# read header line
-        line = result_fp.readline()
-
-        value_list = [] # initialize value list, this list will include snp, bp and pvalue info
-        p_value_dict = {}
-        count = 0
-
-        while line:
-            #convert line from str to list
-            line_list = self.build_line_list(line=line)
-
-            # only keep the records whose chromosome name is in db
-            if self.species.chromosomes.chromosomes.has_key(int(line_list[0])) and line_list[-1] and line_list[-1].strip()!='NA':
-
-                chr_name = self.species.chromosomes.chromosomes[int(line_list[0])]
-                snp = line_list[1]
-                BP = line_list[2]
-                p_value = float(line_list[-1])
-                if threshold_p_value >= 0 and threshold_p_value <= 1:
-                    if p_value < threshold_p_value:
-                        p_value_dict[snp] = float(p_value)
-
-                if plink_results.has_key(chr_name):
-                    value_list = plink_results[chr_name]
-
-                    # pvalue range is [0,1]
-                    if threshold_p_value >=0 and threshold_p_value <= 1:
-                        if p_value < threshold_p_value:
-                            value_list.append((snp, BP, p_value))
-                            count += 1
-
-                    plink_results[chr_name] = value_list
-                    value_list = []
-                else:
-                    if threshold_p_value >= 0 and threshold_p_value <= 1:
-                        if p_value < threshold_p_value:
-                            value_list.append((snp, BP, p_value))
-                            count += 1
-
-                    if value_list:
-                        plink_results[chr_name] = value_list
-
-                    value_list=[]
-
-                line = result_fp.readline()
-            else:
-                line = result_fp.readline()
-
-        #if p_value_list:
-        #    min_p_value = min(p_value_list)
-        #else:
-        #    min_p_value = 0
-
-        return count, p_value_dict
-
-    ######################################################
-    # input: line: str,one line read from file
-    # function: convert line from str to list;
-    # output: lineList list
-    #######################################################
-    def build_line_list(self, line=None):
-
-        line_list = string.split(string.strip(line),' ')# irregular number of whitespaces between columns
-        line_list = [item for item in line_list if item <>'']
-        line_list = map(string.strip, line_list)
-
-        return line_list
-
-
     def run_permutations(self, temp_uuid):
         """Runs permutations and gets significant and suggestive LOD scores"""
 
diff --git a/wqflask/wqflask/marker_regression/plink_mapping.py b/wqflask/wqflask/marker_regression/plink_mapping.py
new file mode 100644
index 00000000..c80dc700
--- /dev/null
+++ b/wqflask/wqflask/marker_regression/plink_mapping.py
@@ -0,0 +1,168 @@
+import string
+import os
+
+from base.webqtlConfig import TMPDIR
+from utility import webqtlUtil
+from utility.tools import PLINK_COMMAND
+
+import utility.logger
+logger = utility.logger.getLogger(__name__ )
+
+def run_plink(this_trait, dataset, species, vals, maf):
+    plink_output_filename = webqtlUtil.genRandStr("%s_%s_"%(dataset.group.name, this_trait.name))
+
+    gen_pheno_txt_file_plink(this_trait, dataset, vals, pheno_filename = plink_output_filename)
+
+    plink_command = PLINK_COMMAND + ' --noweb --ped %s/%s.ped --no-fid --no-parents --no-sex --no-pheno --map %s/%s.map --pheno %s%s.txt --pheno-name %s --maf %s --missing-phenotype -9999 --out %s%s --assoc ' % (PLINK_PATH, 
+                                                                                                                                                                                                                    dataset.group.name, 
+                                                                                                                                                                                                                    PLINK_PATH, 
+                                                                                                                                                                                                                    dataset.group.name, 
+                                                                                                                                                                                                                    TMPDIR, 
+                                                                                                                                                                                                                    plink_output_filename, 
+                                                                                                                                                                                                                    this_trait.name, 
+                                                                                                                                                                                                                    maf, 
+                                                                                                                                                                                                                    TMPDIR, 
+                                                                                                                                                                                                                    plink_output_filename)
+    logger.debug("plink_command:", plink_command)
+
+    os.system(plink_command)
+
+    count, p_values = parse_plink_output(plink_output_filename, species)
+
+    #for marker in self.dataset.group.markers.markers:
+    #    if marker['name'] not in included_markers:
+    #        logger.debug("marker:", marker)
+    #        self.dataset.group.markers.markers.remove(marker)
+    #        #del self.dataset.group.markers.markers[marker]
+
+    logger.debug("p_values:", pf(p_values))
+
+    dataset.group.markers.add_pvalues(p_values)
+
+    return dataset.group.markers.markers
+
+
+def gen_pheno_txt_file_plink(this_trait, dataset, vals, pheno_filename = ''):
+    ped_sample_list = get_samples_from_ped_file(dataset)
+    output_file = open("%s%s.txt" % (TMPDIR, pheno_filename), "wb")
+    header = 'FID\tIID\t%s\n' % this_trait.name
+    output_file.write(header)
+
+    new_value_list = []
+
+    #if valueDict does not include some strain, value will be set to -9999 as missing value
+    for i, sample in enumerate(ped_sample_list):
+        try:
+            value = vals[i]
+            value = str(value).replace('value=','')
+            value = value.strip()
+        except:
+            value = -9999
+
+        new_value_list.append(value)
+
+    new_line = ''
+    for i, sample in enumerate(ped_sample_list):
+        j = i+1
+        value = new_value_list[i]
+        new_line += '%s\t%s\t%s\n'%(sample, sample, value)
+
+        if j%1000 == 0:
+            output_file.write(newLine)
+            new_line = ''
+
+    if new_line:
+        output_file.write(new_line)
+
+    output_file.close() 
+    
+# get strain name from ped file in order
+def get_samples_from_ped_file(dataset):
+    ped_file= open("{}/{}.ped".format(PLINK_PATH, dataset.group.name),"r")
+    line = ped_file.readline()
+    sample_list=[]
+
+    while line:
+        lineList = string.split(string.strip(line), '\t')
+        lineList = map(string.strip, lineList)
+
+        sample_name = lineList[0]
+        sample_list.append(sample_name)
+
+        line = ped_file.readline()
+
+    return sample_list
+    
+def parse_plink_output(output_filename, species):
+    plink_results={}
+
+    threshold_p_value = 0.01
+
+    result_fp = open("%s%s.qassoc"% (TMPDIR, output_filename), "rb")
+
+    header_line = result_fp.readline()# read header line
+    line = result_fp.readline()
+
+    value_list = [] # initialize value list, this list will include snp, bp and pvalue info
+    p_value_dict = {}
+    count = 0
+
+    while line:
+        #convert line from str to list
+        line_list = build_line_list(line=line)
+
+        # only keep the records whose chromosome name is in db
+        if species.chromosomes.chromosomes.has_key(int(line_list[0])) and line_list[-1] and line_list[-1].strip()!='NA':
+
+            chr_name = species.chromosomes.chromosomes[int(line_list[0])]
+            snp = line_list[1]
+            BP = line_list[2]
+            p_value = float(line_list[-1])
+            if threshold_p_value >= 0 and threshold_p_value <= 1:
+                if p_value < threshold_p_value:
+                    p_value_dict[snp] = float(p_value)
+
+            if plink_results.has_key(chr_name):
+                value_list = plink_results[chr_name]
+
+                # pvalue range is [0,1]
+                if threshold_p_value >=0 and threshold_p_value <= 1:
+                    if p_value < threshold_p_value:
+                        value_list.append((snp, BP, p_value))
+                        count += 1
+
+                plink_results[chr_name] = value_list
+                value_list = []
+            else:
+                if threshold_p_value >= 0 and threshold_p_value <= 1:
+                    if p_value < threshold_p_value:
+                        value_list.append((snp, BP, p_value))
+                        count += 1
+
+                if value_list:
+                    plink_results[chr_name] = value_list
+
+                value_list=[]
+
+            line = result_fp.readline()
+        else:
+            line = result_fp.readline()
+
+    #if p_value_list:
+    #    min_p_value = min(p_value_list)
+    #else:
+    #    min_p_value = 0
+
+    return count, p_value_dict
+    
+######################################################
+# input: line: str,one line read from file
+# function: convert line from str to list;
+# output: lineList list
+#######################################################
+def build_line_list(line=None):
+    line_list = string.split(string.strip(line),' ')# irregular number of whitespaces between columns
+    line_list = [item for item in line_list if item <>'']
+    line_list = map(string.strip, line_list)
+
+    return line_list
\ No newline at end of file
diff --git a/wqflask/wqflask/marker_regression/rqtl_mapping.py b/wqflask/wqflask/marker_regression/rqtl_mapping.py
index 93bf717c..afd28563 100644
--- a/wqflask/wqflask/marker_regression/rqtl_mapping.py
+++ b/wqflask/wqflask/marker_regression/rqtl_mapping.py
@@ -1,193 +1,191 @@
-import rpy2.robjects as ro
-import numpy as np
-
-from base.webqtlConfig import TMPDIR
-from utility import webqtlUtil
-from utility.tools import locate, TEMPDIR
-
-def run_rqtl_geno(vals, dataset, method, model, permCheck, num_perm, do_control, control_marker, manhattan_plot, pair_scan):
-    geno_to_rqtl_function(dataset)
-
-    ## Get pointers to some common R functions
-    r_library     = ro.r["library"]                 # Map the library function
-    r_c           = ro.r["c"]                       # Map the c function
-    r_sum         = ro.r["sum"]                     # Map the sum function
-    plot          = ro.r["plot"]                    # Map the plot function
-    postscript    = ro.r["postscript"]              # Map the postscript function
-    png           = ro.r["png"]                     # Map the png function
-    dev_off       = ro.r["dev.off"]                 # Map the device off function
-
-    print(r_library("qtl"))                         # Load R/qtl
-
-    ## Get pointers to some R/qtl functions
-    scanone         = ro.r["scanone"]               # Map the scanone function
-    scantwo         = ro.r["scantwo"]               # Map the scantwo function
-    calc_genoprob   = ro.r["calc.genoprob"]         # Map the calc.genoprob function
-    read_cross      = ro.r["read.cross"]            # Map the read.cross function
-    write_cross     = ro.r["write.cross"]           # Map the write.cross function
-    GENOtoCSVR      = ro.r["GENOtoCSVR"]            # Map the local GENOtoCSVR function
-
-    crossname = dataset.group.name
-    genofilelocation  = locate(crossname + ".geno", "genotype")
-    crossfilelocation = TMPDIR + crossname + ".cross"
-
-    #print("Conversion of geno to cross at location:", genofilelocation, " to ", crossfilelocation)
-
-    cross_object = GENOtoCSVR(genofilelocation, crossfilelocation)                                  # TODO: Add the SEX if that is available
-
-    if manhattan_plot:
-        cross_object = calc_genoprob(cross_object)
-    else:
-        cross_object = calc_genoprob(cross_object, step=1, stepwidth="max")
-
-    cross_object = add_phenotype(cross_object, sanitize_rqtl_phenotype(vals))                 # Add the phenotype
-
-    # for debug: write_cross(cross_object, "csvr", "test.csvr")
-
-    # Scan for QTLs
-    covar = create_covariates(control_marker, cross_object)                                                    # Create the additive covariate matrix
-
-    if pair_scan:
-        if do_control == "true":                                                # If sum(covar) > 0 we have a covariate matrix
-            print("Using covariate"); result_data_frame = scantwo(cross_object, pheno = "the_pheno", addcovar = covar, model=model, method=method, n_cluster = 16)
-        else:
-            print("No covariates"); result_data_frame = scantwo(cross_object, pheno = "the_pheno", model=model, method=method, n_cluster = 16)
-
-        #print("Pair scan results:", result_data_frame)
-
-        pair_scan_filename = webqtlUtil.genRandStr("scantwo_") + ".png"
-        png(file=TEMPDIR+pair_scan_filename)
-        plot(result_data_frame)
-        dev_off()
-
-        return process_pair_scan_results(result_data_frame)
-    else:
-        if do_control == "true":
-            print("Using covariate"); result_data_frame = scanone(cross_object, pheno = "the_pheno", addcovar = covar, model=model, method=method)
-        else:
-            print("No covariates"); result_data_frame = scanone(cross_object, pheno = "the_pheno", model=model, method=method)
-
-        if num_perm > 0 and permCheck == "ON":                                                                   # Do permutation (if requested by user)
-            if do_control == "true":
-                perm_data_frame = scanone(cross_object, pheno_col = "the_pheno", addcovar = covar, n_perm = num_perm, model=model, method=method)
-            else:
-                perm_data_frame = scanone(cross_object, pheno_col = "the_pheno", n_perm = num_perm, model=model, method=method)
-
-            perm_output, suggestive, significant = process_rqtl_perm_results(num_perm, perm_data_frame)          # Functions that sets the thresholds for the webinterface
-            return perm_output, suggestive, significant, process_rqtl_results(result_data_frame)
-        else:
-            return process_rqtl_results(result_data_frame)
-
-def geno_to_rqtl_function(dataset):        # TODO: Need to figure out why some genofiles have the wrong format and don't convert properly
-
-    ro.r("""
-       trim <- function( x ) { gsub("(^[[:space:]]+|[[:space:]]+$)", "", x) }
-
-       getGenoCode <- function(header, name = 'unk'){
-         mat = which(unlist(lapply(header,function(x){ length(grep(paste('@',name,sep=''), x)) })) == 1)
-         return(trim(strsplit(header[mat],':')[[1]][2]))
-       }
-
-       GENOtoCSVR <- function(genotypes = '%s', out = 'cross.csvr', phenotype = NULL, sex = NULL, verbose = FALSE){
-         header = readLines(genotypes, 40)                                                                                 # Assume a geno header is not longer than 40 lines
-         toskip = which(unlist(lapply(header, function(x){ length(grep("Chr\t", x)) })) == 1)-1                            # Major hack to skip the geno headers
-
-         genocodes <- c(getGenoCode(header, 'mat'), getGenoCode(header, 'het'), getGenoCode(header, 'pat'))                # Get the genotype codes
-         type <- getGenoCode(header, 'type')
-         genodata <- read.csv(genotypes, sep='\t', skip=toskip, header=TRUE, na.strings=getGenoCode(header,'unk'), colClasses='character', comment.char = '#')
-         cat('Genodata:', toskip, " ", dim(genodata), genocodes, '\n')
-         if(is.null(phenotype)) phenotype <- runif((ncol(genodata)-4))                                                     # If there isn't a phenotype, generate a random one
-         if(is.null(sex)) sex <- rep('m', (ncol(genodata)-4))                                                              # If there isn't a sex phenotype, treat all as males
-         outCSVR <- rbind(c('Pheno', '', '', phenotype),                                                                   # Phenotype
-                          c('sex', '', '', sex),                                                                           # Sex phenotype for the mice
-                          cbind(genodata[,c('Locus','Chr', 'cM')], genodata[, 5:ncol(genodata)]))                          # Genotypes
-         write.table(outCSVR, file = out, row.names=FALSE, col.names=FALSE,quote=FALSE, sep=',')                           # Save it to a file
-         require(qtl)
-         cross = read.cross(file=out, 'csvr', genotypes=genocodes)                                                         # Load the created cross file using R/qtl read.cross
-         if(type == 'riset') cross <- convert2riself(cross)                                                                # If its a RIL, convert to a RIL in R/qtl
-         return(cross)
-      }
-    """ % (dataset.group.name + ".geno"))
-        
-def add_phenotype(cross, pheno_as_string):
-    ro.globalenv["the_cross"] = cross
-    ro.r('the_cross$pheno <- cbind(pull.pheno(the_cross), the_pheno = '+ pheno_as_string +')')
-    return ro.r["the_cross"]
-
-def create_covariates(control_marker, cross):
-    ro.globalenv["the_cross"] = cross
-    ro.r('genotypes <- pull.geno(the_cross)')                             # Get the genotype matrix
-    userinputS = control_marker.replace(" ", "").split(",")                 # TODO: sanitize user input, Never Ever trust a user
-    covariate_names = ', '.join('"{0}"'.format(w) for w in userinputS)
-    #print("Marker names of selected covariates:", covariate_names)
-    ro.r('covnames <- c(' + covariate_names + ')')
-    ro.r('covInGeno <- which(covnames %in% colnames(genotypes))')
-    ro.r('covnames <- covnames[covInGeno]')
-    ro.r("cat('covnames (purged): ', covnames,'\n')")
-    ro.r('covariates <- genotypes[,covnames]')                            # Get the covariate matrix by using the marker name as index to the genotype file
-    #print("R/qtl matrix of covariates:", ro.r["covariates"])
-    return ro.r["covariates"]
-
-def sanitize_rqtl_phenotype(vals):
-    pheno_as_string = "c("
-    for i, val in enumerate(vals):
-        if val == "x":
-            if i < (len(vals) - 1):
-                pheno_as_string +=  "NA,"
-            else:
-                pheno_as_string += "NA"
-        else:
-            if i < (len(vals) - 1):
-                pheno_as_string += str(val) + ","
-            else:
-                pheno_as_string += str(val)
-    pheno_as_string += ")"
-    return pheno_as_string
-
-def process_pair_scan_results(result):
-    pair_scan_results = []
-
-    result = result[1]
-    output = [tuple([result[j][i] for j in range(result.ncol)]) for i in range(result.nrow)]
-    #print("R/qtl scantwo output:", output)
-
-    for i, line in enumerate(result.iter_row()):
-        marker = {}
-        marker['name'] = result.rownames[i]
-        marker['chr1'] = output[i][0]
-        marker['Mb'] = output[i][1]
-        marker['chr2'] = int(output[i][2])
-        pair_scan_results.append(marker)
-
-    #print("pair_scan_results:", pair_scan_results)
-
-    return pair_scan_results
-
-def process_rqtl_perm_results(num_perm, results):
-    perm_vals = []
-    for line in str(results).split("\n")[1:(num_perm+1)]:
-        #print("R/qtl permutation line:", line.split())
-        perm_vals.append(float(line.split()[1]))
-
-    perm_output = perm_vals
-    suggestive = np.percentile(np.array(perm_vals), 67)
-    significant = np.percentile(np.array(perm_vals), 95)
-    print("SIGNIFICANT:", significant)
-
-    return perm_output, suggestive, significant
-    
-def process_rqtl_results(result):        # TODO: how to make this a one liner and not copy the stuff in a loop
-    qtl_results = []
-
-    output = [tuple([result[j][i] for j in range(result.ncol)]) for i in range(result.nrow)]
-    #print("R/qtl scanone output:", output)
-
-    for i, line in enumerate(result.iter_row()):
-        marker = {}
-        marker['name'] = result.rownames[i]
-        marker['chr'] = output[i][0]
-        marker['Mb'] = output[i][1]
-        marker['lod_score'] = output[i][2]
-        qtl_results.append(marker)
-
-    return qtl_results 
\ No newline at end of file
+import rpy2.robjects as ro
+import numpy as np
+
+from base.webqtlConfig import TMPDIR
+from utility import webqtlUtil
+from utility.tools import locate, TEMPDIR
+
+def run_rqtl_geno(vals, dataset, method, model, permCheck, num_perm, do_control, control_marker, manhattan_plot, pair_scan):
+    geno_to_rqtl_function(dataset)
+
+    ## Get pointers to some common R functions
+    r_library     = ro.r["library"]                 # Map the library function
+    r_c           = ro.r["c"]                       # Map the c function
+    r_sum         = ro.r["sum"]                     # Map the sum function
+    plot          = ro.r["plot"]                    # Map the plot function
+    postscript    = ro.r["postscript"]              # Map the postscript function
+    png           = ro.r["png"]                     # Map the png function
+    dev_off       = ro.r["dev.off"]                 # Map the device off function
+
+    print(r_library("qtl"))                         # Load R/qtl
+
+    ## Get pointers to some R/qtl functions
+    scanone         = ro.r["scanone"]               # Map the scanone function
+    scantwo         = ro.r["scantwo"]               # Map the scantwo function
+    calc_genoprob   = ro.r["calc.genoprob"]         # Map the calc.genoprob function
+    read_cross      = ro.r["read.cross"]            # Map the read.cross function
+    write_cross     = ro.r["write.cross"]           # Map the write.cross function
+    GENOtoCSVR      = ro.r["GENOtoCSVR"]            # Map the local GENOtoCSVR function
+
+    crossname = dataset.group.name
+    genofilelocation  = locate(crossname + ".geno", "genotype")
+    crossfilelocation = TMPDIR + crossname + ".cross"
+
+    #print("Conversion of geno to cross at location:", genofilelocation, " to ", crossfilelocation)
+
+    cross_object = GENOtoCSVR(genofilelocation, crossfilelocation)                                  # TODO: Add the SEX if that is available
+
+    if manhattan_plot:
+        cross_object = calc_genoprob(cross_object)
+    else:
+        cross_object = calc_genoprob(cross_object, step=1, stepwidth="max")
+
+    cross_object = add_phenotype(cross_object, sanitize_rqtl_phenotype(vals))                 # Add the phenotype
+
+    # for debug: write_cross(cross_object, "csvr", "test.csvr")
+
+    # Scan for QTLs
+    covar = create_covariates(control_marker, cross_object)                                                    # Create the additive covariate matrix
+
+    if pair_scan:
+        if do_control == "true":                                                # If sum(covar) > 0 we have a covariate matrix
+            print("Using covariate"); result_data_frame = scantwo(cross_object, pheno = "the_pheno", addcovar = covar, model=model, method=method, n_cluster = 16)
+        else:
+            print("No covariates"); result_data_frame = scantwo(cross_object, pheno = "the_pheno", model=model, method=method, n_cluster = 16)
+
+        #print("Pair scan results:", result_data_frame)
+
+        pair_scan_filename = webqtlUtil.genRandStr("scantwo_") + ".png"
+        png(file=TEMPDIR+pair_scan_filename)
+        plot(result_data_frame)
+        dev_off()
+
+        return process_pair_scan_results(result_data_frame)
+    else:
+        if do_control == "true":
+            print("Using covariate"); result_data_frame = scanone(cross_object, pheno = "the_pheno", addcovar = covar, model=model, method=method)
+        else:
+            print("No covariates"); result_data_frame = scanone(cross_object, pheno = "the_pheno", model=model, method=method)
+
+        if num_perm > 0 and permCheck == "ON":                                                                   # Do permutation (if requested by user)
+            if do_control == "true":
+                perm_data_frame = scanone(cross_object, pheno_col = "the_pheno", addcovar = covar, n_perm = num_perm, model=model, method=method)
+            else:
+                perm_data_frame = scanone(cross_object, pheno_col = "the_pheno", n_perm = num_perm, model=model, method=method)
+
+            perm_output, suggestive, significant = process_rqtl_perm_results(num_perm, perm_data_frame)          # Functions that sets the thresholds for the webinterface
+            return perm_output, suggestive, significant, process_rqtl_results(result_data_frame)
+        else:
+            return process_rqtl_results(result_data_frame)
+
+def geno_to_rqtl_function(dataset):        # TODO: Need to figure out why some genofiles have the wrong format and don't convert properly
+
+    ro.r("""
+       trim <- function( x ) { gsub("(^[[:space:]]+|[[:space:]]+$)", "", x) }
+
+       getGenoCode <- function(header, name = 'unk'){
+         mat = which(unlist(lapply(header,function(x){ length(grep(paste('@',name,sep=''), x)) })) == 1)
+         return(trim(strsplit(header[mat],':')[[1]][2]))
+       }
+
+       GENOtoCSVR <- function(genotypes = '%s', out = 'cross.csvr', phenotype = NULL, sex = NULL, verbose = FALSE){
+         header = readLines(genotypes, 40)                                                                                 # Assume a geno header is not longer than 40 lines
+         toskip = which(unlist(lapply(header, function(x){ length(grep("Chr\t", x)) })) == 1)-1                            # Major hack to skip the geno headers
+
+         genocodes <- c(getGenoCode(header, 'mat'), getGenoCode(header, 'het'), getGenoCode(header, 'pat'))                # Get the genotype codes
+         type <- getGenoCode(header, 'type')
+         genodata <- read.csv(genotypes, sep='\t', skip=toskip, header=TRUE, na.strings=getGenoCode(header,'unk'), colClasses='character', comment.char = '#')
+         cat('Genodata:', toskip, " ", dim(genodata), genocodes, '\n')
+         if(is.null(phenotype)) phenotype <- runif((ncol(genodata)-4))                                                     # If there isn't a phenotype, generate a random one
+         if(is.null(sex)) sex <- rep('m', (ncol(genodata)-4))                                                              # If there isn't a sex phenotype, treat all as males
+         outCSVR <- rbind(c('Pheno', '', '', phenotype),                                                                   # Phenotype
+                          c('sex', '', '', sex),                                                                           # Sex phenotype for the mice
+                          cbind(genodata[,c('Locus','Chr', 'cM')], genodata[, 5:ncol(genodata)]))                          # Genotypes
+         write.table(outCSVR, file = out, row.names=FALSE, col.names=FALSE,quote=FALSE, sep=',')                           # Save it to a file
+         require(qtl)
+         cross = read.cross(file=out, 'csvr', genotypes=genocodes)                                                         # Load the created cross file using R/qtl read.cross
+         if(type == 'riset') cross <- convert2riself(cross)                                                                # If its a RIL, convert to a RIL in R/qtl
+         return(cross)
+      }
+    """ % (dataset.group.name + ".geno"))
+        
+def add_phenotype(cross, pheno_as_string):
+    ro.globalenv["the_cross"] = cross
+    ro.r('the_cross$pheno <- cbind(pull.pheno(the_cross), the_pheno = '+ pheno_as_string +')')
+    return ro.r["the_cross"]
+
+def create_covariates(control_marker, cross):
+    ro.globalenv["the_cross"] = cross
+    ro.r('genotypes <- pull.geno(the_cross)')                             # Get the genotype matrix
+    userinputS = control_marker.replace(" ", "").split(",")                 # TODO: sanitize user input, Never Ever trust a user
+    covariate_names = ', '.join('"{0}"'.format(w) for w in userinputS)
+    #print("Marker names of selected covariates:", covariate_names)
+    ro.r('covnames <- c(' + covariate_names + ')')
+    ro.r('covInGeno <- which(covnames %in% colnames(genotypes))')
+    ro.r('covnames <- covnames[covInGeno]')
+    ro.r("cat('covnames (purged): ', covnames,'\n')")
+    ro.r('covariates <- genotypes[,covnames]')                            # Get the covariate matrix by using the marker name as index to the genotype file
+    #print("R/qtl matrix of covariates:", ro.r["covariates"])
+    return ro.r["covariates"]
+
+def sanitize_rqtl_phenotype(vals):
+    pheno_as_string = "c("
+    for i, val in enumerate(vals):
+        if val == "x":
+            if i < (len(vals) - 1):
+                pheno_as_string +=  "NA,"
+            else:
+                pheno_as_string += "NA"
+        else:
+            if i < (len(vals) - 1):
+                pheno_as_string += str(val) + ","
+            else:
+                pheno_as_string += str(val)
+    pheno_as_string += ")"
+    return pheno_as_string
+
+def process_pair_scan_results(result):
+    pair_scan_results = []
+
+    result = result[1]
+    output = [tuple([result[j][i] for j in range(result.ncol)]) for i in range(result.nrow)]
+    #print("R/qtl scantwo output:", output)
+
+    for i, line in enumerate(result.iter_row()):
+        marker = {}
+        marker['name'] = result.rownames[i]
+        marker['chr1'] = output[i][0]
+        marker['Mb'] = output[i][1]
+        marker['chr2'] = int(output[i][2])
+        pair_scan_results.append(marker)
+
+    return pair_scan_results
+
+def process_rqtl_perm_results(num_perm, results):
+    perm_vals = []
+    for line in str(results).split("\n")[1:(num_perm+1)]:
+        #print("R/qtl permutation line:", line.split())
+        perm_vals.append(float(line.split()[1]))
+
+    perm_output = perm_vals
+    suggestive = np.percentile(np.array(perm_vals), 67)
+    significant = np.percentile(np.array(perm_vals), 95)
+
+    return perm_output, suggestive, significant
+    
+def process_rqtl_results(result):        # TODO: how to make this a one liner and not copy the stuff in a loop
+    qtl_results = []
+
+    output = [tuple([result[j][i] for j in range(result.ncol)]) for i in range(result.nrow)]
+    #print("R/qtl scanone output:", output)
+
+    for i, line in enumerate(result.iter_row()):
+        marker = {}
+        marker['name'] = result.rownames[i]
+        marker['chr'] = output[i][0]
+        marker['Mb'] = output[i][1]
+        marker['lod_score'] = output[i][2]
+        qtl_results.append(marker)
+
+    return qtl_results 
+    
\ No newline at end of file
author	zsloan	2016-10-12 16:52:29 +0000
committer	zsloan	2016-10-12 16:52:29 +0000
commit	1cdd044b2acba69058aaf7b0719eef5d1a727cfc (patch)
tree	276691d34798512cc8b6c29491b45d3acede4a06 /wqflask
parent	22339cb85c5db370ee6a2c247de90e603cdf85a5 (diff)
download	genenetwork2-1cdd044b2acba69058aaf7b0719eef5d1a727cfc.tar.gz