From b815236123ff8e144bd84f349357a1852df95651 Mon Sep 17 00:00:00 2001
From: zsloan
Date: Thu, 23 Sep 2021 19:17:34 +0000
Subject: Fix covariates as described by Danny

---
 scripts/rqtl_wrapper.R | 53 +++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 48 insertions(+), 5 deletions(-)

(limited to 'scripts')

diff --git a/scripts/rqtl_wrapper.R b/scripts/rqtl_wrapper.R
index 7518175..20c0e06 100644
--- a/scripts/rqtl_wrapper.R
+++ b/scripts/rqtl_wrapper.R
@@ -9,6 +9,7 @@ option_list = list(
   make_option(c("-g", "--geno"), type="character", help=".geno file containing a dataset's genotypes"),
   make_option(c("-p", "--pheno"), type="character", help="File containing two columns - sample names and values"),
   make_option(c("-c", "--addcovar"), action="store_true", default=NULL, help="Use covariates (included as extra columns in the phenotype input file)"),
+  make_option(c("--covarstruct"), type="character", help="File detailing which covariates are categorical or numerical"),
   make_option(c("--model"), type="character", default="normal", help="Mapping Model - Normal or Non-Parametric"),
   make_option(c("--method"), type="character", default="hk", help="Mapping Method - hk (Haley Knott), ehk (Extended Haley Knott), mr (Marker Regression), em (Expectation-Maximization), imp (Imputation)"),
   make_option(c("-i", "--interval"), action="store_true", default=NULL, help="Use interval mapping"),
@@ -33,6 +34,20 @@ verbose_print <- function(...){
   }
 }
 
+adjustXprobs <- function(cross){
+  sex <- getsex(cross)$sex
+  pr <- cross$geno[["X"]]$prob
+  stopifnot(!is.null(pr), !is.null(sex))
+
+  for(i in 1:ncol(pr)) {
+      pr[sex==0,i,3:4] <- 0
+      pr[sex==1,i,1:2] <- 0
+      pr[,i,] <- pr[,i,]/rowSums(pr[,i,])
+  }
+  cross$geno[["X"]]$prob <- pr
+  invisible(cross)
+}
+
 if (is.null(opt$geno) || is.null(opt$pheno)){
   print_help(opt_parser)
   stop("Both a genotype and phenotype file must be provided.", call.=FALSE)
@@ -51,7 +66,7 @@ get_geno_code <- function(header, name = 'unk'){
   return(trim(strsplit(header[mat],':')[[1]][2]))
 }
 
-geno_to_csvr <- function(genotypes, trait_names, trait_vals, out, sex = NULL,
+geno_to_csvr <- function(genotypes, trait_names, trait_vals, out, type, sex = NULL,
                          mapping_scale = "Mb", verbose = FALSE){
   # Assume a geno header is not longer than 40 lines
   header = readLines(genotypes, 40)
@@ -148,8 +163,12 @@ for (i in 1:length(trait_names)) {
   trait_names[i] = paste("T_", this_trait, sep = "")
 }
 
+# Get type of genotypes, since it needs to be checked before calc.genoprob
+header = readLines(geno_file, 40)
+type <- get_geno_code(header, 'type')
+
 verbose_print('Generating cross object\n')
-cross_object = geno_to_csvr(geno_file, trait_names, trait_vals, cross_file)
+cross_object = geno_to_csvr(geno_file, trait_names, trait_vals, cross_file, type)
 
 # Calculate genotype probabilities
 if (!is.null(opt$interval)) {
@@ -160,16 +179,40 @@ if (!is.null(opt$interval)) {
   cross_object <- calc.genoprob(cross_object)
 }
 
+# If 4way, adjust X chromosome genotype probabilities
+if (type == "4-way") {
+  verbose_print('Adjusting genotype probabilities for 4way cross')
+  cross_object <- adjustXprobs(cross_object)
+}
+
 # Pull covariates out of cross object, if they exist
-covars = vector(mode = "list", length = length(trait_names) - 1)
+covars <- c() # Holds the covariates which should be passed to R/qtl
 if (!is.null(opt$addcovar)) {
-  #If perm strata are being used, it'll be included as the final column in the phenotype file
+  # If perm strata are being used, it'll be included as the final column in the phenotype file
   if (!is.null(opt$pstrata)) {
-    covar_names = trait_names[3:length(trait_names) - 1]
+    covar_names = trait_names[2:(length(trait_names)-1)]
   } else {
     covar_names = trait_names[2:length(trait_names)]
   }
   covars <- pull.pheno(cross_object, covar_names)
+  # Read in the covar description file
+  covarDescr <- read.table(opt$covarstruct, sep="\t", header=FALSE)
+  for(x in 1:nrow(covarDescr)){
+    cat(covarDescr[x, 1])
+    name <- paste0("T_", covarDescr[x, 1]) # The covar description file doesn't have T_ in trait names (the cross object does)
+    type <- covarDescr[x, 2]
+    if(type == "categorical"){
+      if(length(table(covars[,name])) > 2){ # More then 2 levels create the model matrix for the factor
+        mdata <- data.frame(toExpand = as.factor(covars[, name]))
+        options(na.action='na.pass')
+        modelmatrix <- model.matrix(~ toExpand + 0, mdata)[,-1]
+        covars <- cbind(covars, modelmatrix)
+      }else{ # 2 levels? just bind the trait as covar
+        verbose_print('Binding covars to covars\n')
+        covars <- cbind(covars, covars[,name])
+      }
+    }
+  }
 }
 
 # Pull permutation strata out of cross object, if it is being used
-- 
cgit 1.4.1


From baa801e81c5b0f7c7b30c6ca880a806a6bce54db Mon Sep 17 00:00:00 2001
From: zsloan
Date: Thu, 14 Oct 2021 18:03:50 +0000
Subject: Incorporated Danny's code for calculating QTL main effects into
 rqtl_wrapper.R

---
 scripts/rqtl_wrapper.R | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

(limited to 'scripts')

diff --git a/scripts/rqtl_wrapper.R b/scripts/rqtl_wrapper.R
index 20c0e06..523888f 100644
--- a/scripts/rqtl_wrapper.R
+++ b/scripts/rqtl_wrapper.R
@@ -269,4 +269,47 @@ if (!is.null(opt$addcovar) || !is.null(opt$control)){
   verbose_print('Running scanone\n')
   qtl_results = scanone(cross_object, pheno.col=1, model=opt$model, method=opt$method)
 }
+
+#QTL main effects on adjusted longevity
+getEffects <- function(sdata, gtsprob, marker = "1_24042124", model = "longevity ~ sex + site + cohort + treatment", trait = "longevity"){
+  rownames(sdata) <- 1:nrow(sdata)
+  rownames(gtsprob) <- 1:nrow(gtsprob)
+  mp <- gtsprob[, grep(marker, colnames(gtsprob))]
+  gts <- unlist(lapply(lapply(lapply(apply(mp,1,function(x){which(x > 0.85)}),names), strsplit, ":"), function(x){
+    if(length(x) > 0){ return(x[[1]][2]); }else{ return(NA) }
+  }))
+
+  ismissing <- which(apply(sdata, 1, function(x){any(is.na(x))}))
+  if(length(ismissing) > 0){
+    sdata <- sdata[-ismissing, ]
+    gts <- gts[-ismissing]
+  }
+
+  mlm <- lm(as.formula(model), data = sdata)
+  pheAdj <- rep(NA, nrow(sdata))
+  adj <- residuals(mlm) + mean(sdata[, trait])
+  pheAdj[as.numeric(names(adj))] <- adj
+  means <- c(mean(pheAdj[which(gts == "AC")],na.rm=TRUE),mean(pheAdj[which(gts == "AD")],na.rm=TRUE),mean(pheAdj[which(gts == "BC")],na.rm=TRUE),mean(pheAdj[which(gts == "BD")],na.rm=TRUE))
+  std <- function(x) sd(x,na.rm=TRUE)/sqrt(length(x))
+  stderrs <- c(std(pheAdj[which(gts == "AC")]),std(pheAdj[which(gts == "AD")]),std(pheAdj[which(gts == "BC")]),std(pheAdj[which(gts == "BD")]))
+  paste0(round(means,0), " ± ", round(stderrs,2))
+}
+
+if (type == "4-way") {
+  verbose_print("Get phenotype name + genoprob + all phenotypes + models for 4-way crosses")
+  traitname <- colnames(pull.pheno(cross_object))[1]
+  gtsp <- pull.genoprob(cross_object)
+  allpheno <- pull.pheno(cross_object)
+  model <- paste0(traitname, " ~ ", paste0(covar_names, sep="", collapse=" + "))
+
+  meffects <- c()
+  verbose_print("Getting QTL main effects for 4-way crosses")
+  for(marker in  rownames(qtl_results)){
+    meff <- getEffects(allpheno, gtsp, marker = marker, model, trait = traitname)
+    meffects <- rbind(meffects, meff)
+  }
+  qtl_results <- cbind(data.frame(qtl_results[,1:3]), meffects)
+  colnames(qtl_results)[4:7] <- c("AC", "AD", "BC", "BD")
+}
+
 write.csv(qtl_results, out_file)
-- 
cgit 1.4.1


From 77099cac68e8f4792bf54d8e1f7ce6f315bedfa7 Mon Sep 17 00:00:00 2001
From: zsloan
Date: Thu, 14 Oct 2021 21:00:15 +0000
Subject: Fixed when model is set to account for situations with no covariates

---
 scripts/rqtl_wrapper.R | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'scripts')

diff --git a/scripts/rqtl_wrapper.R b/scripts/rqtl_wrapper.R
index 523888f..b7a9ae0 100644
--- a/scripts/rqtl_wrapper.R
+++ b/scripts/rqtl_wrapper.R
@@ -300,7 +300,11 @@ if (type == "4-way") {
   traitname <- colnames(pull.pheno(cross_object))[1]
   gtsp <- pull.genoprob(cross_object)
   allpheno <- pull.pheno(cross_object)
-  model <- paste0(traitname, " ~ ", paste0(covar_names, sep="", collapse=" + "))
+  if (!is.null(opt$addcovar)) {
+    model <- paste0(traitname, " ~ ", paste0(covar_names, sep="", collapse=" + "))
+  } else {
+    model <- paste0(traitname, " ~ 1 ")
+  }
 
   meffects <- c()
   verbose_print("Getting QTL main effects for 4-way crosses")
-- 
cgit 1.4.1


From 453570cbf22a92de72fa0401895c99a923831942 Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Tue, 5 Oct 2021 15:14:25 +0300
Subject: script modification : add debug statements

---
 scripts/wgcna_analysis.R | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'scripts')

diff --git a/scripts/wgcna_analysis.R b/scripts/wgcna_analysis.R
index 17b3537..a5cbbe3 100644
--- a/scripts/wgcna_analysis.R
+++ b/scripts/wgcna_analysis.R
@@ -6,11 +6,15 @@ library(rjson)
 
 options(stringsAsFactors = FALSE);
 
-imgDir = Sys.getenv("GENERATED_IMAGE_DIR")
+cat("Running the wgcna analysis script\n")
+
 # load expression data **assumes from json files row(traits)(columns info+samples)
 # pass the file_path as arg
 # pass the file path to read json data
 
+
+
+
 args = commandArgs(trailingOnly=TRUE)
 
 if (length(args)==0) {
@@ -21,6 +25,7 @@ if (length(args)==0) {
 }
 
 inputData <- fromJSON(file = json_file_path)
+imgDir = inputData$TMPDIR
 
 
 trait_sample_data <- do.call(rbind, inputData$trait_sample_data)
@@ -83,6 +88,11 @@ network <- blockwiseModules(dataExpr,
 
 
+cat("Generated network \n")
+
+network
+
+
 genImageRandStr <- function(prefix){
 
 	randStr <- paste(prefix,stri_rand_strings(1, 9, pattern = "[A-Za-z0-9]"),sep="_")
@@ -95,6 +105,10 @@ mergedColors <- labels2colors(network$colors)
 imageLoc <- file.path(imgDir,genImageRandStr("WGCNAoutput"))
 png(imageLoc,width=1000,height=600,type='cairo-png')
 
+
+cat("Generating the CLuster  dendrogram\n")
+
+
 plotDendroAndColors(network$dendrograms[[1]],mergedColors[network$blockGenes[[1]]],
 "Module colors",
 dendroLabels = FALSE, hang = 0.03,
-- 
cgit 1.4.1


From d1be5270b99959f99802a7704562eeaaeb504122 Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Wed, 13 Oct 2021 07:29:24 +0300
Subject: minor fixes for sript enable annotations

---
 scripts/wgcna_analysis.R | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'scripts')

diff --git a/scripts/wgcna_analysis.R b/scripts/wgcna_analysis.R
index a5cbbe3..b0d25a9 100644
--- a/scripts/wgcna_analysis.R
+++ b/scripts/wgcna_analysis.R
@@ -13,8 +13,6 @@ cat("Running the wgcna analysis script\n")
 # pass the file path to read json data
 
 
-
-
 args = commandArgs(trailingOnly=TRUE)
 
 if (length(args)==0) {
@@ -100,6 +98,7 @@ genImageRandStr <- function(prefix){
 	return(paste(randStr,".png",sep=""))
 }
 
+
 mergedColors <- labels2colors(network$colors)
 
 imageLoc <- file.path(imgDir,genImageRandStr("WGCNAoutput"))
@@ -111,7 +110,7 @@ cat("Generating the CLuster  dendrogram\n")
 
 plotDendroAndColors(network$dendrograms[[1]],mergedColors[network$blockGenes[[1]]],
 "Module colors",
-dendroLabels = FALSE, hang = 0.03,
+dendroLabels = NULL, hang = 0.03,
 addGuide = TRUE, guideHang = 0.05)
 
 
-- 
cgit 1.4.1


From 8f036415975d6e224e5e94277997329c0f1fa159 Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Fri, 29 Oct 2021 09:49:28 +0300
Subject: Feature/biweight reimplementation (#47)

* add biweight reimplementation with pingouin

* delete biweight scripts and tests

* add python-pingouin to guix file

* delete biweight paths

* mypy fix:pingouin mising imports

* pep8 formatting && pylint fixes---
 gn3/computations/biweight.py                | 27 ------------------
 gn3/computations/correlations.py            | 11 ++++----
 gn3/settings.py                             |  3 --
 guix.scm                                    |  1 +
 mypy.ini                                    |  3 ++
 scripts/calculate_biweight.R                | 43 -----------------------------
 tests/unit/computations/test_biweight.py    | 21 --------------
 tests/unit/computations/test_correlation.py | 11 --------
 8 files changed, 9 insertions(+), 111 deletions(-)
 delete mode 100644 gn3/computations/biweight.py
 delete mode 100644 scripts/calculate_biweight.R
 delete mode 100644 tests/unit/computations/test_biweight.py

(limited to 'scripts')

diff --git a/gn3/computations/biweight.py b/gn3/computations/biweight.py
deleted file mode 100644
index 7accd0c..0000000
--- a/gn3/computations/biweight.py
+++ /dev/null
@@ -1,27 +0,0 @@
-"""module contains script to call biweight midcorrelation in R"""
-import subprocess
-
-from typing import List
-from typing import Tuple
-
-from gn3.settings import BIWEIGHT_RSCRIPT
-
-
-def calculate_biweight_corr(trait_vals: List,
-                            target_vals: List,
-                            path_to_script: str = BIWEIGHT_RSCRIPT,
-                            command: str = "Rscript"
-                            ) -> Tuple[float, float]:
-    """biweight function"""
-
-    args_1 = ' '.join(str(trait_val) for trait_val in trait_vals)
-    args_2 = ' '.join(str(target_val) for target_val in target_vals)
-    cmd = [command, path_to_script] + [args_1] + [args_2]
-
-    results = subprocess.check_output(cmd, universal_newlines=True)
-    try:
-        (corr_coeff, p_val) = tuple(
-            [float(y.strip()) for y in results.split()])
-        return (corr_coeff, p_val)
-    except Exception as error:
-        raise error
diff --git a/gn3/computations/correlations.py b/gn3/computations/correlations.py
index bb13ff1..c930df0 100644
--- a/gn3/computations/correlations.py
+++ b/gn3/computations/correlations.py
@@ -8,7 +8,7 @@ from typing import Optional
 from typing import Callable
 
 import scipy.stats
-from gn3.computations.biweight import calculate_biweight_corr
+import pingouin as pg
 
 
 def map_shared_keys_to_values(target_sample_keys: List,
@@ -102,11 +102,10 @@ package :not packaged in guix
 
     """
 
-    try:
-        results = calculate_biweight_corr(x_val, y_val)
-        return results
-    except Exception as error:
-        raise error
+    results = pg.corr(x_val, y_val, method="bicor")
+    corr_coeff = results["r"].values[0]
+    p_val = results["p-val"].values[0]
+    return (corr_coeff, p_val)
 
 
 def filter_shared_sample_keys(this_samplelist,
diff --git a/gn3/settings.py b/gn3/settings.py
index d5f1d3c..e85eeff 100644
--- a/gn3/settings.py
+++ b/gn3/settings.py
@@ -22,9 +22,6 @@ SQLALCHEMY_TRACK_MODIFICATIONS = False
 
 GN2_BASE_URL = "http://www.genenetwork.org/"
 
-# biweight script
-BIWEIGHT_RSCRIPT = "~/genenetwork3/scripts/calculate_biweight.R"
-
 # wgcna script
 WGCNA_RSCRIPT = "wgcna_analysis.R"
 # qtlreaper command
diff --git a/guix.scm b/guix.scm
index d8b1596..81e8389 100644
--- a/guix.scm
+++ b/guix.scm
@@ -110,6 +110,7 @@
                        ("r-rjson" ,r-rjson)
                        ("python-plotly" ,python-plotly)
                        ("python-pandas" ,python-pandas)
+                       ("python-pingouin" ,python-pingouin)
                        ("rust-qtlreaper" ,rust-qtlreaper)
                        ("python-flask-cors" ,python-flask-cors)))
   (build-system python-build-system)
diff --git a/mypy.ini b/mypy.ini
index 5d66812..a507703 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -11,3 +11,6 @@ ignore_missing_imports = True
 
 [mypy-ipfshttpclient.*]
 ignore_missing_imports = True
+
+[mypy-pingouin.*]
+ignore_missing_imports = True
\ No newline at end of file
diff --git a/scripts/calculate_biweight.R b/scripts/calculate_biweight.R
deleted file mode 100644
index 8d8366e..0000000
--- a/scripts/calculate_biweight.R
+++ /dev/null
@@ -1,43 +0,0 @@
-
-library(testthat)
-library(WGCNA)
-
-arg_values <- commandArgs(trailingOnly = TRUE)
-ParseArgs <- function(args){
-
-    trait_vals <- as.numeric(unlist(strsplit(args[1], split=" ")))
-    target_vals <- as.numeric(unlist(strsplit(args[2], split=" ")))
-
-    return(list(trait_vals= c(trait_vals),target_vals = c(target_vals)))
-
-}
-BiweightMidCorrelation <- function(trait_val,target_val){
-
-    results <-bicorAndPvalue(as.numeric(unlist(trait_val)),as.numeric(unlist(target_val)))
-    return ((c(c(results$bicor)[1],c(results$p)[1])))
-
-}
-
-
-
-test_that("biweight results"),{
-    vec_1 <- c(1,2,3,4)
-    vec_2 <- c(1,2,3,4)
-
-    results <- BiweightMidCorrelation(vec_1,vec_2)
-    expect_equal(c(1.0,0.0),results)
-}
-
-
-test_that("parsing args "),{
-    my_args <- c("1 2 3 4","5 6 7 8")
-    results <- ParseArgs(my_args)
-
-    expect_equal(results[1],c(1,2,3,4))
-    expect_equal(results[2],c(5,6,7,8))
-}
-
-parsed_values <- ParseArgs(arg_values)
-
-
-cat(BiweightMidCorrelation(parsed_values[1],parsed_values[2]))
\ No newline at end of file
diff --git a/tests/unit/computations/test_biweight.py b/tests/unit/computations/test_biweight.py
deleted file mode 100644
index ad404f1..0000000
--- a/tests/unit/computations/test_biweight.py
+++ /dev/null
@@ -1,21 +0,0 @@
-"""test for biweight script"""
-from unittest import TestCase
-from unittest import mock
-
-from gn3.computations.biweight import calculate_biweight_corr
-
-
-class TestBiweight(TestCase):
-    """test class for biweight"""
-
-    @mock.patch("gn3.computations.biweight.subprocess.check_output")
-    def test_calculate_biweight_corr(self, mock_check_output):
-        """test for calculate_biweight_corr func"""
-        mock_check_output.return_value = "0.1 0.5"
-        results = calculate_biweight_corr(command="Rscript",
-                                          path_to_script="./r_script.R",
-                                          trait_vals=[
-                                              1.2, 1.1, 1.9],
-                                          target_vals=[1.9, 0.4, 1.1])
-
-        self.assertEqual(results, (0.1, 0.5))
diff --git a/tests/unit/computations/test_correlation.py b/tests/unit/computations/test_correlation.py
index fc52ec1..96d9c6d 100644
--- a/tests/unit/computations/test_correlation.py
+++ b/tests/unit/computations/test_correlation.py
@@ -5,7 +5,6 @@ from unittest import mock
 from collections import namedtuple
 
 from gn3.computations.correlations import normalize_values
-from gn3.computations.correlations import do_bicor
 from gn3.computations.correlations import compute_sample_r_correlation
 from gn3.computations.correlations import compute_all_sample_correlation
 from gn3.computations.correlations import filter_shared_sample_keys
@@ -98,16 +97,6 @@ class TestCorrelation(TestCase):
 
         self.assertEqual(results, expected_results)
 
-    @mock.patch("gn3.computations.correlations.calculate_biweight_corr")
-    def test_bicor(self, mock_biweight):
-        """Test for doing biweight mid correlation """
-        mock_biweight.return_value = (1.0, 0.0)
-
-        results = do_bicor(x_val=[1, 2, 3], y_val=[4, 5, 6])
-
-        self.assertEqual(results, (1.0, 0.0)
-                         )
-
     @mock.patch("gn3.computations.correlations.compute_corr_coeff_p_value")
     @mock.patch("gn3.computations.correlations.normalize_values")
     def test_compute_sample_r_correlation(self, norm_vals, compute_corr):
-- 
cgit 1.4.1