about summary refs log tree commit diff
diff options
context:
space:
mode:
authorPjotr Prins2017-10-13 12:28:26 +0000
committerPjotr Prins2017-10-13 15:27:24 +0000
commita610dd723a233aed1abe31aa32e3137b23b5f983 (patch)
tree66172cf10d0d85f5a61219c1b7b074cbadb45575
parent25ad9756ebecfdb2f01b6f87c35bd731e3a3186d (diff)
downloadpangemma-a610dd723a233aed1abe31aa32e3137b23b5f983.tar.gz
OpenBlas: preparing for dgemm use
-rw-r--r--INSTALL.md45
-rw-r--r--Makefile10
-rw-r--r--README.md14
-rw-r--r--src/debug.cpp28
-rw-r--r--src/debug.h9
-rw-r--r--src/eigenlib.cpp13
-rw-r--r--src/fastblas.cpp212
-rw-r--r--src/fastblas.h29
-rw-r--r--src/gemma.cpp27
-rw-r--r--src/lmm.cpp5
-rw-r--r--src/mathfunc.cpp7
-rw-r--r--src/mathfunc.h5
-rw-r--r--src/param.h2
-rw-r--r--test/src/unittests-math.cpp116
14 files changed, 494 insertions, 28 deletions
diff --git a/INSTALL.md b/INSTALL.md
index e450a2a..897b062 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -14,7 +14,7 @@ GEMMA runs on Linux and MAC OSX and the runtime has the following
 dependencies:
 
 * C++ tool chain >= 4.9
-* GNU Science library (GSL) 1.x (GEMMA does not currently work with GSL >= 2).
+* GNU Science library (GSL) 1.x (note that 2.x is not yet supported)
 * blas/openblas
 * lapack
 * [Eigen3 library](http://eigen.tuxfamily.org/dox/)
@@ -100,3 +100,46 @@ GEMMA includes the shunit2 test framework (version 2.0).
 or
 
     ./run_tests.sh
+
+## Optimizing performance
+
+### OpenBlas
+
+Linking against a built-from-source OpenBlas is a first optimization
+step because it will optimize code for the local architecture. When
+you check the output .log file of GEMMA after a run, it will tell you
+how the linked-in OpenBlas was compiled.
+
+To link a new version, compile OpenBlas as per
+[instructions](http://www.openblas.net/).  You can start with the default:
+
+    make -j 4
+
+or play with the switches
+
+    make USE_THREAD=1 NUM_THREADS=16 NO_AFFINITY=1 -j 4
+
+rendering for example:
+
+        OpenBLAS build complete. (BLAS CBLAS)
+        OS               ... Linux
+        Architecture     ... x86_64
+        BINARY           ... 64bit
+        C compiler       ... GCC  (command line : gcc)
+        Library Name     ... libopenblas_haswellp-r0.3.0.dev.a (Multi threaded; Max num-threads is 16)
+
+        To install the library, you can run "make PREFIX=/path/to/your/installation install".
+
+
+This generates a static library which you can link using the full path
+with using the GEMMA Makefile:
+
+    time env OPENBLAS_NUM_THREADS=4 make EIGEN_INCLUDE_PATH=~/.guix-profile/include/eigen3 LIBS="~/tmp/OpenBLAS/libopenblas_haswellp-r0.3.0.dev.a -lgsl -lgslcblas -pthread -lz  -llapack" WITH_OPENBLAS=1 -j 4 fast-check
+
+    make EIGEN_INCLUDE_PATH=~/.guix-profile/include/eigen3 LIBS="~/tmp/OpenBLAS/libopenblas_haswellp-r0.3.0.dev.a -lgsl -lgslcblas -pthread -lz  -llapack" WITH_OPENBLAS=1 -j 4 unittests
+
+Batch of 1000:
+
+real    4m24.923s
+user    4m33.576s
+sys     0m11.004s
diff --git a/Makefile b/Makefile
index 3062afb..7fd518d 100644
--- a/Makefile
+++ b/Makefile
@@ -66,15 +66,17 @@ else
   CPP = g++
 endif
 
-ifdef OPENBLAS
-  WITH_LAPACK =  # OPENBLAS usually includes LAPACK
+ifdef WITH_OPENBLAS
+  OPENBLAS=1
+  # WITH_LAPACK =  # OPENBLAS usually includes LAPACK
+  CPPFLAGS += -DOPENBLAS
 endif
 
 ifdef DEBUG
-  CPPFLAGS = -g $(GCC_FLAGS) -std=gnu++11 -isystem/$(EIGEN_INCLUDE_PATH) -Icontrib/catch-1.9.7 -Isrc
+  CPPFLAGS += -g $(GCC_FLAGS) -std=gnu++11 -isystem/$(EIGEN_INCLUDE_PATH) -Icontrib/catch-1.9.7 -Isrc
 else
   # release mode
-  CPPFLAGS = -DNDEBUG $(GCC_FLAGS) -std=gnu++11 -isystem/$(EIGEN_INCLUDE_PATH) -Icontrib/catch-1.9.7 -Isrc
+  CPPFLAGS += -DNDEBUG $(GCC_FLAGS) -std=gnu++11 -isystem/$(EIGEN_INCLUDE_PATH) -Icontrib/catch-1.9.7 -Isrc
 endif
 
 ifdef SHOW_COMPILER_WARNINGS
diff --git a/README.md b/README.md
index d6209a8..a932d6b 100644
--- a/README.md
+++ b/README.md
@@ -162,11 +162,12 @@ unpack the file.
 LAPACK and BLAS libraries. There is no need to install these
 libraries.
 
-### Building from source
+### Optimizing performance
+
+Precompiled binaries and libraries may not be optimal for your particular
+hardware. See [INSTALL.md](INSTALL.md).
 
-*Note that GEMMA currently does not work with GSL 2.x. We recommend
-linking to the latest version of GSL 1.x, which is GSL 1.16 as of this
-writing.*
+### Building from source
 
 More information on source code, dependencies and installation can be
 found in [INSTALL.md](INSTALL.md).
@@ -180,7 +181,8 @@ Dept. of Biostatistics<br>
 University of Michigan<br>
 2012-2017
 
-Peter Carbonetto, Tim Flutre, Matthew Stephens, Pjotr Prins and others
-have also contributed to the development of this software.
+Peter Carbonetto, Tim Flutre, Matthew Stephens,
+[Pjotr Prins](http://thebird.nl/) and others have also contributed to
+the development of this software.
 
 [latest_release]: https://github.com/genetics-statistics/GEMMA/releases "Most recent stable releases"
diff --git a/src/debug.cpp b/src/debug.cpp
index 0d3c9cc..dacb89d 100644
--- a/src/debug.cpp
+++ b/src/debug.cpp
@@ -18,6 +18,34 @@
 #include "debug.h"
 #include "mathfunc.h"
 
+static bool debug_mode     = false;
+static bool debug_no_check = false;
+static bool debug_strict   = false;
+
+void debug_set_debug_mode(bool setting) { debug_mode = setting; }
+void debug_set_no_check_mode(bool setting) {debug_no_check = setting; }
+void debug_set_strict_mode(bool setting) { debug_strict = setting; }
+
+bool is_debug_mode() { return debug_mode; };
+bool is_no_check_mode() { return debug_no_check; };
+bool is_strict_mode() { return debug_strict; };
+
+/*
+  Helper function to make sure gsl allocations do their job because
+  gsl_matrix_alloc does not initiatize values (behaviour that changed
+  in GSL2) we introduced a 'strict mode' by initializing the buffer
+  with NaNs. This happens in STRICT mode without NO-CHECKS
+  (i.e. -strict option).
+*/
+gsl_matrix *gsl_matrix_safe_alloc(size_t rows,size_t cols) {
+  gsl_matrix *m = gsl_matrix_alloc(rows,cols);
+  enforce_msg(m,"Not enough memory"); // just to be sure when there is no error handler set
+  if (debug_strict && !debug_no_check) {
+    gsl_matrix_set_all(m, nan(""));
+  }
+  return m;
+}
+
 // Helper function called by macro validate_K(K, check)
 void do_validate_K(const gsl_matrix *K, bool do_check, bool strict, const char *__file, int __line) {
   if (do_check) {
diff --git a/src/debug.h b/src/debug.h
index 06ca5cb..0f7b38f 100644
--- a/src/debug.h
+++ b/src/debug.h
@@ -10,6 +10,15 @@ void gemma_gsl_error_handler (const char * reason,
                               const char * file,
                               int line, int gsl_errno);
 
+void debug_set_debug_mode(bool setting);
+void debug_set_no_check_mode(bool setting);
+void debug_set_strict_mode(bool setting);
+
+bool is_debug_mode();
+bool is_no_check_mode();
+bool is_strict_mode();
+
+gsl_matrix *gsl_matrix_safe_alloc(size_t rows,size_t cols);
 
 // Validation routines
 void do_validate_K(const gsl_matrix *K, bool do_check, bool strict, const char *__file, int __line);
diff --git a/src/eigenlib.cpp b/src/eigenlib.cpp
index a8c545c..4d6aacc 100644
--- a/src/eigenlib.cpp
+++ b/src/eigenlib.cpp
@@ -17,16 +17,18 @@
 */
 
 #include "Eigen/Dense"
-#include "gsl/gsl_linalg.h"
+// #include "gsl/gsl_linalg.h"
 #include "gsl/gsl_matrix.h"
-#include "gsl/gsl_vector.h"
+// #include "gsl/gsl_vector.h"
 #include <cmath>
 #include <iostream>
 #include <vector>
+#include <cblas.h>
 
 using namespace std;
 using namespace Eigen;
 
+
 // On two different clusters, compare eigen vs lapack/gsl:
 //
 // dgemm, 5x or 0.5x faster or slower than lapack, 5x or 4x faster than gsl
@@ -57,8 +59,6 @@ void eigenlib_dgemm(const char *TransA, const char *TransB, const double alpha,
       C_mat = alpha * A_mat.transpose() * B_mat.transpose() + beta * C_mat;
     }
   }
-
-  return;
 }
 
 void eigenlib_dgemv(const char *TransA, const double alpha, const gsl_matrix *A,
@@ -75,15 +75,12 @@ void eigenlib_dgemv(const char *TransA, const double alpha, const gsl_matrix *A,
   } else {
     y_vec = alpha * A_mat.transpose() * x_vec + beta * y_vec;
   }
-
-  return;
 }
 
 void eigenlib_invert(gsl_matrix *A) {
   Map<Matrix<double, Dynamic, Dynamic, RowMajor>> A_mat(A->data, A->size1,
                                                         A->size2);
   A_mat = A_mat.inverse();
-  return;
 }
 
 void eigenlib_dsyr(const double alpha, const gsl_vector *b, gsl_matrix *A) {
@@ -92,7 +89,6 @@ void eigenlib_dsyr(const double alpha, const gsl_vector *b, gsl_matrix *A) {
   Map<Matrix<double, Dynamic, 1>, 0, OuterStride<Dynamic>> b_vec(
       b->data, b->size, OuterStride<Dynamic>(b->stride));
   A_mat = alpha * b_vec * b_vec.transpose() + A_mat;
-  return;
 }
 
 void eigenlib_eigensymm(const gsl_matrix *G, gsl_matrix *U, gsl_vector *eval) {
@@ -108,5 +104,4 @@ void eigenlib_eigensymm(const gsl_matrix *G, gsl_matrix *U, gsl_vector *eval) {
     abort();
   eval_vec = es.eigenvalues();
   U_mat = es.eigenvectors();
-  return;
 }
diff --git a/src/fastblas.cpp b/src/fastblas.cpp
new file mode 100644
index 0000000..38ca326
--- /dev/null
+++ b/src/fastblas.cpp
@@ -0,0 +1,212 @@
+/*
+    Genome-wide Efficient Mixed Model Association (GEMMA)
+    Copyright (C) 2011-2017, Xiang Zhou
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "gsl/gsl_matrix.h"
+#include <cmath>
+#include <iomanip>
+#include <vector>
+#include <cblas.h>
+#include "debug.h"
+#include "fastblas.h"
+#include "mathfunc.h"
+
+using namespace std;
+
+/*
+   Reasonably fast function to copy data from standard C array into
+   gsl_matrix. Avoid it for performance critical sections.
+*/
+gsl_matrix *fast_copy(gsl_matrix *m, const double *mem) {
+  auto rows = m->size1;
+  auto cols = m->size2;
+  if (is_strict_mode()) { // slower correct version
+    for (auto r=0; r<rows; r++) {
+      for (auto c=0; c<cols; c++) {
+        gsl_matrix_set(m,r,c,mem[r*cols+c]);
+      }
+    }
+  } else { // faster goes by row
+    for (auto r=0; r<rows; r++) {
+      auto v = gsl_vector_calloc(cols);
+      assert(v->size == cols);
+      assert(v->block->size == cols);
+      assert(v->stride == 1);
+      memcpy(v->block->data,&mem[r*cols],cols*sizeof(double));
+      gsl_matrix_set_row(m,r,v);
+    }
+  }
+  return m;
+}
+
+/*
+    Helper function fast_cblas_dgemm runs the local dgemm
+*/
+void fast_cblas_dgemm(OPENBLAS_CONST enum CBLAS_ORDER Order,
+                      OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
+                      OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB,
+                      OPENBLAS_CONST blasint M,
+                      OPENBLAS_CONST blasint N,
+                      OPENBLAS_CONST blasint K,
+                      OPENBLAS_CONST double alpha,
+                      OPENBLAS_CONST double *A,
+                      OPENBLAS_CONST blasint lda,
+                      OPENBLAS_CONST double *B,
+                      OPENBLAS_CONST blasint ldb,
+                      OPENBLAS_CONST double beta,
+                      double *C,
+                      OPENBLAS_CONST blasint ldc) {
+#ifndef NDEBUG
+  size_t i,j;
+  if (is_debug_mode()) {
+    printf (" Top left corner of matrix A: \n");
+    for (i=0; i<min(M,6); i++) {
+      for (j=0; j<min(K,6); j++) {
+        printf ("%12.0f", A[j+i*K]);
+      }
+      printf ("\n");
+    }
+
+    printf ("\n Top left corner of matrix B: \n");
+    for (i=0; i<min(K,6); i++) {
+      for (j=0; j<min(N,6); j++) {
+        printf ("%12.0f", B[j+i*N]);
+      }
+      printf ("\n");
+    }
+
+    printf ("\n Top left corner of matrix C: \n");
+    for (i=0; i<min(M,6); i++) {
+      for (j=0; j<min(N,6); j++) {
+        printf ("%12.5G", C[j+i*N]);
+      }
+      printf ("\n");
+    }
+
+    cout << scientific << setprecision(3) << "* RowMajor " << Order << "\t" ;
+    cout << "transA " << TransA << "\t" ;
+    cout << "transB " << TransB << "\t" ;
+    cout << "m " << M << "\t" ;
+    cout << "n " << N << "\t" ;
+    cout << "k " << K << "\n" ;
+    cout << "* lda " << lda << "\t" ;
+    cout << "ldb " << ldb << "\t" ;
+    cout << "ldc " << ldc << "\t" ;
+    cout << "alpha " << alpha << "\t" ;
+    cout << "beta " << beta << "\n" ;
+    cout << "* A03 " << A[3] << "\t" ;
+    cout << "B03 " << B[3] << "\t" ;
+    cout << "C03 " << C[3] << "\t" ;
+    cout << "Asum " << sum(A,M,K) << "\t" ;
+    cout << "Bsum " << sum(B,K,N) << "\n" ;
+    cout << "Csum " << sum(C,M,N) << "\n" ;
+  }
+#endif // NDEBUG
+
+  cblas_dgemm(Order,TransA,TransB,M,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
+
+#ifndef NDEBUG
+  if (is_debug_mode()) {
+    printf (" Top left corner of matrix A (cols=k %i, rows=m %i): \n",K,M);
+    for (i=0; i<min(M,6); i++) {
+      for (j=0; j<min(K,6); j++) {
+        printf ("%12.0f", A[j+i*K]);
+      }
+      printf ("\n");
+    }
+
+    printf ("\n Top left corner of matrix B: \n");
+    for (i=0; i<min(K,6); i++) {
+      for (j=0; j<min(N,6); j++) {
+        printf ("%12.0f", B[j+i*N]);
+      }
+      printf ("\n");
+    }
+
+    printf ("\n Top left corner of matrix C: \n");
+    for (i=0; i<min(M,6); i++) {
+      for (j=0; j<min(N,6); j++) {
+      printf ("%12.5G", C[j+i*N]);
+      }
+      printf ("\n");
+    }
+  }
+#endif // NDEBUG
+}
+
+/*
+    Helper function fast_cblas_dgemm converts a GEMMA layout to cblas_dgemm.
+*/
+static void fast_cblas_dgemm(const char *TransA, const char *TransB, const double alpha,
+                             const gsl_matrix *A, const gsl_matrix *B, const double beta,
+                             gsl_matrix *C) {
+  // C++ is row-major
+  auto transA = (*TransA == 'N' || *TransA == 'n' ? CblasNoTrans : CblasTrans);
+  auto transB = (*TransB == 'N' || *TransB == 'n' ? CblasNoTrans : CblasTrans);
+  // A(m x k) * B(k x n) = C(m x n))
+  auto rowsA = A->size1;
+  auto colsA = A->size2;
+  blasint M = A->size1;
+  blasint K = B->size1;
+  assert(K == colsA);
+  blasint N = B->size2;
+  // cout << M << "," << N "," << K << endl;
+  // Layout = CblasRowMajor: Trans: K , NoTrans M
+  blasint lda = (transA==CblasNoTrans ? K : M );
+  blasint ldb = (transB==CblasNoTrans ? N : K );
+  blasint ldc = N;
+  cout << rowsA << endl;
+  assert(transA == CblasNoTrans);
+  assert(transB == CblasNoTrans);
+  assert(rowsA == 2000);
+  assert(colsA == 200);
+  assert(lda == K);
+  assert(ldb == N);
+  assert(ldc == N);
+  assert(A->size2 == A->tda);
+  assert(B->size2 == B->tda);
+  assert(C->size2 == C->tda);
+
+  // cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+  //             m, n, k, alpha, A, k, B, n, beta, C, n);
+  // m = 2000, k = 200, n = 1000;
+  assert(M==2000);
+  assert(K==200);
+  assert(N==1000);
+
+  auto k = K;
+  auto m = M;
+  auto n = N;
+
+  fast_cblas_dgemm(CblasRowMajor, transA, transB, M, N, K, alpha,
+              /* A */ A->data,
+              /* lda */ lda,
+              /* B */ B->data,
+              /* ldb */ ldb,
+              /* beta */ beta,
+              /* C */ C->data, ldc);
+}
+
+/*
+   Use the fasted/supported way to call BLAS dgemm
+*/
+
+void fast_dgemm(const char *TransA, const char *TransB, const double alpha,
+                const gsl_matrix *A, const gsl_matrix *B, const double beta,
+                gsl_matrix *C) {
+  fast_cblas_dgemm(TransA,TransB,alpha,A,B,beta,C);
+}
diff --git a/src/fastblas.h b/src/fastblas.h
new file mode 100644
index 0000000..3c28729
--- /dev/null
+++ b/src/fastblas.h
@@ -0,0 +1,29 @@
+#ifndef __FASTBLAS_H__
+#define __FASTBLAS_H__
+
+#include <assert.h>
+#include <iostream>
+#include "gsl/gsl_matrix.h"
+
+gsl_matrix *fast_copy(gsl_matrix *m, const double *mem);
+
+void fast_cblas_dgemm(OPENBLAS_CONST enum CBLAS_ORDER Order,
+                      OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
+                      OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB,
+                      OPENBLAS_CONST blasint M,
+                      OPENBLAS_CONST blasint N,
+                      OPENBLAS_CONST blasint K,
+                      OPENBLAS_CONST double alpha,
+                      OPENBLAS_CONST double *A,
+                      OPENBLAS_CONST blasint lda,
+                      OPENBLAS_CONST double *B,
+                      OPENBLAS_CONST blasint ldb,
+                      OPENBLAS_CONST double beta,
+                      double *C,
+                      OPENBLAS_CONST blasint ldc);
+
+void fast_dgemm(const char *TransA, const char *TransB, const double alpha,
+                const gsl_matrix *A, const gsl_matrix *B, const double beta,
+                gsl_matrix *C);
+
+#endif
diff --git a/src/gemma.cpp b/src/gemma.cpp
index 76ff999..95630c6 100644
--- a/src/gemma.cpp
+++ b/src/gemma.cpp
@@ -23,6 +23,17 @@
 #include <iostream>
 #include <string>
 #include <sys/stat.h>
+#ifdef OPENBLAS
+#pragma message "Compiling with OPENBLAS"
+extern "C" {
+  // these functions are defined in cblas.h - but if we include that we
+  // conflicts with other BLAS includes
+  int openblas_get_num_threads(void);
+  int openblas_get_parallel(void);
+  char* openblas_get_config(void);
+  char* openblas_get_corename(void);
+}
+#endif
 
 #include "gsl/gsl_blas.h"
 #include "gsl/gsl_cdf.h"
@@ -1578,10 +1589,13 @@ void GEMMA::Assign(int argc, char **argv, PARAM &cPar) {
       cPar.window_ns = atoi(str.c_str());
     } else if (strcmp(argv[i], "-debug") == 0) {
       cPar.mode_debug = true;
+      debug_set_debug_mode(true);
     } else if (strcmp(argv[i], "-no-check") == 0) {
       cPar.mode_check = false;
+      debug_set_no_check_mode(true);
     } else if (strcmp(argv[i], "-strict") == 0) {
       cPar.mode_strict = true;
+      debug_set_strict_mode(true);
     } else {
       cout << "error! unrecognized option: " << argv[i] << endl;
       cPar.error = true;
@@ -3081,9 +3095,16 @@ void GEMMA::WriteLog(int argc, char **argv, PARAM &cPar) {
   }
 
   outfile << "##" << endl;
-  outfile << "## GEMMA Version = " << version << endl;
-  outfile << "## GSL Version   = " << GSL_VERSION << endl;
-  outfile << "## Eigen Version = " << EIGEN_WORLD_VERSION << "." << EIGEN_MAJOR_VERSION << "." << EIGEN_MINOR_VERSION << endl;
+  outfile << "## GEMMA Version    = " << version << endl;
+  outfile << "## GSL Version      = " << GSL_VERSION << endl;
+  outfile << "## Eigen Version    = " << EIGEN_WORLD_VERSION << "." << EIGEN_MAJOR_VERSION << "." << EIGEN_MINOR_VERSION << endl;
+  #ifdef OPENBLAS
+  outfile << "## OpenBlas         = " << openblas_get_config() << " - " << openblas_get_corename() << endl;
+
+  outfile << "##   threads        = " << openblas_get_num_threads() << endl;
+  string* pStr = new string[4] { "sequential", "threaded", "openmp" };
+  outfile << "##   parallel type  = " << pStr[openblas_get_parallel()] << endl;
+  #endif
 
   outfile << "##" << endl;
   outfile << "## Command Line Input = ";
diff --git a/src/lmm.cpp b/src/lmm.cpp
index a49c8c5..71aa184 100644
--- a/src/lmm.cpp
+++ b/src/lmm.cpp
@@ -39,6 +39,7 @@
 #include "gsl/gsl_vector.h"
 
 #include "eigenlib.h"
+
 #include "gzstream.h"
 #include "io.h"
 #include "lapack.h"
@@ -1269,6 +1270,7 @@ void LMM::AnalyzeGene(const gsl_matrix *U, const gsl_vector *eval,
   return;
 }
 
+
 void LMM::Analyze(std::function< SnpNameValues(size_t) >& fetch_snp,
                   const gsl_matrix *U, const gsl_vector *eval,
                   const gsl_matrix *UtW, const gsl_vector *Uty,
@@ -1358,7 +1360,7 @@ void LMM::Analyze(std::function< SnpNameValues(size_t) >& fetch_snp,
   };
 
   const auto num_snps = indicator_snp.size();
-  const size_t progress_step = (num_snps/20>d_pace ? num_snps/20 : d_pace);
+  const size_t progress_step = (num_snps/50>d_pace ? num_snps/50 : d_pace);
 
   for (size_t t = 0; t < num_snps; ++t) {
     if (t % progress_step == 0 || t == (num_snps - 1)) {
@@ -1429,6 +1431,7 @@ void LMM::Analyze(std::function< SnpNameValues(size_t) >& fetch_snp,
       batch_compute(msize);
   }
   batch_compute(c % msize);
+  ProgressBar("Reading SNPs", num_snps - 1, num_snps - 1);
   // cout << "Counted SNPs " << c << " sumStat " << sumStat.size() << endl;
   cout << endl;
 
diff --git a/src/mathfunc.cpp b/src/mathfunc.cpp
index 4203837..fbbc061 100644
--- a/src/mathfunc.cpp
+++ b/src/mathfunc.cpp
@@ -313,6 +313,13 @@ bool isMatrixIllConditioned(const gsl_vector *eigenvalues, double max_ratio) {
   return ret_valid;
 }
 
+double sum(const double *m, size_t rows, size_t cols) {
+  double sum = 0.0;
+  for (auto i = 0; i<rows*cols; i++)
+    sum += m[i];
+  return sum;
+}
+
 double SumVector(const gsl_vector *v) {
   double sum = 0;
   for (int i = 0; i < v->size; i++ ) {
diff --git a/src/mathfunc.h b/src/mathfunc.h
index 6e20b37..804bc20 100644
--- a/src/mathfunc.h
+++ b/src/mathfunc.h
@@ -27,7 +27,7 @@
 #define EIGEN_MINVALUE 1e-10
 
 using namespace std;
-using namespace Eigen;
+
 
 bool has_nan(const vector<double> v);
 
@@ -43,6 +43,7 @@ bool isMatrixPositiveDefinite(const gsl_matrix *G);
 bool isMatrixIllConditioned(const gsl_vector *eigenvalues, double max_ratio=CONDITIONED_MAXRATIO);
 bool isMatrixSymmetric(const gsl_matrix *G);
 gsl_vector *getEigenValues(const gsl_matrix *G);
+double sum(const double *m, size_t rows, size_t cols);
 double SumVector(const gsl_vector *v);
 double CenterVector(gsl_vector *y);
 void CenterVector(gsl_vector *y, const gsl_matrix *W);
@@ -57,6 +58,6 @@ void KroneckerSym(const gsl_matrix *K, const gsl_matrix *V, gsl_matrix *H);
 double UcharToDouble02(const unsigned char c);
 unsigned char Double02ToUchar(const double dosage);
 void uchar_matrix_get_row(const vector<vector<unsigned char>> &X,
-                          const size_t i_row, VectorXd &x_row);
+                          const size_t i_row, Eigen::VectorXd &x_row);
 
 #endif
diff --git a/src/param.h b/src/param.h
index efea99a..4b473c0 100644
--- a/src/param.h
+++ b/src/param.h
@@ -26,7 +26,7 @@
 #include <set>
 #include <vector>
 
-#define K_BATCH_SIZE 1000 // #snps used for batched K
+#define K_BATCH_SIZE 10000 // #snps used for batched K
 #define DEFAULT_PACE 1000
 
 using namespace std;
diff --git a/test/src/unittests-math.cpp b/test/src/unittests-math.cpp
index ac4c180..6486707 100644
--- a/test/src/unittests-math.cpp
+++ b/test/src/unittests-math.cpp
@@ -1,14 +1,22 @@
 #include <catch.hpp>
 #include <iostream>
 #include "gsl/gsl_matrix.h"
-#include "mathfunc.h"
+#include <cblas.h>
+
 #include <algorithm>
 #include <limits>
 #include <numeric>
 
+#include "debug.h"
+#include "mathfunc.h"
+#include "fastblas.h"
+
 using namespace std;
 
 TEST_CASE( "Math functions", "[math]" ) {
+  debug_set_debug_mode(true);
+  debug_set_no_check_mode(false);
+  debug_set_strict_mode(true);
   double data[] = { 2,-1, 0,
                    -1, 2,-1,
                     0,-1, 2};
@@ -51,3 +59,109 @@ TEST_CASE( "Math functions", "[math]" ) {
   REQUIRE (std::isnan(v3[2]));
   REQUIRE(has_nan(v3));
 }
+
+TEST_CASE("cblas_dgemm", "[math]") {
+   double *A, *B, *C;
+   int m, n, k, i, j;
+   double alpha, beta;
+
+   printf ("\n This example computes real matrix C=alpha*A*B+beta*C using \n"
+           " Intel(R) MKL function dgemm, where A, B, and  C are matrices and \n"
+           " alpha and beta are double precision scalars\n\n");
+
+   m = 2000, k = 200, n = 1000;
+   printf (" Initializing data for matrix multiplication C=A*B for matrix \n"
+           " A(%ix%i) and matrix B(%ix%i)\n\n", m, k, k, n);
+   alpha = 1.0; beta = 0.0;
+
+   printf (" Allocating memory for matrices aligned on 64-byte boundary for better \n"
+           " performance \n\n");
+   A = (double *)malloc( m*k*sizeof( double ));
+   B = (double *)malloc( k*n*sizeof( double ));
+   C = (double *)malloc( m*n*sizeof( double ));
+
+   printf (" Intializing matrix data \n\n");
+   for (i = 0; i < (m*k); i++) {
+     A[i] = (double)(i+1);
+   }
+
+   for (i = 0; i < (k*n); i++) {
+     B[i] = (double)(-i-1);
+   }
+
+   for (i = 0; i < (m*n); i++) {
+     C[i] = 0.0;
+   }
+
+   printf (" Computing matrix product using Intel(R) MKL dgemm function via CBLAS interface \n\n");
+   assert(m==2000);
+   assert(k==200);
+   assert(n==1000);
+   //cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+   //            m, n, k, alpha, A, k, B, n, beta, C, n);
+   fast_cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+                    m, n, k, alpha, A, k, B, n, beta, C, n);
+
+   REQUIRE(trunc(C[0]) == -2666620100.0 );
+   REQUIRE(trunc(C[1]) == -2666640200.0 );
+   REQUIRE(trunc(C[2003]) == -10627000400.0 );
+
+}
+
+TEST_CASE("fast_dgemm", "[math]") {
+   double *A, *B, *C;
+   int m, n, k, i, j;
+   double alpha, beta;
+
+   printf ("\n This example computes real matrix C=alpha*A*B+beta*C using \n"
+           " Intel(R) MKL function dgemm, where A, B, and  C are matrices and \n"
+           " alpha and beta are double precision scalars\n\n");
+
+   m = 2000, k = 200, n = 1000;
+   printf (" Initializing data for matrix multiplication C=A*B for matrix \n"
+           " A(%ix%i) and matrix B(%ix%i)\n\n", m, k, k, n);
+   alpha = 1.0; beta = 0.0;
+
+   printf (" Allocating memory for matrices aligned on 64-byte boundary for better \n"
+           " performance \n\n");
+   A = (double *)malloc( m*k*sizeof( double ));
+   B = (double *)malloc( k*n*sizeof( double ));
+   C = (double *)malloc( m*n*sizeof( double ));
+
+   printf (" Intializing matrix data \n\n");
+   for (i = 0; i < (m*k); i++) {
+     A[i] = (double)(i+1);
+   }
+
+   for (i = 0; i < (k*n); i++) {
+     B[i] = (double)(-i-1);
+   }
+
+   for (i = 0; i < (m*n); i++) {
+     C[i] = 0.0;
+   }
+
+   printf (" Computing matrix product using Intel(R) MKL dgemm function via CBLAS interface \n\n");
+   // cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+   //            m, n, k, alpha, A, k, B, n, beta, C, n);
+   // eigenlib_dgemm(const char *TransA, const char *TransB, const double alpha,
+   //                const gsl_matrix *A, const gsl_matrix *B, const double beta,
+   //                gsl_matrix *C) {
+   gsl_matrix *AM = gsl_matrix_safe_alloc(m,k); // rows x cols
+   gsl_matrix *BM = gsl_matrix_safe_alloc(k,n);
+   gsl_matrix *CM = gsl_matrix_calloc(m,n);
+
+   fast_copy(AM,A);
+   fast_copy(BM,B);
+   fast_copy(CM,C);
+   fast_dgemm("N","N",alpha,AM,BM,beta,CM);
+   printf ("\n Computations completed.\n\n");
+   A = AM->data;
+   B = BM->data;
+   C = CM->data;
+
+   REQUIRE(trunc(C[0]) == -2666620100.0 );
+   REQUIRE(trunc(C[1]) == -2666640200.0 );
+   REQUIRE(trunc(C[2003]) == -10627000400.0 );
+
+}