/* Genome-wide Efficient Mixed Model Association (GEMMA) Copyright © 2011-2017, Xiang Zhou Copyright © 2017, Peter Carbonetto Copyright © 2017-2020, Pjotr Prins This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <algorithm> // std::min #include <cmath> #include <iomanip> #include <vector> #include "debug.h" #include "mathfunc.h" #include <string.h> #include "fastblas.h" const char *FastblasTrans = "T"; const char *FastblasNoTrans = "N"; using namespace std; /* Reasonably fast function to copy data from standard C array into gsl_matrix. Avoid it for performance critical sections. */ gsl_matrix *fast_copy(gsl_matrix *m, const double *mem) { auto rows = m->size1; auto cols = m->size2; if (is_strict_mode()) { // slower correct version for (size_t r=0; r<rows; r++) { for (size_t c=0; c<cols; c++) { gsl_matrix_set(m,r,c,mem[r*cols+c]); } } } else { // faster goes by row auto v = gsl_vector_calloc(cols); enforce(v); // just to be sure for (size_t r=0; r<rows; r++) { assert(v->size == cols); assert(v->block->size == cols); assert(v->stride == 1); memcpy(v->block->data,&mem[r*cols],cols*sizeof(double)); gsl_matrix_set_row(m,r,v); } gsl_vector_free(v); } return m; } /* Helper function fast_cblas_dgemm runs the local dgemm */ void fast_cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const size_t M, const size_t N, const size_t K, const double alpha, const double *A, const size_t lda, const double *B, const size_t ldb, const double beta, double *C, const size_t ldc) { #ifndef NDEBUG if (is_debug_mode()) { #ifdef DISABLED size_t i,j; printf (" Top left corner of matrix A: \n"); for (i=0; i<min(M,6); i++) { for (j=0; j<min(K,6); j++) { printf ("%12.0f", A[j+i*K]); } printf ("\n"); } printf ("\n Top left corner of matrix B: \n"); for (i=0; i<min(K,6); i++) { for (j=0; j<min(N,6); j++) { printf ("%12.0f", B[j+i*N]); } printf ("\n"); } printf ("\n Top left corner of matrix C: \n"); for (i=0; i<min(M,6); i++) { for (j=0; j<min(N,6); j++) { printf ("%12.5G", C[j+i*N]); } printf ("\n"); } #endif cout << scientific << setprecision(3) << "* RowMajor " << Order << "\t" ; cout << "transA " << TransA << "\t" ; cout << "transB " << TransB << "\t" ; cout << "m " << M << "\t" ; cout << "n " << N << "\t" ; cout << "k " << K << "\n" ; cout << "* lda " << lda << "\t" ; cout << "ldb " << ldb << "\t" ; cout << "ldc " << ldc << "\t" ; cout << "alpha " << alpha << "\t" ; cout << "beta " << beta << "\n" ; cout << "* A03 " << A[3] << "\t" ; cout << "B03 " << B[3] << "\t" ; cout << "C03 " << C[3] << "\t" ; cout << "Asum " << sum(A,M,K) << "\t" ; cout << "Bsum " << sum(B,K,N) << "\n" ; cout << "Csum " << sum(C,M,N) << "\n" ; } #endif // NDEBUG // Check for (integer) overflows enforce(M>0); enforce(N>0); enforce(K>0); // check_int_mult_overflow(560000,8000); // fails on default int (32-bits) check_int_mult_overflow(M,K); check_int_mult_overflow(N,K); check_int_mult_overflow(M,N); cblas_dgemm(Order,TransA,TransB,M,N,K,alpha,A,lda,B,ldb,beta,C,ldc); #ifndef NDEBUG #ifdef DISABLED if (is_debug_mode()) { printf (" Top left corner of matrix A (cols=k %i, rows=m %i): \n",K,M); for (i=0; i<min(M,6); i++) { for (j=0; j<min(K,6); j++) { printf ("%12.0f", A[j+i*K]); } printf ("\n"); } printf ("\n Top left corner of matrix B: \n"); for (i=0; i<min(K,6); i++) { for (j=0; j<min(N,6); j++) { printf ("%12.0f", B[j+i*N]); } printf ("\n"); } printf ("\n Top left corner of matrix C: \n"); for (i=0; i<min(M,6); i++) { for (j=0; j<min(N,6); j++) { printf ("%12.5G", C[j+i*N]); } printf ("\n"); } } #endif #endif // NDEBUG } /* Helper function fast_cblas_dgemm converts a GEMMA layout to cblas_dgemm. */ static void fast_cblas_dgemm(const char *TransA, const char *TransB, const double alpha, const gsl_matrix *A, const gsl_matrix *B, const double beta, gsl_matrix *C) { // C++ is row-major auto transA = (*TransA == 'N' || *TransA == 'n' ? CblasNoTrans : CblasTrans); auto transB = (*TransB == 'N' || *TransB == 'n' ? CblasNoTrans : CblasTrans); const size_t M = C->size1; const size_t N = C->size2; const size_t MA = (transA == CblasNoTrans) ? A->size1 : A->size2; const size_t NA = (transA == CblasNoTrans) ? A->size2 : A->size1; const size_t MBx = (transB == CblasNoTrans) ? B->size1 : B->size2; const size_t NB = (transB == CblasNoTrans) ? B->size2 : B->size1; if (M == MA && N == NB && NA == MBx) { /* [MxN] = [MAxNA][MBxNB] */ auto K = NA; // Check for (integer) overflows enforce(M>0); enforce(N>0); enforce(K>0); // check_int_mult_overflow(560000,8000); check_int_mult_overflow(M,K); check_int_mult_overflow(N,K); check_int_mult_overflow(M,N); cblas_dgemm (CblasRowMajor, transA, transB, M, N, NA, alpha, A->data, A->tda, B->data, B->tda, beta, C->data, C->tda); } else { fail_msg("Range error in dgemm"); } } /* Use the fast/supported way to call BLAS dgemm */ void fast_dgemm(const char *TransA, const char *TransB, const double alpha, const gsl_matrix *A, const gsl_matrix *B, const double beta, gsl_matrix *C) { fast_cblas_dgemm(TransA,TransB,alpha,A,B,beta,C); #ifdef DISABLE if (is_check_mode()) { // ---- validate with original implementation gsl_matrix *C1 = gsl_matrix_alloc(C->size1,C->size2); eigenlib_dgemm(TransA,TransB,alpha,A,B,beta,C1); enforce_msg(gsl_matrix_equal(C,C1),"dgemm outcomes are not equal for fast & eigenlib"); gsl_matrix_free(C1); } #endif } void fast_eigen_dgemm(const char *TransA, const char *TransB, const double alpha, const gsl_matrix *A, const gsl_matrix *B, const double beta, gsl_matrix *C) { fast_cblas_dgemm(TransA,TransB,alpha,A,B,beta,C); } /* * Inverse in place */ #include <gsl/gsl_permutation.h> // #include <gsl/gsl_linalg.h> extern "C" { int gsl_linalg_LU_invert(const gsl_matrix * LU, const gsl_permutation * p, gsl_matrix * inverse); int gsl_linalg_LU_decomp(gsl_matrix * A, gsl_permutation * p, int * signum); } void gsl_matrix_inv(gsl_matrix *m) { size_t n=m->size1; gsl_matrix *temp1=gsl_matrix_calloc(n,n); gsl_matrix_memcpy(temp1,m); gsl_permutation *p=gsl_permutation_calloc(n); int sign=0; gsl_linalg_LU_decomp(temp1,p,&sign); gsl_matrix *inverse=gsl_matrix_calloc(n,n); gsl_linalg_LU_invert(temp1,p,inverse); gsl_matrix_memcpy(m,inverse); gsl_permutation_free(p); gsl_matrix_free(temp1); gsl_matrix_free(inverse); } void fast_inverse(gsl_matrix *m) { gsl_matrix_inv(m); }