From eb905500b6c654860aa9f9d9c77c7c2614e0ad10 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 23 Feb 2010 13:06:49 +0100 Subject: [PATCH] significant speedup in the matrix-matrix products --- Eigen/src/Core/arch/SSE/PacketMath.h | 39 +- .../Core/products/GeneralBlockPanelKernel.h | 545 ++++++++++-------- Eigen/src/Core/products/GeneralMatrixMatrix.h | 8 +- .../Core/products/SelfadjointMatrixMatrix.h | 66 ++- .../Core/products/TriangularMatrixMatrix.h | 30 +- .../Core/products/TriangularSolverMatrix.h | 22 +- Eigen/src/Core/util/Macros.h | 2 +- bench/bench_gemm.cpp | 4 +- bench/bench_gemm_blas.cpp | 27 +- 9 files changed, 429 insertions(+), 314 deletions(-) diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index a5a56f759..de96aaffa 100644 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -184,11 +184,12 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pload(const float* from) { template<> EIGEN_STRONG_INLINE Packet2d ei_pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); } template<> EIGEN_STRONG_INLINE Packet4i ei_pload(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast(from)); } -#if (!defined __GNUC__) && (!defined __ICC) -template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_ps(from); } -template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); } -template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast(from)); } -#else +// #if (!defined __GNUC__) && (!defined __ICC) +// template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_ps(from); } +// template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); } +// template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast(from)); } +// #else + // Fast unaligned loads. Note that here we cannot directly use intrinsics: this would // require pointer casting to incompatible pointer types and leads to invalid code // because of the strict aliasing rule. The "dummy" stuff are required to enforce @@ -197,28 +198,27 @@ template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu(const int* from) { EIGEN_ template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD - __m128 res; - asm volatile ("movsd %[from0], %[r]" : [r] "=x" (res) : [from0] "m" (*from), [dummy] "m" (*(from+1)) ); - asm volatile ("movhps %[from2], %[r]" : [r] "+x" (res) : [from2] "m" (*(from+2)), [dummy] "m" (*(from+3)) ); - return res; + __m128d res; + res = _mm_load_sd((const double*)(from)) ; + res = _mm_loadh_pd(res, (const double*)(from+2)) ; + return _mm_castpd_ps(res); } template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD __m128d res; - asm volatile ("movsd %[from0], %[r]" : [r] "=x" (res) : [from0] "m" (*from) ); - asm volatile ("movhpd %[from1], %[r]" : [r] "+x" (res) : [from1] "m" (*(from+1)) ); + res = _mm_load_sd(from) ; + res = _mm_loadh_pd(res,from+1); return res; } template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD - __m128i res; - asm volatile ("movsd %[from0], %[r]" : [r] "=x" (res) : [from0] "m" (*from), [dummy] "m" (*(from+1)) ); - asm volatile ("movhps %[from2], %[r]" : [r] "+x" (res) : [from2] "m" (*(from+2)), [dummy] "m" (*(from+3)) ); - return res; + __m128d res; + res = _mm_load_sd((const double*)(from)) ; + res = _mm_loadh_pd(res, (const double*)(from+2)) ; + return _mm_castpd_si128(res); } -#endif template<> EIGEN_STRONG_INLINE void ei_pstore(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); } template<> EIGEN_STRONG_INLINE void ei_pstore(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); } @@ -277,6 +277,13 @@ template<> EIGEN_STRONG_INLINE Packet4i ei_pabs(const Packet4i& a) #endif } +EIGEN_STRONG_INLINE void ei_punpackp(Packet4f* vecs) +{ + vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55)); + vecs[2] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xAA)); + vecs[3] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xFF)); + vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00)); +} #ifdef __SSE3__ // TODO implement SSE2 versions as well as integer versions diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 18e913b0e..dfc92c346 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -30,7 +30,7 @@ #ifdef EIGEN_HAS_FUSE_CJMADD #define CJMADD(A,B,C,T) C = cj.pmadd(A,B,C); #else -#define CJMADD(A,B,C,T) T = A; T = cj.pmul(T,B); C = ei_padd(C,T); +#define CJMADD(A,B,C,T) T = B; T = cj.pmul(A,T); C = ei_padd(C,T); #endif // optimized GEneral packed Block * packed Panel product kernel @@ -48,9 +48,66 @@ struct ei_gebp_kernel const int peeled_mc = (rows/mr)*mr; const int peeled_mc2 = peeled_mc + (rows-peeled_mc >= PacketSize ? PacketSize : 0); const int peeled_kc = (depth/4)*4; + + Scalar* unpackedB = const_cast(blockB - strideB * nr * PacketSize); + // loops on each micro vertical panel of rhs (depth x nr) for(int j2=0; j2 we select a mr x nr micro block of res which is entirely // stored into mr/packet_size x nr registers. @@ -65,19 +122,31 @@ struct ei_gebp_kernel // gets res block as register PacketType C0, C1, C2, C3, C4, C5, C6, C7; - C0 = ei_ploadu(&res[(j2+0)*resStride + i]); - C1 = ei_ploadu(&res[(j2+1)*resStride + i]); - if(nr==4) C2 = ei_ploadu(&res[(j2+2)*resStride + i]); - if(nr==4) C3 = ei_ploadu(&res[(j2+3)*resStride + i]); - C4 = ei_ploadu(&res[(j2+0)*resStride + i + PacketSize]); - C5 = ei_ploadu(&res[(j2+1)*resStride + i + PacketSize]); - if(nr==4) C6 = ei_ploadu(&res[(j2+2)*resStride + i + PacketSize]); - if(nr==4) C7 = ei_ploadu(&res[(j2+3)*resStride + i + PacketSize]); + C0 = ei_pset1(Scalar(0)); + C1 = ei_pset1(Scalar(0)); + if(nr==4) C2 = ei_pset1(Scalar(0)); + if(nr==4) C3 = ei_pset1(Scalar(0)); + C4 = ei_pset1(Scalar(0)); + C5 = ei_pset1(Scalar(0)); + if(nr==4) C6 = ei_pset1(Scalar(0)); + if(nr==4) C7 = ei_pset1(Scalar(0)); + + Scalar* r0 = &res[(j2+0)*resStride + i]; + Scalar* r1 = r0 + resStride; + Scalar* r2 = r1 + resStride; + Scalar* r3 = r2 + resStride; + + #ifdef EIGEN_VECTORIZE_SSE + _mm_prefetch((const char*)(r0+16), _MM_HINT_T0); + _mm_prefetch((const char*)(r1+16), _MM_HINT_T0); + _mm_prefetch((const char*)(r2+16), _MM_HINT_T0); + _mm_prefetch((const char*)(r3+16), _MM_HINT_T0); + #endif // performs "inner" product // TODO let's check wether the flowing peeled loop could not be // optimized via optimal prefetching from one loop to the other - const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr]; + const Scalar* blB = unpackedB; for(int k=0; k=PacketSize) { @@ -256,81 +343,76 @@ struct ei_gebp_kernel if(nr==4) C3 = ei_ploadu(&res[(j2+3)*resStride + i]); // performs "inner" product - const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr]; + const Scalar* blB = unpackedB; for(int k=0; k do the same but with nr==1 for(int j2=packet_cols; j2 for(int j2=0; j2 for(int j2=0; j2 for(int k=0; k(Blocking::Max_kc,depth); // cache block size along the K direction int mc = std::min(Blocking::Max_mc,rows); // cache block size along the M direction - Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); - Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize); + Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc*8); + std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols; + Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB); + Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr; // For each horizontal panel of the rhs, and corresponding panel of the lhs... // (==GEMM_VAR1) @@ -111,7 +113,7 @@ static void run(int rows, int cols, int depth, } ei_aligned_stack_delete(Scalar, blockA, kc*mc); - ei_aligned_stack_delete(Scalar, blockB, kc*cols*Blocking::PacketSize); + ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB); } }; diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index 89cbc3ac0..785045db4 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -95,14 +95,14 @@ struct ei_symm_pack_rhs { for(int k=k2; k(Blocking::Max_mc,rows); // cache block size along the M direction Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); - Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize); + std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols; + Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB); + Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr; ei_gebp_kernel > gebp_kernel; @@ -292,7 +294,7 @@ struct ei_product_selfadjoint_matrix(Blocking::Max_mc,rows); // cache block size along the M direction Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); - Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize); + std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols; + Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB); + Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr; ei_gebp_kernel > gebp_kernel; @@ -346,7 +350,7 @@ struct ei_product_selfadjoint_matrix(Blocking::Max_mc,rows); // cache block size along the M direction Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); - Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize); + std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols; + Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB); +// Scalar* allocatedBlockB = new Scalar[sizeB]; + Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr; Matrix triangularBuffer; triangularBuffer.setZero(); @@ -155,7 +158,7 @@ struct ei_product_triangular_matrix_matrix GEBP with the micro triangular block // The trick is to pack this micro block while filling the opposite triangular part with zeros. - // To this end we do an extra triangular copy to small temporary buffer + // To this end we do an extra triangular copy to a small temporary buffer for (int k=0;k0) @@ -176,7 +179,7 @@ struct ei_product_triangular_matrix_matrix(Blocking::Max_mc,rows); // cache block size along the M direction Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); - Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize); + std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols; + Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar,sizeB); + Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr; Matrix triangularBuffer; triangularBuffer.setZero(); @@ -252,7 +258,7 @@ struct ei_product_triangular_matrix_matrix(Blocking::Max_mc,size); // cache block size along the M direction Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); - Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize); + std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols; + Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB); + Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr; ei_conj_if conj; ei_gebp_kernel > gebp_kernel; @@ -146,7 +148,7 @@ struct ei_triangular_solve_matrix(Blocking::Max_mc,size); // cache block size along the M direction Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); - Scalar* blockB = ei_aligned_stack_new(Scalar, kc*size*Blocking::PacketSize); + std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*size; + Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB); + Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr; ei_conj_if conj; ei_gebp_kernel > gebp_kernel; @@ -215,7 +219,7 @@ struct ei_triangular_solve_matrix0) pack_rhs(geb, &rhs(actual_k2,startPanel), triStride, -1, actual_kc, rs); @@ -230,7 +234,7 @@ struct ei_triangular_solve_matrix0) - pack_rhs_panel(blockB+j2*actual_kc*Blocking::PacketSize, + pack_rhs_panel(blockB+j2*actual_kc, &rhs(actual_k2+panelOffset, actual_j2), triStride, -1, panelLength, actualPanelWidth, actual_kc, panelOffset); @@ -260,10 +264,10 @@ struct ei_triangular_solve_matrix0) { gebp_kernel(&lhs(i2,absolute_j2), otherStride, - blockA, blockB+j2*actual_kc*Blocking::PacketSize, + blockA, blockB+j2*actual_kc, actual_mc, panelLength, actualPanelWidth, actual_kc, actual_kc, // strides - panelOffset, panelOffset*Blocking::PacketSize); // offsets + panelOffset, panelOffset); // offsets } // unblocked triangular solve @@ -298,7 +302,7 @@ struct ei_triangular_solve_matrix(a.data()),&lda, + const_cast(b.data()),&ldb,&fone, + c.data(),&ldc); +} + +void blas_gemm(const MatrixXd& a, const MatrixXd& b, MatrixXd& c) +{ int M = c.rows(); int N = c.cols(); int K = a.cols(); @@ -40,16 +55,16 @@ void blas_gemm(const MatrixXf& a, const MatrixXf& b, MatrixXf& c) int ldb = b.rows(); int ldc = c.rows(); - sgemm_(¬rans,¬rans,&M,&N,&K,&fone, - const_cast(a.data()),&lda, - const_cast(b.data()),&ldb,&fzero, + dgemm_(¬rans,¬rans,&M,&N,&K,&done, + const_cast(a.data()),&lda, + const_cast(b.data()),&ldb,&done, c.data(),&ldc); } int main(int argc, char **argv) { - int rep = 2; - int s = 1024; + int rep = 1; + int s = 2048; int m = s; int n = s; int p = s;