From c6d06c22ac6f4b1c2923f050a3b8087f7b717ab0 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 24 Jul 2009 10:53:31 +0200 Subject: [PATCH] some cleaning --- Eigen/src/Core/products/GeneralMatrixMatrix.h | 122 +++---- .../Core/products/SelfadjointMatrixMatrix.h | 39 +-- Eigen/src/Core/products/SelfadjointProduct.h | 327 ++---------------- 3 files changed, 108 insertions(+), 380 deletions(-) diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index 9166921fe..776b7c5d6 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -78,9 +78,6 @@ static void run(int rows, int cols, int depth, Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize); - // number of columns which can be processed by packet of nr columns - int packet_cols = (cols/Blocking::nr) * Blocking::nr; - // => GEMM_VAR1 for(int k2=0; k2()(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, packet_cols, cols); + ei_gemm_pack_rhs()(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, cols); // => GEPP_VAR1 for(int i2=0; i2()(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc); ei_gebp_kernel >() - (res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, i2, cols); + (res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols); } } @@ -113,19 +110,20 @@ static void run(int rows, int cols, int depth, template struct ei_gebp_kernel { - void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int actual_mc, int actual_kc, int packet_cols, int i2, int cols) + void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int rows, int depth, int cols) { typedef typename ei_packet_traits::type PacketType; enum { PacketSize = ei_packet_traits::size }; Conj cj; - const int peeled_mc = (actual_mc/mr)*mr; + int packet_cols = (cols/nr) * nr; + const int peeled_mc = (rows/mr)*mr; // loops on each cache friendly block of the result/rhs for(int j2=0; j2 struct ei_gemm_pack_lhs { - void operator()(Scalar* blockA, const EIGEN_RESTRICT Scalar* _lhs, int lhsStride, int actual_kc, int actual_mc) + void operator()(Scalar* blockA, const EIGEN_RESTRICT Scalar* _lhs, int lhsStride, int depth, int rows) { ei_conj_if::IsComplex && Conjugate> cj; ei_const_blas_data_mapper lhs(_lhs,lhsStride); int count = 0; - const int peeled_mc = (actual_mc/mr)*mr; + const int peeled_mc = (rows/mr)*mr; for(int i=0; i struct ei_gemm_pack_rhs { enum { PacketSize = ei_packet_traits::size }; - void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int actual_kc, int packet_cols, int cols) + void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols) { bool hasAlpha = alpha != Scalar(1); + int packet_cols = (cols/nr) * nr; int count = 0; for(int j2=0; j2 const Scalar* b3 = &rhs[(j2+3)*rhsStride]; if (hasAlpha) { - for(int k=0; k } else { - for(int k=0; k const Scalar* b0 = &rhs[(j2+0)*rhsStride]; if (hasAlpha) { - for(int k=0; k } else { - for(int k=0; k struct ei_gemm_pack_rhs { enum { PacketSize = ei_packet_traits::size }; - void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int actual_kc, int packet_cols, int cols) + void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols) { bool hasAlpha = alpha != Scalar(1); + int packet_cols = (cols/nr) * nr; int count = 0; for(int j2=0; j2 } else { - for(int k=0; k for(int j2=packet_cols; j2 struct ei_symm_pack_lhs { - void operator()(Scalar* blockA, const Scalar* _lhs, int lhsStride, int actual_kc, int actual_mc) + void operator()(Scalar* blockA, const Scalar* _lhs, int lhsStride, int cols, int rows) { ei_const_blas_data_mapper lhs(_lhs,lhsStride); int count = 0; - const int peeled_mc = (actual_mc/mr)*mr; + const int peeled_mc = (rows/mr)*mr; for(int i=0; i struct ei_symm_pack_rhs { enum { PacketSize = ei_packet_traits::size }; - void operator()(Scalar* blockB, const Scalar* _rhs, int rhsStride, Scalar alpha, int actual_kc, int packet_cols, int cols, int k2) + void operator()(Scalar* blockB, const Scalar* _rhs, int rhsStride, Scalar alpha, int rows, int cols, int k2) { - int end_k = k2 + actual_kc; + int end_k = k2 + rows; int count = 0; ei_const_blas_data_mapper rhs(_rhs,rhsStride); + int packet_cols = (cols/nr)*nr; // first part: normal case for(int j2=0; j2 > gebp_kernel; for(int k2=0; k2() - (blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, packet_cols, cols); + (blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, cols); // the select lhs's panel has to be split in three different parts: // 1 - the transposed panel above the diagonal block => transposed packed copy @@ -259,7 +257,7 @@ struct ei_product_selfadjoint_matrix() (blockA, &lhs(k2, i2), lhsStride, actual_kc, actual_mc); - gebp_kernel(res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, i2, cols); + gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols); } // the block diagonal { @@ -268,7 +266,7 @@ struct ei_product_selfadjoint_matrix() (blockA, &lhs(k2,k2), lhsStride, actual_kc, actual_mc); - gebp_kernel(res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, k2, cols); + gebp_kernel(res+k2, resStride, blockA, blockB, actual_mc, actual_kc, cols); } for(int i2=k2+kc; i2() (blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc); - gebp_kernel(res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, i2, cols); + gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols); } } @@ -316,9 +314,6 @@ struct ei_product_selfadjoint_matrix > gebp_kernel; for(int k2=0; k2() - (blockB, _rhs, rhsStride, alpha, actual_kc, packet_cols, cols, k2); + (blockB, _rhs, rhsStride, alpha, actual_kc, cols, k2); // => GEPP for(int i2=0; i2() (blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc); - gebp_kernel(res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, i2, cols); + gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols); } } diff --git a/Eigen/src/Core/products/SelfadjointProduct.h b/Eigen/src/Core/products/SelfadjointProduct.h index facde7252..d971720d6 100644 --- a/Eigen/src/Core/products/SelfadjointProduct.h +++ b/Eigen/src/Core/products/SelfadjointProduct.h @@ -70,7 +70,7 @@ struct ei_selfadjoint_product typedef ei_product_blocking_traits Blocking; - int kc = std::min(Blocking::Max_kc,depth); // cache block size along the K direction + int kc = std::min(Blocking::Max_kc,depth); // cache block size along the K direction int mc = std::min(Blocking::Max_mc,size); // cache block size along the M direction Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); @@ -90,7 +90,7 @@ struct ei_selfadjoint_product // note that the actual rhs is the transpose/adjoint of mat ei_gemm_pack_rhs() - (blockB, &mat(0,k2), matStride, alpha, actual_kc, packet_cols, size); + (blockB, &mat(0,k2), matStride, alpha, actual_kc, size); for(int i2=0; i2 // 2 - the actual_mc x actual_mc symmetric block => processed with a special kernel // 3 - after the diagonal => processed with gebp or skipped if (UpLo==LowerTriangular) - gebp_kernel(res, resStride, blockA, blockB, actual_mc, actual_kc, std::min(packet_cols,i2), i2, std::min(size,i2)); + gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, std::min(size,i2)); ei_sybb_kernel() - (res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*Blocking::PacketSize*i2, actual_mc, actual_kc, std::min(actual_mc,std::max(packet_cols-i2,0))); + (res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*Blocking::PacketSize*i2, actual_mc, actual_kc); if (UpLo==UpperTriangular) { int j2 = i2+actual_mc; - gebp_kernel(res+resStride*j2, resStride, blockA, blockB+actual_kc*Blocking::PacketSize*j2, actual_mc, actual_kc, - std::max(0,packet_cols-j2), i2, std::max(0,size-j2)); + gebp_kernel(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*Blocking::PacketSize*j2, actual_mc, actual_kc, std::max(0,size-j2)); } } } @@ -149,323 +148,57 @@ SelfAdjointView& SelfAdjointView } -// optimized SYmmetric packed Block * packed Block product kernel +// Optimized SYmmetric packed Block * packed Block product kernel. +// This kernel is built on top of the gebp kernel: +// - the current selfadjoint block (res) is processed per panel of actual_mc x BlockSize +// where BlockSize is set to the minimal value allowing gebp to be as fast as possible +// - then, as usual, each panel is split into three parts along the diagonal, +// the sub blocks above and below the diagonal are processed as usual, +// while the selfadjoint block overlapping the diagonal is evaluated into a +// small temporary buffer which is then accumulated into the result using a +// triangular traversal. template struct ei_sybb_kernel { enum { PacketSize = ei_packet_traits::size, - BlockSize = EIGEN_ENUM_MAX(mr,nr) + BlockSize = EIGEN_ENUM_MAX(mr,nr) }; - void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int actual_mc, int actual_kc, int packet_cols) + void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int size, int depth) { ei_gebp_kernel gebp_kernel; Matrix buffer; - + // let's process the block per panel of actual_mc x BlockSize, // again, each is split into three parts, etc. - for (int j=0; j(BlockSize,size - j); + const Scalar* actual_b = blockB+j*depth*PacketSize; + if(UpLo==UpperTriangular) - gebp_kernel(res+j*resStride, resStride, blockA, actual_b, - j, actual_kc, actualPacketCols, 0, actualBlockSize); + gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize); // selfadjoint micro block - // => use the gebp kernel on a temporary buffer, - // and then perform a triangular copy { int i = j; buffer.setZero(); - gebp_kernel(buffer.data(), BlockSize, blockA+actual_kc*i, actual_b, - actualBlockSize, actual_kc, actualPacketCols, 0, actualBlockSize); + // 1 - apply the kernel on the temporary buffer + gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize); + // 2 - triangular accumulation for(int j1=0; j1 TODO find a way to factorize the two kernels in a single one -template -struct ei_sybb_kernel_ -{ - void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int actual_mc, int actual_kc, int packet_cols) - { - typedef typename ei_packet_traits::type PacketType; - enum { PacketSize = ei_packet_traits::size }; - Conj cj; - const int peeled_mc = (actual_mc/mr)*mr; - // loops on each cache friendly block of the result/rhs - for(int j2=0; j2=j2+nr : i+mr<=j2) - { - ei_pstoreu(&res[(j2+0)*resStride + i], C0); - ei_pstoreu(&res[(j2+1)*resStride + i], C1); - if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i], C2); - if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i], C3); - ei_pstoreu(&res[(j2+0)*resStride + i + PacketSize], C4); - ei_pstoreu(&res[(j2+1)*resStride + i + PacketSize], C5); - if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i + PacketSize], C6); - if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i + PacketSize], C7); - } - else - { - Scalar buf[mr*nr]; - // overlap => copy to a temporary mr x nr buffer and then triangular copy - ei_pstore(&buf[0*mr], C0); - ei_pstore(&buf[1*mr], C1); - if(nr==4) ei_pstore(&buf[2*mr], C2); - if(nr==4) ei_pstore(&buf[3*mr], C3); - ei_pstore(&buf[0*mr + PacketSize], C4); - ei_pstore(&buf[1*mr + PacketSize], C5); - if(nr==4) ei_pstore(&buf[2*mr + PacketSize], C6); - if(nr==4) ei_pstore(&buf[3*mr + PacketSize], C7); - - for(int j1=0; j1= j2+j1 : i+i1 <= j2+j1) - res[(j2+j1)*resStride + i+i1] = buf[i1 + j1 * mr]; - } - } - } - for(int i=std::max(start_i,peeled_mc); i=j2+nr : i+mr<=j2) { - res[(j2+0)*resStride + i] += C0; - res[(j2+1)*resStride + i] += C1; - if(nr==4) res[(j2+2)*resStride + i] += C2; - if(nr==4) res[(j2+3)*resStride + i] += C3; - } - else - { - if(UpLo==LowerTriangular ? i>=j2+0 : i<=j2+0) res[(j2+0)*resStride + i] += C0; - if(UpLo==LowerTriangular ? i>=j2+1 : i<=j2+1) res[(j2+1)*resStride + i] += C1; - if(nr==4) if(UpLo==LowerTriangular ? i>=j2+2 : i<=j2+2) res[(j2+2)*resStride + i] += C2; - if(nr==4) if(UpLo==LowerTriangular ? i>=j2+3 : i<=j2+3) res[(j2+3)*resStride + i] += C3; - } - } - } - - // process remaining rhs/res columns one at a time - // => do the same but with nr==1 - for(int j2=packet_cols; j2=j2 : i<=j2) ei_pstoreu(&res[(j2+0)*resStride + i], C0); - if(UpLo==LowerTriangular ? i+PacketSize>=j2 : i+PacketSize<=j2) ei_pstoreu(&res[(j2+0)*resStride + i + PacketSize], C4); - } - if(UpLo==LowerTriangular) - start_i = j2; - for(int i=std::max(start_i,peeled_mc); i