diff --git a/Eigen/src/Core/products/SelfadjointProduct.h b/Eigen/src/Core/products/SelfadjointProduct.h index 8967f62be..b10b009e8 100644 --- a/Eigen/src/Core/products/SelfadjointProduct.h +++ b/Eigen/src/Core/products/SelfadjointProduct.h @@ -74,47 +74,51 @@ struct ei_selfadjoint_product int mc = std::min(Blocking::Max_mc,size); // cache block size along the M direction Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); - Scalar* blockB = ei_aligned_stack_new(Scalar, kc*size*Blocking::PacketSize); - + std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*size; + Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB); + Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr; + // note that the actual rhs is the transpose/adjoint of mat typedef ei_conj_helper::IsComplex && !AAT, NumTraits::IsComplex && AAT> Conj; ei_gebp_kernel gebp_kernel; + ei_gemm_pack_rhs pack_rhs; + ei_gemm_pack_lhs pack_lhs; + ei_sybb_kernel sybb; for(int k2=0; k2() - (blockB, &mat(0,k2), matStride, alpha, actual_kc, size); + pack_rhs(blockB, &mat(0,k2), matStride, alpha, actual_kc, size); for(int i2=0; i2() - (blockA, &mat(i2, k2), matStride, actual_kc, actual_mc); + pack_lhs(blockA, &mat(i2, k2), matStride, actual_kc, actual_mc); // the selected actual_mc * size panel of res is split into three different part: // 1 - before the diagonal => processed with gebp or skipped // 2 - the actual_mc x actual_mc symmetric block => processed with a special kernel // 3 - after the diagonal => processed with gebp or skipped if (UpLo==Lower) - gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, std::min(size,i2)); + gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, std::min(size,i2), + -1, -1, 0, 0, allocatedBlockB); - ei_sybb_kernel() - (res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*Blocking::PacketSize*i2, actual_mc, actual_kc); + sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, allocatedBlockB); if (UpLo==Upper) { int j2 = i2+actual_mc; - gebp_kernel(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*Blocking::PacketSize*j2, actual_mc, actual_kc, std::max(0,size-j2)); + gebp_kernel(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, std::max(0,size-j2), + -1, -1, 0, 0, allocatedBlockB); } } } ei_aligned_stack_delete(Scalar, blockA, kc*mc); - ei_aligned_stack_delete(Scalar, blockB, kc*size*Blocking::PacketSize); + ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB); } }; @@ -161,7 +165,7 @@ struct ei_sybb_kernel PacketSize = ei_packet_traits::size, BlockSize = EIGEN_ENUM_MAX(mr,nr) }; - void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int size, int depth) + void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int size, int depth, Scalar* workspace) { ei_gebp_kernel gebp_kernel; Matrix buffer; @@ -171,7 +175,7 @@ struct ei_sybb_kernel for (int j=0; j(BlockSize,size - j); - const Scalar* actual_b = blockB+j*depth*PacketSize; + const Scalar* actual_b = blockB+j*depth; if(UpLo==Upper) gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize); @@ -181,7 +185,8 @@ struct ei_sybb_kernel int i = j; buffer.setZero(); // 1 - apply the kernel on the temporary buffer - gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize); + gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, + -1, -1, 0, 0, workspace); // 2 - triangular accumulation for(int j1=0; j1