diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index baa25907c..8c0fd9a7f 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -277,13 +277,9 @@ template<> EIGEN_STRONG_INLINE Packet1cd ei_pmul(const Packet1cd& a, { // TODO optimize it for SSE3 and 4 #ifdef EIGEN_VECTORIZE_SSE3 -// return Packet1cd(_mm_addsub_pd(_mm_mul_pd(a.v, b.v), -// _mm_mul_pd(a.v, b.v/*ei_vec2d_swizzle1(b.v, 1, 0)*/))); - return Packet1cd(_mm_add_pd(_mm_mul_pd(a.v, b.v), - _mm_mul_pd(a.v, ei_vec2d_swizzle1(b.v, 1, 0)))); -// return Packet1cd(_mm_addsub_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 0, 0), b.v), -// _mm_mul_pd(ei_vec2d_swizzle1(a.v, 1, 1), -// ei_vec2d_swizzle1(b.v, 1, 0)))); + return Packet1cd(_mm_addsub_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 0, 0), b.v), + _mm_mul_pd(ei_vec2d_swizzle1(a.v, 1, 1), + ei_vec2d_swizzle1(b.v, 1, 0)))); #else const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0)); return Packet1cd(_mm_add_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 0, 0), b.v), diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index a5297a31f..6824bc64d 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -25,6 +25,9 @@ #ifndef EIGEN_GENERAL_BLOCK_PANEL_H #define EIGEN_GENERAL_BLOCK_PANEL_H +template +class ei_gebp_traits; + /** \internal */ inline void ei_manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdiff_t* l2=0) { @@ -97,7 +100,7 @@ inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2) * for matrix products and related algorithms. The blocking sizes depends on various * parameters: * - the L1 and L2 cache sizes, - * - the register level blocking sizes defined by ei_product_blocking_traits, + * - the register level blocking sizes defined by ei_gebp_traits, * - the number of scalars that fit into a packet (when vectorization is enabled). * * \sa setCpuCacheSizes */ @@ -114,9 +117,9 @@ void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrd std::ptrdiff_t l1, l2; enum { - kdiv = KcFactor * 2 * ei_product_blocking_traits::nr + kdiv = KcFactor * 2 * ei_gebp_traits::nr * ei_packet_traits::size * sizeof(RhsScalar), - mr = ei_product_blocking_traits::mr, + mr = ei_gebp_traits::mr, mr_mask = (0xffffffff/mr)*mr }; @@ -184,10 +187,20 @@ public: enum { ConjLhs = _ConjLhs, ConjRhs = _ConjRhs, - Vectorizable = ei_product_blocking_traits::Vectorizable, + Vectorizable = ei_packet_traits::Vectorizable && ei_packet_traits::Vectorizable, LhsPacketSize = Vectorizable ? ei_packet_traits::size : 1, RhsPacketSize = Vectorizable ? ei_packet_traits::size : 1, ResPacketSize = Vectorizable ? ei_packet_traits::size : 1, + + NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, + + // register block size along the N direction (must be either 2 or 4) + nr = NumberOfRegisters/4, + + // register block size along the M direction (currently, this one cannot be modified) + mr = 2 * LhsPacketSize, + + WorkSpaceFactor = nr * RhsPacketSize, LhsProgress = LhsPacketSize, RhsProgress = RhsPacketSize @@ -250,10 +263,15 @@ public: enum { ConjLhs = _ConjLhs, ConjRhs = false, - Vectorizable = ei_product_blocking_traits::Vectorizable, + Vectorizable = ei_packet_traits::Vectorizable && ei_packet_traits::Vectorizable, LhsPacketSize = Vectorizable ? ei_packet_traits::size : 1, RhsPacketSize = Vectorizable ? ei_packet_traits::size : 1, ResPacketSize = Vectorizable ? ei_packet_traits::size : 1, + + NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, + nr = NumberOfRegisters/4, + mr = 2 * LhsPacketSize, + WorkSpaceFactor = nr*RhsPacketSize, LhsProgress = LhsPacketSize, RhsProgress = RhsPacketSize @@ -307,12 +325,11 @@ public: EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const { - r = ei_pmadd(c,alpha,r); + r = cj.pmadd(c,alpha,r); } protected: -// ei_conj_helper cj; -// ei_conj_helper pcj; + ei_conj_helper cj; }; template @@ -331,6 +348,10 @@ public: && ei_packet_traits::Vectorizable, RealPacketSize = Vectorizable ? ei_packet_traits::size : 1, ResPacketSize = Vectorizable ? ei_packet_traits::size : 1, + + nr = 2, + mr = 2 * ResPacketSize, + WorkSpaceFactor = Vectorizable ? 2*nr*RealPacketSize : nr, LhsProgress = ResPacketSize, RhsProgress = Vectorizable ? 2*ResPacketSize : 1 @@ -424,7 +445,7 @@ public: else if((ConjLhs)&&(ConjRhs)) { tmp = ei_pcplxflip(ResPacket(c.second)); - tmp = ei_padd(ei_pconj(ResPacket(c.first)),tmp); + tmp = ei_psub(ei_pconj(ResPacket(c.first)),tmp); } r = ei_pmadd(tmp,alpha,r); @@ -434,21 +455,6 @@ protected: ei_conj_helper cj; }; -/* Specialization for real * complex. - * The only subtility is how the lhs coefficients are loaded. - */ -template -struct ei_product_blocking_traits > -{ - typedef std::complex Scalar; - enum { - Vectorizable = ei_packet_traits::Vectorizable, - PacketSize = ei_packet_traits::size, - nr = 4, - mr = 2*PacketSize - }; -}; - template class ei_gebp_traits, false, _ConjRhs > { @@ -465,7 +471,12 @@ public: && ei_packet_traits::Vectorizable, LhsPacketSize = Vectorizable ? ei_packet_traits::size : 1, RhsPacketSize = Vectorizable ? ei_packet_traits::size : 1, - ResPacketSize = Vectorizable ? ei_packet_traits::size : 1, + ResPacketSize = Vectorizable ? ei_packet_traits::size : 1, + + NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, + nr = 4, + mr = 2*ResPacketSize, + WorkSpaceFactor = nr*RhsPacketSize, LhsProgress = ResPacketSize, RhsProgress = ResPacketSize @@ -500,7 +511,6 @@ public: EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { dest = ei_ploaddup(a); -// dest = ei_pload(a); } EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const @@ -878,7 +888,7 @@ EIGEN_ASM_COMMENT("mybegin4"); traits.madd(A0,B1,C1,B1); traits.loadRhs(&blB[13*RhsProgress], B1); traits.madd(A0,B2,C2,B2); - traits.loadRhs(&blB[4*RhsProgress], B2); + traits.loadRhs(&blB[14*RhsProgress], B2); traits.madd(A0,B3,C3,B3); traits.loadLhs(&blA[3*LhsProgress], A0); diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index 09c7fe24c..27535a980 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -73,22 +73,15 @@ static void run(Index rows, Index cols, Index depth, ei_const_blas_data_mapper lhs(_lhs,lhsStride); ei_const_blas_data_mapper rhs(_rhs,rhsStride); - typedef ei_product_blocking_traits Blocking; + typedef ei_gebp_traits Traits; Index kc = blocking.kc(); // cache block size along the K direction Index mc = std::min(rows,blocking.mc()); // cache block size along the M direction //Index nc = blocking.nc(); // cache block size along the N direction - // FIXME starting from SSE3, normal complex product cannot be optimized as well as - // conjugate product, therefore it is better to conjugate during the copies. - // With SSE2, this is the other way round. - ei_gemm_pack_lhs pack_lhs; - ei_gemm_pack_rhs pack_rhs; - ei_gebp_kernel gebp; - -// ei_gemm_pack_lhs pack_lhs; -// ei_gemm_pack_rhs pack_rhs; -// ei_gebp_kernel gebp; + ei_gemm_pack_lhs pack_lhs; + ei_gemm_pack_rhs pack_rhs; + ei_gebp_kernel gebp; #ifdef EIGEN_HAS_OPENMP if(info) @@ -98,7 +91,7 @@ static void run(Index rows, Index cols, Index depth, Index threads = omp_get_num_threads(); Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); - std::size_t sizeW = kc*Blocking::PacketSize*Blocking::nr*8; + std::size_t sizeW = kc*Traits::WorkSpaceFactor; Scalar* w = ei_aligned_stack_new(Scalar, sizeW); Scalar* blockB = blocking.blockB(); ei_internal_assert(blockB!=0); @@ -170,10 +163,10 @@ static void run(Index rows, Index cols, Index depth, // this is the sequential version! std::size_t sizeA = kc*mc; std::size_t sizeB = kc*cols; - std::size_t sizeW = kc*ei_packet_traits::size*Blocking::nr*2; + std::size_t sizeW = kc*Traits::WorkSpaceFactor; LhsScalar *blockA = blocking.blockA()==0 ? ei_aligned_stack_new(LhsScalar, sizeA) : blocking.blockA(); RhsScalar *blockB = blocking.blockB()==0 ? ei_aligned_stack_new(RhsScalar, sizeB) : blocking.blockB(); - RhsScalar *blockW = blocking.blockW()==0 ? ei_aligned_stack_new(RhsScalar, sizeW) : blocking.blockW(); + RhsScalar *blockW = /*blocking.blockW()==0 ?*/ ei_aligned_stack_new(RhsScalar, sizeW) /*: blocking.blockW()*/; // For each horizontal panel of the rhs, and corresponding panel of the lhs... // (==GEMM_VAR1) @@ -302,11 +295,11 @@ class ei_gemm_blocking_space::ret LhsScalar; typedef typename ei_meta_if::ret RhsScalar; - typedef ei_product_blocking_traits Blocking; + typedef ei_gebp_traits Traits; enum { SizeA = ActualRows * MaxDepth, SizeB = ActualCols * MaxDepth, - SizeW = MaxDepth * Blocking::nr * ei_packet_traits::size + SizeW = MaxDepth * Traits::nr * ei_packet_traits::size }; EIGEN_ALIGN16 LhsScalar m_staticA[SizeA]; @@ -342,7 +335,7 @@ class ei_gemm_blocking_space::ret LhsScalar; typedef typename ei_meta_if::ret RhsScalar; - typedef ei_product_blocking_traits Blocking; + typedef ei_gebp_traits Traits; DenseIndex m_sizeA; DenseIndex m_sizeB; @@ -359,7 +352,7 @@ class ei_gemm_blocking_space(this->m_kc, this->m_mc, this->m_nc); m_sizeA = this->m_mc * this->m_kc; m_sizeB = this->m_kc * this->m_nc; - m_sizeW = this->m_kc*ei_packet_traits::size*Blocking::nr; + m_sizeW = this->m_kc*Traits::WorkSpaceFactor; } void allocateA() diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index 0488114b9..be5248752 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -67,7 +67,7 @@ struct ei_symm_pack_lhs if(rows-peeled_mc>=PacketSize) { - pack(blockA, lhs, cols, peeled_mc, count); + pack(blockA, lhs, cols, peeled_mc, count); peeled_mc += PacketSize; } @@ -253,9 +253,9 @@ struct ei_product_selfadjoint_matrix lhs(_lhs,lhsStride); ei_const_blas_data_mapper rhs(_rhs,rhsStride); - typedef ei_product_blocking_traits Blocking; + typedef ei_gebp_traits Traits; - Index kc = size; // cache block size along the K direction + Index kc = size; // cache block size along the K direction Index mc = rows; // cache block size along the M direction Index nc = cols; // cache block size along the N direction computeProductBlockingSizes(kc, mc, nc); @@ -263,14 +263,15 @@ struct ei_product_selfadjoint_matrix::size*Blocking::nr + kc*cols; + std::size_t sizeW = kc*Traits::WorkSpaceFactor; + std::size_t sizeB = sizeW + kc*cols; Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB); - Scalar* blockB = allocatedBlockB + kc*ei_packet_traits::size*Blocking::nr; + Scalar* blockB = allocatedBlockB + sizeW; - ei_gebp_kernel gebp_kernel; - ei_symm_pack_lhs pack_lhs; - ei_gemm_pack_rhs pack_rhs; - ei_gemm_pack_lhs pack_lhs_transposed; + ei_gebp_kernel gebp_kernel; + ei_symm_pack_lhs pack_lhs; + ei_gemm_pack_rhs pack_rhs; + ei_gemm_pack_lhs pack_lhs_transposed; for(Index k2=0; k2() + ei_gemm_pack_lhs() (blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc); gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha); @@ -335,7 +336,7 @@ struct ei_product_selfadjoint_matrix lhs(_lhs,lhsStride); - typedef ei_product_blocking_traits Blocking; + typedef ei_gebp_traits Traits; Index kc = size; // cache block size along the K direction Index mc = rows; // cache block size along the M direction @@ -343,13 +344,14 @@ struct ei_product_selfadjoint_matrix(kc, mc, nc); Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); - std::size_t sizeB = kc*ei_packet_traits::size*Blocking::nr + kc*cols; + std::size_t sizeW = kc*Traits::WorkSpaceFactor; + std::size_t sizeB = sizeW + kc*cols; Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB); - Scalar* blockB = allocatedBlockB + kc*ei_packet_traits::size*Blocking::nr; + Scalar* blockB = allocatedBlockB + sizeW; - ei_gebp_kernel gebp_kernel; - ei_gemm_pack_lhs pack_lhs; - ei_symm_pack_rhs pack_rhs; + ei_gebp_kernel gebp_kernel; + ei_gemm_pack_lhs pack_lhs; + ei_symm_pack_rhs pack_rhs; for(Index k2=0; k2 Blocking; + typedef ei_gebp_traits Traits; Index kc = depth; // cache block size along the K direction Index mc = size; // cache block size along the M direction Index nc = size; // cache block size along the N direction computeProductBlockingSizes(kc, mc, nc); // !!! mc must be a multiple of nr: - if(mc>Blocking::nr) - mc = (mc/Blocking::nr)*Blocking::nr; + if(mc>Traits::nr) + mc = (mc/Traits::nr)*Traits::nr; Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); - std::size_t sizeB = kc*ei_packet_traits::size*Blocking::nr + kc*size; + std::size_t sizeW = kc*Traits::WorkSpaceFactor; + std::size_t sizeB = sizeW + kc*size; Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB); - Scalar* blockB = allocatedBlockB + kc*ei_packet_traits::size*Blocking::nr; + Scalar* blockB = allocatedBlockB + sizeW; // note that the actual rhs is the transpose/adjoint of mat enum { @@ -89,10 +90,10 @@ struct ei_selfadjoint_product::IsComplex && AAT }; - ei_gebp_kernel gebp_kernel; - ei_gemm_pack_rhs pack_rhs; - ei_gemm_pack_lhs pack_lhs; - ei_sybb_kernel sybb; + ei_gebp_kernel gebp_kernel; + ei_gemm_pack_rhs pack_rhs; + ei_gemm_pack_lhs pack_lhs; + ei_sybb_kernel sybb; for(Index k2=0; k2 lhs(_lhs,lhsStride); ei_const_blas_data_mapper rhs(_rhs,rhsStride); - typedef ei_product_blocking_traits Blocking; + typedef ei_gebp_traits Traits; enum { - SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Blocking::mr,Blocking::nr), + SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr), IsLower = (Mode&Lower) == Lower }; @@ -117,18 +117,19 @@ struct ei_product_triangular_matrix_matrix(kc, mc, nc); Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); - std::size_t sizeB = kc*ei_packet_traits::size*Blocking::nr + kc*cols; + std::size_t sizeW = kc*Traits::WorkSpaceFactor; + std::size_t sizeB = sizeW + kc*cols; Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB); // Scalar* allocatedBlockB = new Scalar[sizeB]; - Scalar* blockB = allocatedBlockB + kc*ei_packet_traits::size*Blocking::nr; + Scalar* blockB = allocatedBlockB + sizeW; Matrix triangularBuffer; triangularBuffer.setZero(); triangularBuffer.diagonal().setOnes(); - ei_gebp_kernel gebp_kernel; - ei_gemm_pack_lhs pack_lhs; - ei_gemm_pack_rhs pack_rhs; + ei_gebp_kernel gebp_kernel; + ei_gemm_pack_lhs pack_lhs; + ei_gemm_pack_rhs pack_rhs; for(Index k2=IsLower ? depth : 0; IsLower ? k2>0 : k2() + ei_gemm_pack_lhs() (blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc); gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha); @@ -228,9 +229,9 @@ struct ei_product_triangular_matrix_matrix lhs(_lhs,lhsStride); ei_const_blas_data_mapper rhs(_rhs,rhsStride); - typedef ei_product_blocking_traits Blocking; + typedef ei_gebp_traits Traits; enum { - SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Blocking::mr,Blocking::nr), + SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr), IsLower = (Mode&Lower) == Lower }; @@ -240,18 +241,19 @@ struct ei_product_triangular_matrix_matrix(kc, mc, nc); Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); - std::size_t sizeB = kc*ei_packet_traits::size*Blocking::nr + kc*cols; + std::size_t sizeW = kc*Traits::WorkSpaceFactor; + std::size_t sizeB = sizeW + kc*cols; Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar,sizeB); - Scalar* blockB = allocatedBlockB + kc*ei_packet_traits::size*Blocking::nr; + Scalar* blockB = allocatedBlockB + sizeW; Matrix triangularBuffer; triangularBuffer.setZero(); triangularBuffer.diagonal().setOnes(); - ei_gebp_kernel gebp_kernel; - ei_gemm_pack_lhs pack_lhs; - ei_gemm_pack_rhs pack_rhs; - ei_gemm_pack_rhs pack_rhs_panel; + ei_gebp_kernel gebp_kernel; + ei_gemm_pack_lhs pack_lhs; + ei_gemm_pack_rhs pack_rhs; + ei_gemm_pack_rhs pack_rhs_panel; for(Index k2=IsLower ? 0 : depth; IsLower ? k20; diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h index 08ad12938..1ceeca79b 100644 --- a/Eigen/src/Core/products/TriangularSolverMatrix.h +++ b/Eigen/src/Core/products/TriangularSolverMatrix.h @@ -57,9 +57,9 @@ struct ei_triangular_solve_matrix tri(_tri,triStride); ei_blas_data_mapper other(_other,otherStride); - typedef ei_product_blocking_traits Blocking; + typedef ei_gebp_traits Traits; enum { - SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Blocking::mr,Blocking::nr), + SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr), IsLower = (Mode&Lower) == Lower }; @@ -69,14 +69,15 @@ struct ei_triangular_solve_matrix(kc, mc, nc); Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); - std::size_t sizeB = kc*ei_packet_traits::size*Blocking::nr + kc*cols; + std::size_t sizeW = kc*Traits::WorkSpaceFactor; + std::size_t sizeB = sizeW + kc*cols; Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB); - Scalar* blockB = allocatedBlockB + kc*ei_packet_traits::size*Blocking::nr; + Scalar* blockB = allocatedBlockB + sizeW; ei_conj_if conj; - ei_gebp_kernel gebp_kernel; - ei_gemm_pack_lhs pack_lhs; - ei_gemm_pack_rhs pack_rhs; + ei_gebp_kernel gebp_kernel; + ei_gemm_pack_lhs pack_lhs; + ei_gemm_pack_rhs pack_rhs; for(Index k2=IsLower ? 0 : size; IsLower ? k20; @@ -191,15 +192,15 @@ struct ei_triangular_solve_matrix rhs(_tri,triStride); ei_blas_data_mapper lhs(_other,otherStride); - typedef ei_product_blocking_traits Blocking; + typedef ei_gebp_traits Traits; enum { RhsStorageOrder = TriStorageOrder, - SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Blocking::mr,Blocking::nr), + SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr), IsLower = (Mode&Lower) == Lower }; -// Index kc = std::min(Blocking::Max_kc/4,size); // cache block size along the K direction -// Index mc = std::min(Blocking::Max_mc,size); // cache block size along the M direction +// Index kc = std::min(Traits::Max_kc/4,size); // cache block size along the K direction +// Index mc = std::min(Traits::Max_mc,size); // cache block size along the M direction // check that !!!! Index kc = size; // cache block size along the K direction Index mc = size; // cache block size along the M direction @@ -207,15 +208,16 @@ struct ei_triangular_solve_matrix(kc, mc, nc); Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); - std::size_t sizeB = kc*ei_packet_traits::size*Blocking::nr + kc*size; + std::size_t sizeW = kc*Traits::WorkSpaceFactor; + std::size_t sizeB = sizeW + kc*size; Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB); - Scalar* blockB = allocatedBlockB + kc*ei_packet_traits::size*Blocking::nr; + Scalar* blockB = allocatedBlockB + sizeW; ei_conj_if conj; - ei_gebp_kernel gebp_kernel; - ei_gemm_pack_rhs pack_rhs; - ei_gemm_pack_rhs pack_rhs_panel; - ei_gemm_pack_lhs pack_lhs_panel; + ei_gebp_kernel gebp_kernel; + ei_gemm_pack_rhs pack_rhs; + ei_gemm_pack_rhs pack_rhs_panel; + ei_gemm_pack_lhs pack_lhs_panel; for(Index k2=IsLower ? size : 0; IsLower ? k2>0 : k2 struct ei_product_blocking_traits; - -template -struct ei_product_blocking_traits -{ - enum { - Vectorizable = ei_packet_traits::Vectorizable - && ei_packet_traits::Vectorizable - /*&& (ei_is_same_type::ret - || (NumTraits::IsComplex && !NumTraits::IsComplex))*/, - LhsPacketSize = Vectorizable ? ei_packet_traits::size : 1, - NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, - - // register block size along the N direction (must be either 2 or 4) - nr = NumberOfRegisters/4, - - // register block size along the M direction (currently, this one cannot be modified) - mr = 2 * LhsPacketSize - }; -}; - -template -struct ei_product_blocking_traits, std::complex > -{ - typedef std::complex Scalar; - enum { - Vectorizable = ei_packet_traits::Vectorizable, - PacketSize = ei_packet_traits::size, - nr = 2, - mr = 2 * PacketSize - }; -}; /* Helper class to analyze the factors of a Product expression. * In particular it allows to pop out operator-, scalar multiples,