From 2a564695f0e9391eb3a0125bd5731c17aabdb680 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 19 Mar 2014 13:28:50 +0100 Subject: [PATCH 1/8] Simpler and hopefully more future-proof fix for bug #503 (aligned_allocator with c++11) --- Eigen/src/Core/util/Memory.h | 110 +++++++++-------------------------- 1 file changed, 27 insertions(+), 83 deletions(-) diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 286e963d2..0f8ab065a 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -700,98 +700,42 @@ template class aligned_stack_memory_handler * \sa \ref TopicStlContainers. */ template -class aligned_allocator +class aligned_allocator : public std::allocator { public: - typedef size_t size_type; - typedef std::ptrdiff_t difference_type; - typedef T* pointer; - typedef const T* const_pointer; - typedef T& reference; - typedef const T& const_reference; - typedef T value_type; + typedef size_t size_type; + typedef std::ptrdiff_t difference_type; + typedef T* pointer; + typedef const T* const_pointer; + typedef T& reference; + typedef const T& const_reference; + typedef T value_type; - template - struct rebind - { - typedef aligned_allocator other; - }; + template + struct rebind + { + typedef aligned_allocator other; + }; - pointer address( reference value ) const - { - return &value; - } + aligned_allocator() : std::allocator() {} - const_pointer address( const_reference value ) const - { - return &value; - } + aligned_allocator(const aligned_allocator& other) : std::allocator(other) {} - aligned_allocator() - { - } + template + aligned_allocator(const aligned_allocator& other) : std::allocator(other) {} - aligned_allocator( const aligned_allocator& ) - { - } + ~aligned_allocator() {} - template - aligned_allocator( const aligned_allocator& ) - { - } + pointer allocate(size_type num, const void* /*hint*/ = 0) + { + internal::check_size_for_overflow(num); + return static_cast( internal::aligned_malloc(num * sizeof(T)) ); + } - ~aligned_allocator() - { - } - - size_type max_size() const - { - return (std::numeric_limits::max)(); - } - - pointer allocate( size_type num, const void* hint = 0 ) - { - EIGEN_UNUSED_VARIABLE(hint); - internal::check_size_for_overflow(num); - return static_cast( internal::aligned_malloc( num * sizeof(T) ) ); - } - - void construct( pointer p, const T& value ) - { - ::new( p ) T( value ); - } - -#if (__cplusplus >= 201103L) - template - void construct( U* u, Args&&... args) - { - ::new( static_cast(u) ) U( std::forward( args )... ); - } -#endif - - void destroy( pointer p ) - { - p->~T(); - } - -#if (__cplusplus >= 201103L) - template - void destroy( U* u ) - { - u->~U(); - } -#endif - - void deallocate( pointer p, size_type /*num*/ ) - { - internal::aligned_free( p ); - } - - bool operator!=(const aligned_allocator& ) const - { return false; } - - bool operator==(const aligned_allocator& ) const - { return true; } + void deallocate(pointer p, size_type /*num*/) + { + internal::aligned_free(p); + } }; //---------- Cache sizes ---------- From c39a3fa7a1808233ad6556e169e0c08d3bc979e1 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 20 Mar 2014 10:14:26 +0100 Subject: [PATCH 2/8] Makes gcc to generate a pshufd instruction for pset1 --- Eigen/src/Core/arch/SSE/PacketMath.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index f5a3dab52..ea14111e3 100644 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -110,7 +110,20 @@ template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { re template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return _mm_set_pd(from,from); } template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { return _mm_set_epi32(from,from,from,from); } #else -template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { return _mm_set1_ps(from); } + +// GCC generates a shufps instruction for set1_ps instead of the more efficient pshufd instruction. +// However, with AVX, we want it to generate a vbroadcastss. +// Moreover, we cannot use intrinsics here because then gcc generates crappy code in some cases (see bug 203) +#if (defined __GNUC__) && (!defined __INTEL_COMPILER) && (!defined __clang__) && (!defined __AVX__) + template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { + Packet4f res; + asm("pshufd $0, %[a], %[b]" : [b] "=x" (res) : [a] "x" (from)); + return res; + } +#else + template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { return _mm_set_ps1(from); } +#endif + template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return _mm_set1_pd(from); } template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { return _mm_set1_epi32(from); } #endif From cfd3d6ce9c3d1bf5b899b8d15547a81a794a8af3 Mon Sep 17 00:00:00 2001 From: Bo Li Date: Thu, 20 Mar 2014 22:05:40 +0800 Subject: [PATCH 3/8] fixed a template type conversion bug in AngleAxis found by Pei Luo --- Eigen/src/Geometry/AngleAxis.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Geometry/AngleAxis.h b/Eigen/src/Geometry/AngleAxis.h index f424e6d7d..b42048c55 100644 --- a/Eigen/src/Geometry/AngleAxis.h +++ b/Eigen/src/Geometry/AngleAxis.h @@ -165,8 +165,8 @@ AngleAxis& AngleAxis::operator=(const QuaternionBase::dummy_precision()*NumTraits::dummy_precision()) { - m_angle = 0; - m_axis << 1, 0, 0; + m_angle = Scalar(0); + m_axis << Scalar(1), Scalar(0), Scalar(0); } else { From 01fd880424f0e937af7841202af67e6e4ee6fc07 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 20 Mar 2014 16:03:46 +0100 Subject: [PATCH 4/8] Revert previous change and introduce a new workaround regarding gcc generating a shufps instruction instead of the more efficient pshufd instruction. The trick consists in introducing a new pload1 function to be used in low level product kernels for which bug #203 does not apply. Indeed, it turned out that using inline assembly prevents gcc of doing a good job at instructtion reordering. --- Eigen/src/Core/GenericPacketMath.h | 4 ++++ Eigen/src/Core/arch/SSE/PacketMath.h | 26 ++++++++++++-------------- 2 files changed, 16 insertions(+), 14 deletions(-) mode change 100644 => 100755 Eigen/src/Core/GenericPacketMath.h mode change 100644 => 100755 Eigen/src/Core/arch/SSE/PacketMath.h diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h old mode 100644 new mode 100755 index b0469fa1e..538ab53b2 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -169,6 +169,10 @@ ploaddup(const typename unpacket_traits::type* from) { return *from; } template EIGEN_DEVICE_FUNC inline Packet pset1(const typename unpacket_traits::type& a) { return a; } +/** \internal \returns a packet with constant coefficients \a a[0], e.g.: (a[0],a[0],a[0],a[0]) */ +template EIGEN_DEVICE_FUNC inline Packet +pload1(const typename unpacket_traits::type *a) { return pset1(*a); } + /** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */ template inline typename packet_traits::type plset(const Scalar& a) { return a; } diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h old mode 100644 new mode 100755 index ea14111e3..293fb83e4 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -110,24 +110,22 @@ template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { re template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return _mm_set_pd(from,from); } template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { return _mm_set_epi32(from,from,from,from); } #else - -// GCC generates a shufps instruction for set1_ps instead of the more efficient pshufd instruction. -// However, with AVX, we want it to generate a vbroadcastss. -// Moreover, we cannot use intrinsics here because then gcc generates crappy code in some cases (see bug 203) -#if (defined __GNUC__) && (!defined __INTEL_COMPILER) && (!defined __clang__) && (!defined __AVX__) - template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { - Packet4f res; - asm("pshufd $0, %[a], %[b]" : [b] "=x" (res) : [a] "x" (from)); - return res; - } -#else - template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { return _mm_set_ps1(from); } -#endif - +template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { return _mm_set_ps1(from); } template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return _mm_set1_pd(from); } template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { return _mm_set1_epi32(from); } #endif +// GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction. +// However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203) +// Using inline assembly is also not an option because then gcc fails to reorder properly the instructions. +// Therefore, we introduced the pload1 functions to be used in product kernels for which bug 203 does not apply. +// Also note that with AVX, we want it to generate a vbroadcastss. +#if (defined __GNUC__) && (!defined __INTEL_COMPILER) && (!defined __clang__) && (!defined __AVX__) +template<> EIGEN_STRONG_INLINE Packet4f pload1(const float *from) { + return vec4f_swizzle1(_mm_load_ss(from),0,0,0,0); +} +#endif + template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return _mm_add_ps(pset1(a), _mm_set_ps(3,2,1,0)); } template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) { return _mm_add_pd(pset1(a),_mm_set_pd(1,0)); } template<> EIGEN_STRONG_INLINE Packet4i plset(const int& a) { return _mm_add_epi32(pset1(a),_mm_set_epi32(3,2,1,0)); } From 60cd361ebea62d973d81436250268e4fd1b86f49 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Wed, 26 Mar 2014 17:48:30 +0100 Subject: [PATCH 5/8] Fix bug #222. Make temporary matrix column-major independently of EIGEN_DEFAULT_TO_ROW_MAJOR --- Eigen/src/Householder/BlockHouseholder.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Householder/BlockHouseholder.h b/Eigen/src/Householder/BlockHouseholder.h index 1991c6527..60dbea5f5 100644 --- a/Eigen/src/Householder/BlockHouseholder.h +++ b/Eigen/src/Householder/BlockHouseholder.h @@ -48,7 +48,7 @@ void apply_block_householder_on_the_left(MatrixType& mat, const VectorsType& vec typedef typename MatrixType::Index Index; enum { TFactorSize = MatrixType::ColsAtCompileTime }; Index nbVecs = vectors.cols(); - Matrix T(nbVecs,nbVecs); + Matrix T(nbVecs,nbVecs); make_block_householder_triangular_factor(T, vectors, hCoeffs); const TriangularView& V(vectors); From c8c81c1e7454dd824607132c78997adee62101fd Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 2 Jan 2014 16:18:32 -0800 Subject: [PATCH 6/8] Improved the efficiency if the block-panel matrix multiplication code: the change reduces the pressure on the L1 cache by removing the calls to gebp_traits::unpackRhs(). Instead the packetization of the rhs blocks is done on the fly in gebp_traits::loadRhs(). This adds numerous calls to pset1 (since we're packetizing on the fly in the inner loop) but this is more than compensated by the fact that we're decreasing the memory transfers by a factor RhsPacketSize. --- .../Core/products/GeneralBlockPanelKernel.h | 76 +++++-------------- Eigen/src/Core/products/GeneralMatrixMatrix.h | 10 +-- .../products/GeneralMatrixMatrixTriangular.h | 19 ++--- .../Core/products/TriangularMatrixMatrix.h | 15 ++-- .../Core/products/TriangularSolverMatrix.h | 13 +--- 5 files changed, 38 insertions(+), 95 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 686ff84f1..ba6fad246 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -10,6 +10,7 @@ #ifndef EIGEN_GENERAL_BLOCK_PANEL_H #define EIGEN_GENERAL_BLOCK_PANEL_H + namespace Eigen { namespace internal { @@ -169,7 +170,7 @@ public: WorkSpaceFactor = nr * RhsPacketSize, LhsProgress = LhsPacketSize, - RhsProgress = RhsPacketSize + RhsProgress = 1 }; typedef typename packet_traits::type _LhsPacket; @@ -187,15 +188,9 @@ public: p = pset1(ResScalar(0)); } - EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) - { - for(DenseIndex k=0; k(&b[k*RhsPacketSize], rhs[k]); - } - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { - dest = pload(b); + dest = pset1(*b); } EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const @@ -240,7 +235,7 @@ public: WorkSpaceFactor = nr*RhsPacketSize, LhsProgress = LhsPacketSize, - RhsProgress = RhsPacketSize + RhsProgress = 1 }; typedef typename packet_traits::type _LhsPacket; @@ -258,15 +253,9 @@ public: p = pset1(ResScalar(0)); } - EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) - { - for(DenseIndex k=0; k(&b[k*RhsPacketSize], rhs[k]); - } - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { - dest = pload(b); + dest = pset1(*b); } EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const @@ -320,7 +309,7 @@ public: WorkSpaceFactor = Vectorizable ? 2*nr*RealPacketSize : nr, LhsProgress = ResPacketSize, - RhsProgress = Vectorizable ? 2*ResPacketSize : 1 + RhsProgress = 1 }; typedef typename packet_traits::type RealPacket; @@ -344,30 +333,15 @@ public: p.second = pset1(RealScalar(0)); } - /* Unpack the rhs coeff such that each complex coefficient is spread into - * two packects containing respectively the real and imaginary coefficient - * duplicated as many time as needed: (x+iy) => [x, ..., x] [y, ..., y] - */ - EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const Scalar* rhs, Scalar* b) + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const { - for(DenseIndex k=0; k((RealScalar*)&b[k*ResPacketSize*2+0], real(rhs[k])); - pstore1((RealScalar*)&b[k*ResPacketSize*2+ResPacketSize], imag(rhs[k])); - } - else - b[k] = rhs[k]; - } + dest = pset1(*b); } - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const { dest = *b; } - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const { - dest.first = pload((const RealScalar*)b); - dest.second = pload((const RealScalar*)(b+ResPacketSize)); + dest.first = pset1(real(*b)); + dest.second = pset1(imag(*b)); } // nothing special here @@ -445,7 +419,7 @@ public: WorkSpaceFactor = nr*RhsPacketSize, LhsProgress = ResPacketSize, - RhsProgress = ResPacketSize + RhsProgress = 1 }; typedef typename packet_traits::type _LhsPacket; @@ -463,15 +437,9 @@ public: p = pset1(ResScalar(0)); } - EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) - { - for(DenseIndex k=0; k(&b[k*RhsPacketSize], rhs[k]); - } - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { - dest = pload(b); + dest = pset1(*b); } EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const @@ -529,14 +497,14 @@ struct gebp_kernel EIGEN_DONT_INLINE void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, - Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB=0); + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); }; template EIGEN_DONT_INLINE void gebp_kernel ::operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, - Index strideA, Index strideB, Index offsetA, Index offsetB, RhsScalar* unpackedB) + Index strideA, Index strideB, Index offsetA, Index offsetB) { Traits traits; @@ -550,14 +518,9 @@ void gebp_kernel const Index peeled_mc2 = peeled_mc + (rows-peeled_mc >= LhsProgress ? LhsProgress : 0); const Index peeled_kc = (depth/4)*4; - if(unpackedB==0) - unpackedB = const_cast(blockB - strideB * nr * RhsProgress); - // loops on each micro vertical panel of rhs (depth x nr) for(Index j2=0; j2 we select a mr x nr micro block of res which is entirely // stored into mr/packet_size x nr registers. @@ -590,7 +553,7 @@ void gebp_kernel // performs "inner" product // TODO let's check wether the folowing peeled loop could not be // optimized via optimal prefetching from one loop to the other - const RhsScalar* blB = unpackedB; + const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; for(Index k=0; k do the same but with nr==1 for(Index j2=packet_cols; j20) while(info[j].sync!=k) {} - gebp(res+info[j].rhs_start*resStride, resStride, blockA, blockB+info[j].rhs_start*actual_kc, mc, actual_kc, info[j].rhs_length, alpha, -1,-1,0,0, w); + gebp(res+info[j].rhs_start*resStride, resStride, blockA, blockB+info[j].rhs_start*actual_kc, mc, actual_kc, info[j].rhs_length, alpha, -1,-1,0,0); } // Then keep going as usual with the remaining A' @@ -134,7 +132,7 @@ static void run(Index rows, Index cols, Index depth, pack_lhs(blockA, &lhs(i,k), lhsStride, actual_kc, actual_mc); // C_i += A' * B' - gebp(res+i, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1,-1,0,0, w); + gebp(res+i, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1,-1,0,0); } // Release all the sub blocks B'_j of B' for the current thread, @@ -152,11 +150,9 @@ static void run(Index rows, Index cols, Index depth, // this is the sequential version! std::size_t sizeA = kc*mc; std::size_t sizeB = kc*cols; - std::size_t sizeW = kc*Traits::WorkSpaceFactor; ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA()); ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB()); - ei_declare_aligned_stack_constructed_variable(RhsScalar, blockW, sizeW, blocking.blockW()); // For each horizontal panel of the rhs, and corresponding panel of the lhs... // (==GEMM_VAR1) @@ -182,7 +178,7 @@ static void run(Index rows, Index cols, Index depth, pack_lhs(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc); // Everything is packed, we can now call the block * panel kernel: - gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW); + gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0); } } } diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index 5c3763909..ffa871cae 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -73,11 +73,8 @@ struct general_matrix_matrix_triangular_product Traits::nr) mc = (mc/Traits::nr)*Traits::nr; - std::size_t sizeW = kc*Traits::WorkSpaceFactor; - std::size_t sizeB = sizeW + kc*size; ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, kc*mc, 0); - ei_declare_aligned_stack_constructed_variable(RhsScalar, allocatedBlockB, sizeB, 0); - RhsScalar* blockB = allocatedBlockB + sizeW; + ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, kc*size, 0); gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; @@ -103,15 +100,15 @@ struct general_matrix_matrix_triangular_product processed with gebp or skipped if (UpLo==Lower) gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, (std::min)(size,i2), alpha, - -1, -1, 0, 0, allocatedBlockB); + -1, -1, 0, 0); - sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha, allocatedBlockB); + sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha); if (UpLo==Upper) { Index j2 = i2+actual_mc; gebp(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, (std::max)(Index(0), size-j2), alpha, - -1, -1, 0, 0, allocatedBlockB); + -1, -1, 0, 0); } } } @@ -136,7 +133,7 @@ struct tribb_kernel enum { BlockSize = EIGEN_PLAIN_ENUM_MAX(mr,nr) }; - void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha, RhsScalar* workspace) + void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha) { gebp_kernel gebp_kernel; Matrix buffer; @@ -150,7 +147,7 @@ struct tribb_kernel if(UpLo==Upper) gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, alpha, - -1, -1, 0, 0, workspace); + -1, -1, 0, 0); // selfadjoint micro block { @@ -158,7 +155,7 @@ struct tribb_kernel buffer.setZero(); // 1 - apply the kernel on the temporary buffer gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha, - -1, -1, 0, 0, workspace); + -1, -1, 0, 0); // 2 - triangular accumulation for(Index j1=0; j1 triangularBuffer; triangularBuffer.setZero(); @@ -187,7 +185,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix0) @@ -197,7 +195,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix() (blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc); - gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW); + gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0); } } } @@ -266,11 +264,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix triangularBuffer; triangularBuffer.setZero(); @@ -357,14 +353,13 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix conj; gebp_kernel gebp_kernel; @@ -158,7 +156,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix conj; gebp_kernel gebp_kernel; @@ -285,8 +281,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix0) gebp_kernel(_other+i2+startPanel*otherStride, otherStride, blockA, geb, actual_mc, actual_kc, rs, Scalar(-1), - -1, -1, 0, 0, blockW); + -1, -1, 0, 0); } } } From b286a1e75c6bd451c27da8c5ebed0b0fb86dfc2a Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 26 Mar 2014 16:46:36 +0100 Subject: [PATCH 7/8] add pbroadcast2/4 generic intrinsics --- Eigen/src/Core/GenericPacketMath.h | 34 ++++++++++++++++++++++++++++ Eigen/src/Core/arch/SSE/PacketMath.h | 32 ++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 538ab53b2..d07541285 100755 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -173,6 +173,40 @@ pset1(const typename unpacket_traits::type& a) { return a; } template EIGEN_DEVICE_FUNC inline Packet pload1(const typename unpacket_traits::type *a) { return pset1(*a); } +/** \internal equivalent to + * \code + * a0 = pload1(a+0); + * a1 = pload1(a+1); + * a2 = pload1(a+2); + * a3 = pload1(a+3); + * \endcode + * \sa pset1, pload1, ploaddup, pbroadcast2 + */ +template EIGEN_DEVICE_FUNC +inline void pbroadcast4(const typename unpacket_traits::type *a, + Packet& a0, Packet& a1, Packet& a2, Packet& a3) +{ + a0 = pload1(a+0); + a1 = pload1(a+1); + a2 = pload1(a+2); + a3 = pload1(a+3); +} + +/** \internal equivalent to + * \code + * a0 = pload1(a+0); + * a1 = pload1(a+1); + * \endcode + * \sa pset1, pload1, ploaddup, pbroadcast4 + */ +template EIGEN_DEVICE_FUNC +inline void pbroadcast2(const typename unpacket_traits::type *a, + Packet& a0, Packet& a1) +{ + a0 = pload1(a+0); + a1 = pload1(a+1); +} + /** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */ template inline typename packet_traits::type plset(const Scalar& a) { return a; } diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 293fb83e4..9f81a4623 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -391,6 +391,38 @@ template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) #endif } +// with AVX, the default implementations based on pload1 are faster +#ifndef __AVX__ +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const float *a, + Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) +{ + a3 = pload(a); + a0 = vec4f_swizzle1(a3, 0,0,0,0); + a1 = vec4f_swizzle1(a3, 1,1,1,1); + a2 = vec4f_swizzle1(a3, 2,2,2,2); + a3 = vec4f_swizzle1(a3, 3,3,3,3); +} +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const double *a, + Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) +{ +#ifdef EIGEN_VECTORIZE_SSE3 + a0 = _mm_loaddup_pd(a+0); + a1 = _mm_loaddup_pd(a+1); + a2 = _mm_loaddup_pd(a+2); + a3 = _mm_loaddup_pd(a+3); +#else + a1 = pload(a); + a0 = vec2d_swizzle1(a1, 0,0); + a1 = vec2d_swizzle1(a1, 1,1); + a3 = pload(a+2); + a2 = vec2d_swizzle1(a3, 0,0); + a3 = vec2d_swizzle1(a3, 1,1); +#endif +} +#endif + EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs) { vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55)); From bc401eb6fa9c4c14c7fb32acfe70b304c1850283 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 26 Mar 2014 18:53:00 +0100 Subject: [PATCH 8/8] Implement new 1 packet x 8 gebp kernel --- Eigen/src/Core/arch/SSE/PacketMath.h | 6 +- .../Core/products/GeneralBlockPanelKernel.h | 679 +++++++----------- .../Core/products/SelfadjointMatrixMatrix.h | 48 +- 3 files changed, 297 insertions(+), 436 deletions(-) diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 9f81a4623..b1f7b7717 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -397,7 +397,7 @@ template<> EIGEN_STRONG_INLINE void pbroadcast4(const float *a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) { - a3 = pload(a); + a3 = ploadu(a); a0 = vec4f_swizzle1(a3, 0,0,0,0); a1 = vec4f_swizzle1(a3, 1,1,1,1); a2 = vec4f_swizzle1(a3, 2,2,2,2); @@ -413,10 +413,10 @@ pbroadcast4(const double *a, a2 = _mm_loaddup_pd(a+2); a3 = _mm_loaddup_pd(a+3); #else - a1 = pload(a); + a1 = ploadu(a); a0 = vec2d_swizzle1(a1, 0,0); a1 = vec2d_swizzle1(a1, 1,1); - a3 = pload(a+2); + a3 = ploadu(a+2); a2 = vec2d_swizzle1(a3, 0,0); a3 = vec2d_swizzle1(a3, 1,1); #endif diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index ba6fad246..8a398d912 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -161,11 +161,11 @@ public: NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, - // register block size along the N direction (must be either 2 or 4) - nr = NumberOfRegisters/4, + // register block size along the N direction (must be either 4 or 8) + nr = NumberOfRegisters/2, // register block size along the M direction (currently, this one cannot be modified) - mr = 2 * LhsPacketSize, + mr = LhsPacketSize, WorkSpaceFactor = nr * RhsPacketSize, @@ -187,6 +187,16 @@ public: { p = pset1(ResScalar(0)); } + + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + { + pbroadcast4(b, b0, b1, b2, b3); + } + + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) + { + pbroadcast2(b, b0, b1); + } EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { @@ -230,8 +240,8 @@ public: ResPacketSize = Vectorizable ? packet_traits::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, - nr = NumberOfRegisters/4, - mr = 2 * LhsPacketSize, + nr = NumberOfRegisters/2, + mr = LhsPacketSize, WorkSpaceFactor = nr*RhsPacketSize, LhsProgress = LhsPacketSize, @@ -262,6 +272,16 @@ public: { dest = pload(a); } + + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + { + pbroadcast4(b, b0, b1, b2, b3); + } + + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) + { + pbroadcast2(b, b0, b1); + } EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const { @@ -304,8 +324,9 @@ public: RealPacketSize = Vectorizable ? packet_traits::size : 1, ResPacketSize = Vectorizable ? packet_traits::size : 1, - nr = 2, - mr = 2 * ResPacketSize, + // FIXME: should depend on NumberOfRegisters + nr = 4, + mr = ResPacketSize, WorkSpaceFactor = Vectorizable ? 2*nr*RealPacketSize : nr, LhsProgress = ResPacketSize, @@ -333,16 +354,37 @@ public: p.second = pset1(RealScalar(0)); } + // Scalar path EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const { dest = pset1(*b); } + // Vectorized path EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const { dest.first = pset1(real(*b)); dest.second = pset1(imag(*b)); } + + // linking error if instantiated without being optimized out: + void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3); + + // Vectorized path + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacket& b0, DoublePacket& b1) + { + // FIXME not sure that's the best way to implement it! + loadRhs(b+0, b0); + loadRhs(b+1, b1); + } + + // Scalar path + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsScalar& b0, RhsScalar& b1) + { + // FIXME not sure that's the best way to implement it! + loadRhs(b+0, b0); + loadRhs(b+1, b1); + } // nothing special here EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const @@ -414,8 +456,9 @@ public: ResPacketSize = Vectorizable ? packet_traits::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, + // FIXME: should depend on NumberOfRegisters nr = 4, - mr = 2*ResPacketSize, + mr = ResPacketSize, WorkSpaceFactor = nr*RhsPacketSize, LhsProgress = ResPacketSize, @@ -441,6 +484,16 @@ public: { dest = pset1(*b); } + + // linking error if instantiated without being optimized out: + void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3); + + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) + { + // FIXME not sure that's the best way to implement it! + b0 = pload1(b+0); + b1 = pload1(b+1); + } EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { @@ -511,11 +564,9 @@ void gebp_kernel if(strideA==-1) strideA = depth; if(strideB==-1) strideB = depth; conj_helper cj; -// conj_helper pcj; Index packet_cols = (cols/nr) * nr; + // Here we assume that mr==LhsProgress const Index peeled_mc = (rows/mr)*mr; - // FIXME: - const Index peeled_mc2 = peeled_mc + (rows-peeled_mc >= LhsProgress ? LhsProgress : 0); const Index peeled_kc = (depth/4)*4; // loops on each micro vertical panel of rhs (depth x nr) @@ -527,144 +578,88 @@ void gebp_kernel for(Index i=0; i(alpha); - R0 = ploadu(r0); - R1 = ploadu(r1); - R2 = ploadu(r2); - R3 = ploadu(r3); - R4 = ploadu(r0 + ResPacketSize); - R5 = ploadu(r1 + ResPacketSize); - R6 = ploadu(r2 + ResPacketSize); + R0 = ploadu(r0+0*resStride); + R1 = ploadu(r0+1*resStride); + R2 = ploadu(r0+2*resStride); + R3 = ploadu(r0+3*resStride); + R4 = ploadu(r0+4*resStride); + R5 = ploadu(r0+5*resStride); + R6 = ploadu(r0+6*resStride); traits.acc(C0, alphav, R0); - pstoreu(r0, R0); - R0 = ploadu(r3 + ResPacketSize); + pstoreu(r0+0*resStride, R0); + R0 = ploadu(r0+7*resStride); traits.acc(C1, alphav, R1); traits.acc(C2, alphav, R2); @@ -739,232 +711,107 @@ EIGEN_ASM_COMMENT("mybegin4"); traits.acc(C6, alphav, R6); traits.acc(C7, alphav, R0); - pstoreu(r1, R1); - pstoreu(r2, R2); - pstoreu(r3, R3); - pstoreu(r0 + ResPacketSize, R4); - pstoreu(r1 + ResPacketSize, R5); - pstoreu(r2 + ResPacketSize, R6); - pstoreu(r3 + ResPacketSize, R0); + pstoreu(r0+1*resStride, R1); + pstoreu(r0+2*resStride, R2); + pstoreu(r0+3*resStride, R3); + pstoreu(r0+4*resStride, R4); + pstoreu(r0+5*resStride, R5); + pstoreu(r0+6*resStride, R6); + pstoreu(r0+7*resStride, R0); } - else + else // nr==4 { - ResPacket R0, R1, R4; + ResPacket R0, R1, R2; ResPacket alphav = pset1(alpha); - R0 = ploadu(r0); - R1 = ploadu(r1); - R4 = ploadu(r0 + ResPacketSize); + R0 = ploadu(r0+0*resStride); + R1 = ploadu(r0+1*resStride); + R2 = ploadu(r0+2*resStride); traits.acc(C0, alphav, R0); - pstoreu(r0, R0); - R0 = ploadu(r1 + ResPacketSize); + pstoreu(r0+0*resStride, R0); + R0 = ploadu(r0+3*resStride); + traits.acc(C1, alphav, R1); - traits.acc(C4, alphav, R4); - traits.acc(C5, alphav, R0); - pstoreu(r1, R1); - pstoreu(r0 + ResPacketSize, R4); - pstoreu(r1 + ResPacketSize, R0); + traits.acc(C2, alphav, R2); + traits.acc(C3, alphav, R0); + + pstoreu(r0+1*resStride, R1); + pstoreu(r0+2*resStride, R2); + pstoreu(r0+3*resStride, R0); } } - if(rows-peeled_mc>=LhsProgress) - { - Index i = peeled_mc; - const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress]; - prefetch(&blA[0]); - - // gets res block as register - AccPacket C0, C1, C2, C3; - traits.initAcc(C0); - traits.initAcc(C1); - if(nr==4) traits.initAcc(C2); - if(nr==4) traits.initAcc(C3); - - // performs "inner" product - const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; - for(Index k=0; k(alpha); - - ResScalar* r0 = &res[(j2+0)*resStride + i]; - ResScalar* r1 = r0 + resStride; - ResScalar* r2 = r1 + resStride; - ResScalar* r3 = r2 + resStride; - - R0 = ploadu(r0); - R1 = ploadu(r1); - if(nr==4) R2 = ploadu(r2); - if(nr==4) R3 = ploadu(r3); - - traits.acc(C0, alphav, R0); - traits.acc(C1, alphav, R1); - if(nr==4) traits.acc(C2, alphav, R2); - if(nr==4) traits.acc(C3, alphav, R3); - - pstoreu(r0, R0); - pstoreu(r1, R1); - if(nr==4) pstoreu(r2, R2); - if(nr==4) pstoreu(r3, R3); - } - for(Index i=peeled_mc2; i do the same but with nr==1 for(Index j2=packet_cols; j2(alpha); - - ResScalar* r0 = &res[(j2+0)*resStride + i]; - - R0 = ploadu(r0); - R4 = ploadu(r0+ResPacketSize); - - traits.acc(C0, alphav, R0); - traits.acc(C4, alphav, R4); - - pstoreu(r0, R0); - pstoreu(r0+ResPacketSize, R4); - } - if(rows-peeled_mc>=LhsProgress) - { - Index i = peeled_mc; - const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress]; - prefetch(&blA[0]); - AccPacket C0; traits.initAcc(C0); @@ -1025,19 +832,23 @@ EIGEN_ASM_COMMENT("mybegin4"); { LhsPacket A0; RhsPacket B_0; - traits.loadLhs(blA, A0); - traits.loadRhs(blB, B_0); - traits.madd(A0, B_0, C0, B_0); + RhsPacket T0; + + traits.loadLhs(&blA[0*LhsProgress], A0); + traits.loadRhs(&blB[0*RhsProgress], B_0); + traits.madd(A0,B_0,C0,T0); + blB += RhsProgress; blA += LhsProgress; } - + ResPacket R0; ResPacket alphav = pset1(alpha); - ResPacket R0 = ploadu(&res[(j2+0)*resStride + i]); + ResScalar* r0 = &res[(j2+0)*resStride + i]; + R0 = ploadu(r0); traits.acc(C0, alphav, R0); - pstoreu(&res[(j2+0)*resStride + i], R0); + pstoreu(r0, R0); } - for(Index i=peeled_mc2; i=depth && offset<=stride)); - eigen_assert( (StorageOrder==RowMajor) || ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) ); + eigen_assert( (StorageOrder==RowMajor) || ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) ); conj_if::IsComplex && Conjugate> cj; const_blas_data_mapper lhs(_lhs,lhsStride); Index count = 0; @@ -1104,15 +915,25 @@ EIGEN_DONT_INLINE void gemm_pack_lhs=1*PacketSize) A = ploadu(&lhs(i+0*PacketSize, k)); - if(Pack1>=2*PacketSize) B = ploadu(&lhs(i+1*PacketSize, k)); - if(Pack1>=3*PacketSize) C = ploadu(&lhs(i+2*PacketSize, k)); - if(Pack1>=4*PacketSize) D = ploadu(&lhs(i+3*PacketSize, k)); - if(Pack1>=1*PacketSize) { pstore(blockA+count, cj.pconj(A)); count+=PacketSize; } - if(Pack1>=2*PacketSize) { pstore(blockA+count, cj.pconj(B)); count+=PacketSize; } - if(Pack1>=3*PacketSize) { pstore(blockA+count, cj.pconj(C)); count+=PacketSize; } - if(Pack1>=4*PacketSize) { pstore(blockA+count, cj.pconj(D)); count+=PacketSize; } + if((Pack1%PacketSize)==0) + { + Packet A, B, C, D; + if(Pack1>=1*PacketSize) A = ploadu(&lhs(i+0*PacketSize, k)); + if(Pack1>=2*PacketSize) B = ploadu(&lhs(i+1*PacketSize, k)); + if(Pack1>=3*PacketSize) C = ploadu(&lhs(i+2*PacketSize, k)); + if(Pack1>=4*PacketSize) D = ploadu(&lhs(i+3*PacketSize, k)); + if(Pack1>=1*PacketSize) { pstore(blockA+count, cj.pconj(A)); count+=PacketSize; } + if(Pack1>=2*PacketSize) { pstore(blockA+count, cj.pconj(B)); count+=PacketSize; } + if(Pack1>=3*PacketSize) { pstore(blockA+count, cj.pconj(C)); count+=PacketSize; } + if(Pack1>=4*PacketSize) { pstore(blockA+count, cj.pconj(D)); count+=PacketSize; } + } + else + { + if(Pack1>=1) blockA[count++] = cj(lhs(i+0, k)); + if(Pack1>=2) blockA[count++] = cj(lhs(i+1, k)); + if(Pack1>=3) blockA[count++] = cj(lhs(i+2, k)); + if(Pack1>=4) blockA[count++] = cj(lhs(i+3, k)); + } } } else @@ -1191,12 +1012,20 @@ EIGEN_DONT_INLINE void gemm_pack_rhs=4) blockB[count+2] = cj(b2[k]); + if(nr>=4) blockB[count+3] = cj(b3[k]); + if(nr>=8) blockB[count+4] = cj(b4[k]); + if(nr>=8) blockB[count+5] = cj(b5[k]); + if(nr>=8) blockB[count+6] = cj(b6[k]); + if(nr>=8) blockB[count+7] = cj(b7[k]); count += nr; } // skip what we have after @@ -1251,8 +1080,12 @@ EIGEN_DONT_INLINE void gemm_pack_rhs=4) blockB[count+2] = cj(b0[2]); + if(nr>=4) blockB[count+3] = cj(b0[3]); + if(nr>=8) blockB[count+4] = cj(b0[4]); + if(nr>=8) blockB[count+5] = cj(b0[5]); + if(nr>=8) blockB[count+6] = cj(b0[6]); + if(nr>=8) blockB[count+7] = cj(b0[7]); count += nr; } } diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index 99cf9e0ae..d9fd9f556 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -63,7 +63,7 @@ struct symm_pack_lhs for(Index i=peeled_mc; i=4) { blockB[count+2] = rhs(k,j2+2); blockB[count+3] = rhs(k,j2+3); } + if (nr>=8) + { + blockB[count+4] = rhs(k,j2+4); + blockB[count+5] = rhs(k,j2+5); + blockB[count+6] = rhs(k,j2+6); + blockB[count+7] = rhs(k,j2+7); + } count += nr; } } @@ -109,11 +116,18 @@ struct symm_pack_rhs { blockB[count+0] = numext::conj(rhs(j2+0,k)); blockB[count+1] = numext::conj(rhs(j2+1,k)); - if (nr==4) + if (nr>=4) { blockB[count+2] = numext::conj(rhs(j2+2,k)); blockB[count+3] = numext::conj(rhs(j2+3,k)); } + if (nr>=8) + { + blockB[count+4] = numext::conj(rhs(j2+4,k)); + blockB[count+5] = numext::conj(rhs(j2+5,k)); + blockB[count+6] = numext::conj(rhs(j2+6,k)); + blockB[count+7] = numext::conj(rhs(j2+7,k)); + } count += nr; } // symmetric @@ -137,11 +151,18 @@ struct symm_pack_rhs { blockB[count+0] = rhs(k,j2+0); blockB[count+1] = rhs(k,j2+1); - if (nr==4) + if (nr>=4) { blockB[count+2] = rhs(k,j2+2); blockB[count+3] = rhs(k,j2+3); } + if (nr>=8) + { + blockB[count+4] = rhs(k,j2+4); + blockB[count+5] = rhs(k,j2+5); + blockB[count+6] = rhs(k,j2+6); + blockB[count+7] = rhs(k,j2+7); + } count += nr; } } @@ -153,11 +174,18 @@ struct symm_pack_rhs { blockB[count+0] = numext::conj(rhs(j2+0,k)); blockB[count+1] = numext::conj(rhs(j2+1,k)); - if (nr==4) + if (nr>=4) { blockB[count+2] = numext::conj(rhs(j2+2,k)); blockB[count+3] = numext::conj(rhs(j2+3,k)); } + if (nr>=8) + { + blockB[count+4] = numext::conj(rhs(j2+4,k)); + blockB[count+5] = numext::conj(rhs(j2+5,k)); + blockB[count+6] = numext::conj(rhs(j2+6,k)); + blockB[count+7] = numext::conj(rhs(j2+7,k)); + } count += nr; } } @@ -422,11 +450,11 @@ struct SelfadjointProductMatrix NumTraits::IsComplex && EIGEN_LOGICAL_XOR(RhsIsUpper,bool(RhsBlasTraits::NeedToConjugate)), internal::traits::Flags&RowMajorBit ? RowMajor : ColMajor> ::run( - lhs.rows(), rhs.cols(), // sizes - &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info - &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info - &dst.coeffRef(0,0), dst.outerStride(), // result info - actualAlpha // alpha + lhs.rows(), rhs.cols(), // sizes + &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info + &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info + &dst.coeffRef(0,0), dst.outerStride(), // result info + actualAlpha // alpha ); } };