From 2a564695f0e9391eb3a0125bd5731c17aabdb680 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 19 Mar 2014 13:28:50 +0100 Subject: [PATCH 001/158] Simpler and hopefully more future-proof fix for bug #503 (aligned_allocator with c++11) --- Eigen/src/Core/util/Memory.h | 110 +++++++++-------------------------- 1 file changed, 27 insertions(+), 83 deletions(-) diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 286e963d2..0f8ab065a 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -700,98 +700,42 @@ template class aligned_stack_memory_handler * \sa \ref TopicStlContainers. */ template -class aligned_allocator +class aligned_allocator : public std::allocator { public: - typedef size_t size_type; - typedef std::ptrdiff_t difference_type; - typedef T* pointer; - typedef const T* const_pointer; - typedef T& reference; - typedef const T& const_reference; - typedef T value_type; + typedef size_t size_type; + typedef std::ptrdiff_t difference_type; + typedef T* pointer; + typedef const T* const_pointer; + typedef T& reference; + typedef const T& const_reference; + typedef T value_type; - template - struct rebind - { - typedef aligned_allocator other; - }; + template + struct rebind + { + typedef aligned_allocator other; + }; - pointer address( reference value ) const - { - return &value; - } + aligned_allocator() : std::allocator() {} - const_pointer address( const_reference value ) const - { - return &value; - } + aligned_allocator(const aligned_allocator& other) : std::allocator(other) {} - aligned_allocator() - { - } + template + aligned_allocator(const aligned_allocator& other) : std::allocator(other) {} - aligned_allocator( const aligned_allocator& ) - { - } + ~aligned_allocator() {} - template - aligned_allocator( const aligned_allocator& ) - { - } + pointer allocate(size_type num, const void* /*hint*/ = 0) + { + internal::check_size_for_overflow(num); + return static_cast( internal::aligned_malloc(num * sizeof(T)) ); + } - ~aligned_allocator() - { - } - - size_type max_size() const - { - return (std::numeric_limits::max)(); - } - - pointer allocate( size_type num, const void* hint = 0 ) - { - EIGEN_UNUSED_VARIABLE(hint); - internal::check_size_for_overflow(num); - return static_cast( internal::aligned_malloc( num * sizeof(T) ) ); - } - - void construct( pointer p, const T& value ) - { - ::new( p ) T( value ); - } - -#if (__cplusplus >= 201103L) - template - void construct( U* u, Args&&... args) - { - ::new( static_cast(u) ) U( std::forward( args )... ); - } -#endif - - void destroy( pointer p ) - { - p->~T(); - } - -#if (__cplusplus >= 201103L) - template - void destroy( U* u ) - { - u->~U(); - } -#endif - - void deallocate( pointer p, size_type /*num*/ ) - { - internal::aligned_free( p ); - } - - bool operator!=(const aligned_allocator& ) const - { return false; } - - bool operator==(const aligned_allocator& ) const - { return true; } + void deallocate(pointer p, size_type /*num*/) + { + internal::aligned_free(p); + } }; //---------- Cache sizes ---------- From c39a3fa7a1808233ad6556e169e0c08d3bc979e1 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 20 Mar 2014 10:14:26 +0100 Subject: [PATCH 002/158] Makes gcc to generate a pshufd instruction for pset1 --- Eigen/src/Core/arch/SSE/PacketMath.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index f5a3dab52..ea14111e3 100644 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -110,7 +110,20 @@ template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { re template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return _mm_set_pd(from,from); } template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { return _mm_set_epi32(from,from,from,from); } #else -template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { return _mm_set1_ps(from); } + +// GCC generates a shufps instruction for set1_ps instead of the more efficient pshufd instruction. +// However, with AVX, we want it to generate a vbroadcastss. +// Moreover, we cannot use intrinsics here because then gcc generates crappy code in some cases (see bug 203) +#if (defined __GNUC__) && (!defined __INTEL_COMPILER) && (!defined __clang__) && (!defined __AVX__) + template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { + Packet4f res; + asm("pshufd $0, %[a], %[b]" : [b] "=x" (res) : [a] "x" (from)); + return res; + } +#else + template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { return _mm_set_ps1(from); } +#endif + template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return _mm_set1_pd(from); } template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { return _mm_set1_epi32(from); } #endif From cfd3d6ce9c3d1bf5b899b8d15547a81a794a8af3 Mon Sep 17 00:00:00 2001 From: Bo Li Date: Thu, 20 Mar 2014 22:05:40 +0800 Subject: [PATCH 003/158] fixed a template type conversion bug in AngleAxis found by Pei Luo --- Eigen/src/Geometry/AngleAxis.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Geometry/AngleAxis.h b/Eigen/src/Geometry/AngleAxis.h index f424e6d7d..b42048c55 100644 --- a/Eigen/src/Geometry/AngleAxis.h +++ b/Eigen/src/Geometry/AngleAxis.h @@ -165,8 +165,8 @@ AngleAxis& AngleAxis::operator=(const QuaternionBase::dummy_precision()*NumTraits::dummy_precision()) { - m_angle = 0; - m_axis << 1, 0, 0; + m_angle = Scalar(0); + m_axis << Scalar(1), Scalar(0), Scalar(0); } else { From 01fd880424f0e937af7841202af67e6e4ee6fc07 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 20 Mar 2014 16:03:46 +0100 Subject: [PATCH 004/158] Revert previous change and introduce a new workaround regarding gcc generating a shufps instruction instead of the more efficient pshufd instruction. The trick consists in introducing a new pload1 function to be used in low level product kernels for which bug #203 does not apply. Indeed, it turned out that using inline assembly prevents gcc of doing a good job at instructtion reordering. --- Eigen/src/Core/GenericPacketMath.h | 4 ++++ Eigen/src/Core/arch/SSE/PacketMath.h | 26 ++++++++++++-------------- 2 files changed, 16 insertions(+), 14 deletions(-) mode change 100644 => 100755 Eigen/src/Core/GenericPacketMath.h mode change 100644 => 100755 Eigen/src/Core/arch/SSE/PacketMath.h diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h old mode 100644 new mode 100755 index b0469fa1e..538ab53b2 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -169,6 +169,10 @@ ploaddup(const typename unpacket_traits::type* from) { return *from; } template EIGEN_DEVICE_FUNC inline Packet pset1(const typename unpacket_traits::type& a) { return a; } +/** \internal \returns a packet with constant coefficients \a a[0], e.g.: (a[0],a[0],a[0],a[0]) */ +template EIGEN_DEVICE_FUNC inline Packet +pload1(const typename unpacket_traits::type *a) { return pset1(*a); } + /** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */ template inline typename packet_traits::type plset(const Scalar& a) { return a; } diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h old mode 100644 new mode 100755 index ea14111e3..293fb83e4 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -110,24 +110,22 @@ template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { re template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return _mm_set_pd(from,from); } template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { return _mm_set_epi32(from,from,from,from); } #else - -// GCC generates a shufps instruction for set1_ps instead of the more efficient pshufd instruction. -// However, with AVX, we want it to generate a vbroadcastss. -// Moreover, we cannot use intrinsics here because then gcc generates crappy code in some cases (see bug 203) -#if (defined __GNUC__) && (!defined __INTEL_COMPILER) && (!defined __clang__) && (!defined __AVX__) - template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { - Packet4f res; - asm("pshufd $0, %[a], %[b]" : [b] "=x" (res) : [a] "x" (from)); - return res; - } -#else - template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { return _mm_set_ps1(from); } -#endif - +template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { return _mm_set_ps1(from); } template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return _mm_set1_pd(from); } template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { return _mm_set1_epi32(from); } #endif +// GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction. +// However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203) +// Using inline assembly is also not an option because then gcc fails to reorder properly the instructions. +// Therefore, we introduced the pload1 functions to be used in product kernels for which bug 203 does not apply. +// Also note that with AVX, we want it to generate a vbroadcastss. +#if (defined __GNUC__) && (!defined __INTEL_COMPILER) && (!defined __clang__) && (!defined __AVX__) +template<> EIGEN_STRONG_INLINE Packet4f pload1(const float *from) { + return vec4f_swizzle1(_mm_load_ss(from),0,0,0,0); +} +#endif + template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return _mm_add_ps(pset1(a), _mm_set_ps(3,2,1,0)); } template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) { return _mm_add_pd(pset1(a),_mm_set_pd(1,0)); } template<> EIGEN_STRONG_INLINE Packet4i plset(const int& a) { return _mm_add_epi32(pset1(a),_mm_set_epi32(3,2,1,0)); } From 60cd361ebea62d973d81436250268e4fd1b86f49 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Wed, 26 Mar 2014 17:48:30 +0100 Subject: [PATCH 005/158] Fix bug #222. Make temporary matrix column-major independently of EIGEN_DEFAULT_TO_ROW_MAJOR --- Eigen/src/Householder/BlockHouseholder.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Householder/BlockHouseholder.h b/Eigen/src/Householder/BlockHouseholder.h index 1991c6527..60dbea5f5 100644 --- a/Eigen/src/Householder/BlockHouseholder.h +++ b/Eigen/src/Householder/BlockHouseholder.h @@ -48,7 +48,7 @@ void apply_block_householder_on_the_left(MatrixType& mat, const VectorsType& vec typedef typename MatrixType::Index Index; enum { TFactorSize = MatrixType::ColsAtCompileTime }; Index nbVecs = vectors.cols(); - Matrix T(nbVecs,nbVecs); + Matrix T(nbVecs,nbVecs); make_block_householder_triangular_factor(T, vectors, hCoeffs); const TriangularView& V(vectors); From c8c81c1e7454dd824607132c78997adee62101fd Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 2 Jan 2014 16:18:32 -0800 Subject: [PATCH 006/158] Improved the efficiency if the block-panel matrix multiplication code: the change reduces the pressure on the L1 cache by removing the calls to gebp_traits::unpackRhs(). Instead the packetization of the rhs blocks is done on the fly in gebp_traits::loadRhs(). This adds numerous calls to pset1 (since we're packetizing on the fly in the inner loop) but this is more than compensated by the fact that we're decreasing the memory transfers by a factor RhsPacketSize. --- .../Core/products/GeneralBlockPanelKernel.h | 76 +++++-------------- Eigen/src/Core/products/GeneralMatrixMatrix.h | 10 +-- .../products/GeneralMatrixMatrixTriangular.h | 19 ++--- .../Core/products/TriangularMatrixMatrix.h | 15 ++-- .../Core/products/TriangularSolverMatrix.h | 13 +--- 5 files changed, 38 insertions(+), 95 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 686ff84f1..ba6fad246 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -10,6 +10,7 @@ #ifndef EIGEN_GENERAL_BLOCK_PANEL_H #define EIGEN_GENERAL_BLOCK_PANEL_H + namespace Eigen { namespace internal { @@ -169,7 +170,7 @@ public: WorkSpaceFactor = nr * RhsPacketSize, LhsProgress = LhsPacketSize, - RhsProgress = RhsPacketSize + RhsProgress = 1 }; typedef typename packet_traits::type _LhsPacket; @@ -187,15 +188,9 @@ public: p = pset1(ResScalar(0)); } - EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) - { - for(DenseIndex k=0; k(&b[k*RhsPacketSize], rhs[k]); - } - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { - dest = pload(b); + dest = pset1(*b); } EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const @@ -240,7 +235,7 @@ public: WorkSpaceFactor = nr*RhsPacketSize, LhsProgress = LhsPacketSize, - RhsProgress = RhsPacketSize + RhsProgress = 1 }; typedef typename packet_traits::type _LhsPacket; @@ -258,15 +253,9 @@ public: p = pset1(ResScalar(0)); } - EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) - { - for(DenseIndex k=0; k(&b[k*RhsPacketSize], rhs[k]); - } - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { - dest = pload(b); + dest = pset1(*b); } EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const @@ -320,7 +309,7 @@ public: WorkSpaceFactor = Vectorizable ? 2*nr*RealPacketSize : nr, LhsProgress = ResPacketSize, - RhsProgress = Vectorizable ? 2*ResPacketSize : 1 + RhsProgress = 1 }; typedef typename packet_traits::type RealPacket; @@ -344,30 +333,15 @@ public: p.second = pset1(RealScalar(0)); } - /* Unpack the rhs coeff such that each complex coefficient is spread into - * two packects containing respectively the real and imaginary coefficient - * duplicated as many time as needed: (x+iy) => [x, ..., x] [y, ..., y] - */ - EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const Scalar* rhs, Scalar* b) + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const { - for(DenseIndex k=0; k((RealScalar*)&b[k*ResPacketSize*2+0], real(rhs[k])); - pstore1((RealScalar*)&b[k*ResPacketSize*2+ResPacketSize], imag(rhs[k])); - } - else - b[k] = rhs[k]; - } + dest = pset1(*b); } - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const { dest = *b; } - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const { - dest.first = pload((const RealScalar*)b); - dest.second = pload((const RealScalar*)(b+ResPacketSize)); + dest.first = pset1(real(*b)); + dest.second = pset1(imag(*b)); } // nothing special here @@ -445,7 +419,7 @@ public: WorkSpaceFactor = nr*RhsPacketSize, LhsProgress = ResPacketSize, - RhsProgress = ResPacketSize + RhsProgress = 1 }; typedef typename packet_traits::type _LhsPacket; @@ -463,15 +437,9 @@ public: p = pset1(ResScalar(0)); } - EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) - { - for(DenseIndex k=0; k(&b[k*RhsPacketSize], rhs[k]); - } - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { - dest = pload(b); + dest = pset1(*b); } EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const @@ -529,14 +497,14 @@ struct gebp_kernel EIGEN_DONT_INLINE void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, - Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB=0); + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); }; template EIGEN_DONT_INLINE void gebp_kernel ::operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, - Index strideA, Index strideB, Index offsetA, Index offsetB, RhsScalar* unpackedB) + Index strideA, Index strideB, Index offsetA, Index offsetB) { Traits traits; @@ -550,14 +518,9 @@ void gebp_kernel const Index peeled_mc2 = peeled_mc + (rows-peeled_mc >= LhsProgress ? LhsProgress : 0); const Index peeled_kc = (depth/4)*4; - if(unpackedB==0) - unpackedB = const_cast(blockB - strideB * nr * RhsProgress); - // loops on each micro vertical panel of rhs (depth x nr) for(Index j2=0; j2 we select a mr x nr micro block of res which is entirely // stored into mr/packet_size x nr registers. @@ -590,7 +553,7 @@ void gebp_kernel // performs "inner" product // TODO let's check wether the folowing peeled loop could not be // optimized via optimal prefetching from one loop to the other - const RhsScalar* blB = unpackedB; + const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; for(Index k=0; k do the same but with nr==1 for(Index j2=packet_cols; j20) while(info[j].sync!=k) {} - gebp(res+info[j].rhs_start*resStride, resStride, blockA, blockB+info[j].rhs_start*actual_kc, mc, actual_kc, info[j].rhs_length, alpha, -1,-1,0,0, w); + gebp(res+info[j].rhs_start*resStride, resStride, blockA, blockB+info[j].rhs_start*actual_kc, mc, actual_kc, info[j].rhs_length, alpha, -1,-1,0,0); } // Then keep going as usual with the remaining A' @@ -134,7 +132,7 @@ static void run(Index rows, Index cols, Index depth, pack_lhs(blockA, &lhs(i,k), lhsStride, actual_kc, actual_mc); // C_i += A' * B' - gebp(res+i, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1,-1,0,0, w); + gebp(res+i, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1,-1,0,0); } // Release all the sub blocks B'_j of B' for the current thread, @@ -152,11 +150,9 @@ static void run(Index rows, Index cols, Index depth, // this is the sequential version! std::size_t sizeA = kc*mc; std::size_t sizeB = kc*cols; - std::size_t sizeW = kc*Traits::WorkSpaceFactor; ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA()); ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB()); - ei_declare_aligned_stack_constructed_variable(RhsScalar, blockW, sizeW, blocking.blockW()); // For each horizontal panel of the rhs, and corresponding panel of the lhs... // (==GEMM_VAR1) @@ -182,7 +178,7 @@ static void run(Index rows, Index cols, Index depth, pack_lhs(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc); // Everything is packed, we can now call the block * panel kernel: - gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW); + gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0); } } } diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index 5c3763909..ffa871cae 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -73,11 +73,8 @@ struct general_matrix_matrix_triangular_product Traits::nr) mc = (mc/Traits::nr)*Traits::nr; - std::size_t sizeW = kc*Traits::WorkSpaceFactor; - std::size_t sizeB = sizeW + kc*size; ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, kc*mc, 0); - ei_declare_aligned_stack_constructed_variable(RhsScalar, allocatedBlockB, sizeB, 0); - RhsScalar* blockB = allocatedBlockB + sizeW; + ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, kc*size, 0); gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; @@ -103,15 +100,15 @@ struct general_matrix_matrix_triangular_product processed with gebp or skipped if (UpLo==Lower) gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, (std::min)(size,i2), alpha, - -1, -1, 0, 0, allocatedBlockB); + -1, -1, 0, 0); - sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha, allocatedBlockB); + sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha); if (UpLo==Upper) { Index j2 = i2+actual_mc; gebp(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, (std::max)(Index(0), size-j2), alpha, - -1, -1, 0, 0, allocatedBlockB); + -1, -1, 0, 0); } } } @@ -136,7 +133,7 @@ struct tribb_kernel enum { BlockSize = EIGEN_PLAIN_ENUM_MAX(mr,nr) }; - void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha, RhsScalar* workspace) + void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha) { gebp_kernel gebp_kernel; Matrix buffer; @@ -150,7 +147,7 @@ struct tribb_kernel if(UpLo==Upper) gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, alpha, - -1, -1, 0, 0, workspace); + -1, -1, 0, 0); // selfadjoint micro block { @@ -158,7 +155,7 @@ struct tribb_kernel buffer.setZero(); // 1 - apply the kernel on the temporary buffer gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha, - -1, -1, 0, 0, workspace); + -1, -1, 0, 0); // 2 - triangular accumulation for(Index j1=0; j1 triangularBuffer; triangularBuffer.setZero(); @@ -187,7 +185,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix0) @@ -197,7 +195,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix() (blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc); - gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW); + gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0); } } } @@ -266,11 +264,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix triangularBuffer; triangularBuffer.setZero(); @@ -357,14 +353,13 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix conj; gebp_kernel gebp_kernel; @@ -158,7 +156,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix conj; gebp_kernel gebp_kernel; @@ -285,8 +281,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix0) gebp_kernel(_other+i2+startPanel*otherStride, otherStride, blockA, geb, actual_mc, actual_kc, rs, Scalar(-1), - -1, -1, 0, 0, blockW); + -1, -1, 0, 0); } } } From 64a85800bd3573a7da7a396fde9707dce87a58d9 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 29 Jan 2014 11:43:05 -0800 Subject: [PATCH 007/158] Added support for AVX to Eigen. --- CMakeLists.txt | 6 + Eigen/Core | 22 +- Eigen/Geometry | 4 +- Eigen/LU | 4 +- Eigen/src/Core/DenseStorage.h | 4 +- Eigen/src/Core/Redux.h | 13 +- Eigen/src/Core/arch/AVX/CMakeLists.txt | 6 + Eigen/src/Core/arch/AVX/Complex.h | 443 ++++++++++++++++++ Eigen/src/Core/arch/AVX/PacketMath.h | 423 +++++++++++++++++ Eigen/src/Core/arch/SSE/PacketMath.h | 8 + Eigen/src/Core/products/GeneralMatrixMatrix.h | 6 +- Eigen/src/Core/products/GeneralMatrixVector.h | 11 +- Eigen/src/Core/util/Macros.h | 9 +- Eigen/src/Core/util/Memory.h | 38 +- cmake/EigenTesting.cmake | 6 + test/packetmath.cpp | 35 +- test/unalignedcount.cpp | 9 +- 17 files changed, 993 insertions(+), 54 deletions(-) create mode 100644 Eigen/src/Core/arch/AVX/CMakeLists.txt create mode 100644 Eigen/src/Core/arch/AVX/Complex.h create mode 100644 Eigen/src/Core/arch/AVX/PacketMath.h diff --git a/CMakeLists.txt b/CMakeLists.txt index a4731c8c1..16a9b5bcb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -196,6 +196,12 @@ if(NOT MSVC) message(STATUS "Enabling SSE4.2 in tests/examples") endif() + option(EIGEN_TEST_AVX "Enable/Disable AVX in tests/examples" ON) + if(EIGEN_TEST_AVX) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx") + message(STATUS "Enabling AVX in tests/examples") + endif() + option(EIGEN_TEST_ALTIVEC "Enable/Disable AltiVec in tests/examples" OFF) if(EIGEN_TEST_ALTIVEC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maltivec -mabi=altivec") diff --git a/Eigen/Core b/Eigen/Core index 468ae0c76..bd20d5ac5 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -110,7 +110,13 @@ #ifdef __SSE4_2__ #define EIGEN_VECTORIZE_SSE4_2 #endif - + #ifdef __AVX__ + #define EIGEN_VECTORIZE_AVX + #define EIGEN_VECTORIZE_SSE3 + #define EIGEN_VECTORIZE_SSSE3 + #define EIGEN_VECTORIZE_SSE4_1 + #define EIGEN_VECTORIZE_SSE4_2 + #endif // include files // This extern "C" works around a MINGW-w64 compilation issue @@ -140,6 +146,9 @@ #ifdef EIGEN_VECTORIZE_SSE4_2 #include #endif + #ifdef EIGEN_VECTORIZE_AVX + #include + #endif #endif } // end extern "C" #elif defined __ALTIVEC__ @@ -209,7 +218,9 @@ namespace Eigen { inline static const char *SimdInstructionSetsInUse(void) { -#if defined(EIGEN_VECTORIZE_SSE4_2) +#if defined(EIGEN_VECTORIZE_AVX) + return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2"; +#elif defined(EIGEN_VECTORIZE_SSE4_2) return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2"; #elif defined(EIGEN_VECTORIZE_SSE4_1) return "SSE, SSE2, SSE3, SSSE3, SSE4.1"; @@ -287,7 +298,12 @@ using std::ptrdiff_t; #include "src/Core/MathFunctions.h" #include "src/Core/GenericPacketMath.h" -#if defined EIGEN_VECTORIZE_SSE +#if defined EIGEN_VECTORIZE_AVX + // Use AVX for floats and doubles, SSE for integers + #include "src/Core/arch/AVX/PacketMath.h" + #include "src/Core/arch/AVX/Complex.h" + #include "src/Core/arch/SSE/PacketMath.h" // For integers +#elif defined EIGEN_VECTORIZE_SSE #include "src/Core/arch/SSE/PacketMath.h" #include "src/Core/arch/SSE/MathFunctions.h" #include "src/Core/arch/SSE/Complex.h" diff --git a/Eigen/Geometry b/Eigen/Geometry index efd9d4504..f9bc6fc57 100644 --- a/Eigen/Geometry +++ b/Eigen/Geometry @@ -47,7 +47,9 @@ #include "src/Geometry/AlignedBox.h" #include "src/Geometry/Umeyama.h" - #if defined EIGEN_VECTORIZE_SSE + // Use the SSE optimized version whenever possible. At the moment the + // SSE version doesn't compile when AVX is enabled + #if defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX #include "src/Geometry/arch/Geometry_SSE.h" #endif #endif diff --git a/Eigen/LU b/Eigen/LU index db5795504..e5c3f32f7 100644 --- a/Eigen/LU +++ b/Eigen/LU @@ -27,7 +27,9 @@ #include "src/LU/Determinant.h" #include "src/LU/Inverse.h" -#if defined EIGEN_VECTORIZE_SSE +// Use the SSE optimized version whenever possible. At the moment the +// SSE version doesn't compile when AVX is enabled +#if defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX #include "src/LU/arch/Inverse_SSE.h" #endif diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h index b9dd75ade..2342b08a1 100644 --- a/Eigen/src/Core/DenseStorage.h +++ b/Eigen/src/Core/DenseStorage.h @@ -83,7 +83,7 @@ struct plain_array template struct plain_array { - EIGEN_USER_ALIGN16 T array[Size]; + EIGEN_USER_ALIGN32 T array[Size]; EIGEN_DEVICE_FUNC plain_array() @@ -102,7 +102,7 @@ struct plain_array template struct plain_array { - EIGEN_USER_ALIGN16 T array[1]; + EIGEN_USER_ALIGN32 T array[1]; EIGEN_DEVICE_FUNC plain_array() {} EIGEN_DEVICE_FUNC plain_array(constructor_without_unaligned_array_assert) {} }; diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h index b2c775d90..5b82c9a65 100644 --- a/Eigen/src/Core/Redux.h +++ b/Eigen/src/Core/Redux.h @@ -303,10 +303,15 @@ struct redux_impl static EIGEN_STRONG_INLINE Scalar run(const Derived& mat, const Func& func) { eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix"); - Scalar res = func.predux(redux_vec_unroller::run(mat,func)); - if (VectorizedSize != Size) - res = func(res,redux_novec_unroller::run(mat,func)); - return res; + if (VectorizedSize > 0) { + Scalar res = func.predux(redux_vec_unroller::run(mat,func)); + if (VectorizedSize != Size) + res = func(res,redux_novec_unroller::run(mat,func)); + return res; + } + else { + return redux_novec_unroller::run(mat,func); + } } }; diff --git a/Eigen/src/Core/arch/AVX/CMakeLists.txt b/Eigen/src/Core/arch/AVX/CMakeLists.txt new file mode 100644 index 000000000..bdb71ab99 --- /dev/null +++ b/Eigen/src/Core/arch/AVX/CMakeLists.txt @@ -0,0 +1,6 @@ +FILE(GLOB Eigen_Core_arch_AVX_SRCS "*.h") + +INSTALL(FILES + ${Eigen_Core_arch_AVX_SRCS} + DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/AVX COMPONENT Devel +) diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h new file mode 100644 index 000000000..9fb44ecab --- /dev/null +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -0,0 +1,443 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com) +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_COMPLEX_AVX_H +#define EIGEN_COMPLEX_AVX_H + +namespace Eigen { + +namespace internal { + +//---------- float ---------- +struct Packet4cf +{ + EIGEN_STRONG_INLINE Packet4cf() {} + EIGEN_STRONG_INLINE explicit Packet4cf(const __m256& a) : v(a) {} + __m256 v; +}; + +template<> struct packet_traits > : default_packet_traits +{ + typedef Packet4cf type; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 4, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0 + }; +}; + +template<> struct unpacket_traits { typedef std::complex type; enum {size=4}; }; + +template<> EIGEN_STRONG_INLINE Packet4cf padd(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf psub(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf pnegate(const Packet4cf& a) +{ + return Packet4cf(pnegate(a.v)); +} +template<> EIGEN_STRONG_INLINE Packet4cf pconj(const Packet4cf& a) +{ + const __m256 mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000)); + return Packet4cf(_mm256_xor_ps(a.v,mask)); +} + +template<> EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) +{ + __m256 tmp1 = _mm256_mul_ps(_mm256_moveldup_ps(a.v), b.v); + __m256 tmp2 = _mm256_mul_ps(_mm256_movehdup_ps(a.v), _mm256_permute_ps(b.v, _MM_SHUFFLE(2,3,0,1))); + __m256 result = _mm256_addsub_ps(tmp1, tmp2); + return Packet4cf(result); +} + +template<> EIGEN_STRONG_INLINE Packet4cf pand (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf por (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf pxor (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf pandnot(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_andnot_ps(a.v,b.v)); } + +template<> EIGEN_STRONG_INLINE Packet4cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet4cf(pload(&numext::real_ref(*from))); } +template<> EIGEN_STRONG_INLINE Packet4cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cf(ploadu(&numext::real_ref(*from))); } + + +template<> EIGEN_STRONG_INLINE Packet4cf pset1(const std::complex& from) +{ + __m256 result; + for (int i = 0; i < 8; i+=2) { + result[i] = std::real(from); + result[i+1] = std::imag(from); + } + return Packet4cf(result); +} + +template<> EIGEN_STRONG_INLINE Packet4cf ploaddup(const std::complex* from) +{ + __m256 result; + for (int i = 0; i < 2; ++i) { + result[4*i] = std::real(from[i]); + result[4*i+1] = std::imag(from[i]); + result[4*i+2] = std::real(from[i]); + result[4*i+3] = std::imag(from[i]); + } + return Packet4cf(result); +} + +template<> EIGEN_STRONG_INLINE void pstore >(std::complex* to, const Packet4cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex* to, const Packet4cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); } + +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } + + +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet4cf& a) +{ + return std::complex(a.v[0], a.v[1]); +} + +template<> EIGEN_STRONG_INLINE Packet4cf preverse(const Packet4cf& a) { + __m256 result; + result[0] = a.v[6]; + result[1] = a.v[7]; + result[2] = a.v[4]; + result[3] = a.v[5]; + result[4] = a.v[2]; + result[5] = a.v[3]; + result[6] = a.v[0]; + result[7] = a.v[1]; + return Packet4cf(result); +} + +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet4cf& a) +{ + return std::complex(a.v[0]+a.v[2]+a.v[4]+a.v[6], a.v[1]+a.v[3]+a.v[5]+a.v[7]); +} + +template<> EIGEN_STRONG_INLINE Packet4cf preduxp(const Packet4cf* vecs) +{ + __m256 result = _mm256_setzero_ps(); + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 8; j+=2) { + result[2*i] += vecs[i].v[j]; + result[2*i+1] += vecs[i].v[j+1]; + } + } + return Packet4cf(result); +} + +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet4cf& a) +{ + std::complex result(a.v[0], a.v[1]); + for (int i = 2; i < 8; i+=2) { + result *= std::complex(a.v[i], a.v[i+1]); + } + return result; +} + +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet4cf& first, const Packet4cf& second) + { + if (Offset==0) return; + for (int i = 0; i < 4-Offset; ++i) + { + first.v[2*i] = first.v[2*(i+Offset)]; + first.v[2*i+1] = first.v[2*(i+Offset)+1]; + } + for (int i = 4-Offset; i < 4; ++i) + { + first.v[2*i] = second.v[2*(i-4+Offset)]; + first.v[2*i+1] = second.v[2*(i-4+Offset)+1]; + } + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const + { + return internal::pmul(a, pconj(b)); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const + { + return internal::pmul(pconj(a), b); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const + { + return pconj(internal::pmul(a, b)); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet8f& x, const Packet4cf& y, const Packet4cf& c) const + { return padd(c, pmul(x,y)); } + + EIGEN_STRONG_INLINE Packet4cf pmul(const Packet8f& x, const Packet4cf& y) const + { return Packet4cf(Eigen::internal::pmul(x, y.v)); } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet8f& y, const Packet4cf& c) const + { return padd(c, pmul(x,y)); } + + EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& x, const Packet8f& y) const + { return Packet4cf(Eigen::internal::pmul(x.v, y)); } +}; + +template<> EIGEN_STRONG_INLINE Packet4cf pdiv(const Packet4cf& a, const Packet4cf& b) +{ + Packet4cf res; + for (int i = 0; i < 8; i+=2) { + std::complex result = std::complex(a.v[i], a.v[i+1]) / std::complex(b.v[i], b.v[i+1]); + res.v[i] = std::real(result); + res.v[i+1] = std::imag(result); + } + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4cf pcplxflip(const Packet4cf& x) +{ + Packet4cf res; + for (int i = 0; i < 8; i+=2) { + res.v[i] = x.v[i+1]; + res.v[i+1] = x.v[i]; + } + return res; +} + +//---------- double ---------- +struct Packet2cd +{ + EIGEN_STRONG_INLINE Packet2cd() {} + EIGEN_STRONG_INLINE explicit Packet2cd(const __m256d& a) : v(a) {} + __m256d v; +}; + +template<> struct packet_traits > : default_packet_traits +{ + typedef Packet2cd type; + enum { + Vectorizable = 1, + AlignedOnScalar = 0, + size = 2, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0 + }; +}; + +template<> struct unpacket_traits { typedef std::complex type; enum {size=2}; }; + +template<> EIGEN_STRONG_INLINE Packet2cd padd(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd psub(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd pnegate(const Packet2cd& a) { return Packet2cd(pnegate(a.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd pconj(const Packet2cd& a) +{ + const __m256d mask = _mm256_castsi256_pd(_mm256_set_epi32(0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0)); + return Packet2cd(_mm256_xor_pd(a.v,mask)); +} + +template<> EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) +{ + __m256d tmp1 = _mm256_mul_pd(_mm256_permute_pd(a.v, 0), b.v); + // FIXME: _mm256_permute_pd(b.v, _MM_SHUFFLE2(1,0) won't work as expected, figure out an alternative. + __m256d op = {b.v[1], b.v[0], b.v[3], b.v[2]}; + __m256d tmp2 = _mm256_mul_pd(_mm256_permute_pd(a.v, 15), op); + __m256d result = _mm256_addsub_pd(tmp1, tmp2); + + return Packet2cd(result); +} + +template<> EIGEN_STRONG_INLINE Packet2cd pand (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd por (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd pxor (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd pandnot(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_andnot_pd(a.v,b.v)); } + +template<> EIGEN_STRONG_INLINE Packet2cd pload (const std::complex* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return Packet2cd(pload((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet2cd ploadu(const std::complex* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cd(ploadu((const double*)from)); } + +template<> EIGEN_STRONG_INLINE Packet2cd pset1(const std::complex& from) +{ + __m256d result; + for (int i = 0; i < 4; i+=2) { + result[i] = std::real(from); + result[i+1] = std::imag(from); + } + return Packet2cd(result); +} + +template<> EIGEN_STRONG_INLINE Packet2cd ploaddup(const std::complex* from) { return pset1(*from); } + +template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } + +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } + +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cd& a) +{ + return std::complex(a.v[0],a.v[1]); +} + +template<> EIGEN_STRONG_INLINE Packet2cd preverse(const Packet2cd& a) { + __m256d result; + result[0] = a.v[2]; + result[1] = a.v[3]; + result[2] = a.v[0]; + result[3] = a.v[1]; + return Packet2cd(result); +} + +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet2cd& a) +{ + return std::complex(a.v[0]+a.v[2], a.v[1]+a.v[3]); +} + +template<> EIGEN_STRONG_INLINE Packet2cd preduxp(const Packet2cd* vecs) +{ + __m256d result = _mm256_setzero_pd(); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 4; j+=2) { + result[2*i] += vecs[i].v[j]; + result[2*i+1] += vecs[i].v[j+1]; + } + } + return Packet2cd(result); +} + +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cd& a) +{ + return std::complex(a.v[0], a.v[1]) * std::complex(a.v[2], a.v[3]); +} + +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet2cd& first, const Packet2cd& second) + { + if (Offset==0) return; + first.v[0] = first.v[2]; + first.v[1] = first.v[3]; + first.v[2] = second.v[0]; + first.v[3] = second.v[1]; + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const + { + return internal::pmul(a, pconj(b)); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const + { + return internal::pmul(pconj(a), b); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const + { + return pconj(internal::pmul(a, b)); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet4d& x, const Packet2cd& y, const Packet2cd& c) const + { return padd(c, pmul(x,y)); } + + EIGEN_STRONG_INLINE Packet2cd pmul(const Packet4d& x, const Packet2cd& y) const + { return Packet2cd(Eigen::internal::pmul(x, y.v)); } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet4d& y, const Packet2cd& c) const + { return padd(c, pmul(x,y)); } + + EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& x, const Packet4d& y) const + { return Packet2cd(Eigen::internal::pmul(x.v, y)); } +}; + +template<> EIGEN_STRONG_INLINE Packet2cd pdiv(const Packet2cd& a, const Packet2cd& b) +{ + Packet2cd res; + for (int i = 0; i < 4; i+=2) { + std::complex result = std::complex(a.v[i], a.v[i+1]) / std::complex(b.v[i], b.v[i+1]); + res.v[i] = std::real(result); + res.v[i+1] = std::imag(result); + } + return res; +} + +template<> EIGEN_STRONG_INLINE Packet2cd pcplxflip(const Packet2cd& x) +{ + Packet2cd res; + for (int i = 0; i < 4; i+=2) { + res.v[i] = x.v[i+1]; + res.v[i+1] = x.v[i]; + } + return res; +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_COMPLEX_AVX_H diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h new file mode 100644 index 000000000..244e63e74 --- /dev/null +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -0,0 +1,423 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com) +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_AVX_H +#define EIGEN_PACKET_MATH_AVX_H + +namespace Eigen { + +namespace internal { + +#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD +#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 +#endif + +#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) +#endif + +typedef __m256 Packet8f; +typedef __m256i Packet8i; +typedef __m256d Packet4d; + +template<> struct is_arithmetic<__m256> { enum { value = true }; }; +template<> struct is_arithmetic<__m256i> { enum { value = true }; }; +template<> struct is_arithmetic<__m256d> { enum { value = true }; }; + +#define _EIGEN_DECLARE_CONST_Packet8f(NAME,X) \ + const Packet8f p8f_##NAME = pset1(X) + +#define _EIGEN_DECLARE_CONST_Packet4d(NAME,X) \ + const Packet4d p4d_##NAME = pset1(X) + + +template<> struct packet_traits : default_packet_traits +{ + typedef Packet8f type; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=8, + + HasDiv = 1, + HasSin = 0, + HasCos = 0, + HasLog = 0, + HasExp = 0, + HasSqrt = 0 + }; + }; +template<> struct packet_traits : default_packet_traits +{ + typedef Packet4d type; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=4, + + HasDiv = 1, + HasExp = 0 + }; +}; + +/* Proper support for integers is only provided by AVX2. In the meantime, we'll + use SSE instructions and packets to deal with integers. +template<> struct packet_traits : default_packet_traits +{ + typedef Packet8i type; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=8 + }; +}; +*/ + +template<> struct unpacket_traits { typedef float type; enum {size=8}; }; +template<> struct unpacket_traits { typedef double type; enum {size=4}; }; +template<> struct unpacket_traits { typedef int type; enum {size=8}; }; + +template<> EIGEN_STRONG_INLINE Packet8f pset1(const float& from) { return _mm256_set1_ps(from); } +template<> EIGEN_STRONG_INLINE Packet4d pset1(const double& from) { return _mm256_set1_pd(from); } +template<> EIGEN_STRONG_INLINE Packet8i pset1(const int& from) { return _mm256_set1_epi32(from); } + +template<> EIGEN_STRONG_INLINE Packet8f plset(const float& a) { return _mm256_add_ps(_mm256_set1_ps(a), _mm256_set_ps(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)); } +template<> EIGEN_STRONG_INLINE Packet4d plset(const double& a) { return _mm256_add_pd(_mm256_set1_pd(a), _mm256_set_pd(3.0,2.0,1.0,0.0)); } + +template<> EIGEN_STRONG_INLINE Packet8f padd(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d padd(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet8f psub(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d psub(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a) +{ + return _mm256_sub_ps(_mm256_set1_ps(0.0),a); +} +template<> EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a) +{ + return _mm256_sub_pd(_mm256_set1_pd(0.0),a); +} + +template<> EIGEN_STRONG_INLINE Packet8f pconj(const Packet8f& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4d pconj(const Packet4d& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet8i pconj(const Packet8i& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet8f pmul(const Packet8f& a, const Packet8f& b) { return _mm256_mul_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pmul(const Packet4d& a, const Packet4d& b) { return _mm256_mul_pd(a,b); } + + +template<> EIGEN_STRONG_INLINE Packet8f pdiv(const Packet8f& a, const Packet8f& b) { return _mm256_div_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pdiv(const Packet4d& a, const Packet4d& b) { return _mm256_div_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8i pdiv(const Packet8i& /*a*/, const Packet8i& /*b*/) +{ eigen_assert(false && "packet integer division are not supported by AVX"); + return pset1(0); +} + +template<> EIGEN_STRONG_INLINE Packet8f pmin(const Packet8f& a, const Packet8f& b) { return _mm256_min_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pmin(const Packet4d& a, const Packet4d& b) { return _mm256_min_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet8f pmax(const Packet8f& a, const Packet8f& b) { return _mm256_max_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pmax(const Packet4d& a, const Packet4d& b) { return _mm256_max_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet8f pand(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pand(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet8f por(const Packet8f& a, const Packet8f& b) { return _mm256_or_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d por(const Packet4d& a, const Packet4d& b) { return _mm256_or_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet8f pxor(const Packet8f& a, const Packet8f& b) { return _mm256_xor_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pxor(const Packet4d& a, const Packet4d& b) { return _mm256_xor_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet8f pandnot(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pandnot(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet8f pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from); } +template<> EIGEN_STRONG_INLINE Packet4d pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from); } +template<> EIGEN_STRONG_INLINE Packet8i pload(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast(from)); } + +template<> EIGEN_STRONG_INLINE Packet8f ploadu(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_ps(from); } +template<> EIGEN_STRONG_INLINE Packet4d ploadu(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_pd(from); } +template<> EIGEN_STRONG_INLINE Packet8i ploadu(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast(from)); } + +// Loads 4 floats from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, a3} +template<> EIGEN_STRONG_INLINE Packet8f ploaddup(const float* from) +{ + Packet8f tmp = ploadu(from); + Packet8f tmp1 = _mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2)); + Packet8f tmp2 = _mm256_permute_ps(tmp, _MM_SHUFFLE(1,1,0,0)); + return _mm256_blend_ps(_mm256_permute2f128_ps(tmp1,tmp1,1),tmp2,15); +} +// Loads 2 doubles from memory a returns the packet {a0, a0 a1, a1} +template<> EIGEN_STRONG_INLINE Packet4d ploaddup(const double* from) +{ + Packet4d tmp = ploadu(from); + Packet4d tmp1 = _mm256_permute_pd(tmp,0); + Packet4d tmp2 = _mm256_permute_pd(tmp,3); + return _mm256_blend_pd(tmp1,_mm256_permute2f128_pd(tmp2,tmp2,1),12); +} + +template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet8f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(to, from); } +template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet4d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd(to, from); } +template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet8i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); } + +template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet8f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ps(to, from); } +template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet4d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from); } +template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); } + +template<> EIGEN_STRONG_INLINE void pstore1(float* to, const float& a) +{ + Packet8f pa = pset1(a); + pstore(to, pa); +} +template<> EIGEN_STRONG_INLINE void pstore1(double* to, const double& a) +{ + Packet4d pa = pset1(a); + pstore(to, pa); +} +template<> EIGEN_STRONG_INLINE void pstore1(int* to, const int& a) +{ + Packet8i pa = pset1(a); + pstore(to, pa); +} + +template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } + +template<> EIGEN_STRONG_INLINE float pfirst(const Packet8f& a) { + return _mm_cvtss_f32(_mm256_castps256_ps128(a)); +} +template<> EIGEN_STRONG_INLINE double pfirst(const Packet4d& a) { + return _mm_cvtsd_f64(_mm256_castpd256_pd128(a)); +} +template<> EIGEN_STRONG_INLINE int pfirst(const Packet8i& a) { + return _mm_cvtsi128_si32(_mm256_castsi256_si128(a)); +} + + +template<> EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a) +{ + __m256 tmp = _mm256_shuffle_ps(a,a,0x1b); + return _mm256_permute2f128_ps(tmp, tmp, 1); +} +template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a) +{ + __m256d tmp = _mm256_shuffle_pd(a,a,5); + return _mm256_permute2f128_pd(tmp, tmp, 1); + + __m256d swap_halves = _mm256_permute2f128_pd(a,a,1); + return _mm256_permute_pd(swap_halves,5); +} + +// pabs should be ok +template<> EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a) +{ + const Packet8f mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF)); + return _mm256_and_ps(a,mask); +} +template<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a) +{ + const Packet4d mask = _mm256_castsi256_pd(_mm256_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF)); + return _mm256_and_pd(a,mask); +} + +// preduxp should be ok +// FIXME: why is this ok? why isn't the simply implementation working as expected? +template<> EIGEN_STRONG_INLINE Packet8f preduxp(const Packet8f* vecs) +{ + __m256 hsum1 = _mm256_hadd_ps(vecs[0], vecs[1]); + __m256 hsum2 = _mm256_hadd_ps(vecs[2], vecs[3]); + __m256 hsum3 = _mm256_hadd_ps(vecs[4], vecs[5]); + __m256 hsum4 = _mm256_hadd_ps(vecs[6], vecs[7]); + + __m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1); + __m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2); + __m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3); + __m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4); + + __m256 perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23); + __m256 perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23); + __m256 perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23); + __m256 perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23); + + __m256 sum1 = _mm256_add_ps(perm1, hsum5); + __m256 sum2 = _mm256_add_ps(perm2, hsum6); + __m256 sum3 = _mm256_add_ps(perm3, hsum7); + __m256 sum4 = _mm256_add_ps(perm4, hsum8); + + __m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc); + __m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc); + + __m256 final = _mm256_blend_ps(blend1, blend2, 0xf0); + return final; +} +template<> EIGEN_STRONG_INLINE Packet4d preduxp(const Packet4d* vecs) +{ + Packet4d tmp0, tmp1; + + tmp0 = _mm256_hadd_pd(vecs[0], vecs[1]); + tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1)); + + tmp1 = _mm256_hadd_pd(vecs[2], vecs[3]); + tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1)); + + return _mm256_blend_pd(tmp0, tmp1, 0xC); +} + +template<> EIGEN_STRONG_INLINE float predux(const Packet8f& a) +{ + Packet8f tmp0 = _mm256_hadd_ps(a,_mm256_permute2f128_ps(a,a,1)); + tmp0 = _mm256_hadd_ps(tmp0,tmp0); + return pfirst(_mm256_hadd_ps(tmp0, tmp0)); +} +template<> EIGEN_STRONG_INLINE double predux(const Packet4d& a) +{ + Packet4d tmp0 = _mm256_hadd_pd(a,_mm256_permute2f128_pd(a,a,1)); + return pfirst(_mm256_hadd_pd(tmp0,tmp0)); +} + +template<> EIGEN_STRONG_INLINE float predux_mul(const Packet8f& a) +{ + Packet8f tmp; + tmp = _mm256_mul_ps(a, _mm256_permute2f128_ps(a,a,1)); + tmp = _mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2))); + return pfirst(_mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1))); +} +template<> EIGEN_STRONG_INLINE double predux_mul(const Packet4d& a) +{ + Packet4d tmp; + tmp = _mm256_mul_pd(a, _mm256_permute2f128_pd(a,a,1)); + return pfirst(_mm256_mul_pd(tmp, _mm256_shuffle_pd(tmp,tmp,1))); +} + +template<> EIGEN_STRONG_INLINE float predux_min(const Packet8f& a) +{ + float result = a[0]; + for (int i = 1; i < 8; ++i) { + if (a[i] < result) result = a[i]; + } + return result; +} +template<> EIGEN_STRONG_INLINE double predux_min(const Packet4d& a) +{ + double result = a[0]; + for (int i = 1; i < 4; ++i) { + if (a[i] < result) result = a[i]; + } + return result; +} + +template<> EIGEN_STRONG_INLINE float predux_max(const Packet8f& a) +{ + float result = a[0]; + for (int i = 1; i < 8; ++i) { + if (a[i] > result) result = a[i]; + } + return result; +} + +template<> EIGEN_STRONG_INLINE double predux_max(const Packet4d& a) +{ + double result = a[0]; + for (int i = 1; i < 4; ++i) { + if (a[i] > result) result = a[i]; + } + return result; +} + + +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet8f& first, const Packet8f& second) + { + if (Offset==1) + { + first = _mm256_blend_ps(first, second, 1); + Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1)); + first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0x88); + } + else if (Offset==2) + { + first = _mm256_blend_ps(first, second, 3); + Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2)); + first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0xcc); + } + else if (Offset==3) + { + first = _mm256_blend_ps(first, second, 7); + Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3)); + first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0xee); + } + else if (Offset==4) + { + first = _mm256_blend_ps(first, second, 15); + Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(3,2,1,0)); + first = _mm256_permute_ps(_mm256_permute2f128_ps (tmp, tmp, 1), _MM_SHUFFLE(3,2,1,0)); + } + else if (Offset==5) + { + first = _mm256_blend_ps(first, second, 31); + first = _mm256_permute2f128_ps(first, first, 1); + Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1)); + first = _mm256_permute2f128_ps(tmp, tmp, 1); + first = _mm256_blend_ps(tmp, first, 0x88); + } + else if (Offset==6) + { + first = _mm256_blend_ps(first, second, 63); + first = _mm256_permute2f128_ps(first, first, 1); + Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2)); + first = _mm256_permute2f128_ps(tmp, tmp, 1); + first = _mm256_blend_ps(tmp, first, 0xcc); + } + else if (Offset==7) + { + first = _mm256_blend_ps(first, second, 127); + first = _mm256_permute2f128_ps(first, first, 1); + Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3)); + first = _mm256_permute2f128_ps(tmp, tmp, 1); + first = _mm256_blend_ps(tmp, first, 0xee); + } + } +}; + +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet4d& first, const Packet4d& second) + { + if (Offset==1) + { + first = _mm256_blend_pd(first, second, 1); + __m256d tmp = _mm256_permute_pd(first, 5); + first = _mm256_permute2f128_pd(tmp, tmp, 1); + first = _mm256_blend_pd(tmp, first, 0xA); + } + else if (Offset==2) + { + first = _mm256_blend_pd(first, second, 3); + first = _mm256_permute2f128_pd(first, first, 1); + } + else if (Offset==3) + { + first = _mm256_blend_pd(first, second, 7); + __m256d tmp = _mm256_permute_pd(first, 5); + first = _mm256_permute2f128_pd(tmp, tmp, 1); + first = _mm256_blend_pd(tmp, first, 5); + } + } +}; + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_PACKET_MATH_AVX_H diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index f5a3dab52..e913af650 100644 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -58,6 +58,9 @@ template<> struct is_arithmetic<__m128d> { enum { value = true }; }; const Packet4i p4i_##NAME = pset1(X) +// Use the packet_traits defined in AVX/PacketMath.h instead if we're going +// to leverage AVX instructions. +#ifndef EIGEN_VECTORIZE_AVX template<> struct packet_traits : default_packet_traits { typedef Packet4f type; @@ -87,6 +90,7 @@ template<> struct packet_traits : default_packet_traits HasSqrt = 1 }; }; +#endif template<> struct packet_traits : default_packet_traits { typedef Packet4i type; @@ -115,8 +119,10 @@ template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { re template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { return _mm_set1_epi32(from); } #endif +#ifndef EIGEN_VECTORIZE_AVX template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return _mm_add_ps(pset1(a), _mm_set_ps(3,2,1,0)); } template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) { return _mm_add_pd(pset1(a),_mm_set_pd(1,0)); } +#endif template<> EIGEN_STRONG_INLINE Packet4i plset(const int& a) { return _mm_add_epi32(pset1(a),_mm_set_epi32(3,2,1,0)); } template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); } @@ -331,9 +337,11 @@ template<> EIGEN_STRONG_INLINE void pstore1(double* to, const double& pstore(to, vec2d_swizzle1(pa,0,0)); } +#ifndef EIGEN_VECTORIZE_AVX template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +#endif #if defined(_MSC_VER) && defined(_WIN64) && !defined(__INTEL_COMPILER) // The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010 diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index 3f5ffcf51..eb399a824 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -286,9 +286,9 @@ class gemm_blocking_space 4. + if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) || LhsPacketSize > 4) { alignedSize = 0; alignedStart = 0; @@ -404,7 +405,9 @@ EIGEN_DONT_INLINE void general_matrix_vector_product 4. + if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) || + (LhsPacketSize > 4)) { alignedSize = 0; alignedStart = 0; @@ -446,7 +449,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product(tmp0); const LhsScalar* lhs0 = lhs + i*lhsStride; // process first unaligned result's coeffs diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index debc04f3f..787f800b8 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -72,7 +72,11 @@ #endif #define EIGEN_ALIGN 0 #else - #define EIGEN_ALIGN 1 + #if !defined(EIGEN_DONT_VECTORIZE) && defined(__AVX__) + #define EIGEN_ALIGN 32 + #else + #define EIGEN_ALIGN 16 + #endif #endif // EIGEN_ALIGN_STATICALLY is the true test whether we want to align arrays on the stack or not. It takes into account both the user choice to explicitly disable @@ -281,13 +285,16 @@ #endif #define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16) +#define EIGEN_ALIGN32 EIGEN_ALIGN_TO_BOUNDARY(32) #if EIGEN_ALIGN_STATICALLY #define EIGEN_USER_ALIGN_TO_BOUNDARY(n) EIGEN_ALIGN_TO_BOUNDARY(n) #define EIGEN_USER_ALIGN16 EIGEN_ALIGN16 +#define EIGEN_USER_ALIGN32 EIGEN_ALIGN32 #else #define EIGEN_USER_ALIGN_TO_BOUNDARY(n) #define EIGEN_USER_ALIGN16 +#define EIGEN_USER_ALIGN32 #endif #ifdef EIGEN_DONT_USE_RESTRICT_KEYWORD diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index d177e8b5a..76bdb6cfc 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -32,7 +32,7 @@ // page 114, "[The] LP64 model [...] is used by all 64-bit UNIX ports" so it's indeed // quite safe, at least within the context of glibc, to equate 64-bit with LP64. #if defined(__GLIBC__) && ((__GLIBC__>=2 && __GLIBC_MINOR__ >= 8) || __GLIBC__>2) \ - && defined(__LP64__) && ! defined( __SANITIZE_ADDRESS__ ) + && defined(__LP64__) && ! defined( __SANITIZE_ADDRESS__ ) && (EIGEN_ALIGN == 16) #define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 1 #else #define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 0 @@ -42,14 +42,14 @@ // See http://svn.freebsd.org/viewvc/base/stable/6/lib/libc/stdlib/malloc.c?view=markup // FreeBSD 7 seems to have 16-byte aligned malloc except on ARM and MIPS architectures // See http://svn.freebsd.org/viewvc/base/stable/7/lib/libc/stdlib/malloc.c?view=markup -#if defined(__FreeBSD__) && !defined(__arm__) && !defined(__mips__) +#if defined(__FreeBSD__) && !defined(__arm__) && !defined(__mips__) && (EIGEN_ALIGN == 16) #define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 1 #else #define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 0 #endif -#if defined(__APPLE__) \ - || defined(_WIN64) \ +#if (defined(__APPLE__) && (EIGEN_ALIGN == 16)) \ + || (defined(_WIN64) && (EIGEN_ALIGN == 16)) \ || EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED \ || EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED #define EIGEN_MALLOC_ALREADY_ALIGNED 1 @@ -73,7 +73,7 @@ #define EIGEN_HAS_POSIX_MEMALIGN 0 #endif -#ifdef EIGEN_VECTORIZE_SSE +#if defined EIGEN_VECTORIZE_SSE || defined EIGEN_VECTORIZE_AVX #define EIGEN_HAS_MM_MALLOC 1 #else #define EIGEN_HAS_MM_MALLOC 0 @@ -105,9 +105,9 @@ inline void throw_std_bad_alloc() */ inline void* handmade_aligned_malloc(std::size_t size) { - void *original = std::malloc(size+16); + void *original = std::malloc(size+EIGEN_ALIGN); if (original == 0) return 0; - void *aligned = reinterpret_cast((reinterpret_cast(original) & ~(std::size_t(15))) + 16); + void *aligned = reinterpret_cast((reinterpret_cast(original) & ~(std::size_t(EIGEN_ALIGN-1))) + EIGEN_ALIGN); *(reinterpret_cast(aligned) - 1) = original; return aligned; } @@ -128,9 +128,9 @@ inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t = if (ptr == 0) return handmade_aligned_malloc(size); void *original = *(reinterpret_cast(ptr) - 1); std::ptrdiff_t previous_offset = static_cast(ptr)-static_cast(original); - original = std::realloc(original,size+16); + original = std::realloc(original,size+EIGEN_ALIGN); if (original == 0) return 0; - void *aligned = reinterpret_cast((reinterpret_cast(original) & ~(std::size_t(15))) + 16); + void *aligned = reinterpret_cast((reinterpret_cast(original) & ~(std::size_t(EIGEN_ALIGN-1))) + EIGEN_ALIGN); void *previous_aligned = static_cast(original)+previous_offset; if(aligned!=previous_aligned) std::memmove(aligned, previous_aligned, size); @@ -208,7 +208,7 @@ inline void check_that_malloc_is_allowed() {} #endif -/** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 bytes alignment. +/** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 or 32 bytes alignment depending on the requirements. * On allocation error, the returned pointer is null, and std::bad_alloc is thrown. */ inline void* aligned_malloc(size_t size) @@ -221,11 +221,11 @@ inline void* aligned_malloc(size_t size) #elif EIGEN_MALLOC_ALREADY_ALIGNED result = std::malloc(size); #elif EIGEN_HAS_POSIX_MEMALIGN - if(posix_memalign(&result, 16, size)) result = 0; + if(posix_memalign(&result, EIGEN_ALIGN, size)) result = 0; #elif EIGEN_HAS_MM_MALLOC - result = _mm_malloc(size, 16); + result = _mm_malloc(size, EIGEN_ALIGN); #elif defined(_MSC_VER) && (!defined(_WIN32_WCE)) - result = _aligned_malloc(size, 16); + result = _aligned_malloc(size, EIGEN_ALIGN); #else result = handmade_aligned_malloc(size); #endif @@ -275,12 +275,12 @@ inline void* aligned_realloc(void *ptr, size_t new_size, size_t old_size) // implements _mm_malloc/_mm_free based on the corresponding _aligned_ // functions. This may not always be the case and we just try to be safe. #if defined(_MSC_VER) && defined(_mm_free) - result = _aligned_realloc(ptr,new_size,16); + result = _aligned_realloc(ptr,new_size,EIGEN_ALIGN); #else result = generic_aligned_realloc(ptr,new_size,old_size); #endif #elif defined(_MSC_VER) - result = _aligned_realloc(ptr,new_size,16); + result = _aligned_realloc(ptr,new_size,EIGEN_ALIGN); #else result = handmade_aligned_realloc(ptr,new_size,old_size); #endif @@ -607,9 +607,9 @@ template class aligned_stack_memory_handler * The underlying stack allocation function can controlled with the EIGEN_ALLOCA preprocessor token. */ #ifdef EIGEN_ALLOCA - - #ifdef __arm__ - #define EIGEN_ALIGNED_ALLOCA(SIZE) reinterpret_cast((reinterpret_cast(EIGEN_ALLOCA(SIZE+16)) & ~(size_t(15))) + 16) + // The native alloca() that comes with llvm aligns buffer on 16 bytes even when AVX is enabled. + #if defined(__arm__) || EIGEN_ALIGN > 16 + #define EIGEN_ALIGNED_ALLOCA(SIZE) reinterpret_cast((reinterpret_cast(EIGEN_ALLOCA(SIZE+EIGEN_ALIGN)) & ~(size_t(EIGEN_ALIGN-1))) + EIGEN_ALIGN) #else #define EIGEN_ALIGNED_ALLOCA EIGEN_ALLOCA #endif @@ -679,7 +679,7 @@ template class aligned_stack_memory_handler #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(true) #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size) \ - EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(((Size)!=Eigen::Dynamic) && ((sizeof(Scalar)*(Size))%16==0))) + EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(((Size)!=Eigen::Dynamic) && ((sizeof(Scalar)*(Size))%EIGEN_ALIGN==0))) /****************************************************************************/ diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake index fdc166bf8..d1ea4b9d2 100644 --- a/cmake/EigenTesting.cmake +++ b/cmake/EigenTesting.cmake @@ -264,6 +264,12 @@ macro(ei_testing_print_summary) message(STATUS "SSE4.2: Using architecture defaults") endif() + if(EIGEN_TEST_AVX) + message(STATUS "AVX: ON") + else() + message(STATUS "AVX: Using architecture defaults") + endif() + if(EIGEN_TEST_ALTIVEC) message(STATUS "Altivec: ON") else() diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 2c0519c41..d7c336c22 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -104,11 +104,12 @@ template void packetmath() const int PacketSize = internal::packet_traits::size; typedef typename NumTraits::Real RealScalar; - const int size = PacketSize*4; - EIGEN_ALIGN16 Scalar data1[internal::packet_traits::size*4]; - EIGEN_ALIGN16 Scalar data2[internal::packet_traits::size*4]; - EIGEN_ALIGN16 Packet packets[PacketSize*2]; - EIGEN_ALIGN16 Scalar ref[internal::packet_traits::size*4]; + const int max_size = PacketSize > 4 ? PacketSize : 4; + const int size = PacketSize*max_size; + EIGEN_ALIGN32 Scalar data1[size]; + EIGEN_ALIGN32 Scalar data2[size]; + EIGEN_ALIGN32 Packet packets[PacketSize*2]; + EIGEN_ALIGN32 Scalar ref[size]; RealScalar refvalue = 0; for (int i=0; i void packetmath() else if (offset==1) internal::palign<1>(packets[0], packets[1]); else if (offset==2) internal::palign<2>(packets[0], packets[1]); else if (offset==3) internal::palign<3>(packets[0], packets[1]); + else if (offset==4) internal::palign<4>(packets[0], packets[1]); + else if (offset==5) internal::palign<5>(packets[0], packets[1]); + else if (offset==6) internal::palign<6>(packets[0], packets[1]); + else if (offset==7) internal::palign<7>(packets[0], packets[1]); internal::pstore(data2, packets[0]); for (int i=0; i void packetmath_real() const int PacketSize = internal::packet_traits::size; const int size = PacketSize*4; - EIGEN_ALIGN16 Scalar data1[internal::packet_traits::size*4]; - EIGEN_ALIGN16 Scalar data2[internal::packet_traits::size*4]; - EIGEN_ALIGN16 Scalar ref[internal::packet_traits::size*4]; + EIGEN_ALIGN32 Scalar data1[internal::packet_traits::size*4]; + EIGEN_ALIGN32 Scalar data2[internal::packet_traits::size*4]; + EIGEN_ALIGN32 Scalar ref[internal::packet_traits::size*4]; for (int i=0; i void packetmath_notcomplex() typedef typename internal::packet_traits::type Packet; const int PacketSize = internal::packet_traits::size; - EIGEN_ALIGN16 Scalar data1[internal::packet_traits::size*4]; - EIGEN_ALIGN16 Scalar data2[internal::packet_traits::size*4]; - EIGEN_ALIGN16 Scalar ref[internal::packet_traits::size*4]; + EIGEN_ALIGN32 Scalar data1[internal::packet_traits::size*4]; + EIGEN_ALIGN32 Scalar data2[internal::packet_traits::size*4]; + EIGEN_ALIGN32 Scalar ref[internal::packet_traits::size*4]; Array::Map(data1, internal::packet_traits::size*4).setRandom(); @@ -317,10 +322,10 @@ template void packetmath_complex() const int PacketSize = internal::packet_traits::size; const int size = PacketSize*4; - EIGEN_ALIGN16 Scalar data1[PacketSize*4]; - EIGEN_ALIGN16 Scalar data2[PacketSize*4]; - EIGEN_ALIGN16 Scalar ref[PacketSize*4]; - EIGEN_ALIGN16 Scalar pval[PacketSize*4]; + EIGEN_ALIGN32 Scalar data1[PacketSize*4]; + EIGEN_ALIGN32 Scalar data2[PacketSize*4]; + EIGEN_ALIGN32 Scalar ref[PacketSize*4]; + EIGEN_ALIGN32 Scalar pval[PacketSize*4]; for (int i=0; i Date: Tue, 18 Feb 2014 18:06:44 -0800 Subject: [PATCH 008/158] Reverted the definition of the EIGEN_ALIGN to its former meaning (i.e. a boolean) Created a new EIGEN_ALIGN_BYTES define to encode how the data should be aligned Fixed a few remaining alignment issues exposed when the Eigen code is compiled with avx enabled. Created a new EIGEN_ALIGN_DEFAULT define, which is set to the minimum alignment value required for the chosen instruction set. Use this value instead of EIGEN_ALIGN32 to preserve the existing alignment on SSE/Altivec/Neon. --- Eigen/src/Core/Block.h | 2 +- Eigen/src/Core/DenseStorage.h | 4 +-- Eigen/src/Core/GeneralProduct.h | 2 +- Eigen/src/Core/Map.h | 2 +- Eigen/src/Core/MapBase.h | 2 +- Eigen/src/Core/products/GeneralMatrixMatrix.h | 6 ++-- Eigen/src/Core/products/GeneralMatrixVector.h | 4 +-- Eigen/src/Core/util/Macros.h | 18 ++++++++--- Eigen/src/Core/util/Memory.h | 32 +++++++++---------- Eigen/src/Core/util/XprHelper.h | 2 +- test/geo_parametrizedline.cpp | 6 ++-- test/mapped_matrix.cpp | 6 ++-- test/packetmath.cpp | 28 ++++++++-------- 13 files changed, 61 insertions(+), 53 deletions(-) diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h index 31cd5c72c..e948e14aa 100644 --- a/Eigen/src/Core/Block.h +++ b/Eigen/src/Core/Block.h @@ -83,7 +83,7 @@ struct traits > : traits::size) == 0) && (InnerStrideAtCompileTime == 1) ? PacketAccessBit : 0, - MaskAlignedBit = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % 16) == 0)) ? AlignedBit : 0, + MaskAlignedBit = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % EIGEN_ALIGN_BYTES) == 0)) ? AlignedBit : 0, FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1) ? LinearAccessBit : 0, FlagsLvalueBit = is_lvalue::value ? LvalueBit : 0, FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0, diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h index 2342b08a1..7264b44c7 100644 --- a/Eigen/src/Core/DenseStorage.h +++ b/Eigen/src/Core/DenseStorage.h @@ -40,7 +40,7 @@ void check_static_allocation_size() */ template struct plain_array { @@ -81,7 +81,7 @@ struct plain_array #endif template -struct plain_array +struct plain_array { EIGEN_USER_ALIGN32 T array[Size]; diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h index e3a165ac6..adda6f784 100644 --- a/Eigen/src/Core/GeneralProduct.h +++ b/Eigen/src/Core/GeneralProduct.h @@ -397,7 +397,7 @@ struct gemv_static_vector_if internal::plain_array m_data; EIGEN_STRONG_INLINE Scalar* data() { return ForceAlignment - ? reinterpret_cast((reinterpret_cast(m_data.array) & ~(size_t(15))) + 16) + ? reinterpret_cast((reinterpret_cast(m_data.array) & ~(size_t(EIGEN_ALIGN_BYTES-1))) + EIGEN_ALIGN_BYTES) : m_data.array; } #endif diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h index 8ea13cfb7..c75a5e95f 100644 --- a/Eigen/src/Core/Map.h +++ b/Eigen/src/Core/Map.h @@ -88,7 +88,7 @@ struct traits > && ( bool(IsDynamicSize) || HasNoOuterStride || ( OuterStrideAtCompileTime!=Dynamic - && ((static_cast(sizeof(Scalar))*OuterStrideAtCompileTime)%16)==0 ) ), + && ((static_cast(sizeof(Scalar))*OuterStrideAtCompileTime)%EIGEN_ALIGN_BYTES)==0 ) ), Flags0 = TraitsBase::Flags & (~NestByRefBit), Flags1 = IsAligned ? (int(Flags0) | AlignedBit) : (int(Flags0) & ~AlignedBit), Flags2 = (bool(HasNoStride) || bool(PlainObjectType::IsVectorAtCompileTime)) diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h index ffa1371c2..a45a0b374 100644 --- a/Eigen/src/Core/MapBase.h +++ b/Eigen/src/Core/MapBase.h @@ -164,7 +164,7 @@ template class MapBase EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(internal::traits::Flags&PacketAccessBit, internal::inner_stride_at_compile_time::ret==1), PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1); - eigen_assert(EIGEN_IMPLIES(internal::traits::Flags&AlignedBit, (size_t(m_data) % 16) == 0) + eigen_assert(EIGEN_IMPLIES(internal::traits::Flags&AlignedBit, (size_t(m_data) % EIGEN_ALIGN_BYTES) == 0) && "data is not aligned"); } diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index eb399a824..3dfd239c1 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -286,9 +286,9 @@ class gemm_blocking_space(tmp0); const LhsScalar* lhs0 = lhs + i*lhsStride; // process first unaligned result's coeffs diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 787f800b8..d6d5bfa23 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -66,17 +66,22 @@ #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0 #endif +// Defined the boundary (in bytes) on which the data needs to be aligned. Note +// that unless EIGEN_ALIGN is defined and not equal to 0, the data may not be +// aligned at all regardless of the value of this #define. +#define EIGEN_ALIGN_BYTES 16 + #ifdef EIGEN_DONT_ALIGN #ifndef EIGEN_DONT_ALIGN_STATICALLY #define EIGEN_DONT_ALIGN_STATICALLY #endif #define EIGEN_ALIGN 0 -#else - #if !defined(EIGEN_DONT_VECTORIZE) && defined(__AVX__) - #define EIGEN_ALIGN 32 - #else - #define EIGEN_ALIGN 16 +#elif !defined(EIGEN_DONT_VECTORIZE) + #if defined(__AVX__) + #undef EIGEN_ALIGN_BYTES + #define EIGEN_ALIGN_BYTES 32 #endif + #define EIGEN_ALIGN 1 #endif // EIGEN_ALIGN_STATICALLY is the true test whether we want to align arrays on the stack or not. It takes into account both the user choice to explicitly disable @@ -286,15 +291,18 @@ #define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16) #define EIGEN_ALIGN32 EIGEN_ALIGN_TO_BOUNDARY(32) +#define EIGEN_ALIGN_DEFAULT EIGEN_ALIGN_TO_BOUNDARY(EIGEN_ALIGN_BYTES) #if EIGEN_ALIGN_STATICALLY #define EIGEN_USER_ALIGN_TO_BOUNDARY(n) EIGEN_ALIGN_TO_BOUNDARY(n) #define EIGEN_USER_ALIGN16 EIGEN_ALIGN16 #define EIGEN_USER_ALIGN32 EIGEN_ALIGN32 +#define EIGEN_USER_ALIGN_DEFAULT EIGEN_ALIGN_DEFAULT #else #define EIGEN_USER_ALIGN_TO_BOUNDARY(n) #define EIGEN_USER_ALIGN16 #define EIGEN_USER_ALIGN32 +#define EIGEN_USER_ALIGN_DEFAULT #endif #ifdef EIGEN_DONT_USE_RESTRICT_KEYWORD diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 76bdb6cfc..2f2398bbf 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -32,7 +32,7 @@ // page 114, "[The] LP64 model [...] is used by all 64-bit UNIX ports" so it's indeed // quite safe, at least within the context of glibc, to equate 64-bit with LP64. #if defined(__GLIBC__) && ((__GLIBC__>=2 && __GLIBC_MINOR__ >= 8) || __GLIBC__>2) \ - && defined(__LP64__) && ! defined( __SANITIZE_ADDRESS__ ) && (EIGEN_ALIGN == 16) + && defined(__LP64__) && ! defined( __SANITIZE_ADDRESS__ ) && (EIGEN_ALIGN_BYTES == 16) #define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 1 #else #define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 0 @@ -42,14 +42,14 @@ // See http://svn.freebsd.org/viewvc/base/stable/6/lib/libc/stdlib/malloc.c?view=markup // FreeBSD 7 seems to have 16-byte aligned malloc except on ARM and MIPS architectures // See http://svn.freebsd.org/viewvc/base/stable/7/lib/libc/stdlib/malloc.c?view=markup -#if defined(__FreeBSD__) && !defined(__arm__) && !defined(__mips__) && (EIGEN_ALIGN == 16) +#if defined(__FreeBSD__) && !defined(__arm__) && !defined(__mips__) && (EIGEN_ALIGN_BYTES == 16) #define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 1 #else #define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 0 #endif -#if (defined(__APPLE__) && (EIGEN_ALIGN == 16)) \ - || (defined(_WIN64) && (EIGEN_ALIGN == 16)) \ +#if (defined(__APPLE__) && (EIGEN_ALIGN_BYTES == 16)) \ + || (defined(_WIN64) && (EIGEN_ALIGN_BYTES == 16)) \ || EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED \ || EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED #define EIGEN_MALLOC_ALREADY_ALIGNED 1 @@ -105,9 +105,9 @@ inline void throw_std_bad_alloc() */ inline void* handmade_aligned_malloc(std::size_t size) { - void *original = std::malloc(size+EIGEN_ALIGN); + void *original = std::malloc(size+EIGEN_ALIGN_BYTES); if (original == 0) return 0; - void *aligned = reinterpret_cast((reinterpret_cast(original) & ~(std::size_t(EIGEN_ALIGN-1))) + EIGEN_ALIGN); + void *aligned = reinterpret_cast((reinterpret_cast(original) & ~(std::size_t(EIGEN_ALIGN_BYTES-1))) + EIGEN_ALIGN_BYTES); *(reinterpret_cast(aligned) - 1) = original; return aligned; } @@ -128,9 +128,9 @@ inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t = if (ptr == 0) return handmade_aligned_malloc(size); void *original = *(reinterpret_cast(ptr) - 1); std::ptrdiff_t previous_offset = static_cast(ptr)-static_cast(original); - original = std::realloc(original,size+EIGEN_ALIGN); + original = std::realloc(original,size+EIGEN_ALIGN_BYTES); if (original == 0) return 0; - void *aligned = reinterpret_cast((reinterpret_cast(original) & ~(std::size_t(EIGEN_ALIGN-1))) + EIGEN_ALIGN); + void *aligned = reinterpret_cast((reinterpret_cast(original) & ~(std::size_t(EIGEN_ALIGN_BYTES-1))) + EIGEN_ALIGN_BYTES); void *previous_aligned = static_cast(original)+previous_offset; if(aligned!=previous_aligned) std::memmove(aligned, previous_aligned, size); @@ -221,11 +221,11 @@ inline void* aligned_malloc(size_t size) #elif EIGEN_MALLOC_ALREADY_ALIGNED result = std::malloc(size); #elif EIGEN_HAS_POSIX_MEMALIGN - if(posix_memalign(&result, EIGEN_ALIGN, size)) result = 0; + if(posix_memalign(&result, EIGEN_ALIGN_BYTES, size)) result = 0; #elif EIGEN_HAS_MM_MALLOC - result = _mm_malloc(size, EIGEN_ALIGN); + result = _mm_malloc(size, EIGEN_ALIGN_BYTES); #elif defined(_MSC_VER) && (!defined(_WIN32_WCE)) - result = _aligned_malloc(size, EIGEN_ALIGN); + result = _aligned_malloc(size, EIGEN_ALIGN_BYTES); #else result = handmade_aligned_malloc(size); #endif @@ -275,12 +275,12 @@ inline void* aligned_realloc(void *ptr, size_t new_size, size_t old_size) // implements _mm_malloc/_mm_free based on the corresponding _aligned_ // functions. This may not always be the case and we just try to be safe. #if defined(_MSC_VER) && defined(_mm_free) - result = _aligned_realloc(ptr,new_size,EIGEN_ALIGN); + result = _aligned_realloc(ptr,new_size,EIGEN_ALIGN_BYTES); #else result = generic_aligned_realloc(ptr,new_size,old_size); #endif #elif defined(_MSC_VER) - result = _aligned_realloc(ptr,new_size,EIGEN_ALIGN); + result = _aligned_realloc(ptr,new_size,EIGEN_ALIGN_BYTES); #else result = handmade_aligned_realloc(ptr,new_size,old_size); #endif @@ -608,8 +608,8 @@ template class aligned_stack_memory_handler */ #ifdef EIGEN_ALLOCA // The native alloca() that comes with llvm aligns buffer on 16 bytes even when AVX is enabled. - #if defined(__arm__) || EIGEN_ALIGN > 16 - #define EIGEN_ALIGNED_ALLOCA(SIZE) reinterpret_cast((reinterpret_cast(EIGEN_ALLOCA(SIZE+EIGEN_ALIGN)) & ~(size_t(EIGEN_ALIGN-1))) + EIGEN_ALIGN) + #if defined(__arm__) || EIGEN_ALIGN_BYTES > 16 + #define EIGEN_ALIGNED_ALLOCA(SIZE) reinterpret_cast((reinterpret_cast(EIGEN_ALLOCA(SIZE+EIGEN_ALIGN_BYTES)) & ~(size_t(EIGEN_ALIGN_BYTES-1))) + EIGEN_ALIGN_BYTES) #else #define EIGEN_ALIGNED_ALLOCA EIGEN_ALLOCA #endif @@ -679,7 +679,7 @@ template class aligned_stack_memory_handler #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(true) #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size) \ - EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(((Size)!=Eigen::Dynamic) && ((sizeof(Scalar)*(Size))%EIGEN_ALIGN==0))) + EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(((Size)!=Eigen::Dynamic) && ((sizeof(Scalar)*(Size))%EIGEN_ALIGN_BYTES==0))) /****************************************************************************/ diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index 195d9e2e1..a08538aff 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -136,7 +136,7 @@ class compute_matrix_flags ((Options&DontAlign)==0) && ( #if EIGEN_ALIGN_STATICALLY - ((!is_dynamic_size_storage) && (((MaxCols*MaxRows*int(sizeof(Scalar))) % 16) == 0)) + ((!is_dynamic_size_storage) && (((MaxCols*MaxRows*int(sizeof(Scalar))) % EIGEN_ALIGN_BYTES) == 0)) #else 0 #endif diff --git a/test/geo_parametrizedline.cpp b/test/geo_parametrizedline.cpp index f0462d40a..5a72b3575 100644 --- a/test/geo_parametrizedline.cpp +++ b/test/geo_parametrizedline.cpp @@ -66,9 +66,9 @@ template void parametrizedline_alignment() typedef ParametrizedLine Line4a; typedef ParametrizedLine Line4u; - EIGEN_ALIGN16 Scalar array1[8]; - EIGEN_ALIGN16 Scalar array2[8]; - EIGEN_ALIGN16 Scalar array3[8+1]; + EIGEN_ALIGN_DEFAULT Scalar array1[8]; + EIGEN_ALIGN_DEFAULT Scalar array2[8]; + EIGEN_ALIGN_DEFAULT Scalar array3[8+1]; Scalar* array3u = array3+1; Line4a *p1 = ::new(reinterpret_cast(array1)) Line4a; diff --git a/test/mapped_matrix.cpp b/test/mapped_matrix.cpp index c18e687a5..5eba3ecb3 100644 --- a/test/mapped_matrix.cpp +++ b/test/mapped_matrix.cpp @@ -26,7 +26,7 @@ template void map_class_vector(const VectorType& m) Scalar* array1 = internal::aligned_new(size); Scalar* array2 = internal::aligned_new(size); Scalar* array3 = new Scalar[size+1]; - Scalar* array3unaligned = size_t(array3)%16 == 0 ? array3+1 : array3; + Scalar* array3unaligned = size_t(array3)%EIGEN_ALIGN_BYTES == 0 ? array3+1 : array3; Scalar array4[EIGEN_TESTMAP_MAX_SIZE]; Map(array1, size) = VectorType::Random(size); @@ -64,7 +64,7 @@ template void map_class_matrix(const MatrixType& m) for(int i = 0; i < size; i++) array2[i] = Scalar(1); Scalar* array3 = new Scalar[size+1]; for(int i = 0; i < size+1; i++) array3[i] = Scalar(1); - Scalar* array3unaligned = size_t(array3)%16 == 0 ? array3+1 : array3; + Scalar* array3unaligned = size_t(array3)%EIGEN_ALIGN_BYTES == 0 ? array3+1 : array3; Map(array1, rows, cols) = MatrixType::Ones(rows,cols); Map(array2, rows, cols) = Map(array1, rows, cols); Map(array3unaligned, rows, cols) = Map(array1, rows, cols); @@ -90,7 +90,7 @@ template void map_static_methods(const VectorType& m) Scalar* array1 = internal::aligned_new(size); Scalar* array2 = internal::aligned_new(size); Scalar* array3 = new Scalar[size+1]; - Scalar* array3unaligned = size_t(array3)%16 == 0 ? array3+1 : array3; + Scalar* array3unaligned = size_t(array3)%EIGEN_ALIGN_BYTES == 0 ? array3+1 : array3; VectorType::MapAligned(array1, size) = VectorType::Random(size); VectorType::Map(array2, size) = VectorType::Map(array1, size); diff --git a/test/packetmath.cpp b/test/packetmath.cpp index d7c336c22..5a680d1ee 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -106,10 +106,10 @@ template void packetmath() const int max_size = PacketSize > 4 ? PacketSize : 4; const int size = PacketSize*max_size; - EIGEN_ALIGN32 Scalar data1[size]; - EIGEN_ALIGN32 Scalar data2[size]; - EIGEN_ALIGN32 Packet packets[PacketSize*2]; - EIGEN_ALIGN32 Scalar ref[size]; + EIGEN_ALIGN_DEFAULT Scalar data1[size]; + EIGEN_ALIGN_DEFAULT Scalar data2[size]; + EIGEN_ALIGN_DEFAULT Packet packets[PacketSize*2]; + EIGEN_ALIGN_DEFAULT Scalar ref[size]; RealScalar refvalue = 0; for (int i=0; i void packetmath_real() const int PacketSize = internal::packet_traits::size; const int size = PacketSize*4; - EIGEN_ALIGN32 Scalar data1[internal::packet_traits::size*4]; - EIGEN_ALIGN32 Scalar data2[internal::packet_traits::size*4]; - EIGEN_ALIGN32 Scalar ref[internal::packet_traits::size*4]; + EIGEN_ALIGN_DEFAULT Scalar data1[internal::packet_traits::size*4]; + EIGEN_ALIGN_DEFAULT Scalar data2[internal::packet_traits::size*4]; + EIGEN_ALIGN_DEFAULT Scalar ref[internal::packet_traits::size*4]; for (int i=0; i void packetmath_notcomplex() typedef typename internal::packet_traits::type Packet; const int PacketSize = internal::packet_traits::size; - EIGEN_ALIGN32 Scalar data1[internal::packet_traits::size*4]; - EIGEN_ALIGN32 Scalar data2[internal::packet_traits::size*4]; - EIGEN_ALIGN32 Scalar ref[internal::packet_traits::size*4]; + EIGEN_ALIGN_DEFAULT Scalar data1[internal::packet_traits::size*4]; + EIGEN_ALIGN_DEFAULT Scalar data2[internal::packet_traits::size*4]; + EIGEN_ALIGN_DEFAULT Scalar ref[internal::packet_traits::size*4]; Array::Map(data1, internal::packet_traits::size*4).setRandom(); @@ -322,10 +322,10 @@ template void packetmath_complex() const int PacketSize = internal::packet_traits::size; const int size = PacketSize*4; - EIGEN_ALIGN32 Scalar data1[PacketSize*4]; - EIGEN_ALIGN32 Scalar data2[PacketSize*4]; - EIGEN_ALIGN32 Scalar ref[PacketSize*4]; - EIGEN_ALIGN32 Scalar pval[PacketSize*4]; + EIGEN_ALIGN_DEFAULT Scalar data1[PacketSize*4]; + EIGEN_ALIGN_DEFAULT Scalar data2[PacketSize*4]; + EIGEN_ALIGN_DEFAULT Scalar ref[PacketSize*4]; + EIGEN_ALIGN_DEFAULT Scalar pval[PacketSize*4]; for (int i=0; i Date: Mon, 24 Feb 2014 13:45:32 -0800 Subject: [PATCH 009/158] Added support for FMA instructions --- Eigen/Core | 3 +++ Eigen/src/Core/arch/AVX/PacketMath.h | 5 +++++ Eigen/src/Core/arch/SSE/PacketMath.h | 4 ++++ .../src/Core/products/GeneralBlockPanelKernel.h | 16 ++++++++++++++++ 4 files changed, 28 insertions(+) diff --git a/Eigen/Core b/Eigen/Core index bd20d5ac5..dbe68586d 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -117,6 +117,9 @@ #define EIGEN_VECTORIZE_SSE4_1 #define EIGEN_VECTORIZE_SSE4_2 #endif + #ifdef __FMA__ + #define EIGEN_VECTORIZE_FMA + #endif // include files // This extern "C" works around a MINGW-w64 compilation issue diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 244e63e74..d1a134087 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -120,6 +120,11 @@ template<> EIGEN_STRONG_INLINE Packet8i pdiv(const Packet8i& /*a*/, co return pset1(0); } +#ifdef EIGEN_VECTORIZE_FMA +template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) { return _mm256_fmadd_ps(a,b,c); } +template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) { return _mm256_fmadd_pd(a,b,c); } +#endif + template<> EIGEN_STRONG_INLINE Packet8f pmin(const Packet8f& a, const Packet8f& b) { return _mm256_min_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d pmin(const Packet4d& a, const Packet4d& b) { return _mm256_min_pd(a,b); } diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index e913af650..a35f9ce98 100644 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -179,6 +179,10 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& /*a*/, co // for some weird raisons, it has to be overloaded for packet of integers template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); } +#ifdef EIGEN_VECTORIZE_FMA +template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); } +template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); } +#endif template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); } diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 780fa74d3..9caa15081 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -205,7 +205,15 @@ public: EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, AccPacket& tmp) const { + // It would be a lot cleaner to call pmadd all the time. Unfortunately if we + // let gcc allocate the register in which to store the result of the pmul + // (in the case where there is no FMA) gcc fails to figure out how to avoid + // spilling register. +#ifdef EIGEN_VECTORIZE_FMA + c = pmadd(a,b,c); +#else tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp); +#endif } EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const @@ -281,7 +289,11 @@ public: EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const { +#ifdef EIGEN_VECTORIZE_FMA + c.v = pmadd(a.v,b,c.v); +#else tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp); +#endif } EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const @@ -486,7 +498,11 @@ public: EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const { +#ifdef EIGEN_VECTORIZE_FMA + c = pmadd(a,b,c); +#else tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp); +#endif } EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const From 847d801a4c91d3770fa06eb56772b64ed576ce1c Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 12 Mar 2014 21:33:45 +0100 Subject: [PATCH 010/158] Fix bug #760: complete Eigen's lapack interface with default Lapack for SPQR if there is no fortran compiler. --- test/CMakeLists.txt | 65 +++++++++++++++++++++++++++------------------ 1 file changed, 39 insertions(+), 26 deletions(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index e1bff179d..62cbedae7 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -13,11 +13,26 @@ if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h) endforeach() endif() +# check if we have a Fortran compiler +include("../cmake/language_support.cmake") + +workaround_9220(Fortran EIGEN_Fortran_COMPILER_WORKS) + +if(EIGEN_Fortran_COMPILER_WORKS) + enable_language(Fortran OPTIONAL) + if(NOT CMAKE_Fortran_COMPILER) + set(EIGEN_Fortran_COMPILER_WORKS OFF) + endif() +endif() + +if(NOT EIGEN_Fortran_COMPILER_WORKS) + # search for a default Lapack library to complete Eigen's one + find_package(LAPACK) +endif() + # configure blas/lapack (use Eigen's ones) -set(BLAS_FOUND TRUE) -set(LAPACK_FOUND TRUE) -set(BLAS_LIBRARIES eigen_blas) -set(LAPACK_LIBRARIES eigen_lapack) +set(EIGEN_BLAS_LIBRARIES eigen_blas) +set(EIGEN_LAPACK_LIBRARIES eigen_lapack) set(EIGEN_TEST_MATRIX_DIR "" CACHE STRING "Enable testing of realword sparse matrices contained in the specified path") if(EIGEN_TEST_MATRIX_DIR) @@ -32,33 +47,33 @@ endif(EIGEN_TEST_MATRIX_DIR) set(SPARSE_LIBS " ") find_package(Cholmod) -if(CHOLMOD_FOUND AND BLAS_FOUND AND LAPACK_FOUND) +if(CHOLMOD_FOUND) add_definitions("-DEIGEN_CHOLMOD_SUPPORT") include_directories(${CHOLMOD_INCLUDES}) - set(SPARSE_LIBS ${SPARSE_LIBS} ${CHOLMOD_LIBRARIES} ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES}) - set(CHOLMOD_ALL_LIBS ${CHOLMOD_LIBRARIES} ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES}) + set(SPARSE_LIBS ${SPARSE_LIBS} ${CHOLMOD_LIBRARIES} ${EIGEN_BLAS_LIBRARIES} ${EIGEN_LAPACK_LIBRARIES}) + set(CHOLMOD_ALL_LIBS ${CHOLMOD_LIBRARIES} ${EIGEN_BLAS_LIBRARIES} ${EIGEN_LAPACK_LIBRARIES}) ei_add_property(EIGEN_TESTED_BACKENDS "Cholmod, ") else() ei_add_property(EIGEN_MISSING_BACKENDS "Cholmod, ") endif() find_package(Umfpack) -if(UMFPACK_FOUND AND BLAS_FOUND) +if(UMFPACK_FOUND) add_definitions("-DEIGEN_UMFPACK_SUPPORT") include_directories(${UMFPACK_INCLUDES}) - set(SPARSE_LIBS ${SPARSE_LIBS} ${UMFPACK_LIBRARIES} ${BLAS_LIBRARIES}) - set(UMFPACK_ALL_LIBS ${UMFPACK_LIBRARIES} ${BLAS_LIBRARIES}) + set(SPARSE_LIBS ${SPARSE_LIBS} ${UMFPACK_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) + set(UMFPACK_ALL_LIBS ${UMFPACK_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) ei_add_property(EIGEN_TESTED_BACKENDS "UmfPack, ") else() ei_add_property(EIGEN_MISSING_BACKENDS "UmfPack, ") endif() find_package(SuperLU) -if(SUPERLU_FOUND AND BLAS_FOUND) +if(SUPERLU_FOUND) add_definitions("-DEIGEN_SUPERLU_SUPPORT") include_directories(${SUPERLU_INCLUDES}) - set(SPARSE_LIBS ${SPARSE_LIBS} ${SUPERLU_LIBRARIES} ${BLAS_LIBRARIES}) - set(SUPERLU_ALL_LIBS ${SUPERLU_LIBRARIES} ${BLAS_LIBRARIES}) + set(SPARSE_LIBS ${SPARSE_LIBS} ${SUPERLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) + set(SUPERLU_ALL_LIBS ${SUPERLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) ei_add_property(EIGEN_TESTED_BACKENDS "SuperLU, ") else() ei_add_property(EIGEN_MISSING_BACKENDS "SuperLU, ") @@ -68,7 +83,7 @@ endif() find_package(Pastix) find_package(Scotch) find_package(Metis) -if(PASTIX_FOUND AND BLAS_FOUND) +if(PASTIX_FOUND) add_definitions("-DEIGEN_PASTIX_SUPPORT") include_directories(${PASTIX_INCLUDES}) if(SCOTCH_FOUND) @@ -80,8 +95,8 @@ if(PASTIX_FOUND AND BLAS_FOUND) else(SCOTCH_FOUND) ei_add_property(EIGEN_MISSING_BACKENDS "PaStiX, ") endif(SCOTCH_FOUND) - set(SPARSE_LIBS ${SPARSE_LIBS} ${PASTIX_LIBRARIES} ${ORDERING_LIBRARIES} ${BLAS_LIBRARIES}) - set(PASTIX_ALL_LIBS ${PASTIX_LIBRARIES} ${BLAS_LIBRARIES}) + set(SPARSE_LIBS ${SPARSE_LIBS} ${PASTIX_LIBRARIES} ${ORDERING_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) + set(PASTIX_ALL_LIBS ${PASTIX_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) ei_add_property(EIGEN_TESTED_BACKENDS "PaStiX, ") else() ei_add_property(EIGEN_MISSING_BACKENDS "PaStiX, ") @@ -96,16 +111,14 @@ else() endif() find_package(SPQR) -if(SPQR_FOUND AND BLAS_FOUND AND LAPACK_FOUND) - if(CHOLMOD_FOUND) - add_definitions("-DEIGEN_SPQR_SUPPORT") - include_directories(${SPQR_INCLUDES}) - set(SPQR_ALL_LIBS ${SPQR_LIBRARIES} ${CHOLMOD_LIBRARIES} ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) - set(SPARSE_LIBS ${SPARSE_LIBS} ${SPQR_ALL_LIBS}) - ei_add_property(EIGEN_TESTED_BACKENDS "SPQR, ") - else(CHOLMOD_FOUND) - ei_add_property(EIGEN_MISSING_BACKENDS "SPQR, ") - endif(CHOLMOD_FOUND) +if(SPQR_FOUND AND CHOLMOD_FOUND AND (EIGEN_Fortran_COMPILER_WORKS OR LAPACK_FOUND) ) + add_definitions("-DEIGEN_SPQR_SUPPORT") + include_directories(${SPQR_INCLUDES}) + set(SPQR_ALL_LIBS ${SPQR_LIBRARIES} ${CHOLMOD_LIBRARIES} ${EIGEN_LAPACK_LIBRARIES} ${EIGEN_BLAS_LIBRARIES} ${LAPACK_LIBRARIES}) + set(SPARSE_LIBS ${SPARSE_LIBS} ${SPQR_ALL_LIBS}) + ei_add_property(EIGEN_TESTED_BACKENDS "SPQR, ") +else() + ei_add_property(EIGEN_MISSING_BACKENDS "SPQR, ") endif() option(EIGEN_TEST_NOQT "Disable Qt support in unit tests" OFF) From 2db792852ffe8d03b23375da75330c90f490e035 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Thu, 13 Mar 2014 12:58:57 +0100 Subject: [PATCH 011/158] Silence stupid parenthesis warnings for old GCC versions (<= 4.6.x) --- Eigen/src/SparseCore/SparseBlock.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Eigen/src/SparseCore/SparseBlock.h b/Eigen/src/SparseCore/SparseBlock.h index 6f615e250..5b95cc33f 100644 --- a/Eigen/src/SparseCore/SparseBlock.h +++ b/Eigen/src/SparseCore/SparseBlock.h @@ -338,7 +338,10 @@ const Block SparseMatrixBase::inner namespace internal { template< typename XprType, int BlockRows, int BlockCols, bool InnerPanel, - bool OuterVector = (BlockCols==1 && XprType::IsRowMajor) || (BlockRows==1 && !XprType::IsRowMajor)> + bool OuterVector = (BlockCols==1 && XprType::IsRowMajor) + | // FIXME | instead of || to please GCC 4.4.0 stupid warning "suggest parentheses around &&". + // revert to || as soon as not needed anymore. + (BlockRows==1 && !XprType::IsRowMajor)> class GenericSparseBlockInnerIteratorImpl; } From bb4b67cf39b2a0aae2c912e1fbad50c1cf3b1ab6 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 13 Mar 2014 18:04:19 +0100 Subject: [PATCH 012/158] Relax Ref such that Ref accepts a RowVectorXf which can be seen as a degenerate MatrixXf(1,N) --- Eigen/src/Core/Ref.h | 10 +++-- test/ref.cpp | 104 ++++++++++++++++++++++++++----------------- 2 files changed, 69 insertions(+), 45 deletions(-) diff --git a/Eigen/src/Core/Ref.h b/Eigen/src/Core/Ref.h index 00d9e6d2b..cd6d949c4 100644 --- a/Eigen/src/Core/Ref.h +++ b/Eigen/src/Core/Ref.h @@ -101,7 +101,7 @@ struct traits > template struct match { enum { HasDirectAccess = internal::has_direct_access::ret, - StorageOrderMatch = PlainObjectType::IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)), + StorageOrderMatch = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)), InnerStrideMatch = int(StrideType::InnerStrideAtCompileTime)==int(Dynamic) || int(StrideType::InnerStrideAtCompileTime)==int(Derived::InnerStrideAtCompileTime) || (int(StrideType::InnerStrideAtCompileTime)==0 && int(Derived::InnerStrideAtCompileTime)==1), @@ -172,8 +172,12 @@ protected: } else ::new (static_cast(this)) Base(expr.data(), expr.rows(), expr.cols()); - ::new (&m_stride) StrideBase(StrideType::OuterStrideAtCompileTime==0?0:expr.outerStride(), - StrideType::InnerStrideAtCompileTime==0?0:expr.innerStride()); + + if(Expression::IsVectorAtCompileTime && (!PlainObjectType::IsVectorAtCompileTime) && ((Expression::Flags&RowMajorBit)!=(PlainObjectType::Flags&RowMajorBit))) + ::new (&m_stride) StrideBase(expr.innerStride(), StrideType::InnerStrideAtCompileTime==0?0:1); + else + ::new (&m_stride) StrideBase(StrideType::OuterStrideAtCompileTime==0?0:expr.outerStride(), + StrideType::InnerStrideAtCompileTime==0?0:expr.innerStride()); } StrideBase m_stride; diff --git a/test/ref.cpp b/test/ref.cpp index f639d900b..19e81549c 100644 --- a/test/ref.cpp +++ b/test/ref.cpp @@ -154,59 +154,79 @@ template void check_const_correctness(const PlainObjec VERIFY( !(Ref::Flags & LvalueBit) ); } -EIGEN_DONT_INLINE void call_ref_1(Ref ) { } -EIGEN_DONT_INLINE void call_ref_2(const Ref& ) { } -EIGEN_DONT_INLINE void call_ref_3(Ref > ) { } -EIGEN_DONT_INLINE void call_ref_4(const Ref >& ) { } -EIGEN_DONT_INLINE void call_ref_5(Ref > ) { } -EIGEN_DONT_INLINE void call_ref_6(const Ref >& ) { } +template +EIGEN_DONT_INLINE void call_ref_1(Ref a, const B &b) { VERIFY_IS_EQUAL(a,b); } +template +EIGEN_DONT_INLINE void call_ref_2(const Ref& a, const B &b) { VERIFY_IS_EQUAL(a,b); } +template +EIGEN_DONT_INLINE void call_ref_3(Ref > a, const B &b) { VERIFY_IS_EQUAL(a,b); } +template +EIGEN_DONT_INLINE void call_ref_4(const Ref >& a, const B &b) { VERIFY_IS_EQUAL(a,b); } +template +EIGEN_DONT_INLINE void call_ref_5(Ref > a, const B &b) { VERIFY_IS_EQUAL(a,b); } +template +EIGEN_DONT_INLINE void call_ref_6(const Ref >& a, const B &b) { VERIFY_IS_EQUAL(a,b); } +template +EIGEN_DONT_INLINE void call_ref_7(Ref > a, const B &b) { VERIFY_IS_EQUAL(a,b); } void call_ref() { - VectorXcf ca(10); - VectorXf a(10); + VectorXcf ca = VectorXcf::Random(10); + VectorXf a = VectorXf::Random(10); + RowVectorXf b = RowVectorXf::Random(10); + MatrixXf A = MatrixXf::Random(10,10); + RowVector3f c = RowVector3f::Random(); const VectorXf& ac(a); VectorBlock ab(a,0,3); - MatrixXf A(10,10); const VectorBlock abc(a,0,3); + - VERIFY_EVALUATION_COUNT( call_ref_1(a), 0); - //call_ref_1(ac); // does not compile because ac is const - VERIFY_EVALUATION_COUNT( call_ref_1(ab), 0); - VERIFY_EVALUATION_COUNT( call_ref_1(a.head(4)), 0); - VERIFY_EVALUATION_COUNT( call_ref_1(abc), 0); - VERIFY_EVALUATION_COUNT( call_ref_1(A.col(3)), 0); - // call_ref_1(A.row(3)); // does not compile because innerstride!=1 - VERIFY_EVALUATION_COUNT( call_ref_3(A.row(3)), 0); - VERIFY_EVALUATION_COUNT( call_ref_4(A.row(3)), 0); - //call_ref_1(a+a); // does not compile for obvious reason + VERIFY_EVALUATION_COUNT( call_ref_1(a,a), 0); + VERIFY_EVALUATION_COUNT( call_ref_1(b,b.transpose()), 0); +// call_ref_1(ac); // does not compile because ac is const + VERIFY_EVALUATION_COUNT( call_ref_1(ab,ab), 0); + VERIFY_EVALUATION_COUNT( call_ref_1(a.head(4),a.head(4)), 0); + VERIFY_EVALUATION_COUNT( call_ref_1(abc,abc), 0); + VERIFY_EVALUATION_COUNT( call_ref_1(A.col(3),A.col(3)), 0); +// call_ref_1(A.row(3)); // does not compile because innerstride!=1 + VERIFY_EVALUATION_COUNT( call_ref_3(A.row(3),A.row(3).transpose()), 0); + VERIFY_EVALUATION_COUNT( call_ref_4(A.row(3),A.row(3).transpose()), 0); +// call_ref_1(a+a); // does not compile for obvious reason - VERIFY_EVALUATION_COUNT( call_ref_2(A*A.col(1)), 1); // evaluated into a temp - VERIFY_EVALUATION_COUNT( call_ref_2(ac.head(5)), 0); - VERIFY_EVALUATION_COUNT( call_ref_2(ac), 0); - VERIFY_EVALUATION_COUNT( call_ref_2(a), 0); - VERIFY_EVALUATION_COUNT( call_ref_2(ab), 0); - VERIFY_EVALUATION_COUNT( call_ref_2(a.head(4)), 0); - VERIFY_EVALUATION_COUNT( call_ref_2(a+a), 1); // evaluated into a temp - VERIFY_EVALUATION_COUNT( call_ref_2(ca.imag()), 1); // evaluated into a temp + MatrixXf tmp = A*A.col(1); + VERIFY_EVALUATION_COUNT( call_ref_2(A*A.col(1), tmp), 1); // evaluated into a temp + VERIFY_EVALUATION_COUNT( call_ref_2(ac.head(5),ac.head(5)), 0); + VERIFY_EVALUATION_COUNT( call_ref_2(ac,ac), 0); + VERIFY_EVALUATION_COUNT( call_ref_2(a,a), 0); + VERIFY_EVALUATION_COUNT( call_ref_2(ab,ab), 0); + VERIFY_EVALUATION_COUNT( call_ref_2(a.head(4),a.head(4)), 0); + tmp = a+a; + VERIFY_EVALUATION_COUNT( call_ref_2(a+a,tmp), 1); // evaluated into a temp + VERIFY_EVALUATION_COUNT( call_ref_2(ca.imag(),ca.imag()), 1); // evaluated into a temp - VERIFY_EVALUATION_COUNT( call_ref_4(ac.head(5)), 0); - VERIFY_EVALUATION_COUNT( call_ref_4(a+a), 1); // evaluated into a temp - VERIFY_EVALUATION_COUNT( call_ref_4(ca.imag()), 0); + VERIFY_EVALUATION_COUNT( call_ref_4(ac.head(5),ac.head(5)), 0); + tmp = a+a; + VERIFY_EVALUATION_COUNT( call_ref_4(a+a,tmp), 1); // evaluated into a temp + VERIFY_EVALUATION_COUNT( call_ref_4(ca.imag(),ca.imag()), 0); - VERIFY_EVALUATION_COUNT( call_ref_5(a), 0); - VERIFY_EVALUATION_COUNT( call_ref_5(a.head(3)), 0); - VERIFY_EVALUATION_COUNT( call_ref_5(A), 0); - // call_ref_5(A.transpose()); // does not compile - VERIFY_EVALUATION_COUNT( call_ref_5(A.block(1,1,2,2)), 0); + VERIFY_EVALUATION_COUNT( call_ref_5(a,a), 0); + VERIFY_EVALUATION_COUNT( call_ref_5(a.head(3),a.head(3)), 0); + VERIFY_EVALUATION_COUNT( call_ref_5(A,A), 0); +// call_ref_5(A.transpose()); // does not compile + VERIFY_EVALUATION_COUNT( call_ref_5(A.block(1,1,2,2),A.block(1,1,2,2)), 0); + VERIFY_EVALUATION_COUNT( call_ref_5(b,b), 0); // storage order do not match, but this is a degenerate case that should work + VERIFY_EVALUATION_COUNT( call_ref_5(a.row(3),a.row(3)), 0); - VERIFY_EVALUATION_COUNT( call_ref_6(a), 0); - VERIFY_EVALUATION_COUNT( call_ref_6(a.head(3)), 0); - VERIFY_EVALUATION_COUNT( call_ref_6(A.row(3)), 1); // evaluated into a temp thouth it could be avoided by viewing it as a 1xn matrix - VERIFY_EVALUATION_COUNT( call_ref_6(A+A), 1); // evaluated into a temp - VERIFY_EVALUATION_COUNT( call_ref_6(A), 0); - VERIFY_EVALUATION_COUNT( call_ref_6(A.transpose()), 1); // evaluated into a temp because the storage orders do not match - VERIFY_EVALUATION_COUNT( call_ref_6(A.block(1,1,2,2)), 0); + VERIFY_EVALUATION_COUNT( call_ref_6(a,a), 0); + VERIFY_EVALUATION_COUNT( call_ref_6(a.head(3),a.head(3)), 0); + VERIFY_EVALUATION_COUNT( call_ref_6(A.row(3),A.row(3)), 1); // evaluated into a temp thouth it could be avoided by viewing it as a 1xn matrix + tmp = A+A; + VERIFY_EVALUATION_COUNT( call_ref_6(A+A,tmp), 1); // evaluated into a temp + VERIFY_EVALUATION_COUNT( call_ref_6(A,A), 0); + VERIFY_EVALUATION_COUNT( call_ref_6(A.transpose(),A.transpose()), 1); // evaluated into a temp because the storage orders do not match + VERIFY_EVALUATION_COUNT( call_ref_6(A.block(1,1,2,2),A.block(1,1,2,2)), 0); + + VERIFY_EVALUATION_COUNT( call_ref_7(c,c), 0); } void test_ref() From 35a2c9cde7e26ef958883297cefd44db57b1a0bb Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Fri, 14 Mar 2014 16:48:29 +0100 Subject: [PATCH 013/158] clang does not accept this without template keyword --- Eigen/src/Core/AssignEvaluator.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h index 5b5d29ca9..5451a138f 100644 --- a/Eigen/src/Core/AssignEvaluator.h +++ b/Eigen/src/Core/AssignEvaluator.h @@ -574,13 +574,13 @@ public: template void assignPacket(Index row, Index col) { - m_functor.assignPacket(&m_dst.coeffRef(row,col), m_src.template packet(row,col)); + m_functor.template assignPacket(&m_dst.coeffRef(row,col), m_src.template packet(row,col)); } template void assignPacket(Index index) { - m_functor.assignPacket(&m_dst.coeffRef(index), m_src.template packet(index)); + m_functor.template assignPacket(&m_dst.coeffRef(index), m_src.template packet(index)); } template From 4fe56a0e0207e69e652dded5263bd0066fc0e4e8 Mon Sep 17 00:00:00 2001 From: Bo Li Date: Sat, 15 Mar 2014 08:42:20 +0800 Subject: [PATCH 014/158] fix Spline constructor --- unsupported/Eigen/src/Splines/Spline.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/src/Splines/Spline.h b/unsupported/Eigen/src/Splines/Spline.h index 771f10432..1b47992d6 100644 --- a/unsupported/Eigen/src/Splines/Spline.h +++ b/unsupported/Eigen/src/Splines/Spline.h @@ -57,7 +57,7 @@ namespace Eigen **/ Spline() : m_knots(1, (Degree==Dynamic ? 2 : 2*Degree+2)) - , m_ctrls(ControlPointVectorType::Zero(2,(Degree==Dynamic ? 1 : Degree+1))) + , m_ctrls(ControlPointVectorType::Zero(Dimension,(Degree==Dynamic ? 1 : Degree+1))) { // in theory this code can go to the initializer list but it will get pretty // much unreadable ... From 3e42b775ead02a2b389840b1b8fd7c28121fb387 Mon Sep 17 00:00:00 2001 From: giacomo po Date: Mon, 17 Mar 2014 16:33:52 -0700 Subject: [PATCH 015/158] MINRES, bug #715: add support for zero rhs, and remove square test. --- .../Eigen/src/IterativeSolvers/MINRES.h | 49 ++++++++++++------- unsupported/test/minres.cpp | 30 ++++++++---- 2 files changed, 52 insertions(+), 27 deletions(-) diff --git a/unsupported/Eigen/src/IterativeSolvers/MINRES.h b/unsupported/Eigen/src/IterativeSolvers/MINRES.h index 0e56342a8..98f9ecc17 100644 --- a/unsupported/Eigen/src/IterativeSolvers/MINRES.h +++ b/unsupported/Eigen/src/IterativeSolvers/MINRES.h @@ -37,22 +37,31 @@ namespace Eigen { typedef typename Dest::Scalar Scalar; typedef Matrix VectorType; + // Check for zero rhs + const RealScalar rhsNorm2(rhs.squaredNorm()); + if(rhsNorm2 == 0) + { + x.setZero(); + iters = 0; + tol_error = 0; + return; + } + // initialize const int maxIters(iters); // initialize maxIters to iters const int N(mat.cols()); // the size of the matrix - const RealScalar rhsNorm2(rhs.squaredNorm()); const RealScalar threshold2(tol_error*tol_error*rhsNorm2); // convergence threshold (compared to residualNorm2) // Initialize preconditioned Lanczos -// VectorType v_old(N); // will be initialized inside loop + VectorType v_old(N); // will be initialized inside loop VectorType v( VectorType::Zero(N) ); //initialize v VectorType v_new(rhs-mat*x); //initialize v_new RealScalar residualNorm2(v_new.squaredNorm()); -// VectorType w(N); // will be initialized inside loop + VectorType w(N); // will be initialized inside loop VectorType w_new(precond.solve(v_new)); // initialize w_new // RealScalar beta; // will be initialized inside loop RealScalar beta_new2(v_new.dot(w_new)); - eigen_assert(beta_new2 >= 0 && "PRECONDITIONER IS NOT POSITIVE DEFINITE"); + eigen_assert(beta_new2 >= 0.0 && "PRECONDITIONER IS NOT POSITIVE DEFINITE"); RealScalar beta_new(sqrt(beta_new2)); const RealScalar beta_one(beta_new); v_new /= beta_new; @@ -62,14 +71,14 @@ namespace Eigen { RealScalar c_old(1.0); RealScalar s(0.0); // the sine of the Givens rotation RealScalar s_old(0.0); // the sine of the Givens rotation -// VectorType p_oold(N); // will be initialized in loop + VectorType p_oold(N); // will be initialized in loop VectorType p_old(VectorType::Zero(N)); // initialize p_old=0 VectorType p(p_old); // initialize p=0 RealScalar eta(1.0); iters = 0; // reset iters - while ( iters < maxIters ){ - + while ( iters < maxIters ) + { // Preconditioned Lanczos /* Note that there are 4 variants on the Lanczos algorithm. These are * described in Paige, C. C. (1972). Computational variants of @@ -81,17 +90,17 @@ namespace Eigen { * A. Greenbaum, Iterative Methods for Solving Linear Systems, SIAM (1987). */ const RealScalar beta(beta_new); -// v_old = v; // update: at first time step, this makes v_old = 0 so value of beta doesn't matter - const VectorType v_old(v); // NOT SURE IF CREATING v_old EVERY ITERATION IS EFFICIENT + v_old = v; // update: at first time step, this makes v_old = 0 so value of beta doesn't matter +// const VectorType v_old(v); // NOT SURE IF CREATING v_old EVERY ITERATION IS EFFICIENT v = v_new; // update -// w = w_new; // update - const VectorType w(w_new); // NOT SURE IF CREATING w EVERY ITERATION IS EFFICIENT + w = w_new; // update +// const VectorType w(w_new); // NOT SURE IF CREATING w EVERY ITERATION IS EFFICIENT v_new.noalias() = mat*w - beta*v_old; // compute v_new const RealScalar alpha = v_new.dot(w); v_new -= alpha*v; // overwrite v_new w_new = precond.solve(v_new); // overwrite w_new beta_new2 = v_new.dot(w_new); // compute beta_new - eigen_assert(beta_new2 >= 0 && "PRECONDITIONER IS NOT POSITIVE DEFINITE"); + eigen_assert(beta_new2 >= 0.0 && "PRECONDITIONER IS NOT POSITIVE DEFINITE"); beta_new = sqrt(beta_new2); // compute beta_new v_new /= beta_new; // overwrite v_new for next iteration w_new /= beta_new; // overwrite w_new for next iteration @@ -107,28 +116,34 @@ namespace Eigen { s=beta_new/r1; // new sine // Update solution -// p_oold = p_old; - const VectorType p_oold(p_old); // NOT SURE IF CREATING p_oold EVERY ITERATION IS EFFICIENT + p_oold = p_old; +// const VectorType p_oold(p_old); // NOT SURE IF CREATING p_oold EVERY ITERATION IS EFFICIENT p_old = p; p.noalias()=(w-r2*p_old-r3*p_oold) /r1; // IS NOALIAS REQUIRED? x += beta_one*c*eta*p; + + /* Update the squared residual. Note that this is the estimated residual. + The real residual |Ax-b|^2 may be slightly larger */ residualNorm2 *= s*s; - if ( residualNorm2 < threshold2){ + if ( residualNorm2 < threshold2) + { break; } eta=-s*eta; // update eta iters++; // increment iteration number (for output purposes) } - tol_error = std::sqrt(residualNorm2 / rhsNorm2); // return error. Note that this is the estimated error. The real error |Ax-b|/|b| may be slightly larger + + /* Compute error. Note that this is the estimated error. The real + error |Ax-b|/|b| may be slightly larger */ + tol_error = std::sqrt(residualNorm2 / rhsNorm2); } } template< typename _MatrixType, int _UpLo=Lower, typename _Preconditioner = IdentityPreconditioner> -// typename _Preconditioner = IdentityPreconditioner > // preconditioner must be positive definite class MINRES; namespace internal { diff --git a/unsupported/test/minres.cpp b/unsupported/test/minres.cpp index fd12da548..81b762c37 100644 --- a/unsupported/test/minres.cpp +++ b/unsupported/test/minres.cpp @@ -1,8 +1,8 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// Copyright (C) 2011 Gael Guennebaud // Copyright (C) 2012 Giacomo Po +// Copyright (C) 2011 Gael Guennebaud // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -14,19 +14,29 @@ template void test_minres_T() { - MINRES, Lower, DiagonalPreconditioner > minres_colmajor_diag; - MINRES, Lower, IdentityPreconditioner > minres_colmajor_I; -// MINRES, Lower, IncompleteLUT > minres_colmajor_ilut; - //minres, SSORPreconditioner > minres_colmajor_ssor; + // Identity preconditioner + MINRES, Lower, IdentityPreconditioner > minres_colmajor_lower_I; + MINRES, Upper, IdentityPreconditioner > minres_colmajor_upper_I; + + // Diagonal preconditioner + MINRES, Lower, DiagonalPreconditioner > minres_colmajor_lower_diag; + MINRES, Upper, DiagonalPreconditioner > minres_colmajor_upper_diag; + + // call tests for SPD matrix + CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_lower_I) ); + CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_upper_I) ); + + CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_lower_diag) ); + CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_upper_diag) ); + + // TO DO: symmetric semi-definite matrix + // TO DO: symmetric indefinite matrix - CALL_SUBTEST( check_sparse_square_solving(minres_colmajor_diag) ); - CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_I) ); - // CALL_SUBTEST( check_sparse_square_solving(minres_colmajor_ilut) ); - //CALL_SUBTEST( check_sparse_square_solving(minres_colmajor_ssor) ); } void test_minres() { CALL_SUBTEST_1(test_minres_T()); -// CALL_SUBTEST_2(test_minres_T >()); +// CALL_SUBTEST_2(test_minres_T >()); + } From 72707a86641ad0bd4c4e5cc45c4b8ced64b499ef Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 21 Mar 2014 11:40:29 -0700 Subject: [PATCH 016/158] Made sure that EIGEN_ALIGN is defined when EIGEN_DONT_VECTORIZE is set to true to prevent build failures when vectorization is disabled. --- Eigen/src/Core/util/Macros.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 733d3403e..bfd6ba7de 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -82,6 +82,8 @@ #define EIGEN_ALIGN_BYTES 32 #endif #define EIGEN_ALIGN 1 +#else + #define EIGEN_ALIGN 0 #endif // EIGEN_ALIGN_STATICALLY is the true test whether we want to align arrays on the stack or not. It takes into account both the user choice to explicitly disable From 08f7b3221d58e480d8ede105bd70b09a2104c5fb Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 24 Mar 2014 09:52:45 -0700 Subject: [PATCH 017/158] Added proper support for AVX and FMA in the makefiles. --- CMakeLists.txt | 8 +++++++- cmake/EigenTesting.cmake | 10 ++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 16a9b5bcb..fb13769f4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -196,12 +196,18 @@ if(NOT MSVC) message(STATUS "Enabling SSE4.2 in tests/examples") endif() - option(EIGEN_TEST_AVX "Enable/Disable AVX in tests/examples" ON) + option(EIGEN_TEST_AVX "Enable/Disable AVX in tests/examples" OFF) if(EIGEN_TEST_AVX) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx") message(STATUS "Enabling AVX in tests/examples") endif() + option(EIGEN_TEST_FMA "Enable/Disable FMA in tests/examples" OFF) + if(EIGEN_TEST_FMA) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfma") + message(STATUS "Enabling FMA in tests/examples") + endif() + option(EIGEN_TEST_ALTIVEC "Enable/Disable AltiVec in tests/examples" OFF) if(EIGEN_TEST_ALTIVEC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maltivec -mabi=altivec") diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake index d1ea4b9d2..9b9776894 100644 --- a/cmake/EigenTesting.cmake +++ b/cmake/EigenTesting.cmake @@ -270,6 +270,12 @@ macro(ei_testing_print_summary) message(STATUS "AVX: Using architecture defaults") endif() + if(EIGEN_TEST_FMA) + message(STATUS "FMA: ON") + else() + message(STATUS "FMA: Using architecture defaults") + endif() + if(EIGEN_TEST_ALTIVEC) message(STATUS "Altivec: ON") else() @@ -414,6 +420,10 @@ macro(ei_get_cxxflags VAR) set(${VAR} NEON) elseif(EIGEN_TEST_ALTIVEC) set(${VAR} ALVEC) + elseif(EIGEN_TEST_FMA) + set(${VAR} FMA) + elseif(EIGEN_TEST_AVX) + set(${VAR} AVX) elseif(EIGEN_TEST_SSE4_2) set(${VAR} SSE42) elseif(EIGEN_TEST_SSE4_1) From 7ae9b0805dbb218506a462c06263bd67f046366b Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 24 Mar 2014 13:33:40 -0700 Subject: [PATCH 018/158] Used AVX instructions to vectorize the predux_min, predux_min, predux_max, and predux_max packet primitives. --- Eigen/src/Core/arch/AVX/PacketMath.h | 30 ++++++++++------------------ 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index d1a134087..26cc996db 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -304,37 +304,27 @@ template<> EIGEN_STRONG_INLINE double predux_mul(const Packet4d& a) template<> EIGEN_STRONG_INLINE float predux_min(const Packet8f& a) { - float result = a[0]; - for (int i = 1; i < 8; ++i) { - if (a[i] < result) result = a[i]; - } - return result; + Packet8f tmp = _mm256_min_ps(a, _mm256_permute2f128_ps(a,a,1)); + tmp = _mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2))); + return pfirst(_mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1))); } template<> EIGEN_STRONG_INLINE double predux_min(const Packet4d& a) { - double result = a[0]; - for (int i = 1; i < 4; ++i) { - if (a[i] < result) result = a[i]; - } - return result; + Packet4d tmp = _mm256_min_pd(a, _mm256_permute2f128_pd(a,a,1)); + return pfirst(_mm256_min_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1))); } template<> EIGEN_STRONG_INLINE float predux_max(const Packet8f& a) { - float result = a[0]; - for (int i = 1; i < 8; ++i) { - if (a[i] > result) result = a[i]; - } - return result; + Packet8f tmp = _mm256_max_ps(a, _mm256_permute2f128_ps(a,a,1)); + tmp = _mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2))); + return pfirst(_mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1))); } template<> EIGEN_STRONG_INLINE double predux_max(const Packet4d& a) { - double result = a[0]; - for (int i = 1; i < 4; ++i) { - if (a[i] > result) result = a[i]; - } - return result; + Packet4d tmp = _mm256_max_pd(a, _mm256_permute2f128_pd(a,a,1)); + return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1))); } From 6bf3cc2732eebff73dd7fadcd8ac421f22381baf Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 25 Mar 2014 09:00:43 -0700 Subject: [PATCH 019/158] Use AVX instructions to vectorize pset1, pset1, preverse, and preverse --- Eigen/src/Core/arch/AVX/Complex.h | 40 +++++++++++++------------------ 1 file changed, 16 insertions(+), 24 deletions(-) diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index 9fb44ecab..17c32d79c 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -76,11 +76,9 @@ template<> EIGEN_STRONG_INLINE Packet4cf ploadu(const std::complex EIGEN_STRONG_INLINE Packet4cf pset1(const std::complex& from) { - __m256 result; - for (int i = 0; i < 8; i+=2) { - result[i] = std::real(from); - result[i+1] = std::imag(from); - } + const float r = std::real(from); + const float i = std::imag(from); + const __m256 result = _mm256_set_ps(i, r, i, r, i, r, i, r); return Packet4cf(result); } @@ -108,15 +106,15 @@ template<> EIGEN_STRONG_INLINE std::complex pfirst(const Pack } template<> EIGEN_STRONG_INLINE Packet4cf preverse(const Packet4cf& a) { + __m128 low = _mm256_extractf128_ps(a.v, 0); + __m128 high = _mm256_extractf128_ps(a.v, 1); + __m128d lowd = _mm_castps_pd(low); + __m128d highd = _mm_castps_pd(high); + low = _mm_castpd_ps(_mm_shuffle_pd(lowd,lowd,0x1)); + high = _mm_castpd_ps(_mm_shuffle_pd(highd,highd,0x1)); __m256 result; - result[0] = a.v[6]; - result[1] = a.v[7]; - result[2] = a.v[4]; - result[3] = a.v[5]; - result[4] = a.v[2]; - result[5] = a.v[3]; - result[6] = a.v[0]; - result[7] = a.v[1]; + result = _mm256_insertf128_ps(result, low, 1); + result = _mm256_insertf128_ps(result, high, 0); return Packet4cf(result); } @@ -298,13 +296,11 @@ template<> EIGEN_STRONG_INLINE Packet2cd pload (const std::complex EIGEN_STRONG_INLINE Packet2cd ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cd(ploadu((const double*)from)); } -template<> EIGEN_STRONG_INLINE Packet2cd pset1(const std::complex& from) +template<> EIGEN_STRONG_INLINE Packet2cd pset1(const std::complex& from) { - __m256d result; - for (int i = 0; i < 4; i+=2) { - result[i] = std::real(from); - result[i+1] = std::imag(from); - } + const double r = std::real(from); + const double i = std::imag(from); + const __m256d result = _mm256_set_pd(i, r, i, r); return Packet2cd(result); } @@ -321,11 +317,7 @@ template<> EIGEN_STRONG_INLINE std::complex pfirst(const Pac } template<> EIGEN_STRONG_INLINE Packet2cd preverse(const Packet2cd& a) { - __m256d result; - result[0] = a.v[2]; - result[1] = a.v[3]; - result[2] = a.v[0]; - result[3] = a.v[1]; + __m256d result = _mm256_permute2f128_pd(a.v, a.v, 1); return Packet2cd(result); } From b286a1e75c6bd451c27da8c5ebed0b0fb86dfc2a Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 26 Mar 2014 16:46:36 +0100 Subject: [PATCH 020/158] add pbroadcast2/4 generic intrinsics --- Eigen/src/Core/GenericPacketMath.h | 34 ++++++++++++++++++++++++++++ Eigen/src/Core/arch/SSE/PacketMath.h | 32 ++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 538ab53b2..d07541285 100755 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -173,6 +173,40 @@ pset1(const typename unpacket_traits::type& a) { return a; } template EIGEN_DEVICE_FUNC inline Packet pload1(const typename unpacket_traits::type *a) { return pset1(*a); } +/** \internal equivalent to + * \code + * a0 = pload1(a+0); + * a1 = pload1(a+1); + * a2 = pload1(a+2); + * a3 = pload1(a+3); + * \endcode + * \sa pset1, pload1, ploaddup, pbroadcast2 + */ +template EIGEN_DEVICE_FUNC +inline void pbroadcast4(const typename unpacket_traits::type *a, + Packet& a0, Packet& a1, Packet& a2, Packet& a3) +{ + a0 = pload1(a+0); + a1 = pload1(a+1); + a2 = pload1(a+2); + a3 = pload1(a+3); +} + +/** \internal equivalent to + * \code + * a0 = pload1(a+0); + * a1 = pload1(a+1); + * \endcode + * \sa pset1, pload1, ploaddup, pbroadcast4 + */ +template EIGEN_DEVICE_FUNC +inline void pbroadcast2(const typename unpacket_traits::type *a, + Packet& a0, Packet& a1) +{ + a0 = pload1(a+0); + a1 = pload1(a+1); +} + /** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */ template inline typename packet_traits::type plset(const Scalar& a) { return a; } diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 293fb83e4..9f81a4623 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -391,6 +391,38 @@ template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) #endif } +// with AVX, the default implementations based on pload1 are faster +#ifndef __AVX__ +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const float *a, + Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) +{ + a3 = pload(a); + a0 = vec4f_swizzle1(a3, 0,0,0,0); + a1 = vec4f_swizzle1(a3, 1,1,1,1); + a2 = vec4f_swizzle1(a3, 2,2,2,2); + a3 = vec4f_swizzle1(a3, 3,3,3,3); +} +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const double *a, + Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) +{ +#ifdef EIGEN_VECTORIZE_SSE3 + a0 = _mm_loaddup_pd(a+0); + a1 = _mm_loaddup_pd(a+1); + a2 = _mm_loaddup_pd(a+2); + a3 = _mm_loaddup_pd(a+3); +#else + a1 = pload(a); + a0 = vec2d_swizzle1(a1, 0,0); + a1 = vec2d_swizzle1(a1, 1,1); + a3 = pload(a+2); + a2 = vec2d_swizzle1(a3, 0,0); + a3 = vec2d_swizzle1(a3, 1,1); +#endif +} +#endif + EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs) { vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55)); From bc401eb6fa9c4c14c7fb32acfe70b304c1850283 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 26 Mar 2014 18:53:00 +0100 Subject: [PATCH 021/158] Implement new 1 packet x 8 gebp kernel --- Eigen/src/Core/arch/SSE/PacketMath.h | 6 +- .../Core/products/GeneralBlockPanelKernel.h | 679 +++++++----------- .../Core/products/SelfadjointMatrixMatrix.h | 48 +- 3 files changed, 297 insertions(+), 436 deletions(-) diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 9f81a4623..b1f7b7717 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -397,7 +397,7 @@ template<> EIGEN_STRONG_INLINE void pbroadcast4(const float *a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) { - a3 = pload(a); + a3 = ploadu(a); a0 = vec4f_swizzle1(a3, 0,0,0,0); a1 = vec4f_swizzle1(a3, 1,1,1,1); a2 = vec4f_swizzle1(a3, 2,2,2,2); @@ -413,10 +413,10 @@ pbroadcast4(const double *a, a2 = _mm_loaddup_pd(a+2); a3 = _mm_loaddup_pd(a+3); #else - a1 = pload(a); + a1 = ploadu(a); a0 = vec2d_swizzle1(a1, 0,0); a1 = vec2d_swizzle1(a1, 1,1); - a3 = pload(a+2); + a3 = ploadu(a+2); a2 = vec2d_swizzle1(a3, 0,0); a3 = vec2d_swizzle1(a3, 1,1); #endif diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index ba6fad246..8a398d912 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -161,11 +161,11 @@ public: NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, - // register block size along the N direction (must be either 2 or 4) - nr = NumberOfRegisters/4, + // register block size along the N direction (must be either 4 or 8) + nr = NumberOfRegisters/2, // register block size along the M direction (currently, this one cannot be modified) - mr = 2 * LhsPacketSize, + mr = LhsPacketSize, WorkSpaceFactor = nr * RhsPacketSize, @@ -187,6 +187,16 @@ public: { p = pset1(ResScalar(0)); } + + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + { + pbroadcast4(b, b0, b1, b2, b3); + } + + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) + { + pbroadcast2(b, b0, b1); + } EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { @@ -230,8 +240,8 @@ public: ResPacketSize = Vectorizable ? packet_traits::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, - nr = NumberOfRegisters/4, - mr = 2 * LhsPacketSize, + nr = NumberOfRegisters/2, + mr = LhsPacketSize, WorkSpaceFactor = nr*RhsPacketSize, LhsProgress = LhsPacketSize, @@ -262,6 +272,16 @@ public: { dest = pload(a); } + + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + { + pbroadcast4(b, b0, b1, b2, b3); + } + + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) + { + pbroadcast2(b, b0, b1); + } EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const { @@ -304,8 +324,9 @@ public: RealPacketSize = Vectorizable ? packet_traits::size : 1, ResPacketSize = Vectorizable ? packet_traits::size : 1, - nr = 2, - mr = 2 * ResPacketSize, + // FIXME: should depend on NumberOfRegisters + nr = 4, + mr = ResPacketSize, WorkSpaceFactor = Vectorizable ? 2*nr*RealPacketSize : nr, LhsProgress = ResPacketSize, @@ -333,16 +354,37 @@ public: p.second = pset1(RealScalar(0)); } + // Scalar path EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const { dest = pset1(*b); } + // Vectorized path EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const { dest.first = pset1(real(*b)); dest.second = pset1(imag(*b)); } + + // linking error if instantiated without being optimized out: + void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3); + + // Vectorized path + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacket& b0, DoublePacket& b1) + { + // FIXME not sure that's the best way to implement it! + loadRhs(b+0, b0); + loadRhs(b+1, b1); + } + + // Scalar path + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsScalar& b0, RhsScalar& b1) + { + // FIXME not sure that's the best way to implement it! + loadRhs(b+0, b0); + loadRhs(b+1, b1); + } // nothing special here EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const @@ -414,8 +456,9 @@ public: ResPacketSize = Vectorizable ? packet_traits::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, + // FIXME: should depend on NumberOfRegisters nr = 4, - mr = 2*ResPacketSize, + mr = ResPacketSize, WorkSpaceFactor = nr*RhsPacketSize, LhsProgress = ResPacketSize, @@ -441,6 +484,16 @@ public: { dest = pset1(*b); } + + // linking error if instantiated without being optimized out: + void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3); + + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) + { + // FIXME not sure that's the best way to implement it! + b0 = pload1(b+0); + b1 = pload1(b+1); + } EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { @@ -511,11 +564,9 @@ void gebp_kernel if(strideA==-1) strideA = depth; if(strideB==-1) strideB = depth; conj_helper cj; -// conj_helper pcj; Index packet_cols = (cols/nr) * nr; + // Here we assume that mr==LhsProgress const Index peeled_mc = (rows/mr)*mr; - // FIXME: - const Index peeled_mc2 = peeled_mc + (rows-peeled_mc >= LhsProgress ? LhsProgress : 0); const Index peeled_kc = (depth/4)*4; // loops on each micro vertical panel of rhs (depth x nr) @@ -527,144 +578,88 @@ void gebp_kernel for(Index i=0; i(alpha); - R0 = ploadu(r0); - R1 = ploadu(r1); - R2 = ploadu(r2); - R3 = ploadu(r3); - R4 = ploadu(r0 + ResPacketSize); - R5 = ploadu(r1 + ResPacketSize); - R6 = ploadu(r2 + ResPacketSize); + R0 = ploadu(r0+0*resStride); + R1 = ploadu(r0+1*resStride); + R2 = ploadu(r0+2*resStride); + R3 = ploadu(r0+3*resStride); + R4 = ploadu(r0+4*resStride); + R5 = ploadu(r0+5*resStride); + R6 = ploadu(r0+6*resStride); traits.acc(C0, alphav, R0); - pstoreu(r0, R0); - R0 = ploadu(r3 + ResPacketSize); + pstoreu(r0+0*resStride, R0); + R0 = ploadu(r0+7*resStride); traits.acc(C1, alphav, R1); traits.acc(C2, alphav, R2); @@ -739,232 +711,107 @@ EIGEN_ASM_COMMENT("mybegin4"); traits.acc(C6, alphav, R6); traits.acc(C7, alphav, R0); - pstoreu(r1, R1); - pstoreu(r2, R2); - pstoreu(r3, R3); - pstoreu(r0 + ResPacketSize, R4); - pstoreu(r1 + ResPacketSize, R5); - pstoreu(r2 + ResPacketSize, R6); - pstoreu(r3 + ResPacketSize, R0); + pstoreu(r0+1*resStride, R1); + pstoreu(r0+2*resStride, R2); + pstoreu(r0+3*resStride, R3); + pstoreu(r0+4*resStride, R4); + pstoreu(r0+5*resStride, R5); + pstoreu(r0+6*resStride, R6); + pstoreu(r0+7*resStride, R0); } - else + else // nr==4 { - ResPacket R0, R1, R4; + ResPacket R0, R1, R2; ResPacket alphav = pset1(alpha); - R0 = ploadu(r0); - R1 = ploadu(r1); - R4 = ploadu(r0 + ResPacketSize); + R0 = ploadu(r0+0*resStride); + R1 = ploadu(r0+1*resStride); + R2 = ploadu(r0+2*resStride); traits.acc(C0, alphav, R0); - pstoreu(r0, R0); - R0 = ploadu(r1 + ResPacketSize); + pstoreu(r0+0*resStride, R0); + R0 = ploadu(r0+3*resStride); + traits.acc(C1, alphav, R1); - traits.acc(C4, alphav, R4); - traits.acc(C5, alphav, R0); - pstoreu(r1, R1); - pstoreu(r0 + ResPacketSize, R4); - pstoreu(r1 + ResPacketSize, R0); + traits.acc(C2, alphav, R2); + traits.acc(C3, alphav, R0); + + pstoreu(r0+1*resStride, R1); + pstoreu(r0+2*resStride, R2); + pstoreu(r0+3*resStride, R0); } } - if(rows-peeled_mc>=LhsProgress) - { - Index i = peeled_mc; - const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress]; - prefetch(&blA[0]); - - // gets res block as register - AccPacket C0, C1, C2, C3; - traits.initAcc(C0); - traits.initAcc(C1); - if(nr==4) traits.initAcc(C2); - if(nr==4) traits.initAcc(C3); - - // performs "inner" product - const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; - for(Index k=0; k(alpha); - - ResScalar* r0 = &res[(j2+0)*resStride + i]; - ResScalar* r1 = r0 + resStride; - ResScalar* r2 = r1 + resStride; - ResScalar* r3 = r2 + resStride; - - R0 = ploadu(r0); - R1 = ploadu(r1); - if(nr==4) R2 = ploadu(r2); - if(nr==4) R3 = ploadu(r3); - - traits.acc(C0, alphav, R0); - traits.acc(C1, alphav, R1); - if(nr==4) traits.acc(C2, alphav, R2); - if(nr==4) traits.acc(C3, alphav, R3); - - pstoreu(r0, R0); - pstoreu(r1, R1); - if(nr==4) pstoreu(r2, R2); - if(nr==4) pstoreu(r3, R3); - } - for(Index i=peeled_mc2; i do the same but with nr==1 for(Index j2=packet_cols; j2(alpha); - - ResScalar* r0 = &res[(j2+0)*resStride + i]; - - R0 = ploadu(r0); - R4 = ploadu(r0+ResPacketSize); - - traits.acc(C0, alphav, R0); - traits.acc(C4, alphav, R4); - - pstoreu(r0, R0); - pstoreu(r0+ResPacketSize, R4); - } - if(rows-peeled_mc>=LhsProgress) - { - Index i = peeled_mc; - const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress]; - prefetch(&blA[0]); - AccPacket C0; traits.initAcc(C0); @@ -1025,19 +832,23 @@ EIGEN_ASM_COMMENT("mybegin4"); { LhsPacket A0; RhsPacket B_0; - traits.loadLhs(blA, A0); - traits.loadRhs(blB, B_0); - traits.madd(A0, B_0, C0, B_0); + RhsPacket T0; + + traits.loadLhs(&blA[0*LhsProgress], A0); + traits.loadRhs(&blB[0*RhsProgress], B_0); + traits.madd(A0,B_0,C0,T0); + blB += RhsProgress; blA += LhsProgress; } - + ResPacket R0; ResPacket alphav = pset1(alpha); - ResPacket R0 = ploadu(&res[(j2+0)*resStride + i]); + ResScalar* r0 = &res[(j2+0)*resStride + i]; + R0 = ploadu(r0); traits.acc(C0, alphav, R0); - pstoreu(&res[(j2+0)*resStride + i], R0); + pstoreu(r0, R0); } - for(Index i=peeled_mc2; i=depth && offset<=stride)); - eigen_assert( (StorageOrder==RowMajor) || ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) ); + eigen_assert( (StorageOrder==RowMajor) || ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) ); conj_if::IsComplex && Conjugate> cj; const_blas_data_mapper lhs(_lhs,lhsStride); Index count = 0; @@ -1104,15 +915,25 @@ EIGEN_DONT_INLINE void gemm_pack_lhs=1*PacketSize) A = ploadu(&lhs(i+0*PacketSize, k)); - if(Pack1>=2*PacketSize) B = ploadu(&lhs(i+1*PacketSize, k)); - if(Pack1>=3*PacketSize) C = ploadu(&lhs(i+2*PacketSize, k)); - if(Pack1>=4*PacketSize) D = ploadu(&lhs(i+3*PacketSize, k)); - if(Pack1>=1*PacketSize) { pstore(blockA+count, cj.pconj(A)); count+=PacketSize; } - if(Pack1>=2*PacketSize) { pstore(blockA+count, cj.pconj(B)); count+=PacketSize; } - if(Pack1>=3*PacketSize) { pstore(blockA+count, cj.pconj(C)); count+=PacketSize; } - if(Pack1>=4*PacketSize) { pstore(blockA+count, cj.pconj(D)); count+=PacketSize; } + if((Pack1%PacketSize)==0) + { + Packet A, B, C, D; + if(Pack1>=1*PacketSize) A = ploadu(&lhs(i+0*PacketSize, k)); + if(Pack1>=2*PacketSize) B = ploadu(&lhs(i+1*PacketSize, k)); + if(Pack1>=3*PacketSize) C = ploadu(&lhs(i+2*PacketSize, k)); + if(Pack1>=4*PacketSize) D = ploadu(&lhs(i+3*PacketSize, k)); + if(Pack1>=1*PacketSize) { pstore(blockA+count, cj.pconj(A)); count+=PacketSize; } + if(Pack1>=2*PacketSize) { pstore(blockA+count, cj.pconj(B)); count+=PacketSize; } + if(Pack1>=3*PacketSize) { pstore(blockA+count, cj.pconj(C)); count+=PacketSize; } + if(Pack1>=4*PacketSize) { pstore(blockA+count, cj.pconj(D)); count+=PacketSize; } + } + else + { + if(Pack1>=1) blockA[count++] = cj(lhs(i+0, k)); + if(Pack1>=2) blockA[count++] = cj(lhs(i+1, k)); + if(Pack1>=3) blockA[count++] = cj(lhs(i+2, k)); + if(Pack1>=4) blockA[count++] = cj(lhs(i+3, k)); + } } } else @@ -1191,12 +1012,20 @@ EIGEN_DONT_INLINE void gemm_pack_rhs=4) blockB[count+2] = cj(b2[k]); + if(nr>=4) blockB[count+3] = cj(b3[k]); + if(nr>=8) blockB[count+4] = cj(b4[k]); + if(nr>=8) blockB[count+5] = cj(b5[k]); + if(nr>=8) blockB[count+6] = cj(b6[k]); + if(nr>=8) blockB[count+7] = cj(b7[k]); count += nr; } // skip what we have after @@ -1251,8 +1080,12 @@ EIGEN_DONT_INLINE void gemm_pack_rhs=4) blockB[count+2] = cj(b0[2]); + if(nr>=4) blockB[count+3] = cj(b0[3]); + if(nr>=8) blockB[count+4] = cj(b0[4]); + if(nr>=8) blockB[count+5] = cj(b0[5]); + if(nr>=8) blockB[count+6] = cj(b0[6]); + if(nr>=8) blockB[count+7] = cj(b0[7]); count += nr; } } diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index 99cf9e0ae..d9fd9f556 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -63,7 +63,7 @@ struct symm_pack_lhs for(Index i=peeled_mc; i=4) { blockB[count+2] = rhs(k,j2+2); blockB[count+3] = rhs(k,j2+3); } + if (nr>=8) + { + blockB[count+4] = rhs(k,j2+4); + blockB[count+5] = rhs(k,j2+5); + blockB[count+6] = rhs(k,j2+6); + blockB[count+7] = rhs(k,j2+7); + } count += nr; } } @@ -109,11 +116,18 @@ struct symm_pack_rhs { blockB[count+0] = numext::conj(rhs(j2+0,k)); blockB[count+1] = numext::conj(rhs(j2+1,k)); - if (nr==4) + if (nr>=4) { blockB[count+2] = numext::conj(rhs(j2+2,k)); blockB[count+3] = numext::conj(rhs(j2+3,k)); } + if (nr>=8) + { + blockB[count+4] = numext::conj(rhs(j2+4,k)); + blockB[count+5] = numext::conj(rhs(j2+5,k)); + blockB[count+6] = numext::conj(rhs(j2+6,k)); + blockB[count+7] = numext::conj(rhs(j2+7,k)); + } count += nr; } // symmetric @@ -137,11 +151,18 @@ struct symm_pack_rhs { blockB[count+0] = rhs(k,j2+0); blockB[count+1] = rhs(k,j2+1); - if (nr==4) + if (nr>=4) { blockB[count+2] = rhs(k,j2+2); blockB[count+3] = rhs(k,j2+3); } + if (nr>=8) + { + blockB[count+4] = rhs(k,j2+4); + blockB[count+5] = rhs(k,j2+5); + blockB[count+6] = rhs(k,j2+6); + blockB[count+7] = rhs(k,j2+7); + } count += nr; } } @@ -153,11 +174,18 @@ struct symm_pack_rhs { blockB[count+0] = numext::conj(rhs(j2+0,k)); blockB[count+1] = numext::conj(rhs(j2+1,k)); - if (nr==4) + if (nr>=4) { blockB[count+2] = numext::conj(rhs(j2+2,k)); blockB[count+3] = numext::conj(rhs(j2+3,k)); } + if (nr>=8) + { + blockB[count+4] = numext::conj(rhs(j2+4,k)); + blockB[count+5] = numext::conj(rhs(j2+5,k)); + blockB[count+6] = numext::conj(rhs(j2+6,k)); + blockB[count+7] = numext::conj(rhs(j2+7,k)); + } count += nr; } } @@ -422,11 +450,11 @@ struct SelfadjointProductMatrix NumTraits::IsComplex && EIGEN_LOGICAL_XOR(RhsIsUpper,bool(RhsBlasTraits::NeedToConjugate)), internal::traits::Flags&RowMajorBit ? RowMajor : ColMajor> ::run( - lhs.rows(), rhs.cols(), // sizes - &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info - &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info - &dst.coeffRef(0,0), dst.outerStride(), // result info - actualAlpha // alpha + lhs.rows(), rhs.cols(), // sizes + &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info + &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info + &dst.coeffRef(0,0), dst.outerStride(), // result info + actualAlpha // alpha ); } }; From cf1a7bfbe146e232bd091a0ff2f186ea8803564e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 26 Mar 2014 12:03:31 -0700 Subject: [PATCH 022/158] Used AVX instructions to vectorize the complex version of the pfirst and ploaddup packet primitives. Silenced a few compilation warnings. --- Eigen/src/Core/arch/AVX/Complex.h | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index 17c32d79c..a7a1b7fda 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -78,19 +78,21 @@ template<> EIGEN_STRONG_INLINE Packet4cf pset1(const std::complex EIGEN_STRONG_INLINE Packet4cf ploaddup(const std::complex* from) { - __m256 result; - for (int i = 0; i < 2; ++i) { - result[4*i] = std::real(from[i]); - result[4*i+1] = std::imag(from[i]); - result[4*i+2] = std::real(from[i]); - result[4*i+3] = std::imag(from[i]); - } + // This should be optimized. + __m128 complex1 = _mm_loadl_pi(_mm_set1_ps(0.0f), (const __m64*)from); + complex1 = _mm_movelh_ps(complex1, complex1); + __m128 complex2 = _mm_loadl_pi(_mm_set1_ps(0.0f), (const __m64*)(from+1)); + complex2 = _mm_movelh_ps(complex2, complex2); + __m256 result = _mm256_setzero_ps(); + result = _mm256_insertf128_ps(result, complex1, 0); + result = _mm256_insertf128_ps(result, complex2, 1); return Packet4cf(result); } @@ -102,7 +104,10 @@ template<> EIGEN_STRONG_INLINE void prefetch >(const std::co template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet4cf& a) { - return std::complex(a.v[0], a.v[1]); + __m128 low = _mm256_extractf128_ps(a.v, 0); + std::complex res; + _mm_storel_pi((__m64*)&res, low); + return res; } template<> EIGEN_STRONG_INLINE Packet4cf preverse(const Packet4cf& a) { @@ -112,7 +117,7 @@ template<> EIGEN_STRONG_INLINE Packet4cf preverse(const Packet4cf& a) { __m128d highd = _mm_castps_pd(high); low = _mm_castpd_ps(_mm_shuffle_pd(lowd,lowd,0x1)); high = _mm_castpd_ps(_mm_shuffle_pd(highd,highd,0x1)); - __m256 result; + __m256 result = _mm256_setzero_ps(); result = _mm256_insertf128_ps(result, low, 1); result = _mm256_insertf128_ps(result, high, 0); return Packet4cf(result); @@ -300,6 +305,7 @@ template<> EIGEN_STRONG_INLINE Packet2cd pset1(const std::complex EIGEN_STRONG_INLINE void prefetch >(const std::c template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cd& a) { - return std::complex(a.v[0],a.v[1]); + __m128d low = _mm256_extractf128_pd(a.v, 0); + EIGEN_ALIGN16 double res[2]; + _mm_store_pd(res, low); + return std::complex(res[0],res[1]); } template<> EIGEN_STRONG_INLINE Packet2cd preverse(const Packet2cd& a) { From a078f442a382be64cc22f315ad300c353891a814 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 26 Mar 2014 15:11:18 -0700 Subject: [PATCH 023/158] Vectorized the multiplication and division of complex numbers using AVX instructions. --- Eigen/src/Core/arch/AVX/Complex.h | 36 +++++++++++++------------------ 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index a7a1b7fda..f63dfe6cf 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -221,13 +221,11 @@ template<> struct conj_helper template<> EIGEN_STRONG_INLINE Packet4cf pdiv(const Packet4cf& a, const Packet4cf& b) { - Packet4cf res; - for (int i = 0; i < 8; i+=2) { - std::complex result = std::complex(a.v[i], a.v[i+1]) / std::complex(b.v[i], b.v[i+1]); - res.v[i] = std::real(result); - res.v[i+1] = std::imag(result); - } - return res; + Packet4cf num = pmul(a, pconj(b)); + __m256 tmp = _mm256_mul_ps(b.v, b.v); + __m256 tmp2 = _mm256_shuffle_ps(tmp,tmp,0xB1); + __m256 denom = _mm256_add_ps(tmp, tmp2); + return Packet4cf(_mm256_div_ps(num.v, denom)); } template<> EIGEN_STRONG_INLINE Packet4cf pcplxflip(const Packet4cf& x) @@ -282,13 +280,12 @@ template<> EIGEN_STRONG_INLINE Packet2cd pconj(const Packet2cd& a) template<> EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) { - __m256d tmp1 = _mm256_mul_pd(_mm256_permute_pd(a.v, 0), b.v); - // FIXME: _mm256_permute_pd(b.v, _MM_SHUFFLE2(1,0) won't work as expected, figure out an alternative. - __m256d op = {b.v[1], b.v[0], b.v[3], b.v[2]}; - __m256d tmp2 = _mm256_mul_pd(_mm256_permute_pd(a.v, 15), op); - __m256d result = _mm256_addsub_pd(tmp1, tmp2); - - return Packet2cd(result); + __m256d tmp1 = _mm256_shuffle_pd(a.v,a.v,0x0); + __m256d even = _mm256_mul_pd(tmp1, b.v); + __m256d tmp2 = _mm256_shuffle_pd(a.v,a.v,0xF); + __m256d tmp3 = _mm256_shuffle_pd(b.v,b.v,0x5); + __m256d odd = _mm256_mul_pd(tmp2, tmp3); + return Packet2cd(_mm256_addsub_pd(even, odd)); } template<> EIGEN_STRONG_INLINE Packet2cd pand (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); } @@ -418,13 +415,10 @@ template<> struct conj_helper template<> EIGEN_STRONG_INLINE Packet2cd pdiv(const Packet2cd& a, const Packet2cd& b) { - Packet2cd res; - for (int i = 0; i < 4; i+=2) { - std::complex result = std::complex(a.v[i], a.v[i+1]) / std::complex(b.v[i], b.v[i+1]); - res.v[i] = std::real(result); - res.v[i+1] = std::imag(result); - } - return res; + Packet2cd num = pmul(a, pconj(b)); + __m256d tmp = _mm256_mul_pd(b.v, b.v); + __m256d denom = _mm256_hadd_pd(tmp, tmp); + return Packet2cd(_mm256_div_pd(num.v, denom)); } template<> EIGEN_STRONG_INLINE Packet2cd pcplxflip(const Packet2cd& x) From 8be011e7765cf5d30cad86edd55850b0b1277741 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 26 Mar 2014 23:14:44 +0100 Subject: [PATCH 024/158] Remove remaining bits of the dead working buffer --- Eigen/src/Core/products/GeneralMatrixMatrix.h | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index 6776cf5a8..19991fa3f 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -278,13 +278,11 @@ class gemm_blocking_space Traits; enum { SizeA = ActualRows * MaxDepth, - SizeB = ActualCols * MaxDepth, - SizeW = MaxDepth * Traits::WorkSpaceFactor + SizeB = ActualCols * MaxDepth }; EIGEN_ALIGN16 LhsScalar m_staticA[SizeA]; EIGEN_ALIGN16 RhsScalar m_staticB[SizeB]; - EIGEN_ALIGN16 RhsScalar m_staticW[SizeW]; public: @@ -295,12 +293,10 @@ class gemm_blocking_spacem_kc = MaxDepth; this->m_blockA = m_staticA; this->m_blockB = m_staticB; - this->m_blockW = m_staticW; } inline void allocateA() {} inline void allocateB() {} - inline void allocateW() {} inline void allocateAll() {} }; @@ -319,7 +315,6 @@ class gemm_blocking_space(this->m_kc, this->m_mc, this->m_nc); m_sizeA = this->m_mc * this->m_kc; m_sizeB = this->m_kc * this->m_nc; - m_sizeW = this->m_kc*Traits::WorkSpaceFactor; } void allocateA() @@ -347,24 +341,16 @@ class gemm_blocking_spacem_blockB = aligned_new(m_sizeB); } - void allocateW() - { - if(this->m_blockW==0) - this->m_blockW = aligned_new(m_sizeW); - } - void allocateAll() { allocateA(); allocateB(); - allocateW(); } ~gemm_blocking_space() { aligned_delete(this->m_blockA, m_sizeA); aligned_delete(this->m_blockB, m_sizeB); - aligned_delete(this->m_blockW, m_sizeW); } }; From f0a4c9d5abe94703d8b4e5ec49cba20df014d0ca Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 26 Mar 2014 23:22:36 +0100 Subject: [PATCH 025/158] Update gebp kernel to process a panle of 4 columns at once for the remaining ones. --- .../Core/products/GeneralBlockPanelKernel.h | 464 +++++++++++------- .../Core/products/SelfadjointMatrixMatrix.h | 153 +++--- 2 files changed, 380 insertions(+), 237 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 8a398d912..2b520383b 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -167,8 +167,6 @@ public: // register block size along the M direction (currently, this one cannot be modified) mr = LhsPacketSize, - WorkSpaceFactor = nr * RhsPacketSize, - LhsProgress = LhsPacketSize, RhsProgress = 1 }; @@ -242,7 +240,6 @@ public: NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, nr = NumberOfRegisters/2, mr = LhsPacketSize, - WorkSpaceFactor = nr*RhsPacketSize, LhsProgress = LhsPacketSize, RhsProgress = 1 @@ -327,7 +324,6 @@ public: // FIXME: should depend on NumberOfRegisters nr = 4, mr = ResPacketSize, - WorkSpaceFactor = Vectorizable ? 2*nr*RealPacketSize : nr, LhsProgress = ResPacketSize, RhsProgress = 1 @@ -459,7 +455,6 @@ public: // FIXME: should depend on NumberOfRegisters nr = 4, mr = ResPacketSize, - WorkSpaceFactor = nr*RhsPacketSize, LhsProgress = ResPacketSize, RhsProgress = 1 @@ -564,63 +559,45 @@ void gebp_kernel if(strideA==-1) strideA = depth; if(strideB==-1) strideB = depth; conj_helper cj; - Index packet_cols = (cols/nr) * nr; + Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; + Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; // Here we assume that mr==LhsProgress const Index peeled_mc = (rows/mr)*mr; const Index peeled_kc = (depth/4)*4; // loops on each micro vertical panel of rhs (depth x nr) - for(Index j2=0; j2=8) { - // loops on each largest micro horizontal panel of lhs (mr x depth) - // => we select a mr x nr micro block of res which is entirely - // stored into mr/packet_size x nr registers. - for(Index i=0; i we select a mr x nr micro block of res which is entirely + // stored into mr/packet_size x nr registers. + for(Index i=0; i EIGEN_GEBGP_ONESTEP8(1,A0,A1); EIGEN_GEBGP_ONESTEP8(2,A1,A0); EIGEN_GEBGP_ONESTEP8(3,A0,A1); - } - blB += 4*nr*RhsProgress; - blA += 4*mr; - } - // process remaining peeled loop - for(Index k=peeled_kc; k(alpha); @@ -718,60 +683,20 @@ void gebp_kernel pstoreu(r0+5*resStride, R5); pstoreu(r0+6*resStride, R6); pstoreu(r0+7*resStride, R0); - } - else // nr==4 - { - ResPacket R0, R1, R2; - ResPacket alphav = pset1(alpha); - - R0 = ploadu(r0+0*resStride); - R1 = ploadu(r0+1*resStride); - R2 = ploadu(r0+2*resStride); - traits.acc(C0, alphav, R0); - pstoreu(r0+0*resStride, R0); - R0 = ploadu(r0+3*resStride); - - traits.acc(C1, alphav, R1); - traits.acc(C2, alphav, R2); - traits.acc(C3, alphav, R0); - pstoreu(r0+1*resStride, R1); - pstoreu(r0+2*resStride, R2); - pstoreu(r0+3*resStride, R0); } - } - - for(Index i=peeled_mc; i B_1 = blB[7]; MADD(cj,A0,B_0,C6, B_0); MADD(cj,A0,B_1,C7, B_1); - } - blB += nr; + blB += 8; + } + res[(j2+0)*resStride + i] += alpha*C0; + res[(j2+1)*resStride + i] += alpha*C1; + res[(j2+2)*resStride + i] += alpha*C2; + res[(j2+3)*resStride + i] += alpha*C3; + res[(j2+4)*resStride + i] += alpha*C4; + res[(j2+5)*resStride + i] += alpha*C5; + res[(j2+6)*resStride + i] += alpha*C6; + res[(j2+7)*resStride + i] += alpha*C7; + } + } + } + + // Second pass using depth x 4 panels + // If nr==8, then we have at most one such panel + if(nr>=4) + { + for(Index j2=packet_cols8; j2 we select a mr x 4 micro block of res which is entirely + // stored into mr/packet_size x 4 registers. + for(Index i=0; i(alpha); + + R0 = ploadu(r0+0*resStride); + R1 = ploadu(r0+1*resStride); + R2 = ploadu(r0+2*resStride); + traits.acc(C0, alphav, R0); + pstoreu(r0+0*resStride, R0); + R0 = ploadu(r0+3*resStride); + + traits.acc(C1, alphav, R1); + traits.acc(C2, alphav, R2); + traits.acc(C3, alphav, R0); + + pstoreu(r0+1*resStride, R1); + pstoreu(r0+2*resStride, R2); + pstoreu(r0+3*resStride, R0); + } + + for(Index i=peeled_mc; i do the same but with nr==1 - for(Index j2=packet_cols; j2=depth && offset<=stride)); conj_if::IsComplex && Conjugate> cj; - Index packet_cols = (cols/nr) * nr; + Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; + Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; Index count = 0; - for(Index j2=0; j2=8) { - // skip what we have before - if(PanelMode) count += nr * offset; - const Scalar* b0 = &rhs[(j2+0)*rhsStride]; - const Scalar* b1 = &rhs[(j2+1)*rhsStride]; - const Scalar* b2 = &rhs[(j2+2)*rhsStride]; - const Scalar* b3 = &rhs[(j2+3)*rhsStride]; - const Scalar* b4 = &rhs[(j2+4)*rhsStride]; - const Scalar* b5 = &rhs[(j2+5)*rhsStride]; - const Scalar* b6 = &rhs[(j2+6)*rhsStride]; - const Scalar* b7 = &rhs[(j2+7)*rhsStride]; - for(Index k=0; k=4) blockB[count+2] = cj(b2[k]); - if(nr>=4) blockB[count+3] = cj(b3[k]); - if(nr>=8) blockB[count+4] = cj(b4[k]); - if(nr>=8) blockB[count+5] = cj(b5[k]); - if(nr>=8) blockB[count+6] = cj(b6[k]); - if(nr>=8) blockB[count+7] = cj(b7[k]); - count += nr; + // skip what we have before + if(PanelMode) count += 8 * offset; + const Scalar* b0 = &rhs[(j2+0)*rhsStride]; + const Scalar* b1 = &rhs[(j2+1)*rhsStride]; + const Scalar* b2 = &rhs[(j2+2)*rhsStride]; + const Scalar* b3 = &rhs[(j2+3)*rhsStride]; + const Scalar* b4 = &rhs[(j2+4)*rhsStride]; + const Scalar* b5 = &rhs[(j2+5)*rhsStride]; + const Scalar* b6 = &rhs[(j2+6)*rhsStride]; + const Scalar* b7 = &rhs[(j2+7)*rhsStride]; + for(Index k=0; k=4) + { + for(Index j2=packet_cols8; j2=depth && offset<=stride)); conj_if::IsComplex && Conjugate> cj; - Index packet_cols = (cols/nr) * nr; + Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; + Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; Index count = 0; - for(Index j2=0; j2=8) { - // skip what we have before - if(PanelMode) count += nr * offset; - for(Index k=0; k(&rhs[k*rhsStride + j2]); - pstoreu(blockB+count, cj.pconj(A)); - count += PacketSize; - } else { - const Scalar* b0 = &rhs[k*rhsStride + j2]; - blockB[count+0] = cj(b0[0]); - blockB[count+1] = cj(b0[1]); - if(nr>=4) blockB[count+2] = cj(b0[2]); - if(nr>=4) blockB[count+3] = cj(b0[3]); - if(nr>=8) blockB[count+4] = cj(b0[4]); - if(nr>=8) blockB[count+5] = cj(b0[5]); - if(nr>=8) blockB[count+6] = cj(b0[6]); - if(nr>=8) blockB[count+7] = cj(b0[7]); - count += nr; + // skip what we have before + if(PanelMode) count += 8 * offset; + for(Index k=0; k(&rhs[k*rhsStride + j2]); + pstoreu(blockB+count, cj.pconj(A)); + count += PacketSize; + } else { + const Scalar* b0 = &rhs[k*rhsStride + j2]; + blockB[count+0] = cj(b0[0]); + blockB[count+1] = cj(b0[1]); + blockB[count+2] = cj(b0[2]); + blockB[count+3] = cj(b0[3]); + blockB[count+4] = cj(b0[4]); + blockB[count+5] = cj(b0[5]); + blockB[count+6] = cj(b0[6]); + blockB[count+7] = cj(b0[7]); + count += 8; + } } + // skip what we have after + if(PanelMode) count += 8 * (stride-offset-depth); + } + } + if(nr>=4) + { + for(Index j2=packet_cols8; j2(&rhs[k*rhsStride + j2]); + pstoreu(blockB+count, cj.pconj(A)); + count += PacketSize; + } else { + const Scalar* b0 = &rhs[k*rhsStride + j2]; + blockB[count+0] = cj(b0[0]); + blockB[count+1] = cj(b0[1]); + blockB[count+2] = cj(b0[2]); + blockB[count+3] = cj(b0[3]); + count += 4; + } + } + // skip what we have after + if(PanelMode) count += 4 * (stride-offset-depth); } - // skip what we have after - if(PanelMode) count += nr * (stride-offset-depth); } // copy the remaining columns one at a time (nr==1) - for(Index j2=packet_cols; j2 rhs(_rhs,rhsStride); - Index packet_cols = (cols/nr)*nr; + Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; + Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; // first part: normal case for(Index j2=0; j2=8 ? (std::min)(k2+rows,packet_cols8) : k2; + if(nr>=8) { - // again we can split vertically in three different parts (transpose, symmetric, normal) - // transpose - for(Index k=k2; k=4) + // again we can split vertically in three different parts (transpose, symmetric, normal) + // transpose + for(Index k=k2; k=8) - { blockB[count+4] = numext::conj(rhs(j2+4,k)); blockB[count+5] = numext::conj(rhs(j2+5,k)); blockB[count+6] = numext::conj(rhs(j2+6,k)); blockB[count+7] = numext::conj(rhs(j2+7,k)); + count += 8; } - count += nr; - } - // symmetric - Index h = 0; - for(Index k=j2; k=4) + // symmetric + Index h = 0; + for(Index k=j2; k=8) - { blockB[count+4] = rhs(k,j2+4); blockB[count+5] = rhs(k,j2+5); blockB[count+6] = rhs(k,j2+6); blockB[count+7] = rhs(k,j2+7); + count += 8; + } + } + } + if(nr>=4) + { + for(Index j2=end8; j2<(std::min)(k2+rows,packet_cols4); j2+=4) + { + // again we can split vertically in three different parts (transpose, symmetric, normal) + // transpose + for(Index k=k2; k=8) { - for(Index k=k2; k=4) + for(Index k=k2; k=8) - { blockB[count+4] = numext::conj(rhs(j2+4,k)); blockB[count+5] = numext::conj(rhs(j2+5,k)); blockB[count+6] = numext::conj(rhs(j2+6,k)); blockB[count+7] = numext::conj(rhs(j2+7,k)); + count += 8; + } + } + } + if(nr>=4) + { + for(Index j2=(std::max)(packet_cols8,k2+rows); j2 the same with nr==1) - for(Index j2=packet_cols; j2 gebp_kernel; symm_pack_lhs pack_lhs; @@ -376,11 +420,10 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix(kc, mc, nc); - std::size_t sizeW = kc*Traits::WorkSpaceFactor; - std::size_t sizeB = sizeW + kc*cols; + std::size_t sizeB = kc*cols; ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0); ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); - Scalar* blockB = allocatedBlockB + sizeW; + Scalar* blockB = allocatedBlockB; gebp_kernel gebp_kernel; gemm_pack_lhs pack_lhs; From e45a6bed4548fb2821efd3d78471bf1bd2a7d4ef Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 26 Mar 2014 15:58:13 -0700 Subject: [PATCH 026/158] Specialized the pload1 packet primitive for Packet8f and Packet4d in order to take advantage of the vbroadcastss and vbroadcastsd instructions whenever possible. --- Eigen/src/Core/arch/AVX/PacketMath.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 26cc996db..98f8c5416 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -87,6 +87,9 @@ template<> EIGEN_STRONG_INLINE Packet8f pset1(const float& from) { re template<> EIGEN_STRONG_INLINE Packet4d pset1(const double& from) { return _mm256_set1_pd(from); } template<> EIGEN_STRONG_INLINE Packet8i pset1(const int& from) { return _mm256_set1_epi32(from); } +template<> EIGEN_STRONG_INLINE Packet8f pload1(const float* from) { return _mm256_broadcast_ss(from); } +template<> EIGEN_STRONG_INLINE Packet4d pload1(const double* from) { return _mm256_broadcast_sd(from); } + template<> EIGEN_STRONG_INLINE Packet8f plset(const float& a) { return _mm256_add_ps(_mm256_set1_ps(a), _mm256_set_ps(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)); } template<> EIGEN_STRONG_INLINE Packet4d plset(const double& a) { return _mm256_add_pd(_mm256_set1_pd(a), _mm256_set_pd(3.0,2.0,1.0,0.0)); } From 14bc4b9704b7e347ffcfe3c52588790e27e5118b Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 26 Mar 2014 17:35:18 -0700 Subject: [PATCH 027/158] Made sure that the version of gemm_pack_rhs specialized for row major matrices is vectorized when nr == 2*PacketSize (which is the case for SSE when compiling in 64bit mode). --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 569cfea71..d17752489 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1091,7 +1091,11 @@ EIGEN_DONT_INLINE void gemm_pack_rhs(&rhs[k*rhsStride + j2]); pstoreu(blockB+count, cj.pconj(A)); - count += PacketSize; + } else if (nr == 2*PacketSize) { + Packet A = ploadu(&rhs[k*rhsStride + j2]); + Packet B = ploadu(&rhs[k*rhsStride + j2 + PacketSize]); + pstoreu(blockB+count, cj.pconj(A)); + pstoreu(blockB+count+PacketSize, cj.pconj(B)); } else { const Scalar* b0 = &rhs[k*rhsStride + j2]; blockB[count+0] = cj(b0[0]); @@ -1102,8 +1106,8 @@ EIGEN_DONT_INLINE void gemm_pack_rhs=8) blockB[count+5] = cj(b0[5]); if(nr>=8) blockB[count+6] = cj(b0[6]); if(nr>=8) blockB[count+7] = cj(b0[7]); - count += nr; } + count += nr; } // skip what we have after if(PanelMode) count += nr * (stride-offset-depth); From ba3457cab2349dd03585435571ff1a8f90cf7403 Mon Sep 17 00:00:00 2001 From: Abhijit Kundu Date: Wed, 26 Mar 2014 22:02:48 -0400 Subject: [PATCH 028/158] Fixed compilation error due to obsolete internal::abs and internal::sqrt function calls --- demos/opengl/quaternion_demo.cpp | 2 +- demos/opengl/trackball.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/demos/opengl/quaternion_demo.cpp b/demos/opengl/quaternion_demo.cpp index 04165619b..dd323a4c9 100644 --- a/demos/opengl/quaternion_demo.cpp +++ b/demos/opengl/quaternion_demo.cpp @@ -234,7 +234,7 @@ void RenderingWidget::drawScene() gpu.drawVector(Vector3f::Zero(), length*Vector3f::UnitZ(), Color(0,0,1,1)); // draw the fractal object - float sqrt3 = internal::sqrt(3.); + float sqrt3 = std::sqrt(3.); glLightfv(GL_LIGHT0, GL_AMBIENT, Vector4f(0.5,0.5,0.5,1).data()); glLightfv(GL_LIGHT0, GL_DIFFUSE, Vector4f(0.5,1,0.5,1).data()); glLightfv(GL_LIGHT0, GL_SPECULAR, Vector4f(1,1,1,1).data()); diff --git a/demos/opengl/trackball.cpp b/demos/opengl/trackball.cpp index 77ac790c8..7c2da8e96 100644 --- a/demos/opengl/trackball.cpp +++ b/demos/opengl/trackball.cpp @@ -23,7 +23,7 @@ void Trackball::track(const Vector2i& point2D) { Vector3f axis = mLastPoint3D.cross(newPoint3D).normalized(); float cos_angle = mLastPoint3D.dot(newPoint3D); - if ( internal::abs(cos_angle) < 1.0 ) + if ( std::abs(cos_angle) < 1.0 ) { float angle = 2. * acos(cos_angle); if (mMode==Around) From a419cea4a0ff545f3221020119d5eb6ab4cd3e48 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 26 Mar 2014 19:03:07 -0700 Subject: [PATCH 029/158] Created the ptranspose packet primitive that can transpose an array of N packets, where N is the number of words in each packet. This primitive will be used to complete the vectorization of the gemm_pack_lhs and gemm_pack_rhs functions. Implemented the primitive using SSE instructions. --- Eigen/src/Core/GenericPacketMath.h | 15 ++++++++++++++- Eigen/src/Core/arch/SSE/Complex.h | 10 ++++++++++ Eigen/src/Core/arch/SSE/PacketMath.h | 25 +++++++++++++++++++++++++ test/packetmath.cpp | 12 ++++++++++++ 4 files changed, 61 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index d07541285..f9ddf4718 100755 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -386,9 +386,22 @@ template<> inline std::complex pmul(const std::complex& a, const #endif + +/*************************************************************************** + * Kernel, that is a collection of N packets where N is the number of words + * in the packet. +***************************************************************************/ +template struct Kernel { + Packet packet[unpacket_traits::size]; +}; + +template EIGEN_DEVICE_FUNC inline void +ptranspose(Kernel& /*kernel*/) { + // Nothing to do in the scalar case, i.e. a 1x1 matrix. +} + } // end namespace internal } // end namespace Eigen #endif // EIGEN_GENERIC_PACKET_MATH_H - diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index 91bba5e38..2dce66819 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -435,6 +435,16 @@ EIGEN_STRONG_INLINE Packet1cd pcplxflip/**/(const Packet1cd& x) return Packet1cd(preverse(x.v)); } +template<> EIGEN_DEVICE_FUNC inline void +ptranspose(Kernel& kernel) { + __m128d w1 = _mm_castps_pd(kernel.packet[0].v); + __m128d w2 = _mm_castps_pd(kernel.packet[1].v); + + __m128 tmp = _mm_castpd_ps(_mm_unpackhi_pd(w1, w2)); + kernel.packet[0].v = _mm_castpd_ps(_mm_unpacklo_pd(w1, w2)); + kernel.packet[1].v = tmp; +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 9d8faa7d6..937f63f88 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -707,6 +707,31 @@ struct palign_impl }; #endif +template<> EIGEN_DEVICE_FUNC inline void +ptranspose(Kernel& kernel) { + _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]); +} + +template<> EIGEN_DEVICE_FUNC inline void +ptranspose(Kernel& kernel) { + __m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]); + kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]); + kernel.packet[1] = tmp; +} + +template<> EIGEN_DEVICE_FUNC inline void +ptranspose(Kernel& kernel) { + __m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]); + __m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]); + __m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]); + __m128i T3 = _mm_unpackhi_epi32(kernel.packet[2], kernel.packet[3]); + + kernel.packet[0] = _mm_unpacklo_epi64(T0, T1); + kernel.packet[1] = _mm_unpackhi_epi64(T0, T1); + kernel.packet[2] = _mm_unpacklo_epi64(T2, T3); + kernel.packet[3] = _mm_unpackhi_epi64(T2, T3); +} + } // end namespace internal } // end namespace Eigen diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 5a680d1ee..735af7017 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -208,6 +208,18 @@ template void packetmath() ref[i] = data1[PacketSize-i-1]; internal::pstore(data2, internal::preverse(internal::pload(data1))); VERIFY(areApprox(ref, data2, PacketSize) && "internal::preverse"); + + internal::Kernel kernel; + for (int i=0; i(data1+i*PacketSize); + } + ptranspose(kernel); + for (int i=0; i void packetmath_real() From 9ce0d785137c4034f19193c6a6781bbbb38e29fb Mon Sep 17 00:00:00 2001 From: Mark Borgerding Date: Wed, 26 Mar 2014 22:26:07 -0400 Subject: [PATCH 030/158] immintrin.h did not come until intel version 11 --- Eigen/Core | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/Core b/Eigen/Core index 468ae0c76..412409497 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -123,7 +123,7 @@ extern "C" { // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly. // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus: - #ifdef __INTEL_COMPILER + #if defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1110 #include #else #include From fb03b56647e299428c03e3b1c3a01f3318e1af7e Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 27 Mar 2014 11:38:35 +0100 Subject: [PATCH 031/158] Fix warning --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 2b520383b..cba76edfe 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -594,8 +594,9 @@ void gebp_kernel // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; - LhsPacket A0, A1; + LhsPacket A0; // uncomment for register prefetching + // LhsPacket A1; // traits.loadLhs(blA, A0); for(Index k=0; k // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB*4]; - LhsPacket A0, A1; + LhsPacket A0; // uncomment for register prefetching + // LhsPacket A1; // traits.loadLhs(blA, A0); for(Index k=0; k Date: Thu, 27 Mar 2014 14:47:00 +0100 Subject: [PATCH 032/158] Implement pcplflip, palign, predux and the likes from AVC/complexes --- Eigen/Core | 3 +- Eigen/src/Core/arch/AVX/Complex.h | 81 +++++++++++-------------------- Eigen/src/Core/arch/SSE/Complex.h | 8 +++ 3 files changed, 37 insertions(+), 55 deletions(-) diff --git a/Eigen/Core b/Eigen/Core index dbe68586d..39cae5d40 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -303,9 +303,10 @@ using std::ptrdiff_t; #if defined EIGEN_VECTORIZE_AVX // Use AVX for floats and doubles, SSE for integers + #include "src/Core/arch/SSE/PacketMath.h" + #include "src/Core/arch/SSE/Complex.h" #include "src/Core/arch/AVX/PacketMath.h" #include "src/Core/arch/AVX/Complex.h" - #include "src/Core/arch/SSE/PacketMath.h" // For integers #elif defined EIGEN_VECTORIZE_SSE #include "src/Core/arch/SSE/PacketMath.h" #include "src/Core/arch/SSE/MathFunctions.h" diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index f63dfe6cf..4f4f6eb52 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -99,8 +99,6 @@ template<> EIGEN_STRONG_INLINE Packet4cf ploaddup(const std::complex< template<> EIGEN_STRONG_INLINE void pstore >(std::complex* to, const Packet4cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); } template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex* to, const Packet4cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); } -template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } - template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet4cf& a) { @@ -125,28 +123,29 @@ template<> EIGEN_STRONG_INLINE Packet4cf preverse(const Packet4cf& a) { template<> EIGEN_STRONG_INLINE std::complex predux(const Packet4cf& a) { - return std::complex(a.v[0]+a.v[2]+a.v[4]+a.v[6], a.v[1]+a.v[3]+a.v[5]+a.v[7]); + return predux(padd(Packet2cf(_mm256_extractf128_ps(a.v,0)), + Packet2cf(_mm256_extractf128_ps(a.v,1)))); } template<> EIGEN_STRONG_INLINE Packet4cf preduxp(const Packet4cf* vecs) { - __m256 result = _mm256_setzero_ps(); - for (int i = 0; i < 4; ++i) { - for (int j = 0; j < 8; j+=2) { - result[2*i] += vecs[i].v[j]; - result[2*i+1] += vecs[i].v[j+1]; - } - } - return Packet4cf(result); + Packet8f t0 = _mm256_shuffle_ps(vecs[0].v, vecs[0].v, _MM_SHUFFLE(3, 1, 2 ,0)); + Packet8f t1 = _mm256_shuffle_ps(vecs[1].v, vecs[1].v, _MM_SHUFFLE(3, 1, 2 ,0)); + t0 = _mm256_hadd_ps(t0,t1); + Packet8f t2 = _mm256_shuffle_ps(vecs[2].v, vecs[2].v, _MM_SHUFFLE(3, 1, 2 ,0)); + Packet8f t3 = _mm256_shuffle_ps(vecs[3].v, vecs[3].v, _MM_SHUFFLE(3, 1, 2 ,0)); + t2 = _mm256_hadd_ps(t2,t3); + + t1 = _mm256_permute2f128_ps(t0,t2, 0 + (2<<4)); + t3 = _mm256_permute2f128_ps(t0,t2, 1 + (3<<4)); + + return Packet4cf(_mm256_add_ps(t1,t3)); } template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet4cf& a) { - std::complex result(a.v[0], a.v[1]); - for (int i = 2; i < 8; i+=2) { - result *= std::complex(a.v[i], a.v[i+1]); - } - return result; + return predux_mul(pmul(Packet2cf(_mm256_extractf128_ps(a.v, 0)), + Packet2cf(_mm256_extractf128_ps(a.v, 1)))); } template @@ -155,16 +154,7 @@ struct palign_impl static EIGEN_STRONG_INLINE void run(Packet4cf& first, const Packet4cf& second) { if (Offset==0) return; - for (int i = 0; i < 4-Offset; ++i) - { - first.v[2*i] = first.v[2*(i+Offset)]; - first.v[2*i+1] = first.v[2*(i+Offset)+1]; - } - for (int i = 4-Offset; i < 4; ++i) - { - first.v[2*i] = second.v[2*(i-4+Offset)]; - first.v[2*i+1] = second.v[2*(i-4+Offset)+1]; - } + palign_impl::run(first.v, second.v); } }; @@ -230,12 +220,7 @@ template<> EIGEN_STRONG_INLINE Packet4cf pdiv(const Packet4cf& a, con template<> EIGEN_STRONG_INLINE Packet4cf pcplxflip(const Packet4cf& x) { - Packet4cf res; - for (int i = 0; i < 8; i+=2) { - res.v[i] = x.v[i+1]; - res.v[i+1] = x.v[i]; - } - return res; + return Packet4cf(_mm256_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0 ,1))); } //---------- double ---------- @@ -312,8 +297,6 @@ template<> EIGEN_STRONG_INLINE Packet2cd ploaddup(const std::complex< template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } -template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } - template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cd& a) { __m128d low = _mm256_extractf128_pd(a.v, 0); @@ -329,24 +312,22 @@ template<> EIGEN_STRONG_INLINE Packet2cd preverse(const Packet2cd& a) { template<> EIGEN_STRONG_INLINE std::complex predux(const Packet2cd& a) { - return std::complex(a.v[0]+a.v[2], a.v[1]+a.v[3]); + return predux(padd(Packet1cd(_mm256_extractf128_pd(a.v,0)), + Packet1cd(_mm256_extractf128_pd(a.v,1)))); } template<> EIGEN_STRONG_INLINE Packet2cd preduxp(const Packet2cd* vecs) { - __m256d result = _mm256_setzero_pd(); - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 4; j+=2) { - result[2*i] += vecs[i].v[j]; - result[2*i+1] += vecs[i].v[j+1]; - } - } - return Packet2cd(result); + Packet4d t0 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 0 + (2<<4)); + Packet4d t1 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 1 + (3<<4)); + + return Packet2cd(_mm256_add_pd(t0,t1)); } template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cd& a) { - return std::complex(a.v[0], a.v[1]) * std::complex(a.v[2], a.v[3]); + return predux(pmul(Packet1cd(_mm256_extractf128_pd(a.v,0)), + Packet1cd(_mm256_extractf128_pd(a.v,1)))); } template @@ -355,10 +336,7 @@ struct palign_impl static EIGEN_STRONG_INLINE void run(Packet2cd& first, const Packet2cd& second) { if (Offset==0) return; - first.v[0] = first.v[2]; - first.v[1] = first.v[3]; - first.v[2] = second.v[0]; - first.v[3] = second.v[1]; + palign_impl::run(first.v, second.v); } }; @@ -423,12 +401,7 @@ template<> EIGEN_STRONG_INLINE Packet2cd pdiv(const Packet2cd& a, con template<> EIGEN_STRONG_INLINE Packet2cd pcplxflip(const Packet2cd& x) { - Packet2cd res; - for (int i = 0; i < 4; i+=2) { - res.v[i] = x.v[i+1]; - res.v[i+1] = x.v[i]; - } - return res; + return Packet2cd(_mm256_shuffle_pd(x.v, x.v, 0x5)); } } // end namespace internal diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index 2dce66819..b92919b75 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -22,6 +22,9 @@ struct Packet2cf __m128 v; }; +// Use the packet_traits defined in AVX/PacketMath.h instead if we're going +// to leverage AVX instructions. +#ifndef EIGEN_VECTORIZE_AVX template<> struct packet_traits > : default_packet_traits { typedef Packet2cf type; @@ -42,6 +45,7 @@ template<> struct packet_traits > : default_packet_traits HasSetLinear = 0 }; }; +#endif template<> struct unpacket_traits { typedef std::complex type; enum {size=2}; }; @@ -248,6 +252,9 @@ struct Packet1cd __m128d v; }; +// Use the packet_traits defined in AVX/PacketMath.h instead if we're going +// to leverage AVX instructions. +#ifndef EIGEN_VECTORIZE_AVX template<> struct packet_traits > : default_packet_traits { typedef Packet1cd type; @@ -268,6 +275,7 @@ template<> struct packet_traits > : default_packet_traits HasSetLinear = 0 }; }; +#endif template<> struct unpacket_traits { typedef std::complex type; enum {size=1}; }; From 6f123d209e65a8cf324df53a9a7e87d8e4c47e9f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 27 Mar 2014 15:29:56 +0100 Subject: [PATCH 033/158] Fix geo_* unit tests with respect to AVX --- test/geo_hyperplane.cpp | 8 ++++---- test/geo_parametrizedline.cpp | 2 +- test/geo_quaternion.cpp | 14 +++++++------- test/geo_transformations.cpp | 6 +++--- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/test/geo_hyperplane.cpp b/test/geo_hyperplane.cpp index befd7d483..ed5928f10 100644 --- a/test/geo_hyperplane.cpp +++ b/test/geo_hyperplane.cpp @@ -129,9 +129,9 @@ template void hyperplane_alignment() typedef Hyperplane Plane3a; typedef Hyperplane Plane3u; - EIGEN_ALIGN16 Scalar array1[4]; - EIGEN_ALIGN16 Scalar array2[4]; - EIGEN_ALIGN16 Scalar array3[4+1]; + EIGEN_ALIGN_DEFAULT Scalar array1[4]; + EIGEN_ALIGN_DEFAULT Scalar array2[4]; + EIGEN_ALIGN_DEFAULT Scalar array3[4+1]; Scalar* array3u = array3+1; Plane3a *p1 = ::new(reinterpret_cast(array1)) Plane3a; @@ -146,7 +146,7 @@ template void hyperplane_alignment() VERIFY_IS_APPROX(p1->coeffs(), p3->coeffs()); #if defined(EIGEN_VECTORIZE) && EIGEN_ALIGN_STATICALLY - if(internal::packet_traits::Vectorizable) + if(internal::packet_traits::Vectorizable && internal::packet_traits::size<=4) VERIFY_RAISES_ASSERT((::new(reinterpret_cast(array3u)) Plane3a)); #endif } diff --git a/test/geo_parametrizedline.cpp b/test/geo_parametrizedline.cpp index 5a72b3575..427327cd7 100644 --- a/test/geo_parametrizedline.cpp +++ b/test/geo_parametrizedline.cpp @@ -86,7 +86,7 @@ template void parametrizedline_alignment() VERIFY_IS_APPROX(p1->direction(), p3->direction()); #if defined(EIGEN_VECTORIZE) && EIGEN_ALIGN_STATICALLY - if(internal::packet_traits::Vectorizable) + if(internal::packet_traits::Vectorizable && internal::packet_traits::size<=4) VERIFY_RAISES_ASSERT((::new(reinterpret_cast(array3u)) Line4a)); #endif } diff --git a/test/geo_quaternion.cpp b/test/geo_quaternion.cpp index 1694b32c7..de0f2aeda 100644 --- a/test/geo_quaternion.cpp +++ b/test/geo_quaternion.cpp @@ -181,9 +181,9 @@ template void mapQuaternion(void){ v1 = Vector3::Random(); Scalar a = internal::random(-Scalar(M_PI), Scalar(M_PI)); - EIGEN_ALIGN16 Scalar array1[4]; - EIGEN_ALIGN16 Scalar array2[4]; - EIGEN_ALIGN16 Scalar array3[4+1]; + EIGEN_ALIGN_DEFAULT Scalar array1[4]; + EIGEN_ALIGN_DEFAULT Scalar array2[4]; + EIGEN_ALIGN_DEFAULT Scalar array3[4+1]; Scalar* array3unaligned = array3+1; MQuaternionA mq1(array1); @@ -232,9 +232,9 @@ template void quaternionAlignment(void){ typedef Quaternion QuaternionA; typedef Quaternion QuaternionUA; - EIGEN_ALIGN16 Scalar array1[4]; - EIGEN_ALIGN16 Scalar array2[4]; - EIGEN_ALIGN16 Scalar array3[4+1]; + EIGEN_ALIGN_DEFAULT Scalar array1[4]; + EIGEN_ALIGN_DEFAULT Scalar array2[4]; + EIGEN_ALIGN_DEFAULT Scalar array3[4+1]; Scalar* arrayunaligned = array3+1; QuaternionA *q1 = ::new(reinterpret_cast(array1)) QuaternionA; @@ -248,7 +248,7 @@ template void quaternionAlignment(void){ VERIFY_IS_APPROX(q1->coeffs(), q2->coeffs()); VERIFY_IS_APPROX(q1->coeffs(), q3->coeffs()); #if defined(EIGEN_VECTORIZE) && EIGEN_ALIGN_STATICALLY - if(internal::packet_traits::Vectorizable) + if(internal::packet_traits::Vectorizable && internal::packet_traits::size<=4) VERIFY_RAISES_ASSERT((::new(reinterpret_cast(arrayunaligned)) QuaternionA)); #endif } diff --git a/test/geo_transformations.cpp b/test/geo_transformations.cpp index ee3030b5d..7d9080333 100644 --- a/test/geo_transformations.cpp +++ b/test/geo_transformations.cpp @@ -404,9 +404,9 @@ template void transform_alignment() typedef Transform Projective3a; typedef Transform Projective3u; - EIGEN_ALIGN16 Scalar array1[16]; - EIGEN_ALIGN16 Scalar array2[16]; - EIGEN_ALIGN16 Scalar array3[16+1]; + EIGEN_ALIGN_DEFAULT Scalar array1[16]; + EIGEN_ALIGN_DEFAULT Scalar array2[16]; + EIGEN_ALIGN_DEFAULT Scalar array3[16+1]; Scalar* array3u = array3+1; Projective3a *p1 = ::new(reinterpret_cast(array1)) Projective3a; From 7d73c7f18be20407ffd28f72b3275da233658f80 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 27 Mar 2014 15:38:40 +0100 Subject: [PATCH 034/158] Change abi version when enabling AVX with GCC --- CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index fb13769f4..838a41b79 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -199,6 +199,9 @@ if(NOT MSVC) option(EIGEN_TEST_AVX "Enable/Disable AVX in tests/examples" OFF) if(EIGEN_TEST_AVX) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx") + if(CMAKE_COMPILER_IS_GNUCXX) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fabi-version=6") + endif() message(STATUS "Enabling AVX in tests/examples") endif() From c4902a3d0182dfc9ac02a24ec2a52cd567ac0104 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 27 Mar 2014 09:34:51 -0700 Subject: [PATCH 035/158] Implemented the AVX version of the ptranspose packet primitive. --- Eigen/src/Core/arch/AVX/Complex.h | 25 +++++++++++++++++ Eigen/src/Core/arch/AVX/PacketMath.h | 41 ++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index 4f4f6eb52..0121cec86 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -404,6 +404,31 @@ template<> EIGEN_STRONG_INLINE Packet2cd pcplxflip(const Packet2cd& x return Packet2cd(_mm256_shuffle_pd(x.v, x.v, 0x5)); } +template<> EIGEN_DEVICE_FUNC inline void +ptranspose(Kernel& kernel) { + __m256d P0 = _mm256_castps_pd(kernel.packet[0].v); + __m256d P1 = _mm256_castps_pd(kernel.packet[1].v); + __m256d P2 = _mm256_castps_pd(kernel.packet[2].v); + __m256d P3 = _mm256_castps_pd(kernel.packet[3].v); + + __m256d T0 = _mm256_shuffle_pd(P0, P1, 15); + __m256d T1 = _mm256_shuffle_pd(P0, P1, 0); + __m256d T2 = _mm256_shuffle_pd(P2, P3, 15); + __m256d T3 = _mm256_shuffle_pd(P2, P3, 0); + + kernel.packet[1].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0, T2, 32)); + kernel.packet[3].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0, T2, 49)); + kernel.packet[0].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 32)); + kernel.packet[2].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 49)); +} + +template<> EIGEN_DEVICE_FUNC inline void +ptranspose(Kernel& kernel) { + __m256d tmp = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 0+(2<<4)); + kernel.packet[1].v = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 1+(3<<4)); + kernel.packet[0].v = tmp; +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 98f8c5416..96a4bc08c 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -414,6 +414,47 @@ struct palign_impl } }; +template<> EIGEN_DEVICE_FUNC inline void +ptranspose(Kernel& kernel) { + __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]); + __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]); + __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]); + __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]); + __m256 T4 = _mm256_unpacklo_ps(kernel.packet[4], kernel.packet[5]); + __m256 T5 = _mm256_unpackhi_ps(kernel.packet[4], kernel.packet[5]); + __m256 T6 = _mm256_unpacklo_ps(kernel.packet[6], kernel.packet[7]); + __m256 T7 = _mm256_unpackhi_ps(kernel.packet[6], kernel.packet[7]); + __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0)); + __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2)); + __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0)); + __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2)); + __m256 S4 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(1,0,1,0)); + __m256 S5 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(3,2,3,2)); + __m256 S6 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(1,0,1,0)); + __m256 S7 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(3,2,3,2)); + kernel.packet[0] = _mm256_permute2f128_ps(S0, S4, 0x20); + kernel.packet[1] = _mm256_permute2f128_ps(S1, S5, 0x20); + kernel.packet[2] = _mm256_permute2f128_ps(S2, S6, 0x20); + kernel.packet[3] = _mm256_permute2f128_ps(S3, S7, 0x20); + kernel.packet[4] = _mm256_permute2f128_ps(S0, S4, 0x31); + kernel.packet[5] = _mm256_permute2f128_ps(S1, S5, 0x31); + kernel.packet[6] = _mm256_permute2f128_ps(S2, S6, 0x31); + kernel.packet[7] = _mm256_permute2f128_ps(S3, S7, 0x31); +} + +template<> EIGEN_DEVICE_FUNC inline void +ptranspose(Kernel& kernel) { + __m256d T0 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 15); + __m256d T1 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 0); + __m256d T2 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 15); + __m256d T3 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 0); + + kernel.packet[1] = _mm256_permute2f128_pd(T0, T2, 32); + kernel.packet[3] = _mm256_permute2f128_pd(T0, T2, 49); + kernel.packet[0] = _mm256_permute2f128_pd(T1, T3, 32); + kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49); +} + } // end namespace internal } // end namespace Eigen From b776458ccbd0b8dbde56d0d2dd0a683c6b4b0692 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 27 Mar 2014 10:02:24 -0700 Subject: [PATCH 036/158] Vectorized the packing of a row-major matrix used as the left hand side argument in a matrix-matrix product. --- .../Core/products/GeneralBlockPanelKernel.h | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index d17752489..eeeb5290f 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -954,9 +954,22 @@ EIGEN_DONT_INLINE void gemm_pack_lhs kernel; + for (int p = 0; p < PacketSize; ++p) { + kernel.packet[p] = ploadu(&lhs(i+p+m, k)); + } + ptranspose(kernel); + for (int p = 0; p < PacketSize; ++p) { + pstore(blockA+count+m+Pack1*p, cj.pconj(kernel.packet[p])); + } + } + count += PacketSize*Pack1; + } + for(; k Date: Thu, 27 Mar 2014 10:38:41 -0700 Subject: [PATCH 037/158] Vectorized the packing of a col-major matrix used as the right hand side argument in a matrix-matrix product when AVX instructions are used. No vectorization takes place when SSE instructions are used, however this doesn't seem to impact performance. --- .../Core/products/GeneralBlockPanelKernel.h | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index eeeb5290f..28c2a913e 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1033,6 +1033,7 @@ EIGEN_DONT_INLINE void gemm_pack_rhs::IsComplex && Conjugate> cj; Index packet_cols = (cols/nr) * nr; Index count = 0; + const Index peeled_k = (depth/PacketSize)*PacketSize; for(Index j2=0; j2 kernel; + for (int p = 0; p < PacketSize; ++p) { + kernel.packet[p] = ploadu(&rhs[(j2+p)*rhsStride+k]); + } + ptranspose(kernel); + for (int p = 0; p < PacketSize; ++p) { + pstoreu(blockB+count, cj.pconj(kernel.packet[p])); + count+=PacketSize; + } + } + } + for(; k Date: Thu, 27 Mar 2014 11:00:47 -0700 Subject: [PATCH 038/158] Silenced "unused variable" warnings when compiling with FMA. --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 28c2a913e..897b6a914 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -215,6 +215,7 @@ public: // (in the case where there is no FMA) gcc fails to figure out how to avoid // spilling register. #ifdef EIGEN_VECTORIZE_FMA + EIGEN_UNUSED_VARIABLE(tmp); c = pmadd(a,b,c); #else tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp); @@ -299,6 +300,7 @@ public: EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const { #ifdef EIGEN_VECTORIZE_FMA + EIGEN_UNUSED_VARIABLE(tmp); c.v = pmadd(a.v,b,c.v); #else tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp); @@ -520,6 +522,7 @@ public: EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const { #ifdef EIGEN_VECTORIZE_FMA + EIGEN_UNUSED_VARIABLE(tmp); c = pmadd(a,b,c); #else tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp); From 729363114f0c7ea8ff3f8ddd8f6a83335b0f3909 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 27 Mar 2014 11:20:41 -0700 Subject: [PATCH 039/158] Fixed compilation error when FMA instructions are enabled. --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 897b6a914..cd89aed1f 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -521,12 +521,7 @@ public: EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const { -#ifdef EIGEN_VECTORIZE_FMA - EIGEN_UNUSED_VARIABLE(tmp); - c = pmadd(a,b,c); -#else tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp); -#endif } EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const From 58fe2fc2b21401365ace575738d878dad21eb184 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 27 Mar 2014 23:38:50 +0100 Subject: [PATCH 040/158] enforce the use of vfmadd231ps for pmadd (gcc and clang stupidely generates the other fmadd variants plus some register moves...) --- Eigen/src/Core/arch/AVX/PacketMath.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 96a4bc08c..dceddb518 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -124,7 +124,19 @@ template<> EIGEN_STRONG_INLINE Packet8i pdiv(const Packet8i& /*a*/, co } #ifdef EIGEN_VECTORIZE_FMA -template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) { return _mm256_fmadd_ps(a,b,c); } +template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) { +#if defined(__clang__) || defined(__GNUC__) + // clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers, + // and gcc stupidly generates a vfmadd132ps instruction, + // so let's enforce it to generate a vfmadd231ps instruction since the most common use case is to accumulate + // the result of the product. + Packet8f res = c; + asm("vfmadd231ps %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b)); + return res; +#else + return _mm256_fmadd_ps(a,b,c); +#endif +} template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) { return _mm256_fmadd_pd(a,b,c); } #endif From ee866790967ab4ab11a62987dd21bac66237cba9 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 27 Mar 2014 16:03:03 -0700 Subject: [PATCH 041/158] Introduced pscatter/pgather packet primitives. They will be used to optimize the loop peeling code of the block-panel matrix multiplication kernel. --- Eigen/src/Core/GenericPacketMath.h | 6 +++++ test/packetmath.cpp | 39 ++++++++++++++++++++++++++++-- 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index f9ddf4718..03e7f410c 100755 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -217,6 +217,12 @@ template EIGEN_DEVICE_FUNC inline void pstore( /** \internal copy the packet \a from to \a *to, (un-aligned store) */ template EIGEN_DEVICE_FUNC inline void pstoreu(Scalar* to, const Packet& from) +{ (*to) = from; } + + template EIGEN_DEVICE_FUNC inline Packet pgather(const Scalar* from, int /*stride*/) + { return ploadu(from); } + + template EIGEN_DEVICE_FUNC inline void pscatter(Scalar* to, const Packet& from, int /*stride*/) { (*to) = from; } /** \internal tries to do cache prefetching of \a addr */ diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 735af7017..7deefe890 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -356,8 +356,38 @@ template void packetmath_complex() internal::pstore(pval,internal::pcplxflip(internal::pload(data1))); VERIFY(areApprox(ref, pval, PacketSize) && "pcplxflip"); } - - +} + +template void packetmath_scatter_gather() { + typedef typename internal::packet_traits::type Packet; + typedef typename NumTraits::Real RealScalar; + const int PacketSize = internal::packet_traits::size; + Scalar data1[PacketSize]; + RealScalar refvalue = 0; + for (int i=0; i()/RealScalar(PacketSize); + } + Scalar buffer[PacketSize*11]; + memset(buffer, 0, 11*sizeof(Packet)); + Packet packet = internal::pload(data1); + internal::pscatter(buffer, packet, 11); + + for (int i = 0; i < PacketSize*11; ++i) { + if ((i%11) == 0) { + VERIFY(isApproxAbs(buffer[i], data1[i/11], refvalue)); + } else { + VERIFY(isApproxAbs(buffer[i], Scalar(0), refvalue)); + } + } + + for (int i=0; i()/RealScalar(PacketSize); + } + packet = internal::pgather(buffer, 7); + internal::pstore(data1, packet); + for (int i = 0; i < PacketSize; ++i) { + VERIFY(isApproxAbs(data1[i], buffer[i*7], refvalue)); + } } void test_packetmath() @@ -378,5 +408,10 @@ void test_packetmath() CALL_SUBTEST_1( packetmath_complex >() ); CALL_SUBTEST_2( packetmath_complex >() ); + + CALL_SUBTEST_1( packetmath_scatter_gather() ); + CALL_SUBTEST_2( packetmath_scatter_gather() ); + CALL_SUBTEST_3( packetmath_scatter_gather >() ); + CALL_SUBTEST_3( packetmath_scatter_gather >() ); } } From 7f3162f7071db63bdbdc21f4c101543df00e4661 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 27 Mar 2014 17:42:25 -0700 Subject: [PATCH 042/158] Implemented the AVX version of the gather and scatter packet primitives. --- Eigen/src/Core/arch/AVX/Complex.h | 39 +++++++++++++++++++++++++++- Eigen/src/Core/arch/AVX/PacketMath.h | 35 +++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index 0121cec86..ec9c861f9 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -99,6 +99,29 @@ template<> EIGEN_STRONG_INLINE Packet4cf ploaddup(const std::complex< template<> EIGEN_STRONG_INLINE void pstore >(std::complex* to, const Packet4cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); } template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex* to, const Packet4cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); } +template<> EIGEN_DEVICE_FUNC inline Packet4cf pgather, Packet4cf>(const std::complex* from, int stride) +{ + return Packet4cf(_mm256_set_ps(std::imag(from[3*stride]), std::real(from[3*stride]), + std::imag(from[2*stride]), std::real(from[2*stride]), + std::imag(from[1*stride]), std::real(from[1*stride]), + std::imag(from[0*stride]), std::real(from[0*stride]))); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet4cf>(std::complex* to, const Packet4cf& from, int stride) +{ + __m128 low = _mm256_extractf128_ps(from.v, 0); + to[stride*0] = std::complex(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 0)), + _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1))); + to[stride*1] = std::complex(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 2)), + _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3))); + + __m128 high = _mm256_extractf128_ps(from.v, 1); + to[stride*2] = std::complex(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 0)), + _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1))); + to[stride*3] = std::complex(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 2)), + _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3))); + +} template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet4cf& a) { @@ -297,7 +320,21 @@ template<> EIGEN_STRONG_INLINE Packet2cd ploaddup(const std::complex< template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } -template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cd& a) +template<> EIGEN_DEVICE_FUNC inline Packet2cd pgather, Packet2cd>(const std::complex* from, int stride) +{ + return Packet2cd(_mm256_set_pd(std::imag(from[1*stride]), std::real(from[1*stride]), + std::imag(from[0*stride]), std::real(from[0*stride]))); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cd>(std::complex* to, const Packet2cd& from, int stride) +{ + __m128d low = _mm256_extractf128_pd(from.v, 0); + to[stride*0] = std::complex(_mm_cvtsd_f64(low), _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1))); + __m128d high = _mm256_extractf128_pd(from.v, 1); + to[stride*1] = std::complex(_mm_cvtsd_f64(high), _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1))); +} + +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cd& a) { __m128d low = _mm256_extractf128_pd(a.v, 0); EIGEN_ALIGN16 double res[2]; diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 96a4bc08c..aa2ac3b0b 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -179,6 +179,41 @@ template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet8f& template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet4d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from); } template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); } +// TODO: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available +template<> EIGEN_DEVICE_FUNC inline Packet8f pgather(const float* from, int stride) +{ + return _mm256_set_ps(from[7*stride], from[6*stride], from[5*stride], from[4*stride], + from[3*stride], from[2*stride], from[1*stride], from[0*stride]); +} +template<> EIGEN_DEVICE_FUNC inline Packet4d pgather(const double* from, int stride) +{ + return _mm256_set_pd(from[3*stride], from[2*stride], from[1*stride], from[0*stride]); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet8f& from, int stride) +{ + __m128 low = _mm256_extractf128_ps(from, 0); + to[stride*0] = _mm_cvtss_f32(low); + to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1)); + to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 2)); + to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3)); + + __m128 high = _mm256_extractf128_ps(from, 1); + to[stride*4] = _mm_cvtss_f32(high); + to[stride*5] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1)); + to[stride*6] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 2)); + to[stride*7] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3)); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet4d& from, int stride) +{ + __m128d low = _mm256_extractf128_pd(from, 0); + to[stride*0] = _mm_cvtsd_f64(low); + to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1)); + __m128d high = _mm256_extractf128_pd(from, 1); + to[stride*2] = _mm_cvtsd_f64(high); + to[stride*3] = _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1)); +} + template<> EIGEN_STRONG_INLINE void pstore1(float* to, const float& a) { Packet8f pa = pset1(a); From 8a94cb3edde854b89031a0e985c524f2f6bf799d Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 27 Mar 2014 18:29:01 -0700 Subject: [PATCH 043/158] Implemented the SSE version of the gather and scatter packet primitives. --- Eigen/src/Core/GenericPacketMath.h | 2 +- Eigen/src/Core/arch/SSE/Complex.h | 18 +++++++++++++++ Eigen/src/Core/arch/SSE/PacketMath.h | 33 ++++++++++++++++++++++++++++ test/packetmath.cpp | 1 + 4 files changed, 53 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 03e7f410c..82eeeed4a 100755 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -223,7 +223,7 @@ template EIGEN_DEVICE_FUNC inline void pstoreu { return ploadu(from); } template EIGEN_DEVICE_FUNC inline void pscatter(Scalar* to, const Packet& from, int /*stride*/) -{ (*to) = from; } + { pstore(to, from); } /** \internal tries to do cache prefetching of \a addr */ template inline void prefetch(const Scalar* addr) diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index b92919b75..86b90b2ee 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -111,6 +111,24 @@ template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex< template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); } template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); } + +template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, int stride) +{ + return Packet2cf(_mm_set_ps(std::imag(from[1*stride]), std::real(from[1*stride]), + std::imag(from[0*stride]), std::real(from[0*stride]))); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, int stride) +{ + /* for (int i = 0; i < 2; i+=2) { + to[stride*i] = std::complex(from.v[i], from.v[i+1]); + }*/ + to[stride*0] = std::complex(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 0)), + _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 1))); + to[stride*1] = std::complex(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 2)), + _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3))); +} + template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 937f63f88..a98ca6bea 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -339,6 +339,39 @@ template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast(to), _mm_castps_pd(from)); } template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast(to), _mm_castsi128_pd(from)); } +template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, int stride) +{ + return _mm_set_ps(from[3*stride], from[2*stride], from[1*stride], from[0*stride]); +} +template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const double* from, int stride) +{ + return _mm_set_pd(from[1*stride], from[0*stride]); +} +template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* from, int stride) +{ + return _mm_set_epi32(from[3*stride], from[2*stride], from[1*stride], from[0*stride]); + } + +template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, int stride) +{ + to[stride*0] = _mm_cvtss_f32(from); + to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 1)); + to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 2)); + to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 3)); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2d& from, int stride) +{ + to[stride*0] = _mm_cvtsd_f64(from); + to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(from, from, 1)); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const Packet4i& from, int stride) +{ + to[stride*0] = _mm_cvtsi128_si32(from); + to[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1)); + to[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2)); + to[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3)); +} + // some compilers might be tempted to perform multiple moves instead of using a vector path. template<> EIGEN_STRONG_INLINE void pstore1(float* to, const float& a) { diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 7deefe890..317b1c8fe 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -411,6 +411,7 @@ void test_packetmath() CALL_SUBTEST_1( packetmath_scatter_gather() ); CALL_SUBTEST_2( packetmath_scatter_gather() ); + CALL_SUBTEST_3( packetmath_scatter_gather() ); CALL_SUBTEST_3( packetmath_scatter_gather >() ); CALL_SUBTEST_3( packetmath_scatter_gather >() ); } From c94fde118a2b6b2b415a7860ba066ba240bc8ac3 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 28 Mar 2014 09:11:06 +0100 Subject: [PATCH 044/158] Enable vectorization of gemv for PacketSize>4 through unaligned loads (still better than no vectorization) --- Eigen/src/Core/products/GeneralMatrixVector.h | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index 59fc54c4e..340c51394 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -136,12 +136,17 @@ EIGEN_DONT_INLINE void general_matrix_vector_product 4. - if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) || LhsPacketSize > 4) + if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) ) { alignedSize = 0; alignedStart = 0; } + else if(LhsPacketSize > 4) + { + // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4. + // Currently, it seems to be better to perform unaligned loads anyway + alignmentPattern = NoneAligned; + } else if (LhsPacketSize>1) { eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size 4. - if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) || - (LhsPacketSize > 4)) + if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) ) { alignedSize = 0; alignedStart = 0; } + else if(LhsPacketSize > 4) + { + // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4. + alignmentPattern = NoneAligned; + } else if (LhsPacketSize>1) { eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth Date: Fri, 28 Mar 2014 10:18:04 +0100 Subject: [PATCH 045/158] Add a mechanism to recursively access to half-size packet types --- Eigen/src/Core/GenericPacketMath.h | 6 +++++- Eigen/src/Core/arch/AVX/Complex.h | 4 ++++ Eigen/src/Core/arch/AVX/PacketMath.h | 4 ++++ Eigen/src/Core/arch/AltiVec/Complex.h | 1 + Eigen/src/Core/arch/AltiVec/PacketMath.h | 2 ++ Eigen/src/Core/arch/NEON/Complex.h | 1 + Eigen/src/Core/arch/NEON/PacketMath.h | 2 ++ Eigen/src/Core/arch/SSE/Complex.h | 4 ++++ Eigen/src/Core/arch/SSE/PacketMath.h | 5 +++++ 9 files changed, 28 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 82eeeed4a..3e5db1a88 100755 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -42,6 +42,8 @@ namespace internal { struct default_packet_traits { enum { + HasHalfPacket = 0, + HasAdd = 1, HasSub = 1, HasMul = 1, @@ -71,10 +73,12 @@ struct default_packet_traits template struct packet_traits : default_packet_traits { typedef T type; + typedef T half; enum { Vectorizable = 0, size = 1, - AlignedOnScalar = 0 + AlignedOnScalar = 0, + HasHalfPacket = 0 }; enum { HasAdd = 0, diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index ec9c861f9..7c1947a4f 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -25,10 +25,12 @@ struct Packet4cf template<> struct packet_traits > : default_packet_traits { typedef Packet4cf type; + typedef Packet2cf half; enum { Vectorizable = 1, AlignedOnScalar = 1, size = 4, + HasHalfPacket = 1, HasAdd = 1, HasSub = 1, @@ -257,10 +259,12 @@ struct Packet2cd template<> struct packet_traits > : default_packet_traits { typedef Packet2cd type; + typedef Packet1cd half; enum { Vectorizable = 1, AlignedOnScalar = 0, size = 2, + HasHalfPacket = 1, HasAdd = 1, HasSub = 1, diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 6d5a5f53f..132c1abe3 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -40,10 +40,12 @@ template<> struct is_arithmetic<__m256d> { enum { value = true }; }; template<> struct packet_traits : default_packet_traits { typedef Packet8f type; + typedef Packet4f half; enum { Vectorizable = 1, AlignedOnScalar = 1, size=8, + HasHalfPacket = 1, HasDiv = 1, HasSin = 0, @@ -56,10 +58,12 @@ template<> struct packet_traits : default_packet_traits template<> struct packet_traits : default_packet_traits { typedef Packet4d type; + typedef Packet2d half; enum { Vectorizable = 1, AlignedOnScalar = 1, size=4, + HasHalfPacket = 1, HasDiv = 1, HasExp = 0 diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h index 68d9a2bff..db52074f4 100644 --- a/Eigen/src/Core/arch/AltiVec/Complex.h +++ b/Eigen/src/Core/arch/AltiVec/Complex.h @@ -33,6 +33,7 @@ struct Packet2cf template<> struct packet_traits > : default_packet_traits { typedef Packet2cf type; + typedef Packet2cf half; enum { Vectorizable = 1, AlignedOnScalar = 1, diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 45d1954f7..5d7a16f5c 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -73,6 +73,7 @@ static Packet4f p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui) template<> struct packet_traits : default_packet_traits { typedef Packet4f type; + typedef Packet4f half; enum { Vectorizable = 1, AlignedOnScalar = 1, @@ -89,6 +90,7 @@ template<> struct packet_traits : default_packet_traits template<> struct packet_traits : default_packet_traits { typedef Packet4i type; + typedef Packet4i half; enum { // FIXME check the Has* Vectorizable = 1, diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index 8d9255eef..e49c1a873 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -28,6 +28,7 @@ struct Packet2cf template<> struct packet_traits > : default_packet_traits { typedef Packet2cf type; + typedef Packet2cf half; enum { Vectorizable = 1, AlignedOnScalar = 1, diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 05e891df2..fae7b55fc 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -66,6 +66,7 @@ typedef uint32x4_t Packet4ui; template<> struct packet_traits : default_packet_traits { typedef Packet4f type; + typedef Packet4f half; enum { Vectorizable = 1, AlignedOnScalar = 1, @@ -83,6 +84,7 @@ template<> struct packet_traits : default_packet_traits template<> struct packet_traits : default_packet_traits { typedef Packet4i type; + typedef Packet4i half; enum { Vectorizable = 1, AlignedOnScalar = 1, diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index 86b90b2ee..694979e19 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -28,10 +28,12 @@ struct Packet2cf template<> struct packet_traits > : default_packet_traits { typedef Packet2cf type; + typedef Packet2cf half; enum { Vectorizable = 1, AlignedOnScalar = 1, size = 2, + HasHalfPacket = 0, HasAdd = 1, HasSub = 1, @@ -276,10 +278,12 @@ struct Packet1cd template<> struct packet_traits > : default_packet_traits { typedef Packet1cd type; + typedef Packet1cd half; enum { Vectorizable = 1, AlignedOnScalar = 0, size = 1, + HasHalfPacket = 0, HasAdd = 1, HasSub = 1, diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index a98ca6bea..ea05a3415 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -64,10 +64,12 @@ template<> struct is_arithmetic<__m128d> { enum { value = true }; }; template<> struct packet_traits : default_packet_traits { typedef Packet4f type; + typedef Packet4f half; enum { Vectorizable = 1, AlignedOnScalar = 1, size=4, + HasHalfPacket = 0, HasDiv = 1, HasSin = EIGEN_FAST_MATH, @@ -80,10 +82,12 @@ template<> struct packet_traits : default_packet_traits template<> struct packet_traits : default_packet_traits { typedef Packet2d type; + typedef Packet2d half; enum { Vectorizable = 1, AlignedOnScalar = 1, size=2, + HasHalfPacket = 0, HasDiv = 1, HasExp = 1, @@ -94,6 +98,7 @@ template<> struct packet_traits : default_packet_traits template<> struct packet_traits : default_packet_traits { typedef Packet4i type; + typedef Packet4i half; enum { // FIXME check the Has* Vectorizable = 1, From 39bfbd43f05691874a78a5a6bf4504cf0e6ff452 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 28 Mar 2014 12:00:08 -0700 Subject: [PATCH 046/158] Properly align the input data to prevent false failures of the packetmath.cpp test. --- test/packetmath.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 317b1c8fe..08566faf8 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -362,12 +362,12 @@ template void packetmath_scatter_gather() { typedef typename internal::packet_traits::type Packet; typedef typename NumTraits::Real RealScalar; const int PacketSize = internal::packet_traits::size; - Scalar data1[PacketSize]; + EIGEN_ALIGN_DEFAULT Scalar data1[PacketSize]; RealScalar refvalue = 0; for (int i=0; i()/RealScalar(PacketSize); } - Scalar buffer[PacketSize*11]; + EIGEN_ALIGN_DEFAULT Scalar buffer[PacketSize*11]; memset(buffer, 0, 11*sizeof(Packet)); Packet packet = internal::pload(data1); internal::pscatter(buffer, packet, 11); From ad59ade116969ca7b18409d690caf00c0b1c34c7 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 28 Mar 2014 12:11:23 -0700 Subject: [PATCH 047/158] Vectorized the loop peeling of the inner loop of the block-panel matrix multiplication code. This speeds up the multiplication of matrices which size is not a multiple of the packet size. --- .../Core/products/GeneralBlockPanelKernel.h | 222 ++++++++++++------ 1 file changed, 156 insertions(+), 66 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 0f47f6de5..3ed1fc5a3 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -206,6 +206,11 @@ public: dest = pload(a); } + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + { + dest = ploadu(a); + } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, AccPacket& tmp) const { // It would be a lot cleaner to call pmadd all the time. Unfortunately if we @@ -278,7 +283,12 @@ public: { dest = pload(a); } - + + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + { + dest = ploadu(a); + } + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) { pbroadcast4(b, b0, b1, b2, b3); @@ -334,7 +344,9 @@ public: && packet_traits::Vectorizable, RealPacketSize = Vectorizable ? packet_traits::size : 1, ResPacketSize = Vectorizable ? packet_traits::size : 1, - + LhsPacketSize = Vectorizable ? packet_traits::size : 1, + RhsPacketSize = Vectorizable ? packet_traits::size : 1, + // FIXME: should depend on NumberOfRegisters nr = 4, mr = ResPacketSize, @@ -402,6 +414,11 @@ public: dest = pload((const typename unpacket_traits::type*)(a)); } + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + { + dest = ploadu((const typename unpacket_traits::type*)(a)); + } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacket& c, RhsPacket& /*tmp*/) const { c.first = padd(pmul(a,b.first), c.first); @@ -509,6 +526,11 @@ public: dest = ploaddup(a); } + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + { + dest = ploaddup(a); + } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const { madd_impl(a, b, c, tmp, typename conditional::type()); @@ -706,49 +728,84 @@ void gebp_kernel const LhsScalar* blA = &blockA[i*strideA+offsetA]; prefetch(&blA[0]); - // gets a 1 x 8 res block as registers - ResScalar C0(0), C1(0), C2(0), C3(0), C4(0), C5(0), C6(0), C7(0); // FIXME directly use blockB ??? const RhsScalar* blB = &blockB[j2*strideB+offsetB*8]; - // TODO peel this loop - for(Index k=0; k SwappedTraits; + typedef typename SwappedTraits::ResScalar SResScalar; + typedef typename SwappedTraits::LhsPacket SLhsPacket; + typedef typename SwappedTraits::RhsPacket SRhsPacket; + typedef typename SwappedTraits::ResPacket SResPacket; + typedef typename SwappedTraits::AccPacket SAccPacket; + SwappedTraits straits; + + SAccPacket C0; + straits.initAcc(C0); + for(Index k=0; k(&res[j2*resStride + i], resStride); + SResPacket alphav = pset1(alpha); + straits.acc(C0, alphav, R); + pscatter(&res[j2*resStride + i], R, resStride); + + EIGEN_ASM_COMMENT("end_vectorized_multiplication_of_last_rows"); + } + else + { + // gets a 1 x 8 res block as registers + ResScalar C0(0), C1(0), C2(0), C3(0), C4(0), C5(0), C6(0), C7(0); + + for(Index k=0; k const LhsScalar* blA = &blockA[i*strideA+offsetA]; prefetch(&blA[0]); - // gets a 1 x 4 res block as registers - ResScalar C0(0), C1(0), C2(0), C3(0); // FIXME directly use blockB ??? const RhsScalar* blB = &blockB[j2*strideB+offsetB*4]; - // TODO peel this loop - for(Index k=0; k SwappedTraits; + typedef typename SwappedTraits::ResScalar SResScalar; + typedef typename SwappedTraits::LhsPacket SLhsPacket; + typedef typename SwappedTraits::RhsPacket SRhsPacket; + typedef typename SwappedTraits::ResPacket SResPacket; + typedef typename SwappedTraits::AccPacket SAccPacket; + SwappedTraits straits; + + SAccPacket C0; + straits.initAcc(C0); + for(Index k=0; k(&res[j2*resStride + i], resStride); + SResPacket alphav = pset1(alpha); + straits.acc(C0, alphav, R); + pscatter(&res[j2*resStride + i], R, resStride); + + EIGEN_ASM_COMMENT("end_vectorized_multiplication_of_last_rows"); + } else { + // gets a 1 x 4 res block as registers + ResScalar C0(0), C1(0), C2(0), C3(0); + + for(Index k=0; k Date: Sun, 30 Mar 2014 21:57:05 +0200 Subject: [PATCH 048/158] Optimize gebp kernel: 1 - increase peeling level along the depth dimention (+5% for large matrices, i.e., >1000) 2 - improve pipelining when dealing with latest rows of the lhs --- .../Core/products/GeneralBlockPanelKernel.h | 454 +++++++++++------- 1 file changed, 271 insertions(+), 183 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 3ed1fc5a3..a9e42c8aa 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -95,6 +95,9 @@ void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) k = std::min(k, l1/kdiv); SizeType _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0; if(_m @@ -328,6 +331,22 @@ protected: conj_helper cj; }; +template +struct DoublePacket +{ + Packet first; + Packet second; +}; + +template +DoublePacket padd(const DoublePacket &a, const DoublePacket &b) +{ + DoublePacket res; + res.first = padd(a.first, b.first); + res.second = padd(a.second,b.second); + return res; +} + template class gebp_traits, std::complex, _ConjLhs, _ConjRhs > { @@ -357,20 +376,16 @@ public: typedef typename packet_traits::type RealPacket; typedef typename packet_traits::type ScalarPacket; - struct DoublePacket - { - RealPacket first; - RealPacket second; - }; + typedef DoublePacket DoublePacketType; typedef typename conditional::type LhsPacket; - typedef typename conditional::type RhsPacket; + typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; - typedef typename conditional::type AccPacket; + typedef typename conditional::type AccPacket; EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); } - EIGEN_STRONG_INLINE void initAcc(DoublePacket& p) + EIGEN_STRONG_INLINE void initAcc(DoublePacketType& p) { p.first = pset1(RealScalar(0)); p.second = pset1(RealScalar(0)); @@ -383,7 +398,7 @@ public: } // Vectorized path - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacketType& dest) const { dest.first = pset1(real(*b)); dest.second = pset1(imag(*b)); @@ -393,7 +408,7 @@ public: void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3); // Vectorized path - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacket& b0, DoublePacket& b1) + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacketType& b0, DoublePacketType& b1) { // FIXME not sure that's the best way to implement it! loadRhs(b+0, b0); @@ -419,7 +434,7 @@ public: dest = ploadu((const typename unpacket_traits::type*)(a)); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacket& c, RhsPacket& /*tmp*/) const + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacketType& c, RhsPacket& /*tmp*/) const { c.first = padd(pmul(a,b.first), c.first); c.second = padd(pmul(a,b.second),c.second); @@ -432,7 +447,7 @@ public: EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; } - EIGEN_STRONG_INLINE void acc(const DoublePacket& c, const ResPacket& alpha, ResPacket& r) const + EIGEN_STRONG_INLINE void acc(const DoublePacketType& c, const ResPacket& alpha, ResPacket& r) const { // assemble c ResPacket tmp; @@ -571,6 +586,14 @@ struct gebp_kernel typedef typename Traits::RhsPacket RhsPacket; typedef typename Traits::ResPacket ResPacket; typedef typename Traits::AccPacket AccPacket; + + typedef gebp_traits SwappedTraits; + typedef typename SwappedTraits::ResScalar SResScalar; + typedef typename SwappedTraits::LhsPacket SLhsPacket; + typedef typename SwappedTraits::RhsPacket SRhsPacket; + typedef typename SwappedTraits::ResPacket SResPacket; + typedef typename SwappedTraits::AccPacket SAccPacket; + enum { Vectorizable = Traits::Vectorizable, @@ -591,6 +614,7 @@ void gebp_kernel Index strideA, Index strideB, Index offsetA, Index offsetB) { Traits traits; + SwappedTraits straits; if(strideA==-1) strideA = depth; if(strideB==-1) strideB = depth; @@ -599,7 +623,9 @@ void gebp_kernel Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; // Here we assume that mr==LhsProgress const Index peeled_mc = (rows/mr)*mr; - const Index peeled_kc = (depth/4)*4; + enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell) + const Index peeled_kc = depth & ~(pk-1); + const Index depth2 = depth & ~1; // loops on each micro vertical panel of rhs (depth x nr) // First pass using depth x 8 panels @@ -634,14 +660,14 @@ void gebp_kernel // uncomment for register prefetching // LhsPacket A1; // traits.loadLhs(blA, A0); - for(Index k=0; k EIGEN_GEBGP_ONESTEP8(1,A0,A1); EIGEN_GEBGP_ONESTEP8(2,A1,A0); EIGEN_GEBGP_ONESTEP8(3,A0,A1); + EIGEN_GEBGP_ONESTEP8(4,A1,A0); + EIGEN_GEBGP_ONESTEP8(5,A0,A1); + EIGEN_GEBGP_ONESTEP8(6,A1,A0); + EIGEN_GEBGP_ONESTEP8(7,A0,A1); - blB += 4*8*RhsProgress; - blA += 4*mr; + blB += pk*8*RhsProgress; + blA += pk*mr; } // process remaining peeled loop for(Index k=peeled_kc; k pstoreu(r0+5*resStride, R5); pstoreu(r0+6*resStride, R6); pstoreu(r0+7*resStride, R0); - } - for(Index i=peeled_mc; i SwappedTraits; - typedef typename SwappedTraits::ResScalar SResScalar; - typedef typename SwappedTraits::LhsPacket SLhsPacket; - typedef typename SwappedTraits::RhsPacket SRhsPacket; - typedef typename SwappedTraits::ResPacket SResPacket; - typedef typename SwappedTraits::AccPacket SAccPacket; - SwappedTraits straits; - - SAccPacket C0; - straits.initAcc(C0); - for(Index k=0; k(&res[j2*resStride + i], resStride); - SResPacket alphav = pset1(alpha); - straits.acc(C0, alphav, R); - pscatter(&res[j2*resStride + i], R, resStride); - - EIGEN_ASM_COMMENT("end_vectorized_multiplication_of_last_rows"); - } - else - { - // gets a 1 x 8 res block as registers - ResScalar C0(0), C1(0), C2(0), C3(0), C4(0), C5(0), C6(0), C7(0); - - for(Index k=0; k SwappedTraits; + typedef typename SwappedTraits::ResScalar SResScalar; + typedef typename SwappedTraits::LhsPacket SLhsPacket; + typedef typename SwappedTraits::RhsPacket SRhsPacket; + typedef typename SwappedTraits::ResPacket SResPacket; + typedef typename SwappedTraits::AccPacket SAccPacket; + SwappedTraits straits; + + Index rows2 = (rows & ~1); + for(Index i=peeled_mc; i(&res[j2*resStride + i], resStride); + SResPacket alphav = pset1(alpha); + straits.acc(padd(C0,C1), alphav, R); + pscatter(&res[j2*resStride + i], R, resStride); + + R = pgather(&res[j2*resStride + i + 1], resStride); + straits.acc(padd(C2,C3), alphav, R); + pscatter(&res[j2*resStride + i + 1], R, resStride); + + EIGEN_ASM_COMMENT("end_vectorized_multiplication_of_last_rows 8"); + } + if(rows2!=rows) + { + Index i = rows-1; + const LhsScalar* blA = &blockA[i*strideA+offsetA]; + const RhsScalar* blB = &blockB[j2*strideB+offsetB*8]; + + EIGEN_ASM_COMMENT("begin_vectorized_multiplication_of_last_rows 8"); + + SAccPacket C0,C1; + straits.initAcc(C0); // even + straits.initAcc(C1); // odd + + for(Index k=0; k(&res[j2*resStride + i], resStride); + SResPacket alphav = pset1(alpha); + straits.acc(padd(C0,C1), alphav, R); + pscatter(&res[j2*resStride + i], R, resStride); + } + } + else + { + // Pure scalar path + for(Index i=peeled_mc; i=4) { for(Index j2=packet_cols8; j2 for(Index i=0; i // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB*4]; LhsPacket A0; - // uncomment for register prefetching - // LhsPacket A1; - // traits.loadLhs(blA, A0); - for(Index k=0; k traits.broadcastRhs(&blB[2+4*K*RhsProgress], B_0, B1); \ traits.madd(A0, B_0,C2, B_0); \ traits.madd(A0, B1, C3, B1) - + EIGEN_GEBGP_ONESTEP4(0); EIGEN_GEBGP_ONESTEP4(1); EIGEN_GEBGP_ONESTEP4(2); EIGEN_GEBGP_ONESTEP4(3); + EIGEN_GEBGP_ONESTEP4(4); + EIGEN_GEBGP_ONESTEP4(5); + EIGEN_GEBGP_ONESTEP4(6); + EIGEN_GEBGP_ONESTEP4(7); - blB += 4*4*RhsProgress; - blA += 4*mr; + blB += pk*4*RhsProgress; + blA += pk*mr; } - // process remaining peeled loop + // process remaining of peeled loop for(Index k=peeled_kc; k for(Index i=peeled_mc; i SwappedTraits; - typedef typename SwappedTraits::ResScalar SResScalar; - typedef typename SwappedTraits::LhsPacket SLhsPacket; - typedef typename SwappedTraits::RhsPacket SRhsPacket; - typedef typename SwappedTraits::ResPacket SResPacket; - typedef typename SwappedTraits::AccPacket SAccPacket; - SwappedTraits straits; - - SAccPacket C0; - straits.initAcc(C0); - for(Index k=0; k(&res[j2*resStride + i], resStride); - SResPacket alphav = pset1(alpha); - straits.acc(C0, alphav, R); - pscatter(&res[j2*resStride + i], R, resStride); - - EIGEN_ASM_COMMENT("end_vectorized_multiplication_of_last_rows"); - } else { - // gets a 1 x 4 res block as registers - ResScalar C0(0), C1(0), C2(0), C3(0); + SLhsPacket A0; + straits.loadLhsUnaligned(blB, A0); + SRhsPacket B_0; + straits.loadRhs(&blA[k], B_0); + SRhsPacket T0; + straits.madd(A0,B_0,C0,T0); + blB += 4; + } + SResPacket R = pgather(&res[j2*resStride + i], resStride); + SResPacket alphav = pset1(alpha); + straits.acc(C0, alphav, R); + pscatter(&res[j2*resStride + i], R, resStride); + + EIGEN_ASM_COMMENT("end_vectorized_multiplication_of_last_rows 1x4"); + } + else + { + // Pure scalar path + // gets a 1 x 4 res block as registers + ResScalar C0(0), C1(0), C2(0), C3(0); - for(Index k=0; k do the same but with nr==1 for(Index j2=packet_cols4; j2 traits.acc(C0, alphav, R0); pstoreu(r0, R0); } + // pure scalar path for(Index i=peeled_mc; i Date: Sun, 30 Mar 2014 22:43:47 +0200 Subject: [PATCH 049/158] Workaround alignment warnings --- Eigen/src/Core/arch/AVX/Complex.h | 32 +++++++++++----------------- Eigen/src/Core/arch/AVX/PacketMath.h | 7 ++++-- Eigen/src/Core/arch/SSE/Complex.h | 9 +++----- 3 files changed, 20 insertions(+), 28 deletions(-) diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index 7c1947a4f..cb16180c5 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -87,15 +87,10 @@ template<> EIGEN_STRONG_INLINE Packet4cf pset1(const std::complex EIGEN_STRONG_INLINE Packet4cf ploaddup(const std::complex* from) { - // This should be optimized. - __m128 complex1 = _mm_loadl_pi(_mm_set1_ps(0.0f), (const __m64*)from); - complex1 = _mm_movelh_ps(complex1, complex1); - __m128 complex2 = _mm_loadl_pi(_mm_set1_ps(0.0f), (const __m64*)(from+1)); - complex2 = _mm_movelh_ps(complex2, complex2); - __m256 result = _mm256_setzero_ps(); - result = _mm256_insertf128_ps(result, complex1, 0); - result = _mm256_insertf128_ps(result, complex2, 1); - return Packet4cf(result); + // FIXME The following might be optimized using _mm256_movedup_pd + Packet2cf a = ploaddup(from); + Packet2cf b = ploaddup(from+1); + return Packet4cf(_mm256_insertf128_ps(_mm256_castps128_ps256(a.v), b.v, 1)); } template<> EIGEN_STRONG_INLINE void pstore >(std::complex* to, const Packet4cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); } @@ -104,33 +99,30 @@ template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex EIGEN_DEVICE_FUNC inline Packet4cf pgather, Packet4cf>(const std::complex* from, int stride) { return Packet4cf(_mm256_set_ps(std::imag(from[3*stride]), std::real(from[3*stride]), - std::imag(from[2*stride]), std::real(from[2*stride]), - std::imag(from[1*stride]), std::real(from[1*stride]), - std::imag(from[0*stride]), std::real(from[0*stride]))); + std::imag(from[2*stride]), std::real(from[2*stride]), + std::imag(from[1*stride]), std::real(from[1*stride]), + std::imag(from[0*stride]), std::real(from[0*stride]))); } template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet4cf>(std::complex* to, const Packet4cf& from, int stride) { __m128 low = _mm256_extractf128_ps(from.v, 0); to[stride*0] = std::complex(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 0)), - _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1))); + _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1))); to[stride*1] = std::complex(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 2)), - _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3))); + _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3))); __m128 high = _mm256_extractf128_ps(from.v, 1); to[stride*2] = std::complex(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 0)), - _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1))); + _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1))); to[stride*3] = std::complex(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 2)), - _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3))); + _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3))); } template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet4cf& a) { - __m128 low = _mm256_extractf128_ps(a.v, 0); - std::complex res; - _mm_storel_pi((__m64*)&res, low); - return res; + return pfirst(Packet2cf(_mm256_castps256_ps128(a.v))); } template<> EIGEN_STRONG_INLINE Packet4cf preverse(const Packet4cf& a) { diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 132c1abe3..38f52ecc8 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -173,6 +173,7 @@ template<> EIGEN_STRONG_INLINE Packet8i ploadu(const int* from) { EIGE // Loads 4 floats from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, a3} template<> EIGEN_STRONG_INLINE Packet8f ploaddup(const float* from) { + // FIXME we should only load the first 128bits Packet8f tmp = ploadu(from); Packet8f tmp1 = _mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2)); Packet8f tmp2 = _mm256_permute_ps(tmp, _MM_SHUFFLE(1,1,0,0)); @@ -181,6 +182,7 @@ template<> EIGEN_STRONG_INLINE Packet8f ploaddup(const float* from) // Loads 2 doubles from memory a returns the packet {a0, a0 a1, a1} template<> EIGEN_STRONG_INLINE Packet4d ploaddup(const double* from) { + // FIXME we should only load the first 128bits Packet4d tmp = ploadu(from); Packet4d tmp1 = _mm256_permute_pd(tmp,0); Packet4d tmp2 = _mm256_permute_pd(tmp,3); @@ -195,11 +197,12 @@ template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet8f& template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet4d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from); } template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); } -// TODO: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available +// NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available +// NOTE: for the record the following seems to be slower: return _mm256_i32gather_ps(from, _mm256_set1_epi32(stride), 4); template<> EIGEN_DEVICE_FUNC inline Packet8f pgather(const float* from, int stride) { return _mm256_set_ps(from[7*stride], from[6*stride], from[5*stride], from[4*stride], - from[3*stride], from[2*stride], from[1*stride], from[0*stride]); + from[3*stride], from[2*stride], from[1*stride], from[0*stride]); } template<> EIGEN_DEVICE_FUNC inline Packet4d pgather(const double* from, int stride) { diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index 694979e19..e54ebbf90 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -117,18 +117,15 @@ template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, int stride) { return Packet2cf(_mm_set_ps(std::imag(from[1*stride]), std::real(from[1*stride]), - std::imag(from[0*stride]), std::real(from[0*stride]))); + std::imag(from[0*stride]), std::real(from[0*stride]))); } template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, int stride) { - /* for (int i = 0; i < 2; i+=2) { - to[stride*i] = std::complex(from.v[i], from.v[i+1]); - }*/ to[stride*0] = std::complex(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 0)), - _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 1))); + _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 1))); to[stride*1] = std::complex(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 2)), - _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3))); + _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3))); } template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } From 82c81630679b44bd1a3f7842152f12179428c9f7 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 31 Mar 2014 10:41:40 +0200 Subject: [PATCH 050/158] Enable repetition in mixing type unit test --- test/mixingtypes.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/test/mixingtypes.cpp b/test/mixingtypes.cpp index 6c2f74875..ada2f69d3 100644 --- a/test/mixingtypes.cpp +++ b/test/mixingtypes.cpp @@ -126,7 +126,9 @@ template void mixingtypes(int size = SizeAtCompileType) void test_mixingtypes() { - CALL_SUBTEST_1(mixingtypes<3>()); - CALL_SUBTEST_2(mixingtypes<4>()); - CALL_SUBTEST_3(mixingtypes(internal::random(1,EIGEN_TEST_MAX_SIZE))); + for(int i = 0; i < g_repeat; i++) { + CALL_SUBTEST_1(mixingtypes<3>()); + CALL_SUBTEST_2(mixingtypes<4>()); + CALL_SUBTEST_3(mixingtypes(internal::random(1,EIGEN_TEST_MAX_SIZE))); + } } From 8d0441052e7fac530fad12016f53f5b234a68d47 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 31 Mar 2014 10:42:19 +0200 Subject: [PATCH 051/158] Finally, prefetching seems to help getting more stable performance --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index a9e42c8aa..d9e659c9a 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -639,7 +639,7 @@ void gebp_kernel for(Index i=0; i for(Index i=peeled_mc; i { Index i = rows-1; const LhsScalar* blA = &blockA[i*strideA+offsetA]; + prefetch(&blA[0]); const RhsScalar* blB = &blockB[j2*strideB+offsetB*8]; EIGEN_ASM_COMMENT("begin_vectorized_multiplication_of_last_rows 8"); @@ -863,6 +865,7 @@ void gebp_kernel for(Index i=peeled_mc; i for(Index i=0; i for(Index i=peeled_mc; i traits.initAcc(C0); const LhsScalar* blA = &blockA[i*strideA+offsetA*mr]; + prefetch(&blA[0]); const RhsScalar* blB = &blockB[j2*strideB+offsetB]; for(Index k=0; k for(Index i=peeled_mc; i Date: Mon, 31 Mar 2014 10:58:30 +0200 Subject: [PATCH 052/158] BTL: fix warnings and extend to 5k matrices, update GotoBlas to OpenBlas, etc. --- bench/bench_gemm.cpp | 16 ++++-- bench/btl/CMakeLists.txt | 1 + bench/btl/actions/action_axpby.hh | 2 +- bench/btl/actions/action_axpy.hh | 2 +- bench/btl/cmake/FindATLAS.cmake | 13 ++--- bench/btl/cmake/FindGOTO.cmake | 15 ------ bench/btl/cmake/FindGOTO2.cmake | 25 ---------- bench/btl/cmake/FindOpenBLAS.cmake | 17 +++++++ bench/btl/data/action_settings.txt | 32 ++++++------ bench/btl/data/perlib_plot_settings.txt | 4 +- bench/btl/generic_bench/bench_parameter.hh | 4 +- bench/btl/generic_bench/btl.hh | 4 +- bench/btl/generic_bench/init/init_function.hh | 8 +-- bench/btl/generic_bench/utils/size_lin_log.hh | 2 +- bench/btl/libs/BLAS/CMakeLists.txt | 29 +++-------- bench/btl/libs/BLAS/blas_interface_impl.hh | 6 +-- bench/btl/libs/BLAS/c_interface_base.h | 6 +-- bench/btl/libs/STL/STL_interface.hh | 4 +- bench/btl/libs/eigen3/eigen3_interface.hh | 50 +++++++++---------- 19 files changed, 104 insertions(+), 136 deletions(-) delete mode 100644 bench/btl/cmake/FindGOTO.cmake delete mode 100644 bench/btl/cmake/FindGOTO2.cmake create mode 100644 bench/btl/cmake/FindOpenBLAS.cmake diff --git a/bench/bench_gemm.cpp b/bench/bench_gemm.cpp index 41ca8b3b6..1ef2e72c2 100644 --- a/bench/bench_gemm.cpp +++ b/bench/bench_gemm.cpp @@ -129,13 +129,25 @@ int main(int argc, char ** argv) int tries = 2; // number of tries, we keep the best int s = 2048; + int m = s; + int n = s; + int p = s; int cache_size = -1; bool need_help = false; for (int i=1; i0) setCpuCacheSizes(cache_size,96*cache_size); - int m = s; - int n = s; - int p = s; + A a(m,p); a.setRandom(); B b(p,n); b.setRandom(); C c(m,n); c.setOnes(); diff --git a/bench/btl/CMakeLists.txt b/bench/btl/CMakeLists.txt index 0ac5c15fc..b299d9899 100644 --- a/bench/btl/CMakeLists.txt +++ b/bench/btl/CMakeLists.txt @@ -104,6 +104,7 @@ add_subdirectory(libs/mtl4) add_subdirectory(libs/blitz) add_subdirectory(libs/tvmet) add_subdirectory(libs/STL) +add_subdirectory(libs/blaze) add_subdirectory(data) diff --git a/bench/btl/actions/action_axpby.hh b/bench/btl/actions/action_axpby.hh index 98511ab6a..dadd0ccf3 100644 --- a/bench/btl/actions/action_axpby.hh +++ b/bench/btl/actions/action_axpby.hh @@ -33,7 +33,7 @@ class Action_axpby { public : // Ctor - Action_axpby( int size ):_size(size),_alpha(0.5),_beta(0.95) + Action_axpby( int size ):_alpha(0.5),_beta(0.95),_size(size) { MESSAGE("Action_axpby Ctor"); diff --git a/bench/btl/actions/action_axpy.hh b/bench/btl/actions/action_axpy.hh index e4cb3a5bd..261be4cb8 100644 --- a/bench/btl/actions/action_axpy.hh +++ b/bench/btl/actions/action_axpy.hh @@ -35,7 +35,7 @@ public : // Ctor - Action_axpy( int size ):_size(size),_coef(1.0) + Action_axpy( int size ):_coef(1.0),_size(size) { MESSAGE("Action_axpy Ctor"); diff --git a/bench/btl/cmake/FindATLAS.cmake b/bench/btl/cmake/FindATLAS.cmake index 6b9065206..14b1dee09 100644 --- a/bench/btl/cmake/FindATLAS.cmake +++ b/bench/btl/cmake/FindATLAS.cmake @@ -4,10 +4,7 @@ if (ATLAS_LIBRARIES) endif (ATLAS_LIBRARIES) find_file(ATLAS_LIB libatlas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) -find_library(ATLAS_LIB atlas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) - -find_file(ATLAS_CBLAS libcblas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) -find_library(ATLAS_CBLAS cblas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) +find_library(ATLAS_LIB satlas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) find_file(ATLAS_LAPACK liblapack_atlas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) find_library(ATLAS_LAPACK lapack_atlas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) @@ -22,14 +19,14 @@ find_library(ATLAS_F77BLAS f77blas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) if(ATLAS_LIB AND ATLAS_CBLAS AND ATLAS_LAPACK AND ATLAS_F77BLAS) - set(ATLAS_LIBRARIES ${ATLAS_LAPACK} ${ATLAS_CBLAS} ${ATLAS_F77BLAS} ${ATLAS_LIB}) + set(ATLAS_LIBRARIES ${ATLAS_LAPACK} ${ATLAS_LIB}) # search the default lapack lib link to it find_file(ATLAS_REFERENCE_LAPACK liblapack.so.3 PATHS /usr/lib /usr/lib64) find_library(ATLAS_REFERENCE_LAPACK NAMES lapack) - if(ATLAS_REFERENCE_LAPACK) - set(ATLAS_LIBRARIES ${ATLAS_LIBRARIES} ${ATLAS_REFERENCE_LAPACK}) - endif() +# if(ATLAS_REFERENCE_LAPACK) +# set(ATLAS_LIBRARIES ${ATLAS_LIBRARIES} ${ATLAS_REFERENCE_LAPACK}) +# endif() endif(ATLAS_LIB AND ATLAS_CBLAS AND ATLAS_LAPACK AND ATLAS_F77BLAS) diff --git a/bench/btl/cmake/FindGOTO.cmake b/bench/btl/cmake/FindGOTO.cmake deleted file mode 100644 index 67ea0934a..000000000 --- a/bench/btl/cmake/FindGOTO.cmake +++ /dev/null @@ -1,15 +0,0 @@ - -if (GOTO_LIBRARIES) - set(GOTO_FIND_QUIETLY TRUE) -endif (GOTO_LIBRARIES) - -find_library(GOTO_LIBRARIES goto PATHS $ENV{GOTODIR} ${LIB_INSTALL_DIR}) - -if(GOTO_LIBRARIES AND CMAKE_COMPILER_IS_GNUCXX) - set(GOTO_LIBRARIES ${GOTO_LIBRARIES} "-lpthread -lgfortran") -endif() - -include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(GOTO DEFAULT_MSG GOTO_LIBRARIES) - -mark_as_advanced(GOTO_LIBRARIES) diff --git a/bench/btl/cmake/FindGOTO2.cmake b/bench/btl/cmake/FindGOTO2.cmake deleted file mode 100644 index baa68d213..000000000 --- a/bench/btl/cmake/FindGOTO2.cmake +++ /dev/null @@ -1,25 +0,0 @@ - -if (GOTO2_LIBRARIES) - set(GOTO2_FIND_QUIETLY TRUE) -endif (GOTO2_LIBRARIES) -# -# find_path(GOTO_INCLUDES -# NAMES -# cblas.h -# PATHS -# $ENV{GOTODIR}/include -# ${INCLUDE_INSTALL_DIR} -# ) - -find_file(GOTO2_LIBRARIES libgoto2.so PATHS /usr/lib $ENV{GOTO2DIR} ${LIB_INSTALL_DIR}) -find_library(GOTO2_LIBRARIES goto2 PATHS $ENV{GOTO2DIR} ${LIB_INSTALL_DIR}) - -if(GOTO2_LIBRARIES AND CMAKE_COMPILER_IS_GNUCXX) - set(GOTO2_LIBRARIES ${GOTO2_LIBRARIES} "-lpthread -lgfortran") -endif() - -include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(GOTO2 DEFAULT_MSG - GOTO2_LIBRARIES) - -mark_as_advanced(GOTO2_LIBRARIES) diff --git a/bench/btl/cmake/FindOpenBLAS.cmake b/bench/btl/cmake/FindOpenBLAS.cmake new file mode 100644 index 000000000..c76fc251c --- /dev/null +++ b/bench/btl/cmake/FindOpenBLAS.cmake @@ -0,0 +1,17 @@ + +if (OPENBLAS_LIBRARIES) + set(OPENBLAS_FIND_QUIETLY TRUE) +endif (OPENBLAS_LIBRARIES) + +find_file(OPENBLAS_LIBRARIES libopenblas.so PATHS /usr/lib $ENV{OPENBLASDIR} ${LIB_INSTALL_DIR}) +find_library(OPENBLAS_LIBRARIES openblas PATHS $ENV{OPENBLASDIR} ${LIB_INSTALL_DIR}) + +if(OPENBLAS_LIBRARIES AND CMAKE_COMPILER_IS_GNUCXX) + set(OPENBLAS_LIBRARIES ${OPENBLAS_LIBRARIES} "-lpthread -lgfortran") +endif() + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(OPENBLAS DEFAULT_MSG + OPENBLAS_LIBRARIES) + +mark_as_advanced(OPENBLAS_LIBRARIES) diff --git a/bench/btl/data/action_settings.txt b/bench/btl/data/action_settings.txt index e32213e22..674a738d9 100644 --- a/bench/btl/data/action_settings.txt +++ b/bench/btl/data/action_settings.txt @@ -1,19 +1,19 @@ -aat ; "{/*1.5 A x A^T}" ; "matrix size" ; 4:3000 -ata ; "{/*1.5 A^T x A}" ; "matrix size" ; 4:3000 -atv ; "{/*1.5 matrix^T x vector}" ; "matrix size" ; 4:3000 +aat ; "{/*1.5 A x A^T}" ; "matrix size" ; 4:5000 +ata ; "{/*1.5 A^T x A}" ; "matrix size" ; 4:5000 +atv ; "{/*1.5 matrix^T x vector}" ; "matrix size" ; 4:5000 axpby ; "{/*1.5 Y = alpha X + beta Y}" ; "vector size" ; 5:1000000 axpy ; "{/*1.5 Y += alpha X}" ; "vector size" ; 5:1000000 -matrix_matrix ; "{/*1.5 matrix matrix product}" ; "matrix size" ; 4:3000 -matrix_vector ; "{/*1.5 matrix vector product}" ; "matrix size" ; 4:3000 -trmm ; "{/*1.5 triangular matrix matrix product}" ; "matrix size" ; 4:3000 -trisolve_vector ; "{/*1.5 triangular solver - vector (X = inv(L) X)}" ; "size" ; 4:3000 -trisolve_matrix ; "{/*1.5 triangular solver - matrix (M = inv(L) M)}" ; "size" ; 4:3000 -cholesky ; "{/*1.5 Cholesky decomposition}" ; "matrix size" ; 4:3000 -complete_lu_decomp ; "{/*1.5 Complete LU decomposition}" ; "matrix size" ; 4:3000 -partial_lu_decomp ; "{/*1.5 Partial LU decomposition}" ; "matrix size" ; 4:3000 -tridiagonalization ; "{/*1.5 Tridiagonalization}" ; "matrix size" ; 4:3000 -hessenberg ; "{/*1.5 Hessenberg decomposition}" ; "matrix size" ; 4:3000 -symv ; "{/*1.5 symmetric matrix vector product}" ; "matrix size" ; 4:3000 -syr2 ; "{/*1.5 symmretric rank-2 update (A += u^T v + u v^T)}" ; "matrix size" ; 4:3000 -ger ; "{/*1.5 general rank-1 update (A += u v^T)}" ; "matrix size" ; 4:3000 +matrix_matrix ; "{/*1.5 matrix matrix product}" ; "matrix size" ; 4:5000 +matrix_vector ; "{/*1.5 matrix vector product}" ; "matrix size" ; 4:5000 +trmm ; "{/*1.5 triangular matrix matrix product}" ; "matrix size" ; 4:5000 +trisolve_vector ; "{/*1.5 triangular solver - vector (X = inv(L) X)}" ; "size" ; 4:5000 +trisolve_matrix ; "{/*1.5 triangular solver - matrix (M = inv(L) M)}" ; "size" ; 4:5000 +cholesky ; "{/*1.5 Cholesky decomposition}" ; "matrix size" ; 4:5000 +complete_lu_decomp ; "{/*1.5 Complete LU decomposition}" ; "matrix size" ; 4:5000 +partial_lu_decomp ; "{/*1.5 Partial LU decomposition}" ; "matrix size" ; 4:5000 +tridiagonalization ; "{/*1.5 Tridiagonalization}" ; "matrix size" ; 4:5000 +hessenberg ; "{/*1.5 Hessenberg decomposition}" ; "matrix size" ; 4:5000 +symv ; "{/*1.5 symmetric matrix vector product}" ; "matrix size" ; 4:5000 +syr2 ; "{/*1.5 symmretric rank-2 update (A += u^T v + u v^T)}" ; "matrix size" ; 4:5000 +ger ; "{/*1.5 general rank-1 update (A += u v^T)}" ; "matrix size" ; 4:5000 rot ; "{/*1.5 apply rotation in the plane}" ; "vector size" ; 4:1000000 \ No newline at end of file diff --git a/bench/btl/data/perlib_plot_settings.txt b/bench/btl/data/perlib_plot_settings.txt index 6844bab28..f023cfe02 100644 --- a/bench/btl/data/perlib_plot_settings.txt +++ b/bench/btl/data/perlib_plot_settings.txt @@ -10,7 +10,7 @@ ublas ; with lines lw 3 lt 1 lc rgbcolor "#00b7ff" mtl4 ; with lines lw 3 lt 1 lc rgbcolor "#d18847" blitz ; with lines lw 3 lt 1 lc rgbcolor "#ff00ff" F77 ; with lines lw 3 lt 3 lc rgbcolor "#e6e64c" -GOTO ; with lines lw 3 lt 3 lc rgbcolor "#C05600" -GOTO2 ; with lines lw 3 lt 1 lc rgbcolor "#C05600" +OPENBLAS ; with lines lw 3 lt 1 lc rgbcolor "#C05600" C ; with lines lw 3 lt 3 lc rgbcolor "#e6bd96" ACML ; with lines lw 2 lt 3 lc rgbcolor "#e6e64c" +blaze ; with lines lw 3 lt 1 lc rgbcolor "#ff00ff" diff --git a/bench/btl/generic_bench/bench_parameter.hh b/bench/btl/generic_bench/bench_parameter.hh index 4c355cd6e..5e341c14c 100644 --- a/bench/btl/generic_bench/bench_parameter.hh +++ b/bench/btl/generic_bench/bench_parameter.hh @@ -33,7 +33,7 @@ // min matrix size for matrix vector product bench #define MIN_MV 5 // max matrix size for matrix vector product bench -#define MAX_MV 3000 +#define MAX_MV 5000 // min matrix size for matrix matrix product bench #define MIN_MM 5 // max matrix size for matrix matrix product bench @@ -41,7 +41,7 @@ // min matrix size for LU bench #define MIN_LU 5 // max matrix size for LU bench -#define MAX_LU 3000 +#define MAX_LU 5000 // max size for tiny vector and matrix #define TINY_MV_MAX_SIZE 16 // default nb_sample for x86 timer diff --git a/bench/btl/generic_bench/btl.hh b/bench/btl/generic_bench/btl.hh index 4670f5113..92af1306a 100644 --- a/bench/btl/generic_bench/btl.hh +++ b/bench/btl/generic_bench/btl.hh @@ -176,7 +176,7 @@ public: if (_config!=NULL) { std::vector config = BtlString(_config).split(" \t\n"); - for (int i = 0; i -void size_lin_log(const int nb_point, const int size_min, const int size_max, Vector & X) +void size_lin_log(const int nb_point, const int /*size_min*/, const int size_max, Vector & X) { int ten=10; int nine=9; diff --git a/bench/btl/libs/BLAS/CMakeLists.txt b/bench/btl/libs/BLAS/CMakeLists.txt index de42fe047..22f09527d 100644 --- a/bench/btl/libs/BLAS/CMakeLists.txt +++ b/bench/btl/libs/BLAS/CMakeLists.txt @@ -18,27 +18,14 @@ if (MKL_FOUND) endif (MKL_FOUND) -find_package(GOTO2) -if (GOTO2_FOUND) - btl_add_bench(btl_goto2 main.cpp) - if(BUILD_btl_goto2) - target_link_libraries(btl_goto2 ${GOTO_LIBRARIES} ) - set_target_properties(btl_goto2 PROPERTIES COMPILE_FLAGS "-DCBLASNAME=GOTO2") - endif(BUILD_btl_goto2) -endif (GOTO2_FOUND) - -find_package(GOTO) -if (GOTO_FOUND) - if(GOTO2_FOUND) - btl_add_bench(btl_goto main.cpp OFF) - else() - btl_add_bench(btl_goto main.cpp) - endif() - if(BUILD_btl_goto) - target_link_libraries(btl_goto ${GOTO_LIBRARIES} ) - set_target_properties(btl_goto PROPERTIES COMPILE_FLAGS "-DCBLASNAME=GOTO") - endif(BUILD_btl_goto) -endif (GOTO_FOUND) +find_package(OPENBLAS) +if (OPENBLAS_FOUND) + btl_add_bench(btl_openblas main.cpp) + if(BUILD_btl_openblas) + target_link_libraries(btl_openblas ${GOTO_LIBRARIES} ) + set_target_properties(btl_openblas PROPERTIES COMPILE_FLAGS "-DCBLASNAME=OPENBLAS") + endif(BUILD_btl_openblas) +endif (OPENBLAS_FOUND) find_package(ACML) if (ACML_FOUND) diff --git a/bench/btl/libs/BLAS/blas_interface_impl.hh b/bench/btl/libs/BLAS/blas_interface_impl.hh index 0e84df038..fc4ba2a1f 100644 --- a/bench/btl/libs/BLAS/blas_interface_impl.hh +++ b/bench/btl/libs/BLAS/blas_interface_impl.hh @@ -75,7 +75,6 @@ public : static inline void partial_lu_decomp(const gene_matrix & X, gene_matrix & C, int N){ int N2 = N*N; BLAS_FUNC(copy)(&N2, X, &intone, C, &intone); - char uplo = 'L'; int info = 0; int * ipiv = (int*)alloca(sizeof(int)*N); BLAS_FUNC(getrf)(&N, &N, C, &N, ipiv, &info); @@ -92,7 +91,7 @@ public : BLAS_FUNC(trsm)(&right, &lower, ¬rans, &nonunit, &N, &N, &fone, L, &N, X, &N); } - static inline void trmm(gene_matrix & A, gene_matrix & B, gene_matrix & X, int N){ + static inline void trmm(gene_matrix & A, gene_matrix & B, gene_matrix & /*X*/, int N){ BLAS_FUNC(trmm)(&left, &lower, ¬rans,&nonunit, &N,&N,&fone,A,&N,B,&N); } @@ -101,7 +100,6 @@ public : static inline void lu_decomp(const gene_matrix & X, gene_matrix & C, int N){ int N2 = N*N; BLAS_FUNC(copy)(&N2, X, &intone, C, &intone); - char uplo = 'L'; int info = 0; int * ipiv = (int*)alloca(sizeof(int)*N); int * jpiv = (int*)alloca(sizeof(int)*N); @@ -134,8 +132,6 @@ public : } char uplo = 'U'; int info = 0; - int ilo = 1; - int ihi = N; int bsize = 64; int worksize = N*bsize; SCALAR* d = new SCALAR[3*N+worksize]; diff --git a/bench/btl/libs/BLAS/c_interface_base.h b/bench/btl/libs/BLAS/c_interface_base.h index 515d8dcfc..de613803b 100644 --- a/bench/btl/libs/BLAS/c_interface_base.h +++ b/bench/btl/libs/BLAS/c_interface_base.h @@ -17,12 +17,12 @@ public: typedef real* gene_matrix; typedef real* gene_vector; - static void free_matrix(gene_matrix & A, int N){ - delete A; + static void free_matrix(gene_matrix & A, int /*N*/){ + delete[] A; } static void free_vector(gene_vector & B){ - delete B; + delete[] B; } static inline void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){ diff --git a/bench/btl/libs/STL/STL_interface.hh b/bench/btl/libs/STL/STL_interface.hh index 93e76bd55..ef4cc9233 100644 --- a/bench/btl/libs/STL/STL_interface.hh +++ b/bench/btl/libs/STL/STL_interface.hh @@ -44,9 +44,9 @@ public : return "STL"; } - static void free_matrix(gene_matrix & A, int N){} + static void free_matrix(gene_matrix & /*A*/, int /*N*/){} - static void free_vector(gene_vector & B){} + static void free_vector(gene_vector & /*B*/){} static inline void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){ A = A_stl; diff --git a/bench/btl/libs/eigen3/eigen3_interface.hh b/bench/btl/libs/eigen3/eigen3_interface.hh index 0c8aa74da..b821fd721 100644 --- a/bench/btl/libs/eigen3/eigen3_interface.hh +++ b/bench/btl/libs/eigen3/eigen3_interface.hh @@ -45,9 +45,9 @@ public : return EIGEN_MAKESTRING(BTL_PREFIX); } - static void free_matrix(gene_matrix & A, int N) {} + static void free_matrix(gene_matrix & /*A*/, int /*N*/) {} - static void free_vector(gene_vector & B) {} + static void free_vector(gene_vector & /*B*/) {} static BTL_DONT_INLINE void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){ A.resize(A_stl[0].size(), A_stl.size()); @@ -74,7 +74,7 @@ public : } static BTL_DONT_INLINE void matrix_to_stl(gene_matrix & A, stl_matrix & A_stl){ - int N=A_stl.size(); + int N=A_stl.size(); for (int j=0;j().setZero(); X.template selfadjointView().rankUpdate(A); } - static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N){ + static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int /*N*/){ X.noalias() = A*B; } - static inline void symv(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N){ + static inline void symv(const gene_matrix & A, const gene_vector & B, gene_vector & X, int /*N*/){ X.noalias() = (A.template selfadjointView() * B); // internal::product_selfadjoint_vector(N,A.data(),N, B.data(), 1, X.data(), 1); } @@ -155,54 +155,54 @@ public : } } - static EIGEN_DONT_INLINE void syr2(gene_matrix & A, gene_vector & X, gene_vector & Y, int N){ + static EIGEN_DONT_INLINE void syr2(gene_matrix & A, gene_vector & X, gene_vector & Y, int N){ // internal::product_selfadjoint_rank2_update(N,A.data(),N, X.data(), 1, Y.data(), 1, -1); for(int j=0; j(c,s)); } - static inline void atv_product(gene_matrix & A, gene_vector & B, gene_vector & X, int N){ + static inline void atv_product(gene_matrix & A, gene_vector & B, gene_vector & X, int /*N*/){ X.noalias() = (A.transpose()*B); } - static inline void axpy(real coef, const gene_vector & X, gene_vector & Y, int N){ + static inline void axpy(real coef, const gene_vector & X, gene_vector & Y, int /*N*/){ Y += coef * X; } - static inline void axpby(real a, const gene_vector & X, real b, gene_vector & Y, int N){ + static inline void axpby(real a, const gene_vector & X, real b, gene_vector & Y, int /*N*/){ Y = a*X + b*Y; } - static EIGEN_DONT_INLINE void copy_matrix(const gene_matrix & source, gene_matrix & cible, int N){ + static EIGEN_DONT_INLINE void copy_matrix(const gene_matrix & source, gene_matrix & cible, int /*N*/){ cible = source; } - static EIGEN_DONT_INLINE void copy_vector(const gene_vector & source, gene_vector & cible, int N){ + static EIGEN_DONT_INLINE void copy_vector(const gene_vector & source, gene_vector & cible, int /*N*/){ cible = source; } - static inline void trisolve_lower(const gene_matrix & L, const gene_vector& B, gene_vector& X, int N){ + static inline void trisolve_lower(const gene_matrix & L, const gene_vector& B, gene_vector& X, int /*N*/){ X = L.template triangularView().solve(B); } - static inline void trisolve_lower_matrix(const gene_matrix & L, const gene_matrix& B, gene_matrix& X, int N){ + static inline void trisolve_lower_matrix(const gene_matrix & L, const gene_matrix& B, gene_matrix& X, int /*N*/){ X = L.template triangularView().solve(B); } - static inline void trmm(const gene_matrix & L, const gene_matrix& B, gene_matrix& X, int N){ + static inline void trmm(const gene_matrix & L, const gene_matrix& B, gene_matrix& X, int /*N*/){ X.noalias() = L.template triangularView() * B; } - static inline void cholesky(const gene_matrix & X, gene_matrix & C, int N){ + static inline void cholesky(const gene_matrix & X, gene_matrix & C, int /*N*/){ C = X; internal::llt_inplace::blocked(C); //C = X.llt().matrixL(); @@ -211,11 +211,11 @@ public : // Cholesky::computeInPlaceBlock(C); } - static inline void lu_decomp(const gene_matrix & X, gene_matrix & C, int N){ + static inline void lu_decomp(const gene_matrix & X, gene_matrix & C, int /*N*/){ C = X.fullPivLu().matrixLU(); } - static inline void partial_lu_decomp(const gene_matrix & X, gene_matrix & C, int N){ + static inline void partial_lu_decomp(const gene_matrix & X, gene_matrix & C, int N){ Matrix piv(N); DenseIndex nb; C = X; @@ -223,13 +223,13 @@ public : // C = X.partialPivLu().matrixLU(); } - static inline void tridiagonalization(const gene_matrix & X, gene_matrix & C, int N){ + static inline void tridiagonalization(const gene_matrix & X, gene_matrix & C, int N){ typename Tridiagonalization::CoeffVectorType aux(N-1); C = X; internal::tridiagonalization_inplace(C, aux); } - static inline void hessenberg(const gene_matrix & X, gene_matrix & C, int N){ + static inline void hessenberg(const gene_matrix & X, gene_matrix & C, int /*N*/){ C = HessenbergDecomposition(X).packedMatrix(); } From 93870d95b7b8a7716ab825d559e7af5f7f84308a Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 31 Mar 2014 10:59:55 +0200 Subject: [PATCH 053/158] BTL: add blaze --- bench/btl/cmake/FindBLAZE.cmake | 31 ++++++ bench/btl/libs/blaze/CMakeLists.txt | 8 ++ bench/btl/libs/blaze/blaze_interface.hh | 140 ++++++++++++++++++++++++ bench/btl/libs/blaze/main.cpp | 46 ++++++++ 4 files changed, 225 insertions(+) create mode 100644 bench/btl/cmake/FindBLAZE.cmake create mode 100644 bench/btl/libs/blaze/CMakeLists.txt create mode 100644 bench/btl/libs/blaze/blaze_interface.hh create mode 100644 bench/btl/libs/blaze/main.cpp diff --git a/bench/btl/cmake/FindBLAZE.cmake b/bench/btl/cmake/FindBLAZE.cmake new file mode 100644 index 000000000..dba4c89f2 --- /dev/null +++ b/bench/btl/cmake/FindBLAZE.cmake @@ -0,0 +1,31 @@ +# - Try to find eigen2 headers +# Once done this will define +# +# BLAZE_FOUND - system has blaze lib +# BLAZE_INCLUDE_DIR - the blaze include directory +# +# Copyright (C) 2008 Gael Guennebaud +# Adapted from FindEigen.cmake: +# Copyright (c) 2006, 2007 Montel Laurent, +# Redistribution and use is allowed according to the terms of the BSD license. +# For details see the accompanying COPYING-CMAKE-SCRIPTS file. + +if (BLAZE_INCLUDE_DIR) + + # in cache already + set(BLAZE_FOUND TRUE) + +else (BLAZE_INCLUDE_DIR) + +find_path(BLAZE_INCLUDE_DIR NAMES blaze/Blaze.h + PATHS + ${INCLUDE_INSTALL_DIR} + ) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(BLAZE DEFAULT_MSG BLAZE_INCLUDE_DIR) + +mark_as_advanced(BLAZE_INCLUDE_DIR) + +endif(BLAZE_INCLUDE_DIR) + diff --git a/bench/btl/libs/blaze/CMakeLists.txt b/bench/btl/libs/blaze/CMakeLists.txt new file mode 100644 index 000000000..54ab929d8 --- /dev/null +++ b/bench/btl/libs/blaze/CMakeLists.txt @@ -0,0 +1,8 @@ + +find_package(BLAZE) +find_package(Boost) +if (BLAZE_FOUND AND Boost_FOUND) + include_directories(${BLAZE_INCLUDE_DIR} ${Boost_INCLUDE_DIRS}) + btl_add_bench(btl_blaze main.cpp) + target_link_libraries(btl_blaze ${Boost_LIBRARIES} ${Boost_system_LIBRARY} /opt/local/lib/libboost_system-mt.a ) +endif (BLAZE_FOUND) diff --git a/bench/btl/libs/blaze/blaze_interface.hh b/bench/btl/libs/blaze/blaze_interface.hh new file mode 100644 index 000000000..8020fef27 --- /dev/null +++ b/bench/btl/libs/blaze/blaze_interface.hh @@ -0,0 +1,140 @@ +//===================================================== +// Copyright (C) 2008 Gael Guennebaud +//===================================================== +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation; either version 2 +// of the License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +// +#ifndef BLAZE_INTERFACE_HH +#define BLAZE_INTERFACE_HH + +#include +#include +// using namespace blaze; + +#include + +template +class blaze_interface { + +public : + + typedef real real_type ; + + typedef std::vector stl_vector; + typedef std::vector stl_matrix; + + typedef blaze::DynamicMatrix gene_matrix; + typedef blaze::DynamicVector gene_vector; + + static inline std::string name() { return "blaze"; } + + static void free_matrix(gene_matrix & A, int N){ + return ; + } + + static void free_vector(gene_vector & B){ + return ; + } + + static inline void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){ + A.resize(A_stl[0].size(), A_stl.size()); + + for (int j=0; j ipvt(N); +// lu_factor(R, ipvt); +// } + +// static inline void trisolve_lower(const gene_matrix & L, const gene_vector& B, gene_vector & X, int N){ +// X = lower_trisolve(L, B); +// } + + static inline void copy_matrix(const gene_matrix & source, gene_matrix & cible, int N){ + cible = source; + } + + static inline void copy_vector(const gene_vector & source, gene_vector & cible, int N){ + cible = source; + } + +}; + +#endif diff --git a/bench/btl/libs/blaze/main.cpp b/bench/btl/libs/blaze/main.cpp new file mode 100644 index 000000000..b8508c8f3 --- /dev/null +++ b/bench/btl/libs/blaze/main.cpp @@ -0,0 +1,46 @@ +//===================================================== +// Copyright (C) 2008 Gael Guennebaud +//===================================================== +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation; either version 2 +// of the License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +// +#include "utilities.h" +#include "blaze_interface.hh" +#include "bench.hh" +#include "basic_actions.hh" +// #include "action_cholesky.hh" +// #include "action_lu_decomp.hh" + +BTL_MAIN; + +int main() +{ + + bench > >(MIN_AXPY,MAX_AXPY,NB_POINT); + bench > >(MIN_AXPY,MAX_AXPY,NB_POINT); + + bench > >(MIN_MV,MAX_MV,NB_POINT); +// bench > >(MIN_MV,MAX_MV,NB_POINT); + bench > >(MIN_MM,MAX_MM,NB_POINT); +// bench > >(MIN_MM,MAX_MM,NB_POINT); +// bench > >(MIN_MM,MAX_MM,NB_POINT); + +// bench > >(MIN_MM,MAX_MM,NB_POINT); +// bench > >(MIN_MM,MAX_MM,NB_POINT); +// bench > >(MIN_MM,MAX_MM,NB_POINT); + + return 0; +} + + From 1221dd90aad41fdd610bed108c5f2f7af283dba7 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 1 Apr 2014 11:21:14 +0200 Subject: [PATCH 054/158] Fix no newline at end of file warning --- bench/spbench/spbenchstyle.h | 3 ++- unsupported/Eigen/src/SparseExtra/MarketIO.h | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/bench/spbench/spbenchstyle.h b/bench/spbench/spbenchstyle.h index 17a05ce71..f6a981778 100644 --- a/bench/spbench/spbenchstyle.h +++ b/bench/spbench/spbenchstyle.h @@ -91,4 +91,5 @@ void printBenchStyle(std::ofstream& out) \n\n"; } -#endif \ No newline at end of file + +#endif diff --git a/unsupported/Eigen/src/SparseExtra/MarketIO.h b/unsupported/Eigen/src/SparseExtra/MarketIO.h index 7aafce928..1c40d3f7c 100644 --- a/unsupported/Eigen/src/SparseExtra/MarketIO.h +++ b/unsupported/Eigen/src/SparseExtra/MarketIO.h @@ -238,9 +238,9 @@ bool saveMarket(const SparseMatrixType& mat, const std::string& filename, int sy for(int j=0; j Date: Tue, 1 Apr 2014 11:23:28 +0200 Subject: [PATCH 055/158] Rename the vector() factories defined in blas/common.h into make_vector() to prevent a possible name conflict with std::vector. --- blas/common.h | 12 +++---- blas/double.cpp | 11 +++--- blas/level1_cplx_impl.h | 38 ++++++++++---------- blas/level1_impl.h | 39 ++++++++++----------- blas/level1_real_impl.h | 22 ++++++------ blas/level2_cplx_impl.h | 4 +-- blas/level2_impl.h | 77 ++++++++++++++++++++--------------------- blas/level2_real_impl.h | 8 ++--- 8 files changed, 103 insertions(+), 108 deletions(-) diff --git a/blas/common.h b/blas/common.h index 2bf642c6b..c39cc63a8 100644 --- a/blas/common.h +++ b/blas/common.h @@ -106,13 +106,13 @@ matrix(T* data, int rows, int cols, int stride) } template -Map, 0, InnerStride > vector(T* data, int size, int incr) +Map, 0, InnerStride > make_vector(T* data, int size, int incr) { return Map, 0, InnerStride >(data, size, InnerStride(incr)); } template -Map > vector(T* data, int size) +Map > make_vector(T* data, int size) { return Map >(data, size); } @@ -124,8 +124,8 @@ T* get_compact_vector(T* x, int n, int incx) return x; T* ret = new Scalar[n]; - if(incx<0) vector(ret,n) = vector(x,n,-incx).reverse(); - else vector(ret,n) = vector(x,n, incx); + if(incx<0) make_vector(ret,n) = make_vector(x,n,-incx).reverse(); + else make_vector(ret,n) = make_vector(x,n, incx); return ret; } @@ -135,8 +135,8 @@ T* copy_back(T* x_cpy, T* x, int n, int incx) if(x_cpy==x) return 0; - if(incx<0) vector(x,n,-incx).reverse() = vector(x_cpy,n); - else vector(x,n, incx) = vector(x_cpy,n); + if(incx<0) make_vector(x,n,-incx).reverse() = make_vector(x_cpy,n); + else make_vector(x,n, incx) = make_vector(x_cpy,n); return x_cpy; } diff --git a/blas/double.cpp b/blas/double.cpp index 8fd0709ba..295b1d1f2 100644 --- a/blas/double.cpp +++ b/blas/double.cpp @@ -23,11 +23,10 @@ double BLASFUNC(dsdot)(int* n, float* x, int* incx, float* y, int* incy) { if(*n<=0) return 0; - if(*incx==1 && *incy==1) return (vector(x,*n).cast().cwiseProduct(vector(y,*n).cast())).sum(); - else if(*incx>0 && *incy>0) return (vector(x,*n,*incx).cast().cwiseProduct(vector(y,*n,*incy).cast())).sum(); - else if(*incx<0 && *incy>0) return (vector(x,*n,-*incx).reverse().cast().cwiseProduct(vector(y,*n,*incy).cast())).sum(); - else if(*incx>0 && *incy<0) return (vector(x,*n,*incx).cast().cwiseProduct(vector(y,*n,-*incy).reverse().cast())).sum(); - else if(*incx<0 && *incy<0) return (vector(x,*n,-*incx).reverse().cast().cwiseProduct(vector(y,*n,-*incy).reverse().cast())).sum(); + if(*incx==1 && *incy==1) return (make_vector(x,*n).cast().cwiseProduct(make_vector(y,*n).cast())).sum(); + else if(*incx>0 && *incy>0) return (make_vector(x,*n,*incx).cast().cwiseProduct(make_vector(y,*n,*incy).cast())).sum(); + else if(*incx<0 && *incy>0) return (make_vector(x,*n,-*incx).reverse().cast().cwiseProduct(make_vector(y,*n,*incy).cast())).sum(); + else if(*incx>0 && *incy<0) return (make_vector(x,*n,*incx).cast().cwiseProduct(make_vector(y,*n,-*incy).reverse().cast())).sum(); + else if(*incx<0 && *incy<0) return (make_vector(x,*n,-*incx).reverse().cast().cwiseProduct(make_vector(y,*n,-*incy).reverse().cast())).sum(); else return 0; } - diff --git a/blas/level1_cplx_impl.h b/blas/level1_cplx_impl.h index ffe192481..719f5bac9 100644 --- a/blas/level1_cplx_impl.h +++ b/blas/level1_cplx_impl.h @@ -32,13 +32,14 @@ RealScalar EIGEN_CAT(EIGEN_CAT(REAL_SCALAR_SUFFIX,SCALAR_SUFFIX),asum_)(int *n, if(*n<=0) return 0; - if(*incx==1) return vector(x,*n).unaryExpr().sum(); - else return vector(x,*n,std::abs(*incx)).unaryExpr().sum(); + if(*incx==1) return make_vector(x,*n).unaryExpr().sum(); + else return make_vector(x,*n,std::abs(*incx)).unaryExpr().sum(); } // computes a dot product of a conjugated vector with another vector. int EIGEN_BLAS_FUNC(dotcw)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar* pres) { +// std::cerr << "_dotc " << *n << " " << *incx << " " << *incy << "\n"; Scalar* res = reinterpret_cast(pres); if(*n<=0) @@ -50,11 +51,11 @@ int EIGEN_BLAS_FUNC(dotcw)(int *n, RealScalar *px, int *incx, RealScalar *py, in Scalar* x = reinterpret_cast(px); Scalar* y = reinterpret_cast(py); - if(*incx==1 && *incy==1) *res = (vector(x,*n).dot(vector(y,*n))); - else if(*incx>0 && *incy>0) *res = (vector(x,*n,*incx).dot(vector(y,*n,*incy))); - else if(*incx<0 && *incy>0) *res = (vector(x,*n,-*incx).reverse().dot(vector(y,*n,*incy))); - else if(*incx>0 && *incy<0) *res = (vector(x,*n,*incx).dot(vector(y,*n,-*incy).reverse())); - else if(*incx<0 && *incy<0) *res = (vector(x,*n,-*incx).reverse().dot(vector(y,*n,-*incy).reverse())); + if(*incx==1 && *incy==1) *res = (make_vector(x,*n).dot(make_vector(y,*n))); + else if(*incx>0 && *incy>0) *res = (make_vector(x,*n,*incx).dot(make_vector(y,*n,*incy))); + else if(*incx<0 && *incy>0) *res = (make_vector(x,*n,-*incx).reverse().dot(make_vector(y,*n,*incy))); + else if(*incx>0 && *incy<0) *res = (make_vector(x,*n,*incx).dot(make_vector(y,*n,-*incy).reverse())); + else if(*incx<0 && *incy<0) *res = (make_vector(x,*n,-*incx).reverse().dot(make_vector(y,*n,-*incy).reverse())); return 0; } @@ -72,11 +73,11 @@ int EIGEN_BLAS_FUNC(dotuw)(int *n, RealScalar *px, int *incx, RealScalar *py, in Scalar* x = reinterpret_cast(px); Scalar* y = reinterpret_cast(py); - if(*incx==1 && *incy==1) *res = (vector(x,*n).cwiseProduct(vector(y,*n))).sum(); - else if(*incx>0 && *incy>0) *res = (vector(x,*n,*incx).cwiseProduct(vector(y,*n,*incy))).sum(); - else if(*incx<0 && *incy>0) *res = (vector(x,*n,-*incx).reverse().cwiseProduct(vector(y,*n,*incy))).sum(); - else if(*incx>0 && *incy<0) *res = (vector(x,*n,*incx).cwiseProduct(vector(y,*n,-*incy).reverse())).sum(); - else if(*incx<0 && *incy<0) *res = (vector(x,*n,-*incx).reverse().cwiseProduct(vector(y,*n,-*incy).reverse())).sum(); + if(*incx==1 && *incy==1) *res = (make_vector(x,*n).cwiseProduct(make_vector(y,*n))).sum(); + else if(*incx>0 && *incy>0) *res = (make_vector(x,*n,*incx).cwiseProduct(make_vector(y,*n,*incy))).sum(); + else if(*incx<0 && *incy>0) *res = (make_vector(x,*n,-*incx).reverse().cwiseProduct(make_vector(y,*n,*incy))).sum(); + else if(*incx>0 && *incy<0) *res = (make_vector(x,*n,*incx).cwiseProduct(make_vector(y,*n,-*incy).reverse())).sum(); + else if(*incx<0 && *incy<0) *res = (make_vector(x,*n,-*incx).reverse().cwiseProduct(make_vector(y,*n,-*incy).reverse())).sum(); return 0; } @@ -88,9 +89,9 @@ RealScalar EIGEN_CAT(EIGEN_CAT(REAL_SCALAR_SUFFIX,SCALAR_SUFFIX),nrm2_)(int *n, Scalar* x = reinterpret_cast(px); if(*incx==1) - return vector(x,*n).stableNorm(); + return make_vector(x,*n).stableNorm(); - return vector(x,*n,*incx).stableNorm(); + return make_vector(x,*n,*incx).stableNorm(); } int EIGEN_CAT(EIGEN_CAT(SCALAR_SUFFIX,REAL_SCALAR_SUFFIX),rot_)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pc, RealScalar *ps) @@ -102,8 +103,8 @@ int EIGEN_CAT(EIGEN_CAT(SCALAR_SUFFIX,REAL_SCALAR_SUFFIX),rot_)(int *n, RealScal RealScalar c = *pc; RealScalar s = *ps; - StridedVectorType vx(vector(x,*n,std::abs(*incx))); - StridedVectorType vy(vector(y,*n,std::abs(*incy))); + StridedVectorType vx(make_vector(x,*n,std::abs(*incx))); + StridedVectorType vy(make_vector(y,*n,std::abs(*incy))); Reverse rvx(vx); Reverse rvy(vy); @@ -125,9 +126,8 @@ int EIGEN_CAT(EIGEN_CAT(SCALAR_SUFFIX,REAL_SCALAR_SUFFIX),scal_)(int *n, RealSca // std::cerr << "__scal " << *n << " " << alpha << " " << *incx << "\n"; - if(*incx==1) vector(x,*n) *= alpha; - else vector(x,*n,std::abs(*incx)) *= alpha; + if(*incx==1) make_vector(x,*n) *= alpha; + else make_vector(x,*n,std::abs(*incx)) *= alpha; return 0; } - diff --git a/blas/level1_impl.h b/blas/level1_impl.h index b08c2f6be..98e4c0963 100644 --- a/blas/level1_impl.h +++ b/blas/level1_impl.h @@ -17,11 +17,11 @@ int EIGEN_BLAS_FUNC(axpy)(int *n, RealScalar *palpha, RealScalar *px, int *incx, if(*n<=0) return 0; - if(*incx==1 && *incy==1) vector(y,*n) += alpha * vector(x,*n); - else if(*incx>0 && *incy>0) vector(y,*n,*incy) += alpha * vector(x,*n,*incx); - else if(*incx>0 && *incy<0) vector(y,*n,-*incy).reverse() += alpha * vector(x,*n,*incx); - else if(*incx<0 && *incy>0) vector(y,*n,*incy) += alpha * vector(x,*n,-*incx).reverse(); - else if(*incx<0 && *incy<0) vector(y,*n,-*incy).reverse() += alpha * vector(x,*n,-*incx).reverse(); + if(*incx==1 && *incy==1) make_vector(y,*n) += alpha * make_vector(x,*n); + else if(*incx>0 && *incy>0) make_vector(y,*n,*incy) += alpha * make_vector(x,*n,*incx); + else if(*incx>0 && *incy<0) make_vector(y,*n,-*incy).reverse() += alpha * make_vector(x,*n,*incx); + else if(*incx<0 && *incy>0) make_vector(y,*n,*incy) += alpha * make_vector(x,*n,-*incx).reverse(); + else if(*incx<0 && *incy<0) make_vector(y,*n,-*incy).reverse() += alpha * make_vector(x,*n,-*incx).reverse(); return 0; } @@ -35,7 +35,7 @@ int EIGEN_BLAS_FUNC(copy)(int *n, RealScalar *px, int *incx, RealScalar *py, int // be carefull, *incx==0 is allowed !! if(*incx==1 && *incy==1) - vector(y,*n) = vector(x,*n); + make_vector(y,*n) = make_vector(x,*n); else { if(*incx<0) x = x - (*n-1)*(*incx); @@ -57,8 +57,8 @@ int EIGEN_CAT(EIGEN_CAT(i,SCALAR_SUFFIX),amax_)(int *n, RealScalar *px, int *inc Scalar* x = reinterpret_cast(px); DenseIndex ret; - if(*incx==1) vector(x,*n).cwiseAbs().maxCoeff(&ret); - else vector(x,*n,std::abs(*incx)).cwiseAbs().maxCoeff(&ret); + if(*incx==1) make_vector(x,*n).cwiseAbs().maxCoeff(&ret); + else make_vector(x,*n,std::abs(*incx)).cwiseAbs().maxCoeff(&ret); return ret+1; } @@ -66,10 +66,10 @@ int EIGEN_CAT(EIGEN_CAT(i,SCALAR_SUFFIX),amin_)(int *n, RealScalar *px, int *inc { if(*n<=0) return 0; Scalar* x = reinterpret_cast(px); - + DenseIndex ret; - if(*incx==1) vector(x,*n).cwiseAbs().minCoeff(&ret); - else vector(x,*n,std::abs(*incx)).cwiseAbs().minCoeff(&ret); + if(*incx==1) make_vector(x,*n).cwiseAbs().minCoeff(&ret); + else make_vector(x,*n,std::abs(*incx)).cwiseAbs().minCoeff(&ret); return ret+1; } @@ -77,7 +77,7 @@ int EIGEN_BLAS_FUNC(rotg)(RealScalar *pa, RealScalar *pb, RealScalar *pc, RealSc { using std::sqrt; using std::abs; - + Scalar& a = *reinterpret_cast(pa); Scalar& b = *reinterpret_cast(pb); RealScalar* c = pc; @@ -143,8 +143,8 @@ int EIGEN_BLAS_FUNC(scal)(int *n, RealScalar *palpha, RealScalar *px, int *incx) Scalar* x = reinterpret_cast(px); Scalar alpha = *reinterpret_cast(palpha); - if(*incx==1) vector(x,*n) *= alpha; - else vector(x,*n,std::abs(*incx)) *= alpha; + if(*incx==1) make_vector(x,*n) *= alpha; + else make_vector(x,*n,std::abs(*incx)) *= alpha; return 0; } @@ -156,12 +156,11 @@ int EIGEN_BLAS_FUNC(swap)(int *n, RealScalar *px, int *incx, RealScalar *py, int Scalar* x = reinterpret_cast(px); Scalar* y = reinterpret_cast(py); - if(*incx==1 && *incy==1) vector(y,*n).swap(vector(x,*n)); - else if(*incx>0 && *incy>0) vector(y,*n,*incy).swap(vector(x,*n,*incx)); - else if(*incx>0 && *incy<0) vector(y,*n,-*incy).reverse().swap(vector(x,*n,*incx)); - else if(*incx<0 && *incy>0) vector(y,*n,*incy).swap(vector(x,*n,-*incx).reverse()); - else if(*incx<0 && *incy<0) vector(y,*n,-*incy).reverse().swap(vector(x,*n,-*incx).reverse()); + if(*incx==1 && *incy==1) make_vector(y,*n).swap(make_vector(x,*n)); + else if(*incx>0 && *incy>0) make_vector(y,*n,*incy).swap(make_vector(x,*n,*incx)); + else if(*incx>0 && *incy<0) make_vector(y,*n,-*incy).reverse().swap(make_vector(x,*n,*incx)); + else if(*incx<0 && *incy>0) make_vector(y,*n,*incy).swap(make_vector(x,*n,-*incx).reverse()); + else if(*incx<0 && *incy<0) make_vector(y,*n,-*incy).reverse().swap(make_vector(x,*n,-*incx).reverse()); return 1; } - diff --git a/blas/level1_real_impl.h b/blas/level1_real_impl.h index 8acecdfc6..02586d519 100644 --- a/blas/level1_real_impl.h +++ b/blas/level1_real_impl.h @@ -19,8 +19,8 @@ RealScalar EIGEN_BLAS_FUNC(asum)(int *n, RealScalar *px, int *incx) if(*n<=0) return 0; - if(*incx==1) return vector(x,*n).cwiseAbs().sum(); - else return vector(x,*n,std::abs(*incx)).cwiseAbs().sum(); + if(*incx==1) return make_vector(x,*n).cwiseAbs().sum(); + else return make_vector(x,*n,std::abs(*incx)).cwiseAbs().sum(); } // computes a vector-vector dot product. @@ -33,11 +33,11 @@ Scalar EIGEN_BLAS_FUNC(dot)(int *n, RealScalar *px, int *incx, RealScalar *py, i Scalar* x = reinterpret_cast(px); Scalar* y = reinterpret_cast(py); - if(*incx==1 && *incy==1) return (vector(x,*n).cwiseProduct(vector(y,*n))).sum(); - else if(*incx>0 && *incy>0) return (vector(x,*n,*incx).cwiseProduct(vector(y,*n,*incy))).sum(); - else if(*incx<0 && *incy>0) return (vector(x,*n,-*incx).reverse().cwiseProduct(vector(y,*n,*incy))).sum(); - else if(*incx>0 && *incy<0) return (vector(x,*n,*incx).cwiseProduct(vector(y,*n,-*incy).reverse())).sum(); - else if(*incx<0 && *incy<0) return (vector(x,*n,-*incx).reverse().cwiseProduct(vector(y,*n,-*incy).reverse())).sum(); + if(*incx==1 && *incy==1) return (make_vector(x,*n).cwiseProduct(make_vector(y,*n))).sum(); + else if(*incx>0 && *incy>0) return (make_vector(x,*n,*incx).cwiseProduct(make_vector(y,*n,*incy))).sum(); + else if(*incx<0 && *incy>0) return (make_vector(x,*n,-*incx).reverse().cwiseProduct(make_vector(y,*n,*incy))).sum(); + else if(*incx>0 && *incy<0) return (make_vector(x,*n,*incx).cwiseProduct(make_vector(y,*n,-*incy).reverse())).sum(); + else if(*incx<0 && *incy<0) return (make_vector(x,*n,-*incx).reverse().cwiseProduct(make_vector(y,*n,-*incy).reverse())).sum(); else return 0; } @@ -50,8 +50,8 @@ Scalar EIGEN_BLAS_FUNC(nrm2)(int *n, RealScalar *px, int *incx) Scalar* x = reinterpret_cast(px); - if(*incx==1) return vector(x,*n).stableNorm(); - else return vector(x,*n,std::abs(*incx)).stableNorm(); + if(*incx==1) return make_vector(x,*n).stableNorm(); + else return make_vector(x,*n,std::abs(*incx)).stableNorm(); } int EIGEN_BLAS_FUNC(rot)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pc, RealScalar *ps) @@ -64,8 +64,8 @@ int EIGEN_BLAS_FUNC(rot)(int *n, RealScalar *px, int *incx, RealScalar *py, int Scalar c = *reinterpret_cast(pc); Scalar s = *reinterpret_cast(ps); - StridedVectorType vx(vector(x,*n,std::abs(*incx))); - StridedVectorType vy(vector(y,*n,std::abs(*incy))); + StridedVectorType vx(make_vector(x,*n,std::abs(*incx))); + StridedVectorType vy(make_vector(y,*n,std::abs(*incy))); Reverse rvx(vx); Reverse rvy(vy); diff --git a/blas/level2_cplx_impl.h b/blas/level2_cplx_impl.h index b850b6cd1..afa9a7493 100644 --- a/blas/level2_cplx_impl.h +++ b/blas/level2_cplx_impl.h @@ -57,8 +57,8 @@ int EIGEN_BLAS_FUNC(hemv)(char *uplo, int *n, RealScalar *palpha, RealScalar *pa if(beta!=Scalar(1)) { - if(beta==Scalar(0)) vector(actual_y, *n).setZero(); - else vector(actual_y, *n) *= beta; + if(beta==Scalar(0)) make_vector(actual_y, *n).setZero(); + else make_vector(actual_y, *n) *= beta; } if(alpha!=Scalar(0)) diff --git a/blas/level2_impl.h b/blas/level2_impl.h index 5f3941975..233c7b753 100644 --- a/blas/level2_impl.h +++ b/blas/level2_impl.h @@ -58,8 +58,8 @@ int EIGEN_BLAS_FUNC(gemv)(char *opa, int *m, int *n, RealScalar *palpha, RealSca if(beta!=Scalar(1)) { - if(beta==Scalar(0)) vector(actual_c, actual_m).setZero(); - else vector(actual_c, actual_m) *= beta; + if(beta==Scalar(0)) make_vector(actual_c, actual_m).setZero(); + else make_vector(actual_c, actual_m) *= beta; } if(code>=4 || func[code]==0) @@ -206,7 +206,7 @@ int EIGEN_BLAS_FUNC(gbmv)(char *trans, int *m, int *n, int *kl, int *ku, RealSca Scalar alpha = *reinterpret_cast(palpha); Scalar beta = *reinterpret_cast(pbeta); int coeff_rows = *kl+*ku+1; - + int info = 0; if(OP(*trans)==INVALID) info = 1; else if(*m<0) info = 2; @@ -218,26 +218,26 @@ int EIGEN_BLAS_FUNC(gbmv)(char *trans, int *m, int *n, int *kl, int *ku, RealSca else if(*incy==0) info = 13; if(info) return xerbla_(SCALAR_SUFFIX_UP"GBMV ",&info,6); - + if(*m==0 || *n==0 || (alpha==Scalar(0) && beta==Scalar(1))) return 0; - + int actual_m = *m; int actual_n = *n; if(OP(*trans)!=NOTR) std::swap(actual_m,actual_n); - + Scalar* actual_x = get_compact_vector(x,actual_n,*incx); Scalar* actual_y = get_compact_vector(y,actual_m,*incy); - + if(beta!=Scalar(1)) { - if(beta==Scalar(0)) vector(actual_y, actual_m).setZero(); - else vector(actual_y, actual_m) *= beta; + if(beta==Scalar(0)) make_vector(actual_y, actual_m).setZero(); + else make_vector(actual_y, actual_m) *= beta; } - + MatrixType mat_coeffs(a,coeff_rows,*n,*lda); - + int nb = std::min(*n,(*m)+(*ku)); for(int j=0; j(pa); Scalar* x = reinterpret_cast(px); int coeff_rows = *k + 1; - + int info = 0; if(UPLO(*uplo)==INVALID) info = 1; else if(OP(*opa)==INVALID) info = 2; @@ -283,37 +283,37 @@ int EIGEN_BLAS_FUNC(tbmv)(char *uplo, char *opa, char *diag, int *n, int *k, Rea else if(*incx==0) info = 9; if(info) return xerbla_(SCALAR_SUFFIX_UP"TBMV ",&info,6); - + if(*n==0) return 0; - + int actual_n = *n; - + Scalar* actual_x = get_compact_vector(x,actual_n,*incx); - + MatrixType mat_coeffs(a,coeff_rows,*n,*lda); - + int ku = UPLO(*uplo)==UPPER ? *k : 0; int kl = UPLO(*uplo)==LOWER ? *k : 0; - + for(int j=0; j<*n; ++j) { int start = std::max(0,j - ku); int end = std::min((*m)-1,j + kl); int len = end - start + 1; int offset = (ku) - j + start; - + if(OP(*trans)==NOTR) - vector(actual_y+start,len) += (alpha*actual_x[j]) * mat_coeffs.col(j).segment(offset,len); + make_vector(actual_y+start,len) += (alpha*actual_x[j]) * mat_coeffs.col(j).segment(offset,len); else if(OP(*trans)==TR) - actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).transpose() * vector(actual_x+start,len) ).value(); + actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).transpose() * make_vector(actual_x+start,len) ).value(); else - actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).adjoint() * vector(actual_x+start,len) ).value(); - } - + actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).adjoint() * make_vector(actual_x+start,len) ).value(); + } + if(actual_x!=x) delete[] actual_x; if(actual_y!=y) delete[] copy_back(actual_y,y,actual_m,*incy); - + return 0; } #endif @@ -362,7 +362,7 @@ int EIGEN_BLAS_FUNC(tbsv)(char *uplo, char *op, char *diag, int *n, int *k, Real Scalar* a = reinterpret_cast(pa); Scalar* x = reinterpret_cast(px); int coeff_rows = *k+1; - + int info = 0; if(UPLO(*uplo)==INVALID) info = 1; else if(OP(*op)==INVALID) info = 2; @@ -373,22 +373,22 @@ int EIGEN_BLAS_FUNC(tbsv)(char *uplo, char *op, char *diag, int *n, int *k, Real else if(*incx==0) info = 9; if(info) return xerbla_(SCALAR_SUFFIX_UP"TBSV ",&info,6); - + if(*n==0 || (*k==0 && DIAG(*diag)==UNIT)) return 0; - + int actual_n = *n; - + Scalar* actual_x = get_compact_vector(x,actual_n,*incx); - + int code = OP(*op) | (UPLO(*uplo) << 2) | (DIAG(*diag) << 3); if(code>=16 || func[code]==0) return 0; func[code](*n, *k, a, *lda, actual_x); - + if(actual_x!=x) delete[] copy_back(actual_x,x,actual_n,*incx); - + return 0; } @@ -521,4 +521,3 @@ int EIGEN_BLAS_FUNC(tpsv)(char *uplo, char *opa, char *diag, int *n, RealScalar return 1; } - diff --git a/blas/level2_real_impl.h b/blas/level2_real_impl.h index 8d56eaaa1..9722a4674 100644 --- a/blas/level2_real_impl.h +++ b/blas/level2_real_impl.h @@ -51,8 +51,8 @@ int EIGEN_BLAS_FUNC(symv) (char *uplo, int *n, RealScalar *palpha, RealScalar *p if(beta!=Scalar(1)) { - if(beta==Scalar(0)) vector(actual_y, *n).setZero(); - else vector(actual_y, *n) *= beta; + if(beta==Scalar(0)) make_vector(actual_y, *n).setZero(); + else make_vector(actual_y, *n) *= beta; } int code = UPLO(*uplo); @@ -179,7 +179,7 @@ int EIGEN_BLAS_FUNC(syr2)(char *uplo, int *n, RealScalar *palpha, RealScalar *px Scalar* x_cpy = get_compact_vector(x,*n,*incx); Scalar* y_cpy = get_compact_vector(y,*n,*incy); - + int code = UPLO(*uplo); if(code>=2 || func[code]==0) return 0; @@ -366,5 +366,3 @@ int EIGEN_BLAS_FUNC(ger)(int *m, int *n, Scalar *palpha, Scalar *px, int *incx, return 1; } - - From d992634fbce54e2ad367e1f57834503b5a3b5eaf Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 1 Apr 2014 11:31:21 +0200 Subject: [PATCH 056/158] Fix bug #776: it seems that mingw does not support weak linking --- blas/xerbla.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/blas/xerbla.cpp b/blas/xerbla.cpp index 0d57710fe..0422f79b7 100644 --- a/blas/xerbla.cpp +++ b/blas/xerbla.cpp @@ -1,7 +1,7 @@ #include -#if (defined __GNUC__) +#if (defined __GNUC__) && (!defined __MINGW32__) #define EIGEN_WEAK_LINKING __attribute__ ((weak)) #else #define EIGEN_WEAK_LINKING From ec65e6648cf2483bd5e10a3010395ffb42a82a77 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 1 Apr 2014 11:45:43 +0200 Subject: [PATCH 057/158] bug #775: propagate generator when workingaround cmake bug #9220 --- cmake/language_support.cmake | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cmake/language_support.cmake b/cmake/language_support.cmake index d687b71f6..93f8a8fd8 100644 --- a/cmake/language_support.cmake +++ b/cmake/language_support.cmake @@ -33,7 +33,7 @@ function(workaround_9220 language language_works) file(WRITE ${CMAKE_BINARY_DIR}/language_tests/${language}/CMakeLists.txt ${text}) execute_process( - COMMAND ${CMAKE_COMMAND} . + COMMAND ${CMAKE_COMMAND} . -G "${CMAKE_GENERATOR}" WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/language_tests/${language} RESULT_VARIABLE return_code OUTPUT_QUIET @@ -64,3 +64,4 @@ endfunction(workaround_9220) #message("CXX_language_works = ${CXX_language_works}") #workaround_9220(CXXp CXXp_language_works) #message("CXXp_language_works = ${CXXp_language_works}") + From ceae5b4145ba2a70793c58cff05a43e95e721ff5 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 1 Apr 2014 11:52:23 +0200 Subject: [PATCH 058/158] Fix lapack build --- lapack/eigenvalues.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lapack/eigenvalues.cpp b/lapack/eigenvalues.cpp index a1526ebcd..6141032ab 100644 --- a/lapack/eigenvalues.cpp +++ b/lapack/eigenvalues.cpp @@ -64,14 +64,14 @@ EIGEN_LAPACK_FUNC(syev,(char *jobz, char *uplo, int* n, Scalar* a, int *lda, Sca if(eig.info()==NoConvergence) { - vector(w,*n).setZero(); + make_vector(w,*n).setZero(); if(computeVectors) matrix(a,*n,*n,*lda).setIdentity(); //*info = 1; return 0; } - vector(w,*n) = eig.eigenvalues(); + make_vector(w,*n) = eig.eigenvalues(); if(computeVectors) matrix(a,*n,*n,*lda) = eig.eigenvectors(); From 56c4851323376876977a9d408cb5aec2a6159c54 Mon Sep 17 00:00:00 2001 From: Florian George Date: Tue, 1 Apr 2014 15:52:25 +0200 Subject: [PATCH 059/158] Fixed typo: symmretric -> symmetric --- bench/btl/data/action_settings.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bench/btl/data/action_settings.txt b/bench/btl/data/action_settings.txt index e32213e22..0244fa59f 100644 --- a/bench/btl/data/action_settings.txt +++ b/bench/btl/data/action_settings.txt @@ -14,6 +14,6 @@ partial_lu_decomp ; "{/*1.5 Partial LU decomposition}" ; "matrix size" ; 4:3000 tridiagonalization ; "{/*1.5 Tridiagonalization}" ; "matrix size" ; 4:3000 hessenberg ; "{/*1.5 Hessenberg decomposition}" ; "matrix size" ; 4:3000 symv ; "{/*1.5 symmetric matrix vector product}" ; "matrix size" ; 4:3000 -syr2 ; "{/*1.5 symmretric rank-2 update (A += u^T v + u v^T)}" ; "matrix size" ; 4:3000 +syr2 ; "{/*1.5 symmetric rank-2 update (A += u^T v + u v^T)}" ; "matrix size" ; 4:3000 ger ; "{/*1.5 general rank-1 update (A += u v^T)}" ; "matrix size" ; 4:3000 rot ; "{/*1.5 apply rotation in the plane}" ; "vector size" ; 4:1000000 \ No newline at end of file From 1cb8de12501b3db3d12774774cbbe12983243cee Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Tue, 1 Apr 2014 17:44:48 +0200 Subject: [PATCH 060/158] Make some actual verifications inside the autodiff unit test --- unsupported/test/autodiff.cpp | 45 ++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/unsupported/test/autodiff.cpp b/unsupported/test/autodiff.cpp index 6eb417e8d..087e7c542 100644 --- a/unsupported/test/autodiff.cpp +++ b/unsupported/test/autodiff.cpp @@ -127,46 +127,47 @@ template void forward_jacobian(const Func& f) VERIFY_IS_APPROX(j, jref); } + +// TODO also check actual derivatives! void test_autodiff_scalar() { - std::cerr << foo(1,2) << "\n"; + Vector2f p = Vector2f::Random(); typedef AutoDiffScalar AD; - AD ax(1,Vector2f::UnitX()); - AD ay(2,Vector2f::UnitY()); + AD ax(p.x(),Vector2f::UnitX()); + AD ay(p.y(),Vector2f::UnitY()); AD res = foo(ax,ay); - std::cerr << res.value() << " <> " - << res.derivatives().transpose() << "\n\n"; + VERIFY_IS_APPROX(res.value(), foo(p.x(),p.y())); } +// TODO also check actual derivatives! void test_autodiff_vector() { - std::cerr << foo(Vector2f(1,2)) << "\n"; + Vector2f p = Vector2f::Random(); typedef AutoDiffScalar AD; typedef Matrix VectorAD; - VectorAD p(AD(1),AD(-1)); - p.x().derivatives() = Vector2f::UnitX(); - p.y().derivatives() = Vector2f::UnitY(); + VectorAD ap = p.cast(); + ap.x().derivatives() = Vector2f::UnitX(); + ap.y().derivatives() = Vector2f::UnitY(); - AD res = foo(p); - std::cerr << res.value() << " <> " - << res.derivatives().transpose() << "\n\n"; + AD res = foo(ap); + VERIFY_IS_APPROX(res.value(), foo(p)); } void test_autodiff_jacobian() { - for(int i = 0; i < g_repeat; i++) { - CALL_SUBTEST(( forward_jacobian(TestFunc1()) )); - CALL_SUBTEST(( forward_jacobian(TestFunc1()) )); - CALL_SUBTEST(( forward_jacobian(TestFunc1()) )); - CALL_SUBTEST(( forward_jacobian(TestFunc1()) )); - CALL_SUBTEST(( forward_jacobian(TestFunc1(3,3)) )); - } + CALL_SUBTEST(( forward_jacobian(TestFunc1()) )); + CALL_SUBTEST(( forward_jacobian(TestFunc1()) )); + CALL_SUBTEST(( forward_jacobian(TestFunc1()) )); + CALL_SUBTEST(( forward_jacobian(TestFunc1()) )); + CALL_SUBTEST(( forward_jacobian(TestFunc1(3,3)) )); } void test_autodiff() { - test_autodiff_scalar(); - test_autodiff_vector(); -// test_autodiff_jacobian(); + for(int i = 0; i < g_repeat; i++) { + CALL_SUBTEST_1( test_autodiff_scalar() ); + CALL_SUBTEST_2( test_autodiff_vector() ); + CALL_SUBTEST_3( test_autodiff_jacobian() ); + } } From 8044b00a7fa30af20cc184fa2991bd0acf0f9aa3 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 3 Apr 2014 23:41:47 +0200 Subject: [PATCH 061/158] bug #782: Workaround for gcc <= 4.4 compilation error on the NEON PacketMath code. --- Eigen/src/Core/arch/NEON/PacketMath.h | 1 + 1 file changed, 1 insertion(+) diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 05e891df2..c2f28e25c 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -95,6 +95,7 @@ template<> struct packet_traits : default_packet_traits // workaround gcc 4.2, 4.3 and 4.4 compilatin issue EIGEN_STRONG_INLINE float32x4_t vld1q_f32(const float* x) { return ::vld1q_f32((const float32_t*)x); } EIGEN_STRONG_INLINE float32x2_t vld1_f32 (const float* x) { return ::vld1_f32 ((const float32_t*)x); } +EIGEN_STRONG_INLINE float32x2_t vld1_dup_f32 (const float* x) { return ::vld1_dup_f32 ((const float32_t*)x); } EIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); } EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); } #endif From 096af597992609be62ec1104fdee476e3065f2e4 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Fri, 4 Apr 2014 17:48:37 +0200 Subject: [PATCH 062/158] Fix bug #784: Assert if assigning a product to a triangularView does not match the size. --- Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index ffa871cae..225b994d1 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -265,6 +265,8 @@ template template TriangularView& TriangularView::assignProduct(const ProductBase& prod, const Scalar& alpha) { + eigen_assert(m_matrix.rows() == prod.rows() && m_matrix.cols() == prod.cols()); + general_product_to_triangular_selector::run(m_matrix.const_cast_derived(), prod.derived(), alpha); return *this; From 5afcb4965c9ccdce05e1852dd22b552559696f61 Mon Sep 17 00:00:00 2001 From: Jitse Niesen Date: Fri, 4 Apr 2014 16:48:13 +0100 Subject: [PATCH 063/158] Remove out-dated comment in cholesky test. --- test/cholesky.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/test/cholesky.cpp b/test/cholesky.cpp index d4d90e467..569318f83 100644 --- a/test/cholesky.cpp +++ b/test/cholesky.cpp @@ -82,10 +82,6 @@ template void cholesky(const MatrixType& m) symm += a1 * a1.adjoint(); } - // to test if really Cholesky only uses the upper triangular part, uncomment the following - // FIXME: currently that fails !! - //symm.template part().setZero(); - { SquareMatrixType symmUp = symm.template triangularView(); SquareMatrixType symmLo = symm.template triangularView(); From b446ff037e9a8573da55c180b2e72e07dd3de4e6 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 4 Apr 2014 14:12:24 -0700 Subject: [PATCH 064/158] Deleted some dead code. --- Eigen/src/Core/products/GeneralMatrixMatrix.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index dd9d79657..b35625a11 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -241,7 +241,6 @@ class level3_blocking protected: LhsScalar* m_blockA; RhsScalar* m_blockB; - RhsScalar* m_blockW; DenseIndex m_mc; DenseIndex m_nc; @@ -250,7 +249,7 @@ class level3_blocking public: level3_blocking() - : m_blockA(0), m_blockB(0), m_blockW(0), m_mc(0), m_nc(0), m_kc(0) + : m_blockA(0), m_blockB(0), m_mc(0), m_nc(0), m_kc(0) {} inline DenseIndex mc() const { return m_mc; } @@ -259,7 +258,6 @@ class level3_blocking inline LhsScalar* blockA() { return m_blockA; } inline RhsScalar* blockB() { return m_blockB; } - inline RhsScalar* blockW() { return m_blockW; } }; template From 3b2321e3ab157b314286227e08d738d74bfbf5d1 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 4 Apr 2014 17:08:47 -0700 Subject: [PATCH 065/158] Updated the geo_parametrizedline_2 test for AVX. --- test/geo_parametrizedline.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/geo_parametrizedline.cpp b/test/geo_parametrizedline.cpp index 427327cd7..58d8eb7f5 100644 --- a/test/geo_parametrizedline.cpp +++ b/test/geo_parametrizedline.cpp @@ -66,9 +66,9 @@ template void parametrizedline_alignment() typedef ParametrizedLine Line4a; typedef ParametrizedLine Line4u; - EIGEN_ALIGN_DEFAULT Scalar array1[8]; - EIGEN_ALIGN_DEFAULT Scalar array2[8]; - EIGEN_ALIGN_DEFAULT Scalar array3[8+1]; + EIGEN_ALIGN_DEFAULT Scalar array1[16]; + EIGEN_ALIGN_DEFAULT Scalar array2[16]; + EIGEN_ALIGN_DEFAULT Scalar array3[16+1]; Scalar* array3u = array3+1; Line4a *p1 = ::new(reinterpret_cast(array1)) Line4a; From a91a7a1964305311133858de96b845da49389922 Mon Sep 17 00:00:00 2001 From: Jitse Niesen Date: Mon, 7 Apr 2014 14:14:48 +0100 Subject: [PATCH 066/158] doc: Add references to Cholesky methods in SelfAdjointView. --- Eigen/Cholesky | 6 ++++-- Eigen/src/Cholesky/LDLT.h | 6 ++++-- Eigen/src/Cholesky/LLT.h | 6 ++++-- doc/Doxyfile.in | 3 ++- 4 files changed, 14 insertions(+), 7 deletions(-) diff --git a/Eigen/Cholesky b/Eigen/Cholesky index f727f5d89..7314d326c 100644 --- a/Eigen/Cholesky +++ b/Eigen/Cholesky @@ -10,9 +10,11 @@ * * * This module provides two variants of the Cholesky decomposition for selfadjoint (hermitian) matrices. - * Those decompositions are accessible via the following MatrixBase methods: - * - MatrixBase::llt(), + * Those decompositions are also accessible via the following methods: + * - MatrixBase::llt() * - MatrixBase::ldlt() + * - SelfAdjointView::llt() + * - SelfAdjointView::ldlt() * * \code * #include diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h index b43e85e7f..efac7fe40 100644 --- a/Eigen/src/Cholesky/LDLT.h +++ b/Eigen/src/Cholesky/LDLT.h @@ -43,7 +43,7 @@ namespace internal { * Remember that Cholesky decompositions are not rank-revealing. Also, do not use a Cholesky * decomposition to determine whether a system of equations has a solution. * - * \sa MatrixBase::ldlt(), class LLT + * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt(), class LLT */ template class LDLT { @@ -179,7 +179,7 @@ template class LDLT * least-square solution of \f$ D y_3 = y_2 \f$ is computed. This does not mean that this function * computes the least-square solution of \f$ A x = b \f$ is \f$ A \f$ is singular. * - * \sa MatrixBase::ldlt() + * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt() */ template inline const internal::solve_retval @@ -582,6 +582,7 @@ MatrixType LDLT::reconstructedMatrix() const #ifndef __CUDACC__ /** \cholesky_module * \returns the Cholesky decomposition with full pivoting without square root of \c *this + * \sa MatrixBase::ldlt() */ template inline const LDLT::PlainObject, UpLo> @@ -592,6 +593,7 @@ SelfAdjointView::ldlt() const /** \cholesky_module * \returns the Cholesky decomposition with full pivoting without square root of \c *this + * \sa SelfAdjointView::ldlt() */ template inline const LDLT::PlainObject> diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h index 2201c641e..45ed8438f 100644 --- a/Eigen/src/Cholesky/LLT.h +++ b/Eigen/src/Cholesky/LLT.h @@ -41,7 +41,7 @@ template struct LLT_Traits; * Example: \include LLT_example.cpp * Output: \verbinclude LLT_example.out * - * \sa MatrixBase::llt(), class LDLT + * \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT */ /* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH) * Note that during the decomposition, only the upper triangular part of A is considered. Therefore, @@ -115,7 +115,7 @@ template class LLT * Example: \include LLT_solve.cpp * Output: \verbinclude LLT_solve.out * - * \sa solveInPlace(), MatrixBase::llt() + * \sa solveInPlace(), MatrixBase::llt(), SelfAdjointView::llt() */ template inline const internal::solve_retval @@ -468,6 +468,7 @@ MatrixType LLT::reconstructedMatrix() const #ifndef __CUDACC__ /** \cholesky_module * \returns the LLT decomposition of \c *this + * \sa SelfAdjointView::llt() */ template inline const LLT::PlainObject> @@ -478,6 +479,7 @@ MatrixBase::llt() const /** \cholesky_module * \returns the LLT decomposition of \c *this + * \sa SelfAdjointView::llt() */ template inline const LLT::PlainObject, UpLo> diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in index 85af9f1d4..7bbf693a0 100644 --- a/doc/Doxyfile.in +++ b/doc/Doxyfile.in @@ -223,7 +223,7 @@ ALIASES = "only_for_vectors=This is only for vectors (either row- "note_about_using_kernel_to_study_multiple_solutions=If you need a complete analysis of the space of solutions, take the one solution obtained by this method and add to it elements of the kernel, as determined by kernel()." \ "note_about_checking_solutions=This method just tries to find as good a solution as possible. If you want to check whether a solution exists or if it is accurate, just call this function to get a result and then compute the error of this result, or use MatrixBase::isApprox() directly, for instance like this: \code bool a_solution_exists = (A*result).isApprox(b, precision); \endcode This method avoids dividing by zero, so that the non-existence of a solution doesn't by itself mean that you'll get \c inf or \c nan values." \ "note_try_to_help_rvo=This function returns the result by value. In order to make that efficient, it is implemented as just a return statement using a special constructor, hopefully allowing the compiler to perform a RVO (return value optimization)." \ - "nonstableyet=\warning This is not considered to be part of the stable public API yet. Changes may happen in future releases. See \ref Experimental \"Experimental parts of Eigen\" + "nonstableyet=\warning This is not considered to be part of the stable public API yet. Changes may happen in future releases. See \ref Experimental \"Experimental parts of Eigen\"" ALIASES += "eigenAutoToc= " @@ -1583,6 +1583,7 @@ PREDEFINED = EIGEN_EMPTY_STRUCT \ EIGEN_VECTORIZE \ EIGEN_QT_SUPPORT \ EIGEN_STRONG_INLINE=inline \ + EIGEN_DEVICE_FUNC= \ "EIGEN2_SUPPORT_STAGE=99" \ "EIGEN_MAKE_CWISE_BINARY_OP(METHOD,FUNCTOR)=template const CwiseBinaryOp, const Derived, const OtherDerived> METHOD(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const;" \ "EIGEN_CWISE_PRODUCT_RETURN_TYPE(LHS,RHS)=CwiseBinaryOp, const LHS, const RHS>" From a1fcf599faa023d1752b37aef215d80286e76839 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 10 Apr 2014 11:19:37 -0700 Subject: [PATCH 067/158] Silenced a compilation warning produced by nvcc. --- Eigen/src/Core/util/Memory.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 0f8ab065a..1a1d3e98d 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -89,7 +89,7 @@ inline void throw_std_bad_alloc() #ifdef EIGEN_EXCEPTIONS throw std::bad_alloc(); #else - std::size_t huge = -1; + std::size_t huge = -1ULL; new int[huge]; #endif } From 1b333c89c9762795fda69f29a1239541a712f4f1 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 10 Apr 2014 17:43:13 -0700 Subject: [PATCH 068/158] Updated my previous fix to avoid introducing a compilation warning on ARM platforms. --- Eigen/src/Core/util/Memory.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 1a1d3e98d..4988be5d9 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -89,7 +89,7 @@ inline void throw_std_bad_alloc() #ifdef EIGEN_EXCEPTIONS throw std::bad_alloc(); #else - std::size_t huge = -1ULL; + std::size_t huge = static_cast(-1); new int[huge]; #endif } From 91288e9bf9d6a9c3d63a9152e863b4390d0a40c7 Mon Sep 17 00:00:00 2001 From: Freddie Witherden Date: Sat, 12 Apr 2014 12:53:09 +0100 Subject: [PATCH 069/158] Add include LevenbergMarquardt in CMakeLists.txt. This fixes bug #768. --- unsupported/Eigen/src/CMakeLists.txt | 1 + unsupported/Eigen/src/LevenbergMarquardt/CMakeLists.txt | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/src/CMakeLists.txt b/unsupported/Eigen/src/CMakeLists.txt index f3180b52b..8eb2808e3 100644 --- a/unsupported/Eigen/src/CMakeLists.txt +++ b/unsupported/Eigen/src/CMakeLists.txt @@ -2,6 +2,7 @@ ADD_SUBDIRECTORY(AutoDiff) ADD_SUBDIRECTORY(BVH) ADD_SUBDIRECTORY(FFT) ADD_SUBDIRECTORY(IterativeSolvers) +ADD_SUBDIRECTORY(LevenbergMarquardt) ADD_SUBDIRECTORY(MatrixFunctions) ADD_SUBDIRECTORY(MoreVectorization) ADD_SUBDIRECTORY(NonLinearOptimization) diff --git a/unsupported/Eigen/src/LevenbergMarquardt/CMakeLists.txt b/unsupported/Eigen/src/LevenbergMarquardt/CMakeLists.txt index 8513803ce..d9690854d 100644 --- a/unsupported/Eigen/src/LevenbergMarquardt/CMakeLists.txt +++ b/unsupported/Eigen/src/LevenbergMarquardt/CMakeLists.txt @@ -2,5 +2,5 @@ FILE(GLOB Eigen_LevenbergMarquardt_SRCS "*.h") INSTALL(FILES ${Eigen_LevenbergMarquardt_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/LevenbergMarquardt COMPONENT Devel + DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/LevenbergMarquardt COMPONENT Devel ) From a803ff18a963fc23ddb3dccf33ed7058af415f39 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sat, 12 Apr 2014 20:24:05 -0700 Subject: [PATCH 070/158] Fixed a typo in cuda_basic.cu --- test/cuda_basic.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/cuda_basic.cu b/test/cuda_basic.cu index aa7f7a599..4c7e96c10 100644 --- a/test/cuda_basic.cu +++ b/test/cuda_basic.cu @@ -129,7 +129,7 @@ void test_cuda_basic() CALL_SUBTEST( run_and_compare_to_cuda(prod(), nthreads, in, out) ); CALL_SUBTEST( run_and_compare_to_cuda(diagonal(), nthreads, in, out) ); - CALL_SUBTEST( run_and_compare_to_c(), nthreads, in, out) ); + CALL_SUBTEST( run_and_compare_to_cuda(diagonal(), nthreads, in, out) ); CALL_SUBTEST( run_and_compare_to_cuda(eigenvalues(), nthreads, in, out) ); CALL_SUBTEST( run_and_compare_to_cuda(eigenvalues(), nthreads, in, out) ); From 7903d3f27b275040702ce30eac8d329d6f571205 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sat, 12 Apr 2014 23:39:37 -0700 Subject: [PATCH 071/158] Updated the compiler flags to enable nvcc to work with clang. --- test/CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 62cbedae7..c2d827051 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -315,6 +315,9 @@ find_package(CUDA) if(CUDA_FOUND) set(CUDA_PROPAGATE_HOST_FLAGS OFF) + if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") + set(CUDA_NVCC_FLAGS "-ccbin /usr/bin/clang" CACHE STRING "nvcc flags" FORCE) + endif() cuda_include_directories(${CMAKE_CURRENT_BINARY_DIR}) set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") From 0587db8bf5ca5d5eb6fb8df1c02abddd5b5718ba Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 14 Apr 2014 11:43:08 +0200 Subject: [PATCH 072/158] bug #793: fix overflow in EigenSolver and add respective regression unit test --- Eigen/src/Eigenvalues/EigenSolver.h | 15 ++++++++++++++- test/eigensolver_generic.cpp | 13 +++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Eigenvalues/EigenSolver.h b/Eigen/src/Eigenvalues/EigenSolver.h index bf20e03ef..739466949 100644 --- a/Eigen/src/Eigenvalues/EigenSolver.h +++ b/Eigen/src/Eigenvalues/EigenSolver.h @@ -366,6 +366,7 @@ EigenSolver::compute(const MatrixType& matrix, bool computeEigenvect { using std::sqrt; using std::abs; + using std::max; eigen_assert(matrix.cols() == matrix.rows()); // Reduce to real Schur form. @@ -390,7 +391,19 @@ EigenSolver::compute(const MatrixType& matrix, bool computeEigenvect else { Scalar p = Scalar(0.5) * (m_matT.coeff(i, i) - m_matT.coeff(i+1, i+1)); - Scalar z = sqrt(abs(p * p + m_matT.coeff(i+1, i) * m_matT.coeff(i, i+1))); + Scalar z; + // Compute z = sqrt(abs(p * p + m_matT.coeff(i+1, i) * m_matT.coeff(i, i+1))); + // without overflow + { + Scalar t0 = m_matT.coeff(i+1, i); + Scalar t1 = m_matT.coeff(i, i+1); + Scalar maxval = (max)(abs(p),(max)(abs(t0),abs(t1))); + t0 /= maxval; + t1 /= maxval; + Scalar p0 = p/maxval; + z = maxval * sqrt(abs(p0 * p0 + t0 * t1)); + } + m_eivalues.coeffRef(i) = ComplexScalar(m_matT.coeff(i+1, i+1) + p, z); m_eivalues.coeffRef(i+1) = ComplexScalar(m_matT.coeff(i+1, i+1) + p, -z); i += 2; diff --git a/test/eigensolver_generic.cpp b/test/eigensolver_generic.cpp index 005af81eb..91383b5cf 100644 --- a/test/eigensolver_generic.cpp +++ b/test/eigensolver_generic.cpp @@ -121,5 +121,18 @@ void test_eigensolver_generic() } ); + // regression test for bug 793 +#ifdef EIGEN_TEST_PART_2 + { + MatrixXd a(3,3); + a << 0, 0, 1, + 1, 1, 1, + 1, 1e+200, 1; + Eigen::EigenSolver eig(a); + VERIFY_IS_APPROX(a * eig.pseudoEigenvectors(), eig.pseudoEigenvectors() * eig.pseudoEigenvalueMatrix()); + VERIFY_IS_APPROX(a * eig.eigenvectors(), eig.eigenvectors() * eig.eigenvalues().asDiagonal()); + } +#endif + TEST_SET_BUT_UNUSED_VARIABLE(s) } From 148acf8e4fb71294703d4d1deafaf52829535ab7 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 14 Apr 2014 13:52:16 +0200 Subject: [PATCH 073/158] bug #790: fix overflow in real_2x2_jacobi_svd --- Eigen/src/SVD/JacobiSVD.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h index eee31ca97..439eb5d29 100644 --- a/Eigen/src/SVD/JacobiSVD.h +++ b/Eigen/src/SVD/JacobiSVD.h @@ -415,6 +415,7 @@ void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q, JacobiRotation *j_right) { using std::sqrt; + using std::abs; Matrix m; m << numext::real(matrix.coeff(p,p)), numext::real(matrix.coeff(p,q)), numext::real(matrix.coeff(q,p)), numext::real(matrix.coeff(q,q)); @@ -428,9 +429,11 @@ void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q, } else { - RealScalar u = d / t; - rot1.c() = RealScalar(1) / sqrt(RealScalar(1) + numext::abs2(u)); - rot1.s() = rot1.c() * u; + RealScalar t2d2 = numext::hypot(t,d); + rot1.c() = abs(t)/t2d2; + rot1.s() = d/t2d2; + if(tmakeJacobi(m,0,1); From feaf7c7e6d01a4804cee5949a01ece1f8a46866f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 14 Apr 2014 10:44:17 -0700 Subject: [PATCH 074/158] Optimized SSE unaligned loads and stores when compiling a 64bit target with a recent version of gcc (ie gcc 4.8). --- Eigen/src/Core/arch/SSE/PacketMath.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index ea05a3415..bc17726b4 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -271,14 +271,17 @@ template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) { E // TODO: do the same for MSVC (ICC is compatible) // NOTE: with the code below, MSVC's compiler crashes! -#if defined(__GNUC__) && defined(__i386__) +#if defined(__GNUC__) && (defined(__i386__) || (defined(__x86_64) && EIGEN_GNUC_AT_LEAST(4, 8))) // bug 195: gcc/i386 emits weird x87 fldl/fstpl instructions for _mm_load_sd #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1 + #define EIGEN_AVOID_CUSTOM_UNALIGNED_STORES 1 #elif defined(__clang__) // bug 201: Segfaults in __mm_loadh_pd with clang 2.8 #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1 + #define EIGEN_AVOID_CUSTOM_UNALIGNED_STORES 0 #else #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 0 + #define EIGEN_AVOID_CUSTOM_UNALIGNED_STORES 0 #endif template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) @@ -338,8 +341,12 @@ template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet4i& f template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE +#if EIGEN_AVOID_CUSTOM_UNALIGNED_STORES + _mm_storeu_pd(to, from); +#else _mm_storel_pd((to), from); _mm_storeh_pd((to+1), from); +#endif } template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast(to), _mm_castps_pd(from)); } template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast(to), _mm_castsi128_pd(from)); } From 7098e6d976ee8d5b25776e749d3ef6e66a302829 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 14 Apr 2014 21:57:49 +0200 Subject: [PATCH 075/158] Add isfinite overload for complexes. --- Eigen/src/Core/MathFunctions.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 63fb92b75..20fc2be74 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -669,6 +669,15 @@ bool (isfinite)(const T& x) return x::highest() && x>NumTraits::lowest(); } +template +EIGEN_DEVICE_FUNC +bool (isfinite)(const std::complex& x) +{ + using std::real; + using std::imag; + return isfinite(real(x)) && isfinite(imag(x)); +} + } // end namespace numext namespace internal { From 3c66bb136bf2adcb9d73d3d66850a8b907bc9264 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 14 Apr 2014 22:00:27 +0200 Subject: [PATCH 076/158] bug #793: detect NaN and INF in EigenSolver instead of aborting with an assert. --- Eigen/src/Eigenvalues/EigenSolver.h | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Eigenvalues/EigenSolver.h b/Eigen/src/Eigenvalues/EigenSolver.h index 739466949..d2563d470 100644 --- a/Eigen/src/Eigenvalues/EigenSolver.h +++ b/Eigen/src/Eigenvalues/EigenSolver.h @@ -275,10 +275,11 @@ template class EigenSolver */ EigenSolver& compute(const MatrixType& matrix, bool computeEigenvectors = true); + /** \returns NumericalIssue if the input contains INF or NaN values or overflow occured. Returns Success otherwise. */ ComputationInfo info() const { eigen_assert(m_isInitialized && "EigenSolver is not initialized."); - return m_realSchur.info(); + return m_info; } /** \brief Sets the maximum number of iterations allowed. */ @@ -302,6 +303,7 @@ template class EigenSolver EigenvalueType m_eivalues; bool m_isInitialized; bool m_eigenvectorsOk; + ComputationInfo m_info; RealSchur m_realSchur; MatrixType m_matT; @@ -367,12 +369,15 @@ EigenSolver::compute(const MatrixType& matrix, bool computeEigenvect using std::sqrt; using std::abs; using std::max; + using numext::isfinite; eigen_assert(matrix.cols() == matrix.rows()); // Reduce to real Schur form. m_realSchur.compute(matrix, computeEigenvectors); + + m_info = m_realSchur.info(); - if (m_realSchur.info() == Success) + if (m_info == Success) { m_matT = m_realSchur.matrixT(); if (computeEigenvectors) @@ -386,6 +391,13 @@ EigenSolver::compute(const MatrixType& matrix, bool computeEigenvect if (i == matrix.cols() - 1 || m_matT.coeff(i+1, i) == Scalar(0)) { m_eivalues.coeffRef(i) = m_matT.coeff(i, i); + if(!isfinite(m_eivalues.coeffRef(i))) + { + m_isInitialized = true; + m_eigenvectorsOk = false; + m_info = NumericalIssue; + return *this; + } ++i; } else @@ -406,6 +418,13 @@ EigenSolver::compute(const MatrixType& matrix, bool computeEigenvect m_eivalues.coeffRef(i) = ComplexScalar(m_matT.coeff(i+1, i+1) + p, z); m_eivalues.coeffRef(i+1) = ComplexScalar(m_matT.coeff(i+1, i+1) + p, -z); + if(!(isfinite(m_eivalues.coeffRef(i)) && isfinite(m_eivalues.coeffRef(i+1)))) + { + m_isInitialized = true; + m_eigenvectorsOk = false; + m_info = NumericalIssue; + return *this; + } i += 2; } } @@ -594,7 +613,7 @@ void EigenSolver::doComputeEigenvectors() } else { - eigen_assert(0 && "Internal bug in EigenSolver"); // this should not happen + eigen_assert(0 && "Internal bug in EigenSolver (INF or NaN has not been detected)"); // this should not happen } } From 1afd50e0f31d56f7ee228e915e15998422c3ea11 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 14 Apr 2014 14:26:30 -0700 Subject: [PATCH 077/158] Fixed a typo in CXX11Meta.h --- unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h index d6b5d75d9..618e2eb7b 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h @@ -285,7 +285,7 @@ struct equal_op { template constexpr static inli struct not_equal_op { template constexpr static inline auto run(A a, B b) -> decltype(a != b) { return a != b; } }; struct lesser_op { template constexpr static inline auto run(A a, B b) -> decltype(a < b) { return a < b; } }; struct lesser_equal_op { template constexpr static inline auto run(A a, B b) -> decltype(a <= b) { return a <= b; } }; -struct greater_op { template constexpr static inline auto run(A a, B b) -> decltype(a < b) { return a < b; } }; +struct greater_op { template constexpr static inline auto run(A a, B b) -> decltype(a > b) { return a > b; } }; struct greater_equal_op { template constexpr static inline auto run(A a, B b) -> decltype(a >= b) { return a >= b; } }; /* generic unary operations */ From e0dbb68c2f17f3c8c6accc7dc0b2b8d544e2eebc Mon Sep 17 00:00:00 2001 From: Mark Borgerding Date: Tue, 15 Apr 2014 13:57:03 -0400 Subject: [PATCH 078/158] Check IMKL version for compatibility with Eigen --- Eigen/src/Core/util/MKL_support.h | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/MKL_support.h b/Eigen/src/Core/util/MKL_support.h index 1e6e355d6..8acca9c8c 100644 --- a/Eigen/src/Core/util/MKL_support.h +++ b/Eigen/src/Core/util/MKL_support.h @@ -54,8 +54,25 @@ #endif #if defined EIGEN_USE_MKL +# include +/*Check IMKL version for compatibility: < 10.3 is not usable with Eigen*/ +# ifndef INTEL_MKL_VERSION +# undef EIGEN_USE_MKL /* INTEL_MKL_VERSION is not even defined on older versions */ +# elif INTEL_MKL_VERSION < 100305 /* the intel-mkl-103-release-notes say this was when the lapacke.h interface was added*/ +# undef EIGEN_USE_MKL +# endif +# ifndef EIGEN_USE_MKL + /*If the MKL version is too old, undef everything*/ +# undef EIGEN_USE_MKL_ALL +# undef EIGEN_USE_BLAS +# undef EIGEN_USE_LAPACKE +# undef EIGEN_USE_MKL_VML +# undef EIGEN_USE_LAPACKE_STRICT +# undef EIGEN_USE_LAPACKE +# endif +#endif -#include +#if defined EIGEN_USE_MKL #include #define EIGEN_MKL_VML_THRESHOLD 128 From d5a795f67366db20a132cc70e4f0217f42372357 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 16 Apr 2014 17:05:11 +0200 Subject: [PATCH 079/158] New gebp kernel handling up to 3 packets x 4 register-level blocks. Huge speeup on Haswell. This changeset also introduce new vector functions: ploadquad and predux4. --- Eigen/src/Core/GenericPacketMath.h | 35 +- Eigen/src/Core/arch/AVX/Complex.h | 4 +- Eigen/src/Core/arch/AVX/PacketMath.h | 29 +- Eigen/src/Core/arch/AltiVec/PacketMath.h | 4 +- Eigen/src/Core/arch/NEON/Complex.h | 2 +- Eigen/src/Core/arch/NEON/PacketMath.h | 4 +- Eigen/src/Core/arch/SSE/Complex.h | 4 +- Eigen/src/Core/arch/SSE/PacketMath.h | 6 +- .../Core/products/GeneralBlockPanelKernel.h | 1430 +++++++++++------ Eigen/src/Core/products/GeneralMatrixMatrix.h | 121 +- Eigen/src/Core/products/Parallelizer.h | 25 +- .../Core/products/SelfadjointMatrixMatrix.h | 33 +- 12 files changed, 1064 insertions(+), 633 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 3e5db1a88..147298009 100755 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -161,14 +161,6 @@ pload(const typename unpacket_traits::type* from) { return *from; } template EIGEN_DEVICE_FUNC inline Packet ploadu(const typename unpacket_traits::type* from) { return *from; } -/** \internal \returns a packet with elements of \a *from duplicated. - * For instance, for a packet of 8 elements, 4 scalar will be read from \a *from and - * duplicated to form: {from[0],from[0],from[1],from[1],,from[2],from[2],,from[3],from[3]} - * Currently, this function is only used for scalar * complex products. - */ -template EIGEN_DEVICE_FUNC inline Packet -ploaddup(const typename unpacket_traits::type* from) { return *from; } - /** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */ template EIGEN_DEVICE_FUNC inline Packet pset1(const typename unpacket_traits::type& a) { return a; } @@ -177,6 +169,24 @@ pset1(const typename unpacket_traits::type& a) { return a; } template EIGEN_DEVICE_FUNC inline Packet pload1(const typename unpacket_traits::type *a) { return pset1(*a); } +/** \internal \returns a packet with elements of \a *from duplicated. + * For instance, for a packet of 8 elements, 4 scalars will be read from \a *from and + * duplicated to form: {from[0],from[0],from[1],from[1],from[2],from[2],from[3],from[3]} + * Currently, this function is only used for scalar * complex products. + */ +template EIGEN_DEVICE_FUNC inline Packet +ploaddup(const typename unpacket_traits::type* from) { return *from; } + +/** \internal \returns a packet with elements of \a *from quadrupled. + * For instance, for a packet of 8 elements, 2 scalars will be read from \a *from and + * replicated to form: {from[0],from[0],from[0],from[0],from[1],from[1],from[1],from[1]} + * Currently, this function is only used in matrix products. + * For packet-size smaller or equal to 4, this function is equivalent to pload1 + */ +template EIGEN_DEVICE_FUNC inline Packet +ploadquad(const typename unpacket_traits::type* from) +{ return pload1(from); } + /** \internal equivalent to * \code * a0 = pload1(a+0); @@ -249,6 +259,15 @@ preduxp(const Packet* vecs) { return vecs[0]; } template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux(const Packet& a) { return a; } +/** \internal \returns the sum of the elements of \a a by block of 4 elements. + * For a packet {a0, a1, a2, a3, a4, a5, a6, a7}, it returns a half packet {a0+a4, a1+a5, a2+a6, a3+a7} + * For packet-size smaller or equal to 4, this boils down to a noop. + */ +template EIGEN_DEVICE_FUNC inline +typename conditional<(unpacket_traits::size%8)==0,typename unpacket_traits::half,Packet>::type +predux4(const Packet& a) +{ return a; } + /** \internal \returns the product of the elements of \a a*/ template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_mul(const Packet& a) { return a; } diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index cb16180c5..8f95a7be7 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -45,7 +45,7 @@ template<> struct packet_traits > : default_packet_traits }; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=4}; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=4}; typedef Packet2cf half; }; template<> EIGEN_STRONG_INLINE Packet4cf padd(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf psub(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); } @@ -271,7 +271,7 @@ template<> struct packet_traits > : default_packet_traits }; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=2}; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=2}; typedef Packet1cd half; }; template<> EIGEN_STRONG_INLINE Packet2cd padd(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd psub(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); } diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 38f52ecc8..47e10f6da 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -83,9 +83,9 @@ template<> struct packet_traits : default_packet_traits }; */ -template<> struct unpacket_traits { typedef float type; enum {size=8}; }; -template<> struct unpacket_traits { typedef double type; enum {size=4}; }; -template<> struct unpacket_traits { typedef int type; enum {size=8}; }; +template<> struct unpacket_traits { typedef float type; typedef Packet4f half; enum {size=8}; }; +template<> struct unpacket_traits { typedef double type; typedef Packet2d half; enum {size=4}; }; +template<> struct unpacket_traits { typedef int type; typedef Packet4i half; enum {size=8}; }; template<> EIGEN_STRONG_INLINE Packet8f pset1(const float& from) { return _mm256_set1_ps(from); } template<> EIGEN_STRONG_INLINE Packet4d pset1(const double& from) { return _mm256_set1_pd(from); } @@ -141,7 +141,16 @@ template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& return _mm256_fmadd_ps(a,b,c); #endif } -template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) { return _mm256_fmadd_pd(a,b,c); } +template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) { +#if defined(__clang__) || defined(__GNUC__) + // see above + Packet4d res = c; + asm("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b)); + return res; +#else + return _mm256_fmadd_pd(a,b,c); +#endif +} #endif template<> EIGEN_STRONG_INLINE Packet8f pmin(const Packet8f& a, const Packet8f& b) { return _mm256_min_ps(a,b); } @@ -189,6 +198,13 @@ template<> EIGEN_STRONG_INLINE Packet4d ploaddup(const double* from) return _mm256_blend_pd(tmp1,_mm256_permute2f128_pd(tmp2,tmp2,1),12); } +// Loads 2 floats from memory a returns the packet {a0, a0 a0, a0, a1, a1, a1, a1} +template<> EIGEN_STRONG_INLINE Packet8f ploadquad(const float* from) +{ + Packet8f tmp = _mm256_castps128_ps256(_mm_broadcast_ss(from)); + return _mm256_insertf128_ps(tmp, _mm_broadcast_ss(from+1), 1); +} + template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet8f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(to, from); } template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet4d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd(to, from); } template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet8i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); } @@ -345,6 +361,11 @@ template<> EIGEN_STRONG_INLINE double predux(const Packet4d& a) return pfirst(_mm256_hadd_pd(tmp0,tmp0)); } +template<> EIGEN_STRONG_INLINE Packet4f predux4(const Packet8f& a) +{ + return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1)); +} + template<> EIGEN_STRONG_INLINE float predux_mul(const Packet8f& a) { Packet8f tmp; diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 5d7a16f5c..16948264f 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -99,8 +99,8 @@ template<> struct packet_traits : default_packet_traits }; }; -template<> struct unpacket_traits { typedef float type; enum {size=4}; }; -template<> struct unpacket_traits { typedef int type; enum {size=4}; }; +template<> struct unpacket_traits { typedef float type; enum {size=4}; typedef Packet4f half; }; +template<> struct unpacket_traits { typedef int type; enum {size=4}; typedef Packet4i half; }; /* inline std::ostream & operator <<(std::ostream & s, const Packet4f & v) { diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index e49c1a873..7ca76714f 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -47,7 +47,7 @@ template<> struct packet_traits > : default_packet_traits }; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=2}; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=2}; typedef Packet2cf half; }; template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) { diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index fae7b55fc..83150507a 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -101,8 +101,8 @@ EIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); } #endif -template<> struct unpacket_traits { typedef float type; enum {size=4}; }; -template<> struct unpacket_traits { typedef int type; enum {size=4}; }; +template<> struct unpacket_traits { typedef float type; enum {size=4}; typedef Packet4f half; }; +template<> struct unpacket_traits { typedef int type; enum {size=4}; typedef Packet4i half; }; template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { return vdupq_n_f32(from); } template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { return vdupq_n_s32(from); } diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index e54ebbf90..715e5a13c 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -49,7 +49,7 @@ template<> struct packet_traits > : default_packet_traits }; #endif -template<> struct unpacket_traits { typedef std::complex type; enum {size=2}; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=2}; typedef Packet2cf half; }; template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); } @@ -296,7 +296,7 @@ template<> struct packet_traits > : default_packet_traits }; #endif -template<> struct unpacket_traits { typedef std::complex type; enum {size=1}; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=1}; typedef Packet1cd half; }; template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); } diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index bc17726b4..89dfa6975 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -107,9 +107,9 @@ template<> struct packet_traits : default_packet_traits }; }; -template<> struct unpacket_traits { typedef float type; enum {size=4}; }; -template<> struct unpacket_traits { typedef double type; enum {size=2}; }; -template<> struct unpacket_traits { typedef int type; enum {size=4}; }; +template<> struct unpacket_traits { typedef float type; enum {size=4}; typedef Packet4f half; }; +template<> struct unpacket_traits { typedef double type; enum {size=2}; typedef Packet2d half; }; +template<> struct unpacket_traits { typedef int type; enum {size=4}; typedef Packet4i half; }; #if defined(_MSC_VER) && (_MSC_VER==1500) // Workaround MSVC 9 internal compiler error. diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index d9e659c9a..df30fdd3e 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -10,6 +10,12 @@ #ifndef EIGEN_GENERAL_BLOCK_PANEL_H #define EIGEN_GENERAL_BLOCK_PANEL_H +#ifdef USE_IACA +#include "iacaMarks.h" +#else +#define IACA_START +#define IACA_END +#endif namespace Eigen { @@ -92,12 +98,22 @@ void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) }; manage_caching_sizes(GetAction, &l1, &l2); - k = std::min(k, l1/kdiv); - SizeType _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0; - if(_m(k, l1/kdiv); +// SizeType _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0; +// if(_m(k,240); + n = std::min(n,3840/sizeof(RhsScalar)); + m = std::min(m,3840/sizeof(RhsScalar)); +#else + k = std::min(k,24); + n = std::min(n,384/sizeof(RhsScalar)); + m = std::min(m,384/sizeof(RhsScalar)); +#endif } template @@ -164,11 +180,15 @@ public: NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, - // register block size along the N direction (must be either 4 or 8) - nr = NumberOfRegisters/2, + // register block size along the N direction (must be either 2 or 4) + nr = 4,//NumberOfRegisters/4, // register block size along the M direction (currently, this one cannot be modified) - mr = LhsPacketSize, +#ifdef __FMA__ + mr = 3*LhsPacketSize, +#else + mr = 2*LhsPacketSize, +#endif LhsProgress = LhsPacketSize, RhsProgress = 1 @@ -198,23 +218,32 @@ public: { pbroadcast2(b, b0, b1); } - - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + + template + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const { - dest = pset1(*b); + dest = pset1(*b); + } + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const + { + dest = ploadquad(b); } - EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const + template + EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacketType& dest) const { - dest = pload(a); + dest = pload(a); } - EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + template + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const { - dest = ploadu(a); + dest = ploadu(a); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, AccPacket& tmp) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp) const { // It would be a lot cleaner to call pmadd all the time. Unfortunately if we // let gcc allocate the register in which to store the result of the pmul @@ -232,6 +261,12 @@ public: { r = pmadd(c,alpha,r); } + + template + EIGEN_STRONG_INLINE void acc(const ResPacketHalf& c, const ResPacketHalf& alpha, ResPacketHalf& r) const + { + r = pmadd(c,alpha,r); + } protected: // conj_helper cj; @@ -281,6 +316,11 @@ public: { dest = pset1(*b); } + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const + { + dest = pset1(*b); + } EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { @@ -346,7 +386,23 @@ DoublePacket padd(const DoublePacket &a, const DoublePacket +const DoublePacket& predux4(const DoublePacket &a) +{ + return a; +} + +template struct unpacket_traits > { typedef DoublePacket half; }; +// template +// DoublePacket pmadd(const DoublePacket &a, const DoublePacket &b) +// { +// DoublePacket res; +// res.first = padd(a.first, b.first); +// res.second = padd(a.second,b.second); +// return res; +// } + template class gebp_traits, std::complex, _ConjLhs, _ConjRhs > { @@ -404,6 +460,16 @@ public: dest.second = pset1(imag(*b)); } + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const + { + loadRhs(b,dest); + } + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const + { + eigen_internal_assert(unpacket_traits::size<=4); + loadRhs(b,dest); + } + // linking error if instantiated without being optimized out: void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3); @@ -619,26 +685,246 @@ void gebp_kernel if(strideA==-1) strideA = depth; if(strideB==-1) strideB = depth; conj_helper cj; - Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; - // Here we assume that mr==LhsProgress - const Index peeled_mc = (rows/mr)*mr; + const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0; + const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0; + const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0; enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell) const Index peeled_kc = depth & ~(pk-1); - const Index depth2 = depth & ~1; - - // loops on each micro vertical panel of rhs (depth x nr) - // First pass using depth x 8 panels - if(nr>=8) +// const Index depth2 = depth & ~1; + +// std::cout << mr << " " << peeled_mc3 << " " << peeled_mc2 << " " << peeled_mc1 << "\n"; + + + //---------- Process 3 * LhsProgress rows at once ---------- + // This corresponds to 3*LhsProgress x nr register blocks. + // Usually, make sense only with FMA + if(mr>=3*Traits::LhsProgress) { - for(Index j2=0; j2 we select a mr x nr micro block of res which is entirely - // stored into mr/packet_size x nr registers. - for(Index i=0; i(alpha); + + R0 = ploadu(r0+0*Traits::ResPacketSize); + R1 = ploadu(r0+1*Traits::ResPacketSize); + R2 = ploadu(r0+2*Traits::ResPacketSize); + traits.acc(C0, alphav, R0); + traits.acc(C4, alphav, R1); + traits.acc(C8, alphav, R2); + pstoreu(r0+0*Traits::ResPacketSize, R0); + pstoreu(r0+1*Traits::ResPacketSize, R1); + pstoreu(r0+2*Traits::ResPacketSize, R2); + + R0 = ploadu(r1+0*Traits::ResPacketSize); + R1 = ploadu(r1+1*Traits::ResPacketSize); + R2 = ploadu(r1+2*Traits::ResPacketSize); + traits.acc(C1, alphav, R0); + traits.acc(C5, alphav, R1); + traits.acc(C9, alphav, R2); + pstoreu(r1+0*Traits::ResPacketSize, R0); + pstoreu(r1+1*Traits::ResPacketSize, R1); + pstoreu(r1+2*Traits::ResPacketSize, R2); + + R0 = ploadu(r2+0*Traits::ResPacketSize); + R1 = ploadu(r2+1*Traits::ResPacketSize); + R2 = ploadu(r2+2*Traits::ResPacketSize); + traits.acc(C2, alphav, R0); + traits.acc(C6, alphav, R1); + traits.acc(C10, alphav, R2); + pstoreu(r2+0*Traits::ResPacketSize, R0); + pstoreu(r2+1*Traits::ResPacketSize, R1); + pstoreu(r2+2*Traits::ResPacketSize, R2); + + R0 = ploadu(r3+0*Traits::ResPacketSize); + R1 = ploadu(r3+1*Traits::ResPacketSize); + R2 = ploadu(r3+2*Traits::ResPacketSize); + traits.acc(C3, alphav, R0); + traits.acc(C7, alphav, R1); + traits.acc(C11, alphav, R2); + pstoreu(r3+0*Traits::ResPacketSize, R0); + pstoreu(r3+1*Traits::ResPacketSize, R1); + pstoreu(r3+2*Traits::ResPacketSize, R2); + } + + // Deal with remaining columns of the rhs + for(Index j2=packet_cols4; j2(alpha); + + R0 = ploadu(r0+0*Traits::ResPacketSize); + R1 = ploadu(r0+1*Traits::ResPacketSize); + R2 = ploadu(r0+2*Traits::ResPacketSize); + traits.acc(C0, alphav, R0); + traits.acc(C4, alphav, R1); + traits.acc(C8 , alphav, R2); + pstoreu(r0+0*Traits::ResPacketSize, R0); + pstoreu(r0+1*Traits::ResPacketSize, R1); + pstoreu(r0+2*Traits::ResPacketSize, R2); + } + } + } + + //---------- Process 2 * LhsProgress rows at once ---------- + if(mr>=2*Traits::LhsProgress) + { + // loops on each largest micro horizontal panel of lhs (2*LhsProgress x depth) + for(Index i=peeled_mc3; i traits.initAcc(C7); ResScalar* r0 = &res[(j2+0)*resStride + i]; + ResScalar* r1 = &res[(j2+1)*resStride + i]; + ResScalar* r2 = &res[(j2+2)*resStride + i]; + ResScalar* r3 = &res[(j2+3)*resStride + i]; + + internal::prefetch(r0); + internal::prefetch(r1); + internal::prefetch(r2); + internal::prefetch(r3); // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; - LhsPacket A0; - // uncomment for register prefetching - // LhsPacket A1; - // traits.loadLhs(blA, A0); + prefetch(&blB[0]); + LhsPacket A0, A1; + for(Index k=0; k(alpha); - - R0 = ploadu(r0+0*resStride); - R1 = ploadu(r0+1*resStride); - R2 = ploadu(r0+2*resStride); - R3 = ploadu(r0+3*resStride); - R4 = ploadu(r0+4*resStride); - R5 = ploadu(r0+5*resStride); - R6 = ploadu(r0+6*resStride); - traits.acc(C0, alphav, R0); - pstoreu(r0+0*resStride, R0); - R0 = ploadu(r0+7*resStride); - - traits.acc(C1, alphav, R1); - traits.acc(C2, alphav, R2); - traits.acc(C3, alphav, R3); - traits.acc(C4, alphav, R4); - traits.acc(C5, alphav, R5); - traits.acc(C6, alphav, R6); - traits.acc(C7, alphav, R0); - pstoreu(r0+1*resStride, R1); - pstoreu(r0+2*resStride, R2); - pstoreu(r0+3*resStride, R3); - pstoreu(r0+4*resStride, R4); - pstoreu(r0+5*resStride, R5); - pstoreu(r0+6*resStride, R6); - pstoreu(r0+7*resStride, R0); + R0 = ploadu(r0+0*Traits::ResPacketSize); + R1 = ploadu(r0+1*Traits::ResPacketSize); + R2 = ploadu(r1+0*Traits::ResPacketSize); + R3 = ploadu(r1+1*Traits::ResPacketSize); + traits.acc(C0, alphav, R0); + traits.acc(C4, alphav, R1); + traits.acc(C1, alphav, R2); + traits.acc(C5, alphav, R3); + pstoreu(r0+0*Traits::ResPacketSize, R0); + pstoreu(r0+1*Traits::ResPacketSize, R1); + pstoreu(r1+0*Traits::ResPacketSize, R2); + pstoreu(r1+1*Traits::ResPacketSize, R3); + + R0 = ploadu(r2+0*Traits::ResPacketSize); + R1 = ploadu(r2+1*Traits::ResPacketSize); + R2 = ploadu(r3+0*Traits::ResPacketSize); + R3 = ploadu(r3+1*Traits::ResPacketSize); + traits.acc(C2, alphav, R0); + traits.acc(C6, alphav, R1); + traits.acc(C3, alphav, R2); + traits.acc(C7, alphav, R3); + pstoreu(r2+0*Traits::ResPacketSize, R0); + pstoreu(r2+1*Traits::ResPacketSize, R1); + pstoreu(r3+0*Traits::ResPacketSize, R2); + pstoreu(r3+1*Traits::ResPacketSize, R3); } - // Deal with remaining rows of the lhs - // TODO we should vectorize if <= 8, and not strictly == - if(SwappedTraits::LhsProgress == 8) + // Deal with remaining columns of the rhs + for(Index j2=packet_cols4; j2 SwappedTraits; - typedef typename SwappedTraits::ResScalar SResScalar; - typedef typename SwappedTraits::LhsPacket SLhsPacket; - typedef typename SwappedTraits::RhsPacket SRhsPacket; - typedef typename SwappedTraits::ResPacket SResPacket; - typedef typename SwappedTraits::AccPacket SAccPacket; - SwappedTraits straits; - - Index rows2 = (rows & ~1); - for(Index i=peeled_mc; i(&res[j2*resStride + i], resStride); - SResPacket alphav = pset1(alpha); - straits.acc(padd(C0,C1), alphav, R); - pscatter(&res[j2*resStride + i], R, resStride); - - R = pgather(&res[j2*resStride + i + 1], resStride); - straits.acc(padd(C2,C3), alphav, R); - pscatter(&res[j2*resStride + i + 1], R, resStride); - - EIGEN_ASM_COMMENT("end_vectorized_multiplication_of_last_rows 8"); - } - if(rows2!=rows) - { - Index i = rows-1; - const LhsScalar* blA = &blockA[i*strideA+offsetA]; - prefetch(&blA[0]); - const RhsScalar* blB = &blockB[j2*strideB+offsetB*8]; - - EIGEN_ASM_COMMENT("begin_vectorized_multiplication_of_last_rows 8"); - - SAccPacket C0,C1; - straits.initAcc(C0); // even - straits.initAcc(C1); // odd - - for(Index k=0; k(&res[j2*resStride + i], resStride); - SResPacket alphav = pset1(alpha); - straits.acc(padd(C0,C1), alphav, R); - pscatter(&res[j2*resStride + i], R, resStride); - } - } - else - { - // Pure scalar path - for(Index i=peeled_mc; i(alpha); + + R0 = ploadu(r0+0*Traits::ResPacketSize); + R1 = ploadu(r0+1*Traits::ResPacketSize); + traits.acc(C0, alphav, R0); + traits.acc(C4, alphav, R1); + pstoreu(r0+0*Traits::ResPacketSize, R0); + pstoreu(r0+1*Traits::ResPacketSize, R1); } } } - - // Second pass using depth x 4 panels - // If nr==8, then we have at most one such panel - // TODO: with 16 registers, we coud optimize this part to leverage more pipelinining, - // for instance, by using a 2 packet * 4 kernel. Useful when the rhs is thin - if(nr>=4) + //---------- Process 1 * LhsProgress rows at once ---------- + if(mr>=1*Traits::LhsProgress) { - for(Index j2=packet_cols8; j2 we select a mr x 4 micro block of res which is entirely - // stored into mr/packet_size x 4 registers. - for(Index i=0; i traits.initAcc(C3); ResScalar* r0 = &res[(j2+0)*resStride + i]; + ResScalar* r1 = &res[(j2+1)*resStride + i]; + ResScalar* r2 = &res[(j2+2)*resStride + i]; + ResScalar* r3 = &res[(j2+3)*resStride + i]; + + internal::prefetch(r0); + internal::prefetch(r1); + internal::prefetch(r2); + internal::prefetch(r3); // performs "inner" products - const RhsScalar* blB = &blockB[j2*strideB+offsetB*4]; + const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; + prefetch(&blB[0]); LhsPacket A0; + for(Index k=0; k(alpha); - - R0 = ploadu(r0+0*resStride); - R1 = ploadu(r0+1*resStride); - R2 = ploadu(r0+2*resStride); - traits.acc(C0, alphav, R0); - pstoreu(r0+0*resStride, R0); - R0 = ploadu(r0+3*resStride); - - traits.acc(C1, alphav, R1); - traits.acc(C2, alphav, R2); - traits.acc(C3, alphav, R0); - pstoreu(r0+1*resStride, R1); - pstoreu(r0+2*resStride, R2); - pstoreu(r0+3*resStride, R0); + R0 = ploadu(r0+0*Traits::ResPacketSize); + R1 = ploadu(r1+0*Traits::ResPacketSize); + traits.acc(C0, alphav, R0); + traits.acc(C1, alphav, R1); + pstoreu(r0+0*Traits::ResPacketSize, R0); + pstoreu(r1+0*Traits::ResPacketSize, R1); + + R0 = ploadu(r2+0*Traits::ResPacketSize); + R1 = ploadu(r3+0*Traits::ResPacketSize); + traits.acc(C2, alphav, R0); + traits.acc(C3, alphav, R1); + pstoreu(r2+0*Traits::ResPacketSize, R0); + pstoreu(r3+0*Traits::ResPacketSize, R1); } - for(Index i=peeled_mc; i(alpha); + R0 = ploadu(r0+0*Traits::ResPacketSize); + traits.acc(C0, alphav, R0); + pstoreu(r0+0*Traits::ResPacketSize, R0); + } + } + } + //---------- Process remaining rows, 1 at once ---------- + { + // loop on each row of the lhs (1*LhsProgress x depth) + for(Index i=peeled_mc1; i::half,SResPacket>::type SResPacketHalf; + typedef typename conditional::half,SLhsPacket>::type SLhsPacketHalf; + typedef typename conditional::half,SRhsPacket>::type SRhsPacketHalf; + typedef typename conditional::half,SAccPacket>::type SAccPacketHalf; + + SResPacketHalf R = pgather(&res[j2*resStride + i], resStride); + SResPacketHalf alphav = pset1(alpha); + + if(depth-endk>0) + { + // We have to handle the last row of the rhs which corresponds to a half-packet + SLhsPacketHalf a0; + SRhsPacketHalf b0; + straits.loadLhsUnaligned(blB, a0); + straits.loadRhs(blA, b0); + SAccPacketHalf c0 = predux4(C0); + straits.madd(a0,b0,c0,b0); + straits.acc(c0, alphav, R); + } + else + { + straits.acc(predux4(C0), alphav, R); + } + pscatter(&res[j2*resStride + i], R, resStride); + } + else + { + SResPacket R = pgather(&res[j2*resStride + i], resStride); + SResPacket alphav = pset1(alpha); + straits.acc(C0, alphav, R); + pscatter(&res[j2*resStride + i], R, resStride); } - SResPacket R = pgather(&res[j2*resStride + i], resStride); - SResPacket alphav = pset1(alpha); - straits.acc(C0, alphav, R); - pscatter(&res[j2*resStride + i], R, resStride); - - EIGEN_ASM_COMMENT("end_vectorized_multiplication_of_last_rows 1x4"); } - else + else // scalar path { - // Pure scalar path - // gets a 1 x 4 res block as registers + // get a 1 x 4 res block as registers ResScalar C0(0), C1(0), C2(0), C3(0); for(Index k=0; k B_1 = blB[3]; MADD(cj,A0,B_0,C2, B_0); MADD(cj,A0,B_1,C3, B_1); - + blB += 4; } res[(j2+0)*resStride + i] += alpha*C0; @@ -1058,56 +1377,22 @@ void gebp_kernel res[(j2+3)*resStride + i] += alpha*C3; } } - } - } - - // process remaining rhs/res columns one at a time - for(Index j2=packet_cols4; j2(alpha); - ResScalar* r0 = &res[(j2+0)*resStride + i]; - R0 = ploadu(r0); - traits.acc(C0, alphav, R0); - pstoreu(r0, R0); - } - // pure scalar path - for(Index i=peeled_mc; i // // 32 33 34 35 ... // 36 36 38 39 ... -template -struct gemm_pack_lhs +template +struct gemm_pack_lhs { EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0); }; -template -EIGEN_DONT_INLINE void gemm_pack_lhs +template +EIGEN_DONT_INLINE void gemm_pack_lhs ::operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride, Index offset) { typedef typename packet_traits::type Packet; @@ -1146,87 +1431,174 @@ EIGEN_DONT_INLINE void gemm_pack_lhs=depth && offset<=stride)); - eigen_assert( (StorageOrder==RowMajor) || ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) ); + eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) ); conj_if::IsComplex && Conjugate> cj; - const_blas_data_mapper lhs(_lhs,lhsStride); + const_blas_data_mapper lhs(_lhs,lhsStride); Index count = 0; - Index peeled_mc = (rows/Pack1)*Pack1; - for(Index i=0; i=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; + const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; + const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; + + // Pack 3 packets + if(Pack1>=3*PacketSize) { - if(PanelMode) count += Pack1 * offset; - - if(StorageOrder==ColMajor) + for(Index i=0; i=1*PacketSize) A = ploadu(&lhs(i+0*PacketSize, k)); - if(Pack1>=2*PacketSize) B = ploadu(&lhs(i+1*PacketSize, k)); - if(Pack1>=3*PacketSize) C = ploadu(&lhs(i+2*PacketSize, k)); - if(Pack1>=4*PacketSize) D = ploadu(&lhs(i+3*PacketSize, k)); - if(Pack1>=1*PacketSize) { pstore(blockA+count, cj.pconj(A)); count+=PacketSize; } - if(Pack1>=2*PacketSize) { pstore(blockA+count, cj.pconj(B)); count+=PacketSize; } - if(Pack1>=3*PacketSize) { pstore(blockA+count, cj.pconj(C)); count+=PacketSize; } - if(Pack1>=4*PacketSize) { pstore(blockA+count, cj.pconj(D)); count+=PacketSize; } - } - else - { - if(Pack1>=1) blockA[count++] = cj(lhs(i+0, k)); - if(Pack1>=2) blockA[count++] = cj(lhs(i+1, k)); - if(Pack1>=3) blockA[count++] = cj(lhs(i+2, k)); - if(Pack1>=4) blockA[count++] = cj(lhs(i+3, k)); - } + Packet A, B, C; + A = ploadu(&lhs(i+0*PacketSize, k)); + B = ploadu(&lhs(i+1*PacketSize, k)); + C = ploadu(&lhs(i+2*PacketSize, k)); + pstore(blockA+count, cj.pconj(A)); count+=PacketSize; + pstore(blockA+count, cj.pconj(B)); count+=PacketSize; + pstore(blockA+count, cj.pconj(C)); count+=PacketSize; } + if(PanelMode) count += (3*PacketSize) * (stride-offset-depth); } - else + } + // Pack 2 packets + if(Pack1>=2*PacketSize) + { + for(Index i=peeled_mc3; i(&lhs(i+0*PacketSize, k)); + B = ploadu(&lhs(i+1*PacketSize, k)); + pstore(blockA+count, cj.pconj(A)); count+=PacketSize; + pstore(blockA+count, cj.pconj(B)); count+=PacketSize; + } + if(PanelMode) count += (2*PacketSize) * (stride-offset-depth); + } + } + // Pack 1 packets + if(Pack1>=1*PacketSize) + { + for(Index i=peeled_mc2; i(&lhs(i+0*PacketSize, k)); + pstore(blockA+count, cj.pconj(A)); + count+=PacketSize; + } + if(PanelMode) count += (1*PacketSize) * (stride-offset-depth); + } + } + // Pack scalars +// if(rows-peeled_mc>=Pack2) +// { +// if(PanelMode) count += Pack2*offset; +// for(Index k=0; k +struct gemm_pack_lhs +{ + EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0); +}; + +template +EIGEN_DONT_INLINE void gemm_pack_lhs + ::operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride, Index offset) +{ + typedef typename packet_traits::type Packet; + enum { PacketSize = packet_traits::size }; + + EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS"); + EIGEN_UNUSED_VARIABLE(stride); + EIGEN_UNUSED_VARIABLE(offset); + eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); + conj_if::IsComplex && Conjugate> cj; + const_blas_data_mapper lhs(_lhs,lhsStride); + Index count = 0; + +// const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; +// const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; +// const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; + + int pack_packets = Pack1/PacketSize; + Index i = 0; + while(pack_packets>0) + { + + Index remaining_rows = rows-i; + Index peeled_mc = i+(remaining_rows/(pack_packets*PacketSize))*(pack_packets*PacketSize); +// std::cout << "pack_packets = " << pack_packets << " from " << i << " to " << peeled_mc << "\n"; + for(; i kernel; - for (int p = 0; p < PacketSize; ++p) { - kernel.packet[p] = ploadu(&lhs(i+p+m, k)); - } + for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = ploadu(&lhs(i+p+m, k)); ptranspose(kernel); - for (int p = 0; p < PacketSize; ++p) { - pstore(blockA+count+m+Pack1*p, cj.pconj(kernel.packet[p])); - } + for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack_packets*PacketSize)*p, cj.pconj(kernel.packet[p])); } - count += PacketSize*Pack1; + count += PacketSize*(pack_packets*PacketSize); } - for(; k=Pack2) - { - if(PanelMode) count += Pack2*offset; - for(Index k=0; k=Pack2) +// { +// if(PanelMode) count += Pack2*offset; +// for(Index k=0; k=4 ? (cols/4) * 4 : 0; Index count = 0; const Index peeled_k = (depth/PacketSize)*PacketSize; - if(nr>=8) - { - for(Index j2=0; j2 kernel; - for (int p = 0; p < PacketSize; ++p) { - kernel.packet[p] = ploadu(&rhs[(j2+p)*rhsStride+k]); - } - ptranspose(kernel); - for (int p = 0; p < PacketSize; ++p) { - pstoreu(blockB+count, cj.pconj(kernel.packet[p])); - count+=PacketSize; - } - } - } - for(; k=8) +// { +// for(Index j2=0; j2 kernel; +// for (int p = 0; p < PacketSize; ++p) { +// kernel.packet[p] = ploadu(&rhs[(j2+p)*rhsStride+k]); +// } +// ptranspose(kernel); +// for (int p = 0; p < PacketSize; ++p) { +// pstoreu(blockB+count, cj.pconj(kernel.packet[p])); +// count+=PacketSize; +// } +// } +// } +// for(; k=4) { @@ -1383,39 +1755,39 @@ EIGEN_DONT_INLINE void gemm_pack_rhs=4 ? (cols/4) * 4 : 0; Index count = 0; - if(nr>=8) - { - for(Index j2=0; j2(&rhs[k*rhsStride + j2]); - pstoreu(blockB+count, cj.pconj(A)); - } else if (PacketSize==4) { - Packet A = ploadu(&rhs[k*rhsStride + j2]); - Packet B = ploadu(&rhs[k*rhsStride + j2 + PacketSize]); - pstoreu(blockB+count, cj.pconj(A)); - pstoreu(blockB+count+PacketSize, cj.pconj(B)); - } else { - const Scalar* b0 = &rhs[k*rhsStride + j2]; - blockB[count+0] = cj(b0[0]); - blockB[count+1] = cj(b0[1]); - blockB[count+2] = cj(b0[2]); - blockB[count+3] = cj(b0[3]); - blockB[count+4] = cj(b0[4]); - blockB[count+5] = cj(b0[5]); - blockB[count+6] = cj(b0[6]); - blockB[count+7] = cj(b0[7]); - } - count += 8; - } - // skip what we have after - if(PanelMode) count += 8 * (stride-offset-depth); - } - } +// if(nr>=8) +// { +// for(Index j2=0; j2(&rhs[k*rhsStride + j2]); +// pstoreu(blockB+count, cj.pconj(A)); +// } else if (PacketSize==4) { +// Packet A = ploadu(&rhs[k*rhsStride + j2]); +// Packet B = ploadu(&rhs[k*rhsStride + j2 + PacketSize]); +// pstoreu(blockB+count, cj.pconj(A)); +// pstoreu(blockB+count+PacketSize, cj.pconj(B)); +// } else { +// const Scalar* b0 = &rhs[k*rhsStride + j2]; +// blockB[count+0] = cj(b0[0]); +// blockB[count+1] = cj(b0[1]); +// blockB[count+2] = cj(b0[2]); +// blockB[count+3] = cj(b0[3]); +// blockB[count+4] = cj(b0[4]); +// blockB[count+5] = cj(b0[5]); +// blockB[count+6] = cj(b0[6]); +// blockB[count+7] = cj(b0[7]); +// } +// count += 8; +// } +// // skip what we have after +// if(PanelMode) count += 8 * (stride-offset-depth); +// } +// } if(nr>=4) { for(Index j2=packet_cols8; j2 struct general_matrix_matrix_product { + typedef gebp_traits Traits; + typedef typename scalar_product_traits::ReturnType ResScalar; static EIGEN_STRONG_INLINE void run( Index rows, Index cols, Index depth, @@ -51,6 +53,8 @@ template< struct general_matrix_matrix_product { +typedef gebp_traits Traits; + typedef typename scalar_product_traits::ReturnType ResScalar; static void run(Index rows, Index cols, Index depth, const LhsScalar* _lhs, Index lhsStride, @@ -63,11 +67,9 @@ static void run(Index rows, Index cols, Index depth, const_blas_data_mapper lhs(_lhs,lhsStride); const_blas_data_mapper rhs(_rhs,rhsStride); - typedef gebp_traits Traits; - Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction - //Index nc = blocking.nc(); // cache block size along the N direction + Index nc = (std::min)(cols,blocking.nc()); // cache block size along the N direction gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; @@ -80,66 +82,68 @@ static void run(Index rows, Index cols, Index depth, Index tid = omp_get_thread_num(); Index threads = omp_get_num_threads(); - std::size_t sizeA = kc*mc; - ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, 0); + LhsScalar* blockA = blocking.blockA(); + eigen_internal_assert(blockA!=0); - RhsScalar* blockB = blocking.blockB(); - eigen_internal_assert(blockB!=0); - + std::size_t sizeB = kc*nc; + ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, 0); + // For each horizontal panel of the rhs, and corresponding vertical panel of the lhs... for(Index k=0; k rows of B', and cols of the A' // In order to reduce the chance that a thread has to wait for the other, - // let's start by packing A'. - pack_lhs(blockA, &lhs(0,k), lhsStride, actual_kc, mc); + // let's start by packing B'. + pack_rhs(blockB, &rhs(k,0), rhsStride, actual_kc, nc); - // Pack B_k to B' in a parallel fashion: - // each thread packs the sub block B_k,j to B'_j where j is the thread id. + // Pack A_k to A' in a parallel fashion: + // each thread packs the sub block A_k,i to A'_i where i is the thread id. - // However, before copying to B'_j, we have to make sure that no other thread is still using it, + // However, before copying to A'_i, we have to make sure that no other thread is still using it, // i.e., we test that info[tid].users equals 0. // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it. while(info[tid].users!=0) {} info[tid].users += threads; + + pack_lhs(blockA+info[tid].lhs_start*actual_kc, &lhs(info[tid].lhs_start,k), lhsStride, actual_kc, info[tid].lhs_length); - pack_rhs(blockB+info[tid].rhs_start*actual_kc, &rhs(k,info[tid].rhs_start), rhsStride, actual_kc, info[tid].rhs_length); - - // Notify the other threads that the part B'_j is ready to go. + // Notify the other threads that the part A'_i is ready to go. info[tid].sync = k; - - // Computes C_i += A' * B' per B'_j + + // Computes C_i += A' * B' per A'_i for(Index shift=0; shift0) - while(info[j].sync!=k) {} - - gebp(res+info[j].rhs_start*resStride, resStride, blockA, blockB+info[j].rhs_start*actual_kc, mc, actual_kc, info[j].rhs_length, alpha, -1,-1,0,0); + while(info[i].sync!=k) {} + gebp(res+info[i].lhs_start, resStride, blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha); } - // Then keep going as usual with the remaining A' - for(Index i=mc; i Pack rhs's panel into a sequential chunk of memory (L2 caching) - // Note that this panel will be read as many times as the number of blocks in the lhs's - // vertical panel which is, in practice, a very low number. - pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, cols); + // => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching) + // Note that this panel will be read as many times as the number of blocks in the rhs's + // horizontal panel which is, in practice, a very low number. + pack_lhs(blockA, &lhs(0,k2), lhsStride, actual_kc, rows); - // For each mc x kc block of the lhs's vertical panel... - // (==GEPP_VAR1) - for(Index i2=0; i2 > template struct gemm_functor { - gemm_functor(const Lhs& lhs, const Rhs& rhs, Dest& dest, const Scalar& actualAlpha, - BlockingType& blocking) + gemm_functor(const Lhs& lhs, const Rhs& rhs, Dest& dest, const Scalar& actualAlpha, BlockingType& blocking) : m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha), m_blocking(blocking) {} void initParallelSession() const { - m_blocking.allocateB(); + m_blocking.allocateA(); } void operator() (Index row, Index rows, Index col=0, Index cols=-1, GemmParallelInfo* info=0) const @@ -220,6 +221,8 @@ struct gemm_functor (Scalar*)&(m_dest.coeffRef(row,col)), m_dest.outerStride(), m_actualAlpha, m_blocking, info); } + + typedef typename Gemm::Traits Traits; protected: const Lhs& m_lhs; @@ -316,13 +319,23 @@ class gemm_blocking_spacem_mc = Transpose ? cols : rows; this->m_nc = Transpose ? rows : cols; this->m_kc = depth; - computeProductBlockingSizes(this->m_kc, this->m_mc, this->m_nc); + if(full_rows) + { + DenseIndex m = this->m_mc; + computeProductBlockingSizes(this->m_kc, m, this->m_nc); + } + else // full columns + { + DenseIndex n = this->m_nc; + computeProductBlockingSizes(this->m_kc, this->m_mc, n); + } + m_sizeA = this->m_mc * this->m_kc; m_sizeB = this->m_kc * this->m_nc; } @@ -396,7 +409,7 @@ class GeneralProduct (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>, _ActualLhsType, _ActualRhsType, Dest, BlockingType> GemmFunctor; - BlockingType blocking(dst.rows(), dst.cols(), lhs.cols()); + BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), true); internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), this->rows(), this->cols(), Dest::Flags&RowMajorBit); } diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h index 5c3e9b7ac..4079063eb 100644 --- a/Eigen/src/Core/products/Parallelizer.h +++ b/Eigen/src/Core/products/Parallelizer.h @@ -73,13 +73,13 @@ namespace internal { template struct GemmParallelInfo { - GemmParallelInfo() : sync(-1), users(0), rhs_start(0), rhs_length(0) {} + GemmParallelInfo() : sync(-1), users(0), lhs_start(0), lhs_length(0) {} int volatile sync; int volatile users; - Index rhs_start; - Index rhs_length; + Index lhs_start; + Index lhs_length; }; template @@ -107,7 +107,7 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos if((!Condition) || (omp_get_num_threads()>1)) return func(0,rows, 0,cols); - Index size = transpose ? cols : rows; + Index size = transpose ? rows : cols; // 2- compute the maximal number of threads from the size of the product: // FIXME this has to be fine tuned @@ -126,26 +126,25 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos std::swap(rows,cols); Index blockCols = (cols / threads) & ~Index(0x3); - Index blockRows = (rows / threads) & ~Index(0x7); + Index blockRows = (rows / threads); + blockRows = (blockRows/Functor::Traits::mr)*Functor::Traits::mr; GemmParallelInfo* info = new GemmParallelInfo[threads]; - #pragma omp parallel for schedule(static,1) num_threads(threads) - for(Index i=0; i +template struct symm_pack_lhs { template inline @@ -45,22 +45,29 @@ struct symm_pack_lhs } void operator()(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows) { + enum { PacketSize = packet_traits::size }; const_blas_data_mapper lhs(_lhs,lhsStride); Index count = 0; - Index peeled_mc = (rows/Pack1)*Pack1; - for(Index i=0; i(blockA, lhs, cols, i, count); - } - - if(rows-peeled_mc>=Pack2) - { - pack(blockA, lhs, cols, peeled_mc, count); - peeled_mc += Pack2; - } + //Index peeled_mc3 = (rows/Pack1)*Pack1; + + const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; + const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; + const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; + + if(Pack1>=3*PacketSize) + for(Index i=0; i(blockA, lhs, cols, i, count); + + if(Pack1>=2*PacketSize) + for(Index i=peeled_mc3; i(blockA, lhs, cols, i, count); + + if(Pack1>=1*PacketSize) + for(Index i=peeled_mc2; i(blockA, lhs, cols, i, count); // do the same with mr==1 - for(Index i=peeled_mc; i Date: Wed, 16 Apr 2014 23:14:58 +0200 Subject: [PATCH 080/158] Enable alloca on MAC OSX --- Eigen/src/Core/util/Memory.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index a4d7e454d..34b387ded 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -552,7 +552,7 @@ template struct smart_memmove_helper { // you can overwrite Eigen's default behavior regarding alloca by defining EIGEN_ALLOCA // to the appropriate stack allocation function #ifndef EIGEN_ALLOCA - #if (defined __linux__) + #if (defined __linux__) || (defined __APPLE__) #define EIGEN_ALLOCA alloca #elif defined(_MSC_VER) #define EIGEN_ALLOCA _alloca From d936ddc3d129162f096d8eb39084ea1a47ff6f29 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 16 Apr 2014 23:15:42 +0200 Subject: [PATCH 081/158] Fallback to lazy products for very small ones. --- Eigen/src/Core/products/GeneralMatrixMatrix.h | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index d06e0f808..38ca7b242 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -386,7 +386,37 @@ class GeneralProduct typedef internal::scalar_product_op BinOp; EIGEN_CHECK_BINARY_COMPATIBILIY(BinOp,LhsScalar,RhsScalar); } + + template + inline void evalTo(Dest& dst) const + { + if((m_rhs.rows()+dst.rows()+dst.cols())<20) + dst.noalias() = m_lhs .lazyProduct( m_rhs ); + else + { + dst.setZero(); + scaleAndAddTo(dst,Scalar(1)); + } + } + template + inline void addTo(Dest& dst) const + { + if((m_rhs.rows()+dst.rows()+dst.cols())<20) + dst.noalias() += m_lhs .lazyProduct( m_rhs ); + else + scaleAndAddTo(dst,Scalar(1)); + } + + template + inline void subTo(Dest& dst) const + { + if((m_rhs.rows()+dst.rows()+dst.cols())<20) + dst.noalias() -= m_lhs .lazyProduct( m_rhs ); + else + scaleAndAddTo(dst,Scalar(-1)); + } + template void scaleAndAddTo(Dest& dst, const Scalar& alpha) const { eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols()); From 0fa8290366038f6b71494499aba163eb60698426 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 17 Apr 2014 16:02:27 +0200 Subject: [PATCH 082/158] Optimize ploaddup for AVX --- Eigen/src/Core/arch/AVX/PacketMath.h | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 47e10f6da..8d2e88061 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -182,20 +182,19 @@ template<> EIGEN_STRONG_INLINE Packet8i ploadu(const int* from) { EIGE // Loads 4 floats from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, a3} template<> EIGEN_STRONG_INLINE Packet8f ploaddup(const float* from) { - // FIXME we should only load the first 128bits - Packet8f tmp = ploadu(from); - Packet8f tmp1 = _mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2)); - Packet8f tmp2 = _mm256_permute_ps(tmp, _MM_SHUFFLE(1,1,0,0)); - return _mm256_blend_ps(_mm256_permute2f128_ps(tmp1,tmp1,1),tmp2,15); + // TODO try to find a way to avoid the need of a temporary register + Packet8f tmp = _mm256_castps128_ps256(_mm_loadu_ps(from)); + tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1); + return _mm256_unpacklo_ps(tmp,tmp); } // Loads 2 doubles from memory a returns the packet {a0, a0 a1, a1} template<> EIGEN_STRONG_INLINE Packet4d ploaddup(const double* from) { - // FIXME we should only load the first 128bits - Packet4d tmp = ploadu(from); - Packet4d tmp1 = _mm256_permute_pd(tmp,0); - Packet4d tmp2 = _mm256_permute_pd(tmp,3); - return _mm256_blend_pd(tmp1,_mm256_permute2f128_pd(tmp2,tmp2,1),12); + // TODO try to find a way to avoid the need of a temporary register + Packet2d tmp0 = _mm_loadu_pd(from); + Packet2d tmp1 = _mm_permute_pd(tmp0,3); + tmp0 = _mm_permute_pd(tmp0,0); + return _mm256_insertf128_pd(_mm256_castpd128_pd256(tmp0), tmp1, 1); } // Loads 2 floats from memory a returns the packet {a0, a0 a0, a0, a1, a1, a1, a1} From 11fbdcbc385917f44b7b01671e158d07a695eb00 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 17 Apr 2014 16:04:30 +0200 Subject: [PATCH 083/158] Fix and optimize mixed products --- .../Core/products/GeneralBlockPanelKernel.h | 166 ++++++++++-------- 1 file changed, 92 insertions(+), 74 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index df30fdd3e..dcc0b4a0d 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -180,14 +180,15 @@ public: NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, - // register block size along the N direction (must be either 2 or 4) - nr = 4,//NumberOfRegisters/4, + // register block size along the N direction must be 1 or 4 + nr = 4, // register block size along the M direction (currently, this one cannot be modified) #ifdef __FMA__ + // we assume 16 registers mr = 3*LhsPacketSize, #else - mr = 2*LhsPacketSize, + mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize, #endif LhsProgress = LhsPacketSize, @@ -209,15 +210,15 @@ public: p = pset1(ResScalar(0)); } - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) - { - pbroadcast4(b, b0, b1, b2, b3); - } - - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) - { - pbroadcast2(b, b0, b1); - } +// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) +// { +// pbroadcast4(b, b0, b1, b2, b3); +// } +// +// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) +// { +// pbroadcast2(b, b0, b1); +// } template EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const @@ -290,8 +291,13 @@ public: ResPacketSize = Vectorizable ? packet_traits::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, - nr = NumberOfRegisters/2, - mr = LhsPacketSize, + nr = 4, +#ifdef __FMA__ + // we assume 16 registers + mr = 3*LhsPacketSize, +#else + mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize, +#endif LhsProgress = LhsPacketSize, RhsProgress = 1 @@ -332,15 +338,15 @@ public: dest = ploadu(a); } - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) - { - pbroadcast4(b, b0, b1, b2, b3); - } - - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) - { - pbroadcast2(b, b0, b1); - } +// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) +// { +// pbroadcast4(b, b0, b1, b2, b3); +// } +// +// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) +// { +// pbroadcast2(b, b0, b1); +// } EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const { @@ -566,7 +572,7 @@ public: NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, // FIXME: should depend on NumberOfRegisters nr = 4, - mr = ResPacketSize, + mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*ResPacketSize, LhsProgress = ResPacketSize, RhsProgress = 1 @@ -593,19 +599,25 @@ public: } // linking error if instantiated without being optimized out: - void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3); - - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) - { - // FIXME not sure that's the best way to implement it! - b0 = pload1(b+0); - b1 = pload1(b+1); - } +// void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3); +// +// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) +// { +// // FIXME not sure that's the best way to implement it! +// b0 = pload1(b+0); +// b1 = pload1(b+1); +// } EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { dest = ploaddup(a); } + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const + { + eigen_internal_assert(unpacket_traits::size<=4); + loadRhs(b,dest); + } EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const { @@ -619,7 +631,13 @@ public: EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const { +#ifdef EIGEN_VECTORIZE_FMA + EIGEN_UNUSED_VARIABLE(tmp); + c.v = pmadd(a,b.v,c.v); +#else tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp); +#endif + } EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const @@ -956,7 +974,7 @@ void gebp_kernel for(Index k=0; k for(Index k=0; k EIGEN_GEBGP_ONESTEP(7); blB += pk*4*RhsProgress; - blA += pk*(1*Traits::LhsProgress); + blA += pk*1*LhsProgress; IACA_END } // process remaining peeled loop @@ -1169,7 +1187,7 @@ void gebp_kernel RhsPacket B_0, B1; EIGEN_GEBGP_ONESTEP(0); blB += 4*RhsProgress; - blA += 1*Traits::LhsProgress; + blA += 1*LhsProgress; } #undef EIGEN_GEBGP_ONESTEP @@ -1439,6 +1457,8 @@ EIGEN_DONT_INLINE void gemm_pack_lhs=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; + const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1 + : Pack2>1 ? (rows/Pack2)*Pack2 : 0; // Pack 3 packets if(Pack1>=3*PacketSize) @@ -1496,16 +1516,20 @@ EIGEN_DONT_INLINE void gemm_pack_lhs=Pack2) -// { -// if(PanelMode) count += Pack2*offset; -// for(Index k=0; k1) + { + for(Index i=peeled_mc1; i=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; // const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; - int pack_packets = Pack1/PacketSize; + int pack = Pack1; Index i = 0; - while(pack_packets>0) + while(pack>0) { - Index remaining_rows = rows-i; - Index peeled_mc = i+(remaining_rows/(pack_packets*PacketSize))*(pack_packets*PacketSize); -// std::cout << "pack_packets = " << pack_packets << " from " << i << " to " << peeled_mc << "\n"; - for(; i=PacketSize) { - for (Index m = 0; m < (pack_packets*PacketSize); m += PacketSize) + for(; k kernel; - for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = ploadu(&lhs(i+p+m, k)); - ptranspose(kernel); - for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack_packets*PacketSize)*p, cj.pconj(kernel.packet[p])); + for (Index m = 0; m < pack; m += PacketSize) + { + Kernel kernel; + for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = ploadu(&lhs(i+p+m, k)); + ptranspose(kernel); + for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p])); + } + count += PacketSize*pack; } - count += PacketSize*(pack_packets*PacketSize); } for(; k=Pack2) -// { -// if(PanelMode) count += Pack2*offset; -// for(Index k=0; k Date: Thu, 17 Apr 2014 16:26:35 +0200 Subject: [PATCH 084/158] Extend mixingtype unit test to check transposed cases. --- test/mixingtypes.cpp | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/test/mixingtypes.cpp b/test/mixingtypes.cpp index ada2f69d3..1e0e2d4c1 100644 --- a/test/mixingtypes.cpp +++ b/test/mixingtypes.cpp @@ -97,11 +97,25 @@ template void mixingtypes(int size = SizeAtCompileType) VERIFY_IS_APPROX(sd*mcd*md, sd*mcd*md.template cast()); VERIFY_IS_APPROX(scd*md*mcd, scd*md.template cast().eval()*mcd); VERIFY_IS_APPROX(scd*mcd*md, scd*mcd*md.template cast()); - + VERIFY_IS_APPROX(sf*mf*mcf, sf*mf.template cast()*mcf); VERIFY_IS_APPROX(sf*mcf*mf, sf*mcf*mf.template cast()); VERIFY_IS_APPROX(scf*mf*mcf, scf*mf.template cast()*mcf); VERIFY_IS_APPROX(scf*mcf*mf, scf*mcf*mf.template cast()); + + VERIFY_IS_APPROX(sd*md.adjoint()*mcd, (sd*md).template cast().eval().adjoint()*mcd); + VERIFY_IS_APPROX(sd*mcd.adjoint()*md, sd*mcd.adjoint()*md.template cast()); + VERIFY_IS_APPROX(sd*md.adjoint()*mcd.adjoint(), (sd*md).template cast().eval().adjoint()*mcd.adjoint()); + VERIFY_IS_APPROX(sd*mcd.adjoint()*md.adjoint(), sd*mcd.adjoint()*md.template cast().adjoint()); + VERIFY_IS_APPROX(sd*md*mcd.adjoint(), (sd*md).template cast().eval()*mcd.adjoint()); + VERIFY_IS_APPROX(sd*mcd*md.adjoint(), sd*mcd*md.template cast().adjoint()); + + VERIFY_IS_APPROX(sf*mf.adjoint()*mcf, (sf*mf).template cast().eval().adjoint()*mcf); + VERIFY_IS_APPROX(sf*mcf.adjoint()*mf, sf*mcf.adjoint()*mf.template cast()); + VERIFY_IS_APPROX(sf*mf.adjoint()*mcf.adjoint(), (sf*mf).template cast().eval().adjoint()*mcf.adjoint()); + VERIFY_IS_APPROX(sf*mcf.adjoint()*mf.adjoint(), sf*mcf.adjoint()*mf.template cast().adjoint()); + VERIFY_IS_APPROX(sf*mf*mcf.adjoint(), (sf*mf).template cast().eval()*mcf.adjoint()); + VERIFY_IS_APPROX(sf*mcf*mf.adjoint(), sf*mcf*mf.template cast().adjoint()); VERIFY_IS_APPROX(sf*mf*vcf, (sf*mf).template cast().eval()*vcf); VERIFY_IS_APPROX(scf*mf*vcf,(scf*mf.template cast()).eval()*vcf); From 45a4aad572605f7db3348b1089801e0b8ea7cfd8 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 17 Apr 2014 16:27:22 +0200 Subject: [PATCH 085/158] add unit tests for ploadquad and predux4, and split packetmath unit test wrt real/complex --- test/packetmath.cpp | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 08566faf8..9ee2cad21 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -183,11 +183,30 @@ template void packetmath() VERIFY(areApprox(ref, data2, PacketSize) && "ploaddup"); } } + if(PacketSize>2) + { + for(int offset=0;offset<4;++offset) + { + for(int i=0;i(data1+offset)); + VERIFY(areApprox(ref, data2, PacketSize) && "ploadquad"); + } + } ref[0] = 0; for (int i=0; i(data1)), refvalue) && "internal::predux"); + + { + for (int i=0; i<4; ++i) + ref[i] = 0; + for (int i=0; i(data1))); + VERIFY(areApprox(ref, data2, PacketSize>4?PacketSize/2:PacketSize) && "internal::predux4"); + } ref[0] = 1; for (int i=0; i() ); CALL_SUBTEST_2( packetmath() ); CALL_SUBTEST_3( packetmath() ); - CALL_SUBTEST_1( packetmath >() ); - CALL_SUBTEST_2( packetmath >() ); + CALL_SUBTEST_4( packetmath >() ); + CALL_SUBTEST_5( packetmath >() ); CALL_SUBTEST_1( packetmath_notcomplex() ); CALL_SUBTEST_2( packetmath_notcomplex() ); @@ -406,13 +425,13 @@ void test_packetmath() CALL_SUBTEST_1( packetmath_real() ); CALL_SUBTEST_2( packetmath_real() ); - CALL_SUBTEST_1( packetmath_complex >() ); - CALL_SUBTEST_2( packetmath_complex >() ); + CALL_SUBTEST_4( packetmath_complex >() ); + CALL_SUBTEST_5( packetmath_complex >() ); CALL_SUBTEST_1( packetmath_scatter_gather() ); CALL_SUBTEST_2( packetmath_scatter_gather() ); CALL_SUBTEST_3( packetmath_scatter_gather() ); - CALL_SUBTEST_3( packetmath_scatter_gather >() ); - CALL_SUBTEST_3( packetmath_scatter_gather >() ); + CALL_SUBTEST_4( packetmath_scatter_gather >() ); + CALL_SUBTEST_5( packetmath_scatter_gather >() ); } } From 1dd015fea64048219aa4c2d616fb56e0c37bad47 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 17 Apr 2014 16:27:58 +0200 Subject: [PATCH 086/158] Reduce block sizes in unit tests. --- test/main.h | 1 + 1 file changed, 1 insertion(+) diff --git a/test/main.h b/test/main.h index 9dd8bc535..fcac0e3ab 100644 --- a/test/main.h +++ b/test/main.h @@ -31,6 +31,7 @@ // B0 is defined in POSIX header termios.h #define B0 FORBIDDEN_IDENTIFIER +#define EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS // shuts down ICC's remark #593: variable "XXX" was set but never used #define TEST_SET_BUT_UNUSED_VARIABLE(X) X = X + 0; From e5d0cb54a5f2a2200a4656d993c82a80f159a7c4 Mon Sep 17 00:00:00 2001 From: Benjamin Chretien Date: Thu, 17 Apr 2014 18:49:23 +0200 Subject: [PATCH 087/158] Fix typo in Reductions tutorial. --- doc/TutorialReductionsVisitorsBroadcasting.dox | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/TutorialReductionsVisitorsBroadcasting.dox b/doc/TutorialReductionsVisitorsBroadcasting.dox index 992cf6f34..eb6787dbc 100644 --- a/doc/TutorialReductionsVisitorsBroadcasting.dox +++ b/doc/TutorialReductionsVisitorsBroadcasting.dox @@ -32,7 +32,7 @@ Eigen also provides the \link MatrixBase::norm() norm() \endlink method, which r These operations can also operate on matrices; in that case, a n-by-p matrix is seen as a vector of size (n*p), so for example the \link MatrixBase::norm() norm() \endlink method returns the "Frobenius" or "Hilbert-Schmidt" norm. We refrain from speaking of the \f$\ell^2\f$ norm of a matrix because that can mean different things. -If you want other \f$\ell^p\f$ norms, use the \link MatrixBase::lpNorm() lpNnorm

() \endlink method. The template parameter \a p can take the special value \a Infinity if you want the \f$\ell^\infty\f$ norm, which is the maximum of the absolute values of the coefficients. +If you want other \f$\ell^p\f$ norms, use the \link MatrixBase::lpNorm() lpNorm

() \endlink method. The template parameter \a p can take the special value \a Infinity if you want the \f$\ell^\infty\f$ norm, which is the maximum of the absolute values of the coefficients. The following example demonstrates these methods. From 9746396d1b8d039d3d0d6537ad477135e5e9d3f5 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 17 Apr 2014 20:51:04 +0200 Subject: [PATCH 088/158] Optimize AVX pset1 for complexes and ploaddup --- Eigen/src/Core/arch/AVX/Complex.h | 14 ++++---------- Eigen/src/Core/arch/AVX/PacketMath.h | 20 ++++++++++++-------- 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index 8f95a7be7..d0646e77d 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -78,11 +78,7 @@ template<> EIGEN_STRONG_INLINE Packet4cf ploadu(const std::complex EIGEN_STRONG_INLINE Packet4cf pset1(const std::complex& from) { - const float r = std::real(from); - const float i = std::imag(from); - // Beware, _mm256_set_ps expects the scalar values in reverse order (i.e. 7 to 0) - const __m256 result = _mm256_set_ps(i, r, i, r, i, r, i, r); - return Packet4cf(result); + return Packet4cf(_mm256_castps_pd(_mm256_broadcast_sd((const double*)(const void*)&from))); } template<> EIGEN_STRONG_INLINE Packet4cf ploaddup(const std::complex* from) @@ -304,11 +300,9 @@ template<> EIGEN_STRONG_INLINE Packet2cd ploadu(const std::complex EIGEN_STRONG_INLINE Packet2cd pset1(const std::complex& from) { - const double r = std::real(from); - const double i = std::imag(from); - // Beware, _mm256_set_pd expects the scalar values in reverse order (i.e. 3 to 0) - const __m256d result = _mm256_set_pd(i, r, i, r); - return Packet2cd(result); + // in case casting to a __m128d* is really not safe, then we can still fallback to this version: (much slower though) +// return Packet2cd(_mm256_loadu2_m128d((const double*)&from,(const double*)&from)); + return Packet2cd(_mm256_broadcast_pd((const __m128d*)(const void*)&from)); } template<> EIGEN_STRONG_INLINE Packet2cd ploaddup(const std::complex* from) { return pset1(*from); } diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 8d2e88061..a8b94e191 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -183,18 +183,22 @@ template<> EIGEN_STRONG_INLINE Packet8i ploadu(const int* from) { EIGE template<> EIGEN_STRONG_INLINE Packet8f ploaddup(const float* from) { // TODO try to find a way to avoid the need of a temporary register - Packet8f tmp = _mm256_castps128_ps256(_mm_loadu_ps(from)); - tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1); - return _mm256_unpacklo_ps(tmp,tmp); +// Packet8f tmp = _mm256_castps128_ps256(_mm_loadu_ps(from)); +// tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1); +// return _mm256_unpacklo_ps(tmp,tmp); + + // _mm256_insertf128_ps is very slow on Haswell, thus: + Packet8f tmp = _mm256_broadcast_ps((const __m128*)(const void*)from); + // mimic an "inplace" permutation of the lower 128bits using a blend + tmp = _mm256_blend_ps(tmp,_mm256_castps128_ps256(_mm_permute_ps( _mm256_castps256_ps128(tmp), _MM_SHUFFLE(1,0,1,0))), 15); + // then we can perform a consistent permutation on the global register to get everything in shape: + return _mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2)); } // Loads 2 doubles from memory a returns the packet {a0, a0 a1, a1} template<> EIGEN_STRONG_INLINE Packet4d ploaddup(const double* from) { - // TODO try to find a way to avoid the need of a temporary register - Packet2d tmp0 = _mm_loadu_pd(from); - Packet2d tmp1 = _mm_permute_pd(tmp0,3); - tmp0 = _mm_permute_pd(tmp0,0); - return _mm256_insertf128_pd(_mm256_castpd128_pd256(tmp0), tmp1, 1); + Packet4d tmp = _mm256_broadcast_pd((const __m128d*)(const void*)from); + return _mm256_permute_pd(tmp, 3<<2); } // Loads 2 floats from memory a returns the packet {a0, a0 a0, a0, a1, a1, a1, a1} From 9777a5ca60f0a82bb789f55912fd046ab7f3d15d Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 17 Apr 2014 21:01:45 +0200 Subject: [PATCH 089/158] Various minor fixes in BTL --- ...{FindOpenBLAS.cmake => FindOPENBLAS.cmake} | 0 bench/btl/generic_bench/bench_parameter.hh | 2 +- bench/btl/libs/BLAS/CMakeLists.txt | 2 +- bench/btl/libs/BLAS/main.cpp | 10 ++++---- bench/btl/libs/blaze/CMakeLists.txt | 6 +++-- bench/btl/libs/blaze/blaze_interface.hh | 24 +++++++++---------- bench/btl/libs/blaze/main.cpp | 8 +------ bench/btl/libs/eigen3/main_adv.cpp | 14 +++++------ 8 files changed, 31 insertions(+), 35 deletions(-) rename bench/btl/cmake/{FindOpenBLAS.cmake => FindOPENBLAS.cmake} (100%) diff --git a/bench/btl/cmake/FindOpenBLAS.cmake b/bench/btl/cmake/FindOPENBLAS.cmake similarity index 100% rename from bench/btl/cmake/FindOpenBLAS.cmake rename to bench/btl/cmake/FindOPENBLAS.cmake diff --git a/bench/btl/generic_bench/bench_parameter.hh b/bench/btl/generic_bench/bench_parameter.hh index 5e341c14c..0f62bd421 100644 --- a/bench/btl/generic_bench/bench_parameter.hh +++ b/bench/btl/generic_bench/bench_parameter.hh @@ -41,7 +41,7 @@ // min matrix size for LU bench #define MIN_LU 5 // max matrix size for LU bench -#define MAX_LU 5000 +#define MAX_LU 3000 // max size for tiny vector and matrix #define TINY_MV_MAX_SIZE 16 // default nb_sample for x86 timer diff --git a/bench/btl/libs/BLAS/CMakeLists.txt b/bench/btl/libs/BLAS/CMakeLists.txt index 22f09527d..0272ccad0 100644 --- a/bench/btl/libs/BLAS/CMakeLists.txt +++ b/bench/btl/libs/BLAS/CMakeLists.txt @@ -22,7 +22,7 @@ find_package(OPENBLAS) if (OPENBLAS_FOUND) btl_add_bench(btl_openblas main.cpp) if(BUILD_btl_openblas) - target_link_libraries(btl_openblas ${GOTO_LIBRARIES} ) + target_link_libraries(btl_openblas ${OPENBLAS_LIBRARIES} ) set_target_properties(btl_openblas PROPERTIES COMPILE_FLAGS "-DCBLASNAME=OPENBLAS") endif(BUILD_btl_openblas) endif (OPENBLAS_FOUND) diff --git a/bench/btl/libs/BLAS/main.cpp b/bench/btl/libs/BLAS/main.cpp index 8347c9f0b..564d55ef2 100644 --- a/bench/btl/libs/BLAS/main.cpp +++ b/bench/btl/libs/BLAS/main.cpp @@ -56,13 +56,13 @@ int main() bench > >(MIN_MM,MAX_MM,NB_POINT); - bench > >(MIN_MM,MAX_MM,NB_POINT); - bench > >(MIN_MM,MAX_MM,NB_POINT); + bench > >(MIN_LU,MAX_LU,NB_POINT); + bench > >(MIN_LU,MAX_LU,NB_POINT); #ifdef HAS_LAPACK - bench > >(MIN_MM,MAX_MM,NB_POINT); - bench > >(MIN_MM,MAX_MM,NB_POINT); - bench > >(MIN_MM,MAX_MM,NB_POINT); +// bench > >(MIN_LU,MAX_LU,NB_POINT); + bench > >(MIN_LU,MAX_LU,NB_POINT); + bench > >(MIN_LU,MAX_LU,NB_POINT); #endif //bench > >(MIN_LU,MAX_LU,NB_POINT); diff --git a/bench/btl/libs/blaze/CMakeLists.txt b/bench/btl/libs/blaze/CMakeLists.txt index 54ab929d8..f8b1b2ec3 100644 --- a/bench/btl/libs/blaze/CMakeLists.txt +++ b/bench/btl/libs/blaze/CMakeLists.txt @@ -4,5 +4,7 @@ find_package(Boost) if (BLAZE_FOUND AND Boost_FOUND) include_directories(${BLAZE_INCLUDE_DIR} ${Boost_INCLUDE_DIRS}) btl_add_bench(btl_blaze main.cpp) - target_link_libraries(btl_blaze ${Boost_LIBRARIES} ${Boost_system_LIBRARY} /opt/local/lib/libboost_system-mt.a ) -endif (BLAZE_FOUND) + if(BUILD_btl_blaze) + target_link_libraries(btl_blaze ${Boost_LIBRARIES} ${Boost_system_LIBRARY} /opt/local/lib/libboost_system-mt.a ) + endif() +endif () diff --git a/bench/btl/libs/blaze/blaze_interface.hh b/bench/btl/libs/blaze/blaze_interface.hh index 8020fef27..ed43ecdd4 100644 --- a/bench/btl/libs/blaze/blaze_interface.hh +++ b/bench/btl/libs/blaze/blaze_interface.hh @@ -84,25 +84,25 @@ public : X = (A*B); } -// static inline void transposed_matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int N){ -// X = (trans(A)*trans(B)); -// } + static inline void transposed_matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int N){ + X = A.transpose()*B.transpose(); + } -// static inline void ata_product(const gene_matrix & A, gene_matrix & X, int N){ -// X = (trans(A)*A); -// } + static inline void ata_product(const gene_matrix & A, gene_matrix & X, int N){ + X = (A.transpose()*A); + } -// static inline void aat_product(const gene_matrix & A, gene_matrix & X, int N){ -// X = (A*trans(A)); -// } + static inline void aat_product(const gene_matrix & A, gene_matrix & X, int N){ + X = (A*A.transpose()); + } static inline void matrix_vector_product(gene_matrix & A, gene_vector & B, gene_vector & X, int N){ X = (A*B); } -// static inline void atv_product(gene_matrix & A, gene_vector & B, gene_vector & X, int N){ -// X = (trans(A)*B); -// } + static inline void atv_product(gene_matrix & A, gene_vector & B, gene_vector & X, int N){ + X = (A.transpose()*B); + } static inline void axpy(const real coef, const gene_vector & X, gene_vector & Y, int N){ Y += coef * X; diff --git a/bench/btl/libs/blaze/main.cpp b/bench/btl/libs/blaze/main.cpp index b8508c8f3..582a2956b 100644 --- a/bench/btl/libs/blaze/main.cpp +++ b/bench/btl/libs/blaze/main.cpp @@ -19,8 +19,6 @@ #include "blaze_interface.hh" #include "bench.hh" #include "basic_actions.hh" -// #include "action_cholesky.hh" -// #include "action_lu_decomp.hh" BTL_MAIN; @@ -32,14 +30,10 @@ int main() bench > >(MIN_MV,MAX_MV,NB_POINT); // bench > >(MIN_MV,MAX_MV,NB_POINT); - bench > >(MIN_MM,MAX_MM,NB_POINT); +// bench > >(MIN_MM,MAX_MM,NB_POINT); // bench > >(MIN_MM,MAX_MM,NB_POINT); // bench > >(MIN_MM,MAX_MM,NB_POINT); -// bench > >(MIN_MM,MAX_MM,NB_POINT); -// bench > >(MIN_MM,MAX_MM,NB_POINT); -// bench > >(MIN_MM,MAX_MM,NB_POINT); - return 0; } diff --git a/bench/btl/libs/eigen3/main_adv.cpp b/bench/btl/libs/eigen3/main_adv.cpp index efe5857e4..95865357e 100644 --- a/bench/btl/libs/eigen3/main_adv.cpp +++ b/bench/btl/libs/eigen3/main_adv.cpp @@ -29,14 +29,14 @@ BTL_MAIN; int main() { - bench > >(MIN_MM,MAX_MM,NB_POINT); - bench > >(MIN_MM,MAX_MM,NB_POINT); - bench > >(MIN_MM,MAX_MM,NB_POINT); - bench > >(MIN_MM,MAX_MM,NB_POINT); - bench > >(MIN_MM,MAX_MM,NB_POINT); + bench > >(MIN_LU,MAX_LU,NB_POINT); + bench > >(MIN_LU,MAX_LU,NB_POINT); + bench > >(MIN_LU,MAX_LU,NB_POINT); +// bench > >(MIN_LU,MAX_LU,NB_POINT); + bench > >(MIN_LU,MAX_LU,NB_POINT); - bench > >(MIN_MM,MAX_MM,NB_POINT); - bench > >(MIN_MM,MAX_MM,NB_POINT); +// bench > >(MIN_LU,MAX_LU,NB_POINT); + bench > >(MIN_LU,MAX_LU,NB_POINT); return 0; } From c354bd47f7001bd6b3c43fc4a4b5d27f764aa5c3 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 17 Apr 2014 21:03:26 +0200 Subject: [PATCH 090/158] Make our gemm bench a little more powerful. --- bench/bench_gemm.cpp | 100 ++++++++++++++++++++++++++++++++----------- 1 file changed, 76 insertions(+), 24 deletions(-) diff --git a/bench/bench_gemm.cpp b/bench/bench_gemm.cpp index 1ef2e72c2..8222271fb 100644 --- a/bench/bench_gemm.cpp +++ b/bench/bench_gemm.cpp @@ -2,6 +2,14 @@ // g++-4.4 bench_gemm.cpp -I .. -O2 -DNDEBUG -lrt -fopenmp && OMP_NUM_THREADS=2 ./a.out // icpc bench_gemm.cpp -I .. -O3 -DNDEBUG -lrt -openmp && OMP_NUM_THREADS=2 ./a.out +// Compilation options: +// +// -DSCALAR=std::complex +// -DSCALARA=double or -DSCALARB=double +// -DHAVE_BLAS +// -DDECOUPLED +// + #include #include #include @@ -14,10 +22,18 @@ using namespace Eigen; #define SCALAR float #endif +#ifndef SCALARA +#define SCALARA SCALAR +#endif + +#ifndef SCALARB +#define SCALARB SCALAR +#endif + typedef SCALAR Scalar; typedef NumTraits::Real RealScalar; -typedef Matrix A; -typedef Matrix B; +typedef Matrix A; +typedef Matrix B; typedef Matrix C; typedef Matrix M; @@ -135,32 +151,49 @@ int main(int argc, char ** argv) int cache_size = -1; bool need_help = false; - for (int i=1; i c t p\n"; + std::cout << argv[0] << " -s -c -t -p \n"; + std::cout << " : size\n"; + std::cout << " : rows columns depth\n"; return 1; } @@ -182,6 +215,7 @@ int main(int argc, char ** argv) // check the parallel product is correct #if defined EIGEN_HAS_OPENMP + Eigen::initParallel(); int procs = omp_get_max_threads(); if(procs>1) { @@ -198,11 +232,20 @@ int main(int argc, char ** argv) #elif defined HAVE_BLAS blas_gemm(a,b,r); c.noalias() += a * b; - if(!r.isApprox(c)) std::cerr << "Warning, your product is crap!\n\n"; + if(!r.isApprox(c)) { + std::cout << r - c << "\n"; + std::cerr << "Warning, your product is crap!\n\n"; + } #else - gemm(a,b,c); - r.noalias() += a.cast() * b.cast(); - if(!r.isApprox(c)) std::cerr << "Warning, your product is crap!\n\n"; + if(1.*m*n*p<2000.*2000*2000) + { + gemm(a,b,c); + r.noalias() += a.cast() .lazyProduct( b.cast() ); + if(!r.isApprox(c)) { + std::cout << r - c << "\n"; + std::cerr << "Warning, your product is crap!\n\n"; + } + } #endif #ifdef HAVE_BLAS @@ -224,7 +267,7 @@ int main(int argc, char ** argv) { BenchTimer tmono; omp_set_num_threads(1); - Eigen::internal::setNbThreads(1); + Eigen::setNbThreads(1); c = rc; BENCH(tmono, tries, rep, gemm(a,b,c)); std::cout << "eigen mono cpu " << tmono.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmono.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << tmono.total(CPU_TIMER) << "s)\n"; @@ -233,6 +276,15 @@ int main(int argc, char ** argv) } #endif + if(1.*m*n*p<30*30*30) + { + BenchTimer tmt; + c = rc; + BENCH(tmt, tries, rep, c.noalias()+=a.lazyProduct(b)); + std::cout << "lazy cpu " << tmt.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmt.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << tmt.total(CPU_TIMER) << "s)\n"; + std::cout << "lazy real " << tmt.best(REAL_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmt.best(REAL_TIMER))*1e-9 << " GFLOPS \t(" << tmt.total(REAL_TIMER) << "s)\n"; + } + #ifdef DECOUPLED if((NumTraits::IsComplex) && (NumTraits::IsComplex)) { From 6d6df90c9a813287d18fe045b78a7f4e3996ee5f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 17 Apr 2014 12:28:01 -0700 Subject: [PATCH 091/158] Implemented the pgather/pscatter packet primitives for the arm/NEON architecture --- Eigen/src/Core/arch/NEON/Complex.h | 16 +++++++++++++ Eigen/src/Core/arch/NEON/PacketMath.h | 34 +++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index 7ca76714f..7b94733ab 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -111,6 +111,22 @@ template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex< template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); } +template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, int stride) +{ + Packet4f res; + res = vsetq_lane_f32(std::real(from[0*stride]), res, 0); + res = vsetq_lane_f32(std::imag(from[0*stride]), res, 1); + res = vsetq_lane_f32(std::real(from[1*stride]), res, 2); + res = vsetq_lane_f32(std::imag(from[1*stride]), res, 3); + return Packet2cf(res); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, int stride) +{ + to[stride*0] = std::complex(vgetq_lane_f32(from.v, 0), vgetq_lane_f32(from.v, 1)); + to[stride*1] = std::complex(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3)); +} + template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ARM_PREFETCH((float *)addr); } template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 83150507a..7f3301f51 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -221,6 +221,40 @@ template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet4i& f template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); } template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); } +template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, int stride) +{ + Packet4f res; + res = vsetq_lane_f32(from[0*stride], res, 0); + res = vsetq_lane_f32(from[1*stride], res, 1); + res = vsetq_lane_f32(from[2*stride], res, 2); + res = vsetq_lane_f32(from[3*stride], res, 3); + return res; +} +template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* from, int stride) +{ + Packet4i res; + res = vsetq_lane_s32(from[0*stride], res, 0); + res = vsetq_lane_s32(from[1*stride], res, 1); + res = vsetq_lane_s32(from[2*stride], res, 2); + res = vsetq_lane_s32(from[3*stride], res, 3); + return res; +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, int stride) +{ + to[stride*0] = vgetq_lane_f32(from, 0); + to[stride*1] = vgetq_lane_f32(from, 1); + to[stride*2] = vgetq_lane_f32(from, 2); + to[stride*3] = vgetq_lane_f32(from, 3); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const Packet4i& from, int stride) +{ + to[stride*0] = vgetq_lane_s32(from, 0); + to[stride*1] = vgetq_lane_s32(from, 1); + to[stride*2] = vgetq_lane_s32(from, 2); + to[stride*3] = vgetq_lane_s32(from, 3); +} + template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { EIGEN_ARM_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { EIGEN_ARM_PREFETCH(addr); } From 6d665d446be19f233a2d94e05db206cf29a12191 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 17 Apr 2014 23:26:34 +0200 Subject: [PATCH 092/158] Fixes for fixed sizes and non vectorizable types --- Eigen/src/Core/products/GeneralMatrixMatrix.h | 2 +- Eigen/src/Core/util/XprHelper.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index 38ca7b242..b45797f09 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -287,7 +287,7 @@ class gemm_blocking_spacem_mc = ActualRows; this->m_nc = ActualCols; diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index a08538aff..1b3e122e1 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -101,6 +101,7 @@ template struct packet_traits; template struct unpacket_traits { typedef T type; + typedef T half; enum {size=1}; }; From 1388f4f9fd3f8804128a5f86ead7e478e59a4749 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 18 Apr 2014 11:43:13 +0200 Subject: [PATCH 093/158] Fix typo (was working with clang\!) --- Eigen/src/Core/arch/AVX/Complex.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index d0646e77d..18be2f225 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -78,7 +78,7 @@ template<> EIGEN_STRONG_INLINE Packet4cf ploadu(const std::complex EIGEN_STRONG_INLINE Packet4cf pset1(const std::complex& from) { - return Packet4cf(_mm256_castps_pd(_mm256_broadcast_sd((const double*)(const void*)&from))); + return Packet4cf(_mm256_castpd_ps(_mm256_broadcast_sd((const double*)(const void*)&from))); } template<> EIGEN_STRONG_INLINE Packet4cf ploaddup(const std::complex* from) From 94684721bd2d10c0b67ef518ed599981603440bc Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 18 Apr 2014 15:35:34 +0200 Subject: [PATCH 094/158] Smarter block size computation --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index dcc0b4a0d..b1ed9264a 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -106,7 +106,11 @@ void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) // In unit tests we do not want to use extra large matrices, // so we reduce the block size to check the blocking strategy is not flawed #ifndef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS - k = std::min(k,240); +// k = std::min(k,240); +// n = std::min(n,3840/sizeof(RhsScalar)); +// m = std::min(m,3840/sizeof(RhsScalar)); + + k = std::min(k,sizeof(LhsScalar)<=4 ? 360 : 240); n = std::min(n,3840/sizeof(RhsScalar)); m = std::min(m,3840/sizeof(RhsScalar)); #else From 3454b4e5f1eb1a68e15415eeca827a30cb3bb58e Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 18 Apr 2014 17:06:03 +0200 Subject: [PATCH 095/158] Fix calls to lazy products (lazy product does not like matrices with 0 length) --- Eigen/src/Core/products/GeneralMatrixMatrix.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index b45797f09..6ad07eccb 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -390,7 +390,7 @@ class GeneralProduct template inline void evalTo(Dest& dst) const { - if((m_rhs.rows()+dst.rows()+dst.cols())<20) + if((m_rhs.rows()+dst.rows()+dst.cols())<20 && m_rhs.rows()>0) dst.noalias() = m_lhs .lazyProduct( m_rhs ); else { @@ -402,7 +402,7 @@ class GeneralProduct template inline void addTo(Dest& dst) const { - if((m_rhs.rows()+dst.rows()+dst.cols())<20) + if((m_rhs.rows()+dst.rows()+dst.cols())<20 && m_rhs.rows()>0) dst.noalias() += m_lhs .lazyProduct( m_rhs ); else scaleAndAddTo(dst,Scalar(1)); @@ -411,7 +411,7 @@ class GeneralProduct template inline void subTo(Dest& dst) const { - if((m_rhs.rows()+dst.rows()+dst.cols())<20) + if((m_rhs.rows()+dst.rows()+dst.cols())<20 && m_rhs.rows()>0) dst.noalias() -= m_lhs .lazyProduct( m_rhs ); else scaleAndAddTo(dst,Scalar(-1)); From a7d20038df1dc160e282e824052ff5bcf5ba2c9a Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 18 Apr 2014 17:06:31 +0200 Subject: [PATCH 096/158] Fix alignment assertion. --- Eigen/src/Core/DenseStorage.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h index 7264b44c7..79033a1b5 100644 --- a/Eigen/src/Core/DenseStorage.h +++ b/Eigen/src/Core/DenseStorage.h @@ -88,7 +88,7 @@ struct plain_array EIGEN_DEVICE_FUNC plain_array() { - EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(0xf); + EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(EIGEN_ALIGN_BYTES-1); check_static_allocation_size(); } From 2606abed535744fcaa41b923c71338a06b8ed3fa Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 18 Apr 2014 21:14:40 +0200 Subject: [PATCH 097/158] Fix 128bit packet size assumptions in unit tests. --- Eigen/src/Core/Assign.h | 2 ++ Eigen/src/Core/GeneralProduct.h | 3 +-- test/dynalloc.cpp | 21 ++++++++++++------- test/unalignedassert.cpp | 37 ++++++++++++++++++++------------- test/vectorization_logic.cpp | 23 ++++++++++---------- 5 files changed, 52 insertions(+), 34 deletions(-) diff --git a/Eigen/src/Core/Assign.h b/Eigen/src/Core/Assign.h index cefa6f3cc..07da2fe31 100644 --- a/Eigen/src/Core/Assign.h +++ b/Eigen/src/Core/Assign.h @@ -105,6 +105,8 @@ public: EIGEN_DEBUG_VAR(DstIsAligned) EIGEN_DEBUG_VAR(SrcIsAligned) EIGEN_DEBUG_VAR(JointAlignment) + EIGEN_DEBUG_VAR(Derived::SizeAtCompileTime) + EIGEN_DEBUG_VAR(OtherDerived::CoeffReadCost) EIGEN_DEBUG_VAR(InnerSize) EIGEN_DEBUG_VAR(InnerMaxSize) EIGEN_DEBUG_VAR(PacketSize) diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h index adda6f784..229d12c3f 100644 --- a/Eigen/src/Core/GeneralProduct.h +++ b/Eigen/src/Core/GeneralProduct.h @@ -66,8 +66,7 @@ template struct product_type MaxDepth = EIGEN_SIZE_MIN_PREFER_FIXED(_Lhs::MaxColsAtCompileTime, _Rhs::MaxRowsAtCompileTime), Depth = EIGEN_SIZE_MIN_PREFER_FIXED(_Lhs::ColsAtCompileTime, - _Rhs::RowsAtCompileTime), - LargeThreshold = EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD + _Rhs::RowsAtCompileTime) }; // the splitting into different lines of code here, introducing the _select enums and the typedef below, diff --git a/test/dynalloc.cpp b/test/dynalloc.cpp index 8bbda1c94..c98cc80f0 100644 --- a/test/dynalloc.cpp +++ b/test/dynalloc.cpp @@ -10,11 +10,13 @@ #include "main.h" #if EIGEN_ALIGN -#define ALIGNMENT 16 +#define ALIGNMENT EIGEN_ALIGN_BYTES #else #define ALIGNMENT 1 #endif +typedef Matrix Vector8f; + void check_handmade_aligned_malloc() { for(int i = 1; i < 1000; i++) @@ -68,7 +70,7 @@ struct MyStruct { EIGEN_MAKE_ALIGNED_OPERATOR_NEW char dummychar; - Vector4f avec; + Vector8f avec; }; class MyClassA @@ -76,15 +78,19 @@ class MyClassA public: EIGEN_MAKE_ALIGNED_OPERATOR_NEW char dummychar; - Vector4f avec; + Vector8f avec; }; template void check_dynaligned() { - T* obj = new T; - VERIFY(T::NeedsToAlign==1); - VERIFY(size_t(obj)%ALIGNMENT==0); - delete obj; + // TODO have to be updated once we support multiple alignment values + if(T::SizeAtCompileTime % ALIGNMENT == 0) + { + T* obj = new T; + VERIFY(T::NeedsToAlign==1); + VERIFY(size_t(obj)%ALIGNMENT==0); + delete obj; + } } void test_dynalloc() @@ -102,6 +108,7 @@ void test_dynalloc() CALL_SUBTEST(check_dynaligned() ); CALL_SUBTEST(check_dynaligned() ); CALL_SUBTEST(check_dynaligned() ); + CALL_SUBTEST(check_dynaligned() ); } // check static allocation, who knows ? diff --git a/test/unalignedassert.cpp b/test/unalignedassert.cpp index 601dbf214..d8815263a 100644 --- a/test/unalignedassert.cpp +++ b/test/unalignedassert.cpp @@ -9,6 +9,8 @@ #include "main.h" +typedef Matrix Vector8f; + struct TestNew1 { MatrixXd m; // good: m will allocate its own array, taking care of alignment. @@ -69,7 +71,7 @@ void construct_at_boundary(int boundary) { char buf[sizeof(T)+256]; size_t _buf = reinterpret_cast(buf); - _buf += (16 - (_buf % 16)); // make 16-byte aligned + _buf += (EIGEN_ALIGN_BYTES - (_buf % EIGEN_ALIGN_BYTES)); // make 16/32-byte aligned _buf += boundary; // make exact boundary-aligned T *x = ::new(reinterpret_cast(_buf)) T; x[0].setZero(); // just in order to silence warnings @@ -85,18 +87,18 @@ void unalignedassert() construct_at_boundary(16); construct_at_boundary(16); construct_at_boundary(4); - construct_at_boundary(16); + construct_at_boundary(EIGEN_ALIGN_BYTES); construct_at_boundary(16); construct_at_boundary(4); - construct_at_boundary(16); - construct_at_boundary(16); + construct_at_boundary(EIGEN_ALIGN_BYTES); + construct_at_boundary(EIGEN_ALIGN_BYTES); construct_at_boundary(4); - construct_at_boundary(16); + construct_at_boundary(EIGEN_ALIGN_BYTES); construct_at_boundary(16); construct_at_boundary(4); - construct_at_boundary(16); + construct_at_boundary(EIGEN_ALIGN_BYTES); construct_at_boundary(16); #endif @@ -110,14 +112,21 @@ void unalignedassert() check_unalignedassert_good >(); #if EIGEN_ALIGN_STATICALLY - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); + if(EIGEN_ALIGN_BYTES==16) + { + VERIFY_RAISES_ASSERT(construct_at_boundary(8)); + VERIFY_RAISES_ASSERT(construct_at_boundary(8)); + VERIFY_RAISES_ASSERT(construct_at_boundary(8)); + } + for(int b=8; b(b)); + VERIFY_RAISES_ASSERT(construct_at_boundary(b)); + VERIFY_RAISES_ASSERT(construct_at_boundary(b)); + VERIFY_RAISES_ASSERT(construct_at_boundary(b)); + VERIFY_RAISES_ASSERT(construct_at_boundary(b)); + VERIFY_RAISES_ASSERT(construct_at_boundary(b)); + } #endif } diff --git a/test/vectorization_logic.cpp b/test/vectorization_logic.cpp index aee68a87f..09b46660b 100644 --- a/test/vectorization_logic.cpp +++ b/test/vectorization_logic.cpp @@ -90,8 +90,8 @@ template::Vectori typedef Matrix Matrix22; typedef Matrix Matrix44; typedef Matrix Matrix44u; - typedef Matrix Matrix44c; - typedef Matrix Matrix44r; + typedef Matrix Matrix44c; + typedef Matrix Matrix44r; typedef Matrix::Vectori LinearTraversal,CompleteUnrolling)); VERIFY(test_assign(Matrix33c().col(0),Matrix33c().col(1)+Matrix33c().col(1), LinearTraversal,CompleteUnrolling)); - + VERIFY(test_assign(Matrix3(),Matrix3().cwiseQuotient(Matrix3()), LinearVectorizedTraversal,CompleteUnrolling)); - + VERIFY(test_assign(Matrix(),Matrix()+Matrix(), LinearTraversal,NoUnrolling)); - - VERIFY(test_assign(Matrix11(),Matrix().template block(2,3)+Matrix().template block(10,4), - DefaultTraversal,CompleteUnrolling)); + + VERIFY(test_assign(Matrix11(),Matrix().template block(2,3)+Matrix().template block(8,4), + DefaultTraversal,PacketSize>4?InnerUnrolling:CompleteUnrolling)); } VERIFY(test_redux(Matrix3(), @@ -174,18 +174,19 @@ template::Vectori VERIFY(test_redux(Matrix44r().template block<1,2*PacketSize>(2,1), LinearVectorizedTraversal,CompleteUnrolling)); - + VERIFY((test_assign< Map >, Matrix22 >(InnerVectorizedTraversal,CompleteUnrolling))); VERIFY((test_assign< - Map >, - Matrix22 + Map, Aligned, InnerStride<3*PacketSize> >, + Matrix >(DefaultTraversal,CompleteUnrolling))); - VERIFY((test_assign(Matrix11(), Matrix11()*Matrix11(), InnerVectorizedTraversal, CompleteUnrolling))); + VERIFY((test_assign(Matrix11(), Matrix()*Matrix(), + PacketSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD?DefaultTraversal:InnerVectorizedTraversal, CompleteUnrolling))); #endif VERIFY(test_assign(MatrixXX(10,10),MatrixXX(20,20).block(10,10,2,3), From 5c5231ab71ba51432a24fdfd7d36a835b44805ea Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 22 Apr 2014 16:03:19 +0200 Subject: [PATCH 098/158] Workaround gcc's default ABI not being able to distinghish between vector types of different sizes. --- CMakeLists.txt | 3 -- Eigen/src/Core/arch/SSE/Complex.h | 22 ++++---- Eigen/src/Core/arch/SSE/MathFunctions.h | 10 ++-- Eigen/src/Core/arch/SSE/PacketMath.h | 68 +++++++++++++++++-------- 4 files changed, 63 insertions(+), 40 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 838a41b79..fb13769f4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -199,9 +199,6 @@ if(NOT MSVC) option(EIGEN_TEST_AVX "Enable/Disable AVX in tests/examples" OFF) if(EIGEN_TEST_AVX) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx") - if(CMAKE_COMPILER_IS_GNUCXX) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fabi-version=6") - endif() message(STATUS "Enabling AVX in tests/examples") endif() diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index 715e5a13c..d0c080c4f 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -110,8 +110,8 @@ template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { return pset1(*from); } -template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); } -template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); } +template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), Packet4f(from.v)); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), Packet4f(from.v)); } template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, int stride) @@ -145,7 +145,7 @@ template<> EIGEN_STRONG_INLINE std::complex pfirst(const Pack #endif } -template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { return Packet2cf(_mm_castpd_ps(preverse(_mm_castps_pd(a.v)))); } +template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { return Packet2cf(_mm_castpd_ps(preverse(Packet2d(_mm_castps_pd(a.v))))); } template<> EIGEN_STRONG_INLINE std::complex predux(const Packet2cf& a) { @@ -235,7 +235,7 @@ template<> struct conj_helper { return padd(c, pmul(x,y)); } EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const - { return Packet2cf(Eigen::internal::pmul(x, y.v)); } + { return Packet2cf(Eigen::internal::pmul(x, y.v)); } }; template<> struct conj_helper @@ -244,7 +244,7 @@ template<> struct conj_helper { return padd(c, pmul(x,y)); } EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const - { return Packet2cf(Eigen::internal::pmul(x.v, y)); } + { return Packet2cf(Eigen::internal::pmul(x.v, y)); } }; template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) @@ -300,7 +300,7 @@ template<> struct unpacket_traits { typedef std::complex type template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(a.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); } template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); @@ -338,8 +338,8 @@ template<> EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* from) { return pset1(*from); } // FIXME force unaligned store, this is a temporary fix -template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } -template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v)); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v)); } template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } @@ -437,7 +437,7 @@ template<> struct conj_helper { return padd(c, pmul(x,y)); } EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const - { return Packet1cd(Eigen::internal::pmul(x, y.v)); } + { return Packet1cd(Eigen::internal::pmul(x, y.v)); } }; template<> struct conj_helper @@ -446,7 +446,7 @@ template<> struct conj_helper { return padd(c, pmul(x,y)); } EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const - { return Packet1cd(Eigen::internal::pmul(x.v, y)); } + { return Packet1cd(Eigen::internal::pmul(x.v, y)); } }; template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) @@ -459,7 +459,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, con EIGEN_STRONG_INLINE Packet1cd pcplxflip/**/(const Packet1cd& x) { - return Packet1cd(preverse(x.v)); + return Packet1cd(preverse(Packet2d(x.v))); } template<> EIGEN_DEVICE_FUNC inline void diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h index 09f74c651..8f78b3a6c 100644 --- a/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -63,7 +63,7 @@ Packet4f plog(const Packet4f& _x) x = _mm_or_ps(x, p4f_half); emm0 = _mm_sub_epi32(emm0, p4i_0x7f); - Packet4f e = padd(_mm_cvtepi32_ps(emm0), p4f_1); + Packet4f e = padd(Packet4f(_mm_cvtepi32_ps(emm0)), p4f_1); /* part2: if( x < SQRTHF ) { @@ -72,9 +72,9 @@ Packet4f plog(const Packet4f& _x) } else { x = x - 1.0; } */ Packet4f mask = _mm_cmplt_ps(x, p4f_cephes_SQRTHF); - Packet4f tmp = _mm_and_ps(x, mask); + Packet4f tmp = pand(x, mask); x = psub(x, p4f_1); - e = psub(e, _mm_and_ps(p4f_1, mask)); + e = psub(e, pand(p4f_1, mask)); x = padd(x, tmp); Packet4f x2 = pmul(x,x); @@ -167,7 +167,7 @@ Packet4f pexp(const Packet4f& _x) emm0 = _mm_cvttps_epi32(fx); emm0 = _mm_add_epi32(emm0, p4i_0x7f); emm0 = _mm_slli_epi32(emm0, 23); - return pmul(y, _mm_castsi128_ps(emm0)); + return pmul(y, Packet4f(_mm_castsi128_ps(emm0))); } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d pexp(const Packet2d& _x) @@ -241,7 +241,7 @@ Packet2d pexp(const Packet2d& _x) emm0 = _mm_add_epi32(emm0, p4i_1023_0); emm0 = _mm_slli_epi32(emm0, 20); emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3)); - return pmul(x, _mm_castsi128_pd(emm0)); + return pmul(x, Packet2d(_mm_castsi128_pd(emm0))); } /* evaluation of 4 sines at onces, using SSE2 intrinsics. diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 89dfa6975..e7e035f4e 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -22,9 +22,35 @@ namespace internal { #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) #endif +#if defined EIGEN_VECTORIZE_AVX && defined __GNUC__ && !(defined __clang__ || defined __INTEL_COMPILER) +// With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot +// have overloads for both types without linking error. +// One solution is to increase ABI version using -fabi-version=4 (or greater). +// To workaround this inconvenince, we rather wrap 128bit types into the following helper +// structure: +// TODO disable this wrapper if abi-versio>=4, but to detect that without asking the user to define a macro? +template +struct eigen_packet_wrapper +{ + EIGEN_ALWAYS_INLINE operator T&() { return m_val; } + EIGEN_ALWAYS_INLINE operator const T&() const { return m_val; } + EIGEN_ALWAYS_INLINE eigen_packet_wrapper() {} + EIGEN_ALWAYS_INLINE eigen_packet_wrapper(const T &v) : m_val(v) {} + EIGEN_ALWAYS_INLINE eigen_packet_wrapper& operator=(const T &v) { + m_val = v; + return *this; + } + + T m_val; +}; +typedef eigen_packet_wrapper<__m128> Packet4f; +typedef eigen_packet_wrapper<__m128i> Packet4i; +typedef eigen_packet_wrapper<__m128d> Packet2d; +#else typedef __m128 Packet4f; typedef __m128i Packet4i; typedef __m128d Packet2d; +#endif template<> struct is_arithmetic<__m128> { enum { value = true }; }; template<> struct is_arithmetic<__m128i> { enum { value = true }; }; @@ -161,7 +187,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) } template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { - return psub(_mm_setr_epi32(0,0,0,0), a); + return psub(Packet4i(_mm_setr_epi32(0,0,0,0)), a); } template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } @@ -244,7 +270,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, con template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); } template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); } -template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast(from)); } +template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast(from)); } #if defined(_MSC_VER) template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { @@ -262,7 +288,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) { E #endif } template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); } - template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast(from)); } + template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast(from)); } #else // Fast unaligned loads. Note that here we cannot directly use intrinsics: this would // require pointer casting to incompatible pointer types and leads to invalid code @@ -312,7 +338,7 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD #if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS - return _mm_loadu_si128(reinterpret_cast(from)); + return _mm_loadu_si128(reinterpret_cast(from)); #else __m128d res; res = _mm_load_sd((const double*)(from)) ; @@ -331,13 +357,13 @@ template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int* from) { Packet4i tmp; - tmp = _mm_loadl_epi64(reinterpret_cast(from)); + tmp = _mm_loadl_epi64(reinterpret_cast(from)); return vec4i_swizzle1(tmp, 0, 0, 1, 1); } template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); } template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); } -template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast(to), from); } +template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); } template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE @@ -348,8 +374,8 @@ template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& _mm_storeh_pd((to+1), from); #endif } -template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast(to), _mm_castps_pd(from)); } -template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast(to), _mm_castsi128_pd(from)); } +template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast(to), Packet2d(_mm_castps_pd(from))); } +template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast(to), Packet2d(_mm_castsi128_pd(from))); } template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, int stride) { @@ -388,13 +414,13 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const template<> EIGEN_STRONG_INLINE void pstore1(float* to, const float& a) { Packet4f pa = _mm_set_ss(a); - pstore(to, vec4f_swizzle1(pa,0,0,0,0)); + pstore(to, Packet4f(vec4f_swizzle1(pa,0,0,0,0))); } // some compilers might be tempted to perform multiple moves instead of using a vector path. template<> EIGEN_STRONG_INLINE void pstore1(double* to, const double& a) { Packet2d pa = _mm_set_sd(a); - pstore(to, vec2d_swizzle1(pa,0,0)); + pstore(to, Packet2d(vec2d_swizzle1(pa,0,0))); } #ifndef EIGEN_VECTORIZE_AVX @@ -507,10 +533,10 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) { Packet4f tmp0 = _mm_hadd_ps(a,a); - return pfirst(_mm_hadd_ps(tmp0, tmp0)); + return pfirst(_mm_hadd_ps(tmp0, tmp0)); } -template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) { return pfirst(_mm_hadd_pd(a, a)); } +template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) { return pfirst(_mm_hadd_pd(a, a)); } // SSSE3 version: // EIGEN_STRONG_INLINE float predux(const Packet4i& a) @@ -553,7 +579,7 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) { Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a)); - return pfirst(tmp) + pfirst(_mm_shuffle_epi32(tmp, 1)); + return pfirst(tmp) + pfirst(_mm_shuffle_epi32(tmp, 1)); } template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) @@ -576,11 +602,11 @@ template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) { Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a)); - return pfirst(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); + return pfirst(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); } template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) { - return pfirst(_mm_mul_sd(a, _mm_unpackhi_pd(a,a))); + return pfirst(_mm_mul_sd(a, _mm_unpackhi_pd(a,a))); } template<> EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) { @@ -596,17 +622,17 @@ template<> EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) { Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a,a)); - return pfirst(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); + return pfirst(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); } template<> EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) { - return pfirst(_mm_min_sd(a, _mm_unpackhi_pd(a,a))); + return pfirst(_mm_min_sd(a, _mm_unpackhi_pd(a,a))); } template<> EIGEN_STRONG_INLINE int predux_min(const Packet4i& a) { #ifdef EIGEN_VECTORIZE_SSE4_1 Packet4i tmp = _mm_min_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2))); - return pfirst(_mm_min_epi32(tmp,_mm_shuffle_epi32(tmp, 1))); + return pfirst(_mm_min_epi32(tmp,_mm_shuffle_epi32(tmp, 1))); #else // after some experiments, it is seems this is the fastest way to implement it // for GCC (eg., it does not like using std::min after the pstore !!) @@ -622,17 +648,17 @@ template<> EIGEN_STRONG_INLINE int predux_min(const Packet4i& a) template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) { Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a,a)); - return pfirst(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); + return pfirst(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); } template<> EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) { - return pfirst(_mm_max_sd(a, _mm_unpackhi_pd(a,a))); + return pfirst(_mm_max_sd(a, _mm_unpackhi_pd(a,a))); } template<> EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) { #ifdef EIGEN_VECTORIZE_SSE4_1 Packet4i tmp = _mm_max_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2))); - return pfirst(_mm_max_epi32(tmp,_mm_shuffle_epi32(tmp, 1))); + return pfirst(_mm_max_epi32(tmp,_mm_shuffle_epi32(tmp, 1))); #else // after some experiments, it is seems this is the fastest way to implement it // for GCC (eg., it does not like using std::min after the pstore !!) From ecbd67a15a55372f189d3718ff8661791bfa1c9f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 22 Apr 2014 17:03:57 +0200 Subject: [PATCH 099/158] Fix EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT macro --- Eigen/src/Core/DenseStorage.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h index 79033a1b5..94f796783 100644 --- a/Eigen/src/Core/DenseStorage.h +++ b/Eigen/src/Core/DenseStorage.h @@ -68,13 +68,13 @@ struct plain_array template EIGEN_ALWAYS_INLINE PtrType eigen_unaligned_array_assert_workaround_gcc47(PtrType array) { return array; } #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \ - eigen_assert((reinterpret_cast(eigen_unaligned_array_assert_workaround_gcc47(array)) & sizemask) == 0 \ + eigen_assert((reinterpret_cast(eigen_unaligned_array_assert_workaround_gcc47(array)) & (sizemask)) == 0 \ && "this assertion is explained here: " \ "http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \ " **** READ THIS WEB PAGE !!! ****"); #else #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \ - eigen_assert((reinterpret_cast(array) & sizemask) == 0 \ + eigen_assert((reinterpret_cast(array) & (sizemask)) == 0 \ && "this assertion is explained here: " \ "http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \ " **** READ THIS WEB PAGE !!! ****"); From 82b09fcb911c5a0734ccf441504269f33034194a Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 23 Apr 2014 13:09:26 +0200 Subject: [PATCH 100/158] Add Altivec implementation of pgather/pscatter (not tested) --- Eigen/src/Core/arch/AltiVec/Complex.h | 16 ++++++++++ Eigen/src/Core/arch/AltiVec/PacketMath.h | 37 ++++++++++++++++++++++++ 2 files changed, 53 insertions(+) diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h index db52074f4..ec11cfaa0 100644 --- a/Eigen/src/Core/arch/AltiVec/Complex.h +++ b/Eigen/src/Core/arch/AltiVec/Complex.h @@ -66,6 +66,22 @@ template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, int stride) +{ + std::complex EIGEN_ALIGN16 af[2]; + af[0] = from[0*stride]; + af[1] = from[1*stride]; + return Packet2cf(vec_ld(0, af)); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, int stride) +{ + std::complex EIGEN_ALIGN16 af[2]; + vec_st(from.v, 0, af); + to[0*stride] = af[0]; + to[1*stride] = af[1]; +} + + template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_add(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_sub(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); } diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 16948264f..80a99a004 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -163,6 +163,43 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { return vc; } +template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, int stride) +{ + float EIGEN_ALIGN16 af[4]; + af[0] = from[0*stride]; + af[1] = from[1*stride]; + af[2] = from[2*stride]; + af[3] = from[3*stride]; + return vec_ld(0, af); +} +template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* from, int stride) +{ + int EIGEN_ALIGN16 ai[4]; + ai[0] = from[0*stride]; + ai[1] = from[1*stride]; + ai[2] = from[2*stride]; + ai[3] = from[3*stride]; + return vec_ld(0, ai); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, int stride) +{ + float EIGEN_ALIGN16 af[4]; + vec_st(from, 0, af); + to[0*stride] = af[0]; + to[1*stride] = af[1]; + to[2*stride] = af[2]; + to[3*stride] = af[3]; +} +template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const Packet4i& from, int stride) +{ + int EIGEN_ALIGN16 ai[4]; + vec_st(from, 0, ai); + to[0*stride] = ai[0]; + to[1*stride] = ai[1]; + to[2*stride] = ai[2]; + to[3*stride] = ai[3]; +} + template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return vec_add(pset1(a), p4f_COUNTDOWN); } template<> EIGEN_STRONG_INLINE Packet4i plset(const int& a) { return vec_add(pset1(a), p4i_COUNTDOWN); } From ccb4dec719c1dffbd3565497cd897abb1b9f730f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 23 Apr 2014 18:22:10 -0700 Subject: [PATCH 101/158] Created a NEON version of the ptranspose packet primitives --- Eigen/src/Core/arch/NEON/Complex.h | 8 ++++++++ Eigen/src/Core/arch/NEON/PacketMath.h | 23 ++++++++++++++++++++++- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index 7b94733ab..a668382e2 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -263,6 +263,14 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, con return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s))); } +template<> EIGEN_DEVICE_FUNC inline void +ptranspose(Kernel& kernel) { + float32x4_t tmp = vcombine_f32(vget_high_f32(kernel.packet[0].v), vget_high_f32(kernel.packet[1].v)); + kernel.packet[0].v = vcombine_f32(vget_low_f32(kernel.packet[0].v), vget_low_f32(kernel.packet[1].v)); + kernel.packet[1].v = tmp; +} + + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index c9b9e5e9b..6a6876e69 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -447,9 +447,30 @@ PALIGN_NEON(0,Packet4i,vextq_s32) PALIGN_NEON(1,Packet4i,vextq_s32) PALIGN_NEON(2,Packet4i,vextq_s32) PALIGN_NEON(3,Packet4i,vextq_s32) - + #undef PALIGN_NEON +template<> EIGEN_DEVICE_FUNC inline void +ptranspose(Kernel& kernel) { + float32x4x2_t tmp1 = vzipq_f32(kernel.packet[0], kernel.packet[1]); + float32x4x2_t tmp2 = vzipq_f32(kernel.packet[2], kernel.packet[3]); + + kernel.packet[0] = vcombine_f32(vget_low_f32(tmp1.val[0]), vget_low_f32(tmp2.val[0])); + kernel.packet[1] = vcombine_f32(vget_high_f32(tmp1.val[0]), vget_high_f32(tmp2.val[0])); + kernel.packet[2] = vcombine_f32(vget_low_f32(tmp1.val[1]), vget_low_f32(tmp2.val[1])); + kernel.packet[3] = vcombine_f32(vget_high_f32(tmp1.val[1]), vget_high_f32(tmp2.val[1])); +} + +template<> EIGEN_DEVICE_FUNC inline void +ptranspose(Kernel& kernel) { + int32x4x2_t tmp1 = vzipq_s32(kernel.packet[0], kernel.packet[1]); + int32x4x2_t tmp2 = vzipq_s32(kernel.packet[2], kernel.packet[3]); + kernel.packet[0] = vcombine_s32(vget_low_s32(tmp1.val[0]), vget_low_s32(tmp2.val[0])); + kernel.packet[1] = vcombine_s32(vget_high_s32(tmp1.val[0]), vget_high_s32(tmp2.val[0])); + kernel.packet[2] = vcombine_s32(vget_low_s32(tmp1.val[1]), vget_low_s32(tmp2.val[1])); + kernel.packet[3] = vcombine_s32(vget_high_s32(tmp1.val[1]), vget_high_s32(tmp2.val[1])); +} + } // end namespace internal } // end namespace Eigen From 4eb92e564710cd55323302d31e47aff74d770426 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 23 Apr 2014 18:23:07 -0700 Subject: [PATCH 102/158] Fixed the NEON implementation of predux_max. --- Eigen/src/Core/arch/NEON/PacketMath.h | 1 + 1 file changed, 1 insertion(+) diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 6a6876e69..6426623cf 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -422,6 +422,7 @@ template<> EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) a_lo = vget_low_s32(a); a_hi = vget_high_s32(a); max = vpmax_s32(a_lo, a_hi); + max = vpmax_s32(max, max); return vget_lane_s32(max, 0); } From 8d85ce88e129d794d0700dd2c8eec2713449e54d Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 24 Apr 2014 05:47:53 -0700 Subject: [PATCH 103/158] Implement ptranspose on altivec and fix pgather/pscatter --- Eigen/src/Core/arch/AltiVec/Complex.h | 15 ++++++++++--- Eigen/src/Core/arch/AltiVec/PacketMath.h | 27 ++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h index ec11cfaa0..ee1f008b1 100644 --- a/Eigen/src/Core/arch/AltiVec/Complex.h +++ b/Eigen/src/Core/arch/AltiVec/Complex.h @@ -21,6 +21,8 @@ static Packet16uc p16uc_COMPLEX_REV = vec_sld(p16uc_REVERSE, p16uc_REVERSE, 8); static Packet16uc p16uc_COMPLEX_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8);//{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; static Packet16uc p16uc_PSET_HI = (Packet16uc) vec_mergeh((Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 1));//{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 }; static Packet16uc p16uc_PSET_LO = (Packet16uc) vec_mergeh((Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 2), (Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 3));//{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 }; +static Packet16uc p16uc_COMPLEX_TRANSPOSE_0 = { 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23}; +static Packet16uc p16uc_COMPLEX_TRANSPOSE_1 = { 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31}; //---------- float ---------- struct Packet2cf @@ -52,7 +54,7 @@ template<> struct packet_traits > : default_packet_traits }; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=2}; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=2}; typedef Packet2cf half; }; template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) { @@ -71,12 +73,12 @@ template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packe std::complex EIGEN_ALIGN16 af[2]; af[0] = from[0*stride]; af[1] = from[1*stride]; - return Packet2cf(vec_ld(0, af)); + return Packet2cf(vec_ld(0, (const float*)af)); } template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, int stride) { std::complex EIGEN_ALIGN16 af[2]; - vec_st(from.v, 0, af); + vec_st(from.v, 0, (float*)af); to[0*stride] = af[0]; to[1*stride] = af[1]; } @@ -227,6 +229,13 @@ template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip(const Packet2cf& x return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX_REV)); } +template<> EIGEN_STRONG_INLINE void ptranspose(Kernel& kernel) +{ + Packet4f tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_COMPLEX_TRANSPOSE_0); + kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_COMPLEX_TRANSPOSE_1); + kernel.packet[0].v = tmp; +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 80a99a004..618d95d85 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -146,6 +146,7 @@ inline std::ostream & operator <<(std::ostream & s, const Packetbi & v) return s; } */ + template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html float EIGEN_ALIGN16 af[4]; @@ -533,6 +534,32 @@ struct palign_impl } }; +template<> EIGEN_DEVICE_FUNC inline void +ptranspose(Kernel& kernel) { + Packet4f t0, t1, t2, t3; + t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); + t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); + t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); + t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); + kernel.packet[0] = vec_mergeh(t0, t2); + kernel.packet[1] = vec_mergel(t0, t2); + kernel.packet[2] = vec_mergeh(t1, t3); + kernel.packet[3] = vec_mergel(t1, t3); +} + +template<> EIGEN_DEVICE_FUNC inline void +ptranspose(Kernel& kernel) { + Packet4i t0, t1, t2, t3; + t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); + t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); + t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); + t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); + kernel.packet[0] = vec_mergeh(t0, t2); + kernel.packet[1] = vec_mergel(t0, t2); + kernel.packet[2] = vec_mergeh(t1, t3); + kernel.packet[3] = vec_mergel(t1, t3); +} + } // end namespace internal } // end namespace Eigen From b0e19db1cf462a07e25429d4f04f7d8e858f670f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 24 Apr 2014 23:17:18 +0200 Subject: [PATCH 104/158] Enable fused madd for Altivec --- Eigen/src/Core/arch/AVX/PacketMath.h | 6 ++++++ Eigen/src/Core/arch/AltiVec/PacketMath.h | 4 ++++ Eigen/src/Core/arch/SSE/PacketMath.h | 6 ++++++ Eigen/src/Core/products/GeneralBlockPanelKernel.h | 10 +++++----- 4 files changed, 21 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index a8b94e191..e6f540430 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -22,6 +22,12 @@ namespace internal { #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) #endif +#ifdef EIGEN_VECTORIZE_FMA +#ifndef EIGEN_HAS_FUSED_MADD +#define EIGEN_HAS_FUSED_MADD 1 +#endif +#endif + typedef __m256 Packet8f; typedef __m256i Packet8i; typedef __m256d Packet4d; diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 618d95d85..e26d88382 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -18,6 +18,10 @@ namespace internal { #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4 #endif +#ifndef EIGEN_HAS_FUSED_MADD +#define EIGEN_HAS_FUSED_MADD 1 +#endif + #ifndef EIGEN_HAS_FUSE_CJMADD #define EIGEN_HAS_FUSE_CJMADD 1 #endif diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index e7e035f4e..4f9d8c4fd 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -22,6 +22,12 @@ namespace internal { #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) #endif +#ifdef EIGEN_VECTORIZE_FMA +#ifndef EIGEN_HAS_FUSED_MADD +#define EIGEN_HAS_FUSED_MADD 1 +#endif +#endif + #if defined EIGEN_VECTORIZE_AVX && defined __GNUC__ && !(defined __clang__ || defined __INTEL_COMPILER) // With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot // have overloads for both types without linking error. diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index b1ed9264a..60251f624 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -188,7 +188,7 @@ public: nr = 4, // register block size along the M direction (currently, this one cannot be modified) -#ifdef __FMA__ +#ifdef EIGEN_HAS_FUSED_MADD // we assume 16 registers mr = 3*LhsPacketSize, #else @@ -254,7 +254,7 @@ public: // let gcc allocate the register in which to store the result of the pmul // (in the case where there is no FMA) gcc fails to figure out how to avoid // spilling register. -#ifdef EIGEN_VECTORIZE_FMA +#ifdef EIGEN_HAS_FUSED_MADD EIGEN_UNUSED_VARIABLE(tmp); c = pmadd(a,b,c); #else @@ -296,7 +296,7 @@ public: NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, nr = 4, -#ifdef __FMA__ +#ifdef EIGEN_HAS_FUSED_MADD // we assume 16 registers mr = 3*LhsPacketSize, #else @@ -359,7 +359,7 @@ public: EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const { -#ifdef EIGEN_VECTORIZE_FMA +#ifdef EIGEN_HAS_FUSED_MADD EIGEN_UNUSED_VARIABLE(tmp); c.v = pmadd(a.v,b,c.v); #else @@ -635,7 +635,7 @@ public: EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const { -#ifdef EIGEN_VECTORIZE_FMA +#ifdef EIGEN_HAS_FUSED_MADD EIGEN_UNUSED_VARIABLE(tmp); c.v = pmadd(a,b.v,c.v); #else From 3d8d0f6269478a06f4fcbd4b838c8e9b9d7e9d62 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 25 Apr 2014 10:56:18 +0200 Subject: [PATCH 105/158] Enable vectorization of pack_rhs with a column-major RHS. Rename and generalize Kernel<*> to PacketBlock<*,N>. --- Eigen/src/Core/GenericPacketMath.h | 10 +++---- Eigen/src/Core/arch/AVX/Complex.h | 8 +++--- Eigen/src/Core/arch/AVX/PacketMath.h | 26 ++++++++++++++++--- Eigen/src/Core/arch/AltiVec/Complex.h | 2 +- Eigen/src/Core/arch/AltiVec/PacketMath.h | 4 +-- Eigen/src/Core/arch/NEON/Complex.h | 2 +- Eigen/src/Core/arch/NEON/PacketMath.h | 4 +-- Eigen/src/Core/arch/SSE/Complex.h | 4 +-- Eigen/src/Core/arch/SSE/PacketMath.h | 12 ++++----- .../Core/products/GeneralBlockPanelKernel.h | 25 ++++++++++-------- test/packetmath.cpp | 10 +++---- 11 files changed, 64 insertions(+), 43 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 147298009..98313c68f 100755 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -417,15 +417,15 @@ template<> inline std::complex pmul(const std::complex& a, const /*************************************************************************** - * Kernel, that is a collection of N packets where N is the number of words - * in the packet. + * PacketBlock, that is a collection of N packets where the number of words + * in the packet is a multiple of N. ***************************************************************************/ -template struct Kernel { - Packet packet[unpacket_traits::size]; +template ::size> struct PacketBlock { + Packet packet[N]; }; template EIGEN_DEVICE_FUNC inline void -ptranspose(Kernel& /*kernel*/) { +ptranspose(PacketBlock& /*kernel*/) { // Nothing to do in the scalar case, i.e. a 1x1 matrix. } diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index 18be2f225..9ced85132 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -431,8 +431,8 @@ template<> EIGEN_STRONG_INLINE Packet2cd pcplxflip(const Packet2cd& x return Packet2cd(_mm256_shuffle_pd(x.v, x.v, 0x5)); } -template<> EIGEN_DEVICE_FUNC inline void -ptranspose(Kernel& kernel) { +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { __m256d P0 = _mm256_castps_pd(kernel.packet[0].v); __m256d P1 = _mm256_castps_pd(kernel.packet[1].v); __m256d P2 = _mm256_castps_pd(kernel.packet[2].v); @@ -449,8 +449,8 @@ ptranspose(Kernel& kernel) { kernel.packet[2].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 49)); } -template<> EIGEN_DEVICE_FUNC inline void -ptranspose(Kernel& kernel) { +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { __m256d tmp = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 0+(2<<4)); kernel.packet[1].v = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 1+(3<<4)); kernel.packet[0].v = tmp; diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index e6f540430..8b8307d75 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -498,8 +498,8 @@ struct palign_impl } }; -template<> EIGEN_DEVICE_FUNC inline void -ptranspose(Kernel& kernel) { +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]); __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]); __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]); @@ -526,8 +526,26 @@ ptranspose(Kernel& kernel) { kernel.packet[7] = _mm256_permute2f128_ps(S3, S7, 0x31); } -template<> EIGEN_DEVICE_FUNC inline void -ptranspose(Kernel& kernel) { +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]); + __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]); + __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]); + __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]); + + __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0)); + __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2)); + __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0)); + __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2)); + + kernel.packet[0] = _mm256_permute2f128_ps(S0, S1, 0x20); + kernel.packet[1] = _mm256_permute2f128_ps(S2, S3, 0x20); + kernel.packet[2] = _mm256_permute2f128_ps(S0, S1, 0x31); + kernel.packet[3] = _mm256_permute2f128_ps(S2, S3, 0x31); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { __m256d T0 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 15); __m256d T1 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 0); __m256d T2 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 15); diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h index ee1f008b1..8fdffad5e 100644 --- a/Eigen/src/Core/arch/AltiVec/Complex.h +++ b/Eigen/src/Core/arch/AltiVec/Complex.h @@ -229,7 +229,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip(const Packet2cf& x return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX_REV)); } -template<> EIGEN_STRONG_INLINE void ptranspose(Kernel& kernel) +template<> EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { Packet4f tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_COMPLEX_TRANSPOSE_0); kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_COMPLEX_TRANSPOSE_1); diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index e26d88382..8a67354e4 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -539,7 +539,7 @@ struct palign_impl }; template<> EIGEN_DEVICE_FUNC inline void -ptranspose(Kernel& kernel) { +ptranspose(PacketBlock& kernel) { Packet4f t0, t1, t2, t3; t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); @@ -552,7 +552,7 @@ ptranspose(Kernel& kernel) { } template<> EIGEN_DEVICE_FUNC inline void -ptranspose(Kernel& kernel) { +ptranspose(PacketBlock& kernel) { Packet4i t0, t1, t2, t3; t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index a668382e2..3c60735a9 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -264,7 +264,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, con } template<> EIGEN_DEVICE_FUNC inline void -ptranspose(Kernel& kernel) { +ptranspose(PacketBlock& kernel) { float32x4_t tmp = vcombine_f32(vget_high_f32(kernel.packet[0].v), vget_high_f32(kernel.packet[1].v)); kernel.packet[0].v = vcombine_f32(vget_low_f32(kernel.packet[0].v), vget_low_f32(kernel.packet[1].v)); kernel.packet[1].v = tmp; diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 6426623cf..37fb840d4 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -452,7 +452,7 @@ PALIGN_NEON(3,Packet4i,vextq_s32) #undef PALIGN_NEON template<> EIGEN_DEVICE_FUNC inline void -ptranspose(Kernel& kernel) { +ptranspose(PacketBlock& kernel) { float32x4x2_t tmp1 = vzipq_f32(kernel.packet[0], kernel.packet[1]); float32x4x2_t tmp2 = vzipq_f32(kernel.packet[2], kernel.packet[3]); @@ -463,7 +463,7 @@ ptranspose(Kernel& kernel) { } template<> EIGEN_DEVICE_FUNC inline void -ptranspose(Kernel& kernel) { +ptranspose(PacketBlock& kernel) { int32x4x2_t tmp1 = vzipq_s32(kernel.packet[0], kernel.packet[1]); int32x4x2_t tmp2 = vzipq_s32(kernel.packet[2], kernel.packet[3]); kernel.packet[0] = vcombine_s32(vget_low_s32(tmp1.val[0]), vget_low_s32(tmp2.val[0])); diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index d0c080c4f..758183c18 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -462,8 +462,8 @@ EIGEN_STRONG_INLINE Packet1cd pcplxflip/**/(const Packet1cd& x) return Packet1cd(preverse(Packet2d(x.v))); } -template<> EIGEN_DEVICE_FUNC inline void -ptranspose(Kernel& kernel) { +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { __m128d w1 = _mm_castps_pd(kernel.packet[0].v); __m128d w2 = _mm_castps_pd(kernel.packet[1].v); diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 4f9d8c4fd..ad935d5f1 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -784,20 +784,20 @@ struct palign_impl }; #endif -template<> EIGEN_DEVICE_FUNC inline void -ptranspose(Kernel& kernel) { +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]); } -template<> EIGEN_DEVICE_FUNC inline void -ptranspose(Kernel& kernel) { +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { __m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]); kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]); kernel.packet[1] = tmp; } -template<> EIGEN_DEVICE_FUNC inline void -ptranspose(Kernel& kernel) { +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { __m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]); __m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]); __m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]); diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 60251f624..0a94f25e4 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1585,7 +1585,7 @@ EIGEN_DONT_INLINE void gemm_pack_lhs kernel; + PacketBlock kernel; for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = ploadu(&lhs(i+p+m, k)); ptranspose(kernel); for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p])); @@ -1675,7 +1675,7 @@ EIGEN_DONT_INLINE void gemm_pack_rhs kernel; +// PacketBlock kernel; // for (int p = 0; p < PacketSize; ++p) { // kernel.packet[p] = ploadu(&rhs[(j2+p)*rhsStride+k]); // } @@ -1713,19 +1713,22 @@ EIGEN_DONT_INLINE void gemm_pack_rhs kernel; - for (int p = 0; p < PacketSize; ++p) { - kernel.packet[p] = ploadu(&rhs[(j2+p)*rhsStride+k]); - } + PacketBlock kernel; + kernel.packet[0] = ploadu(&b0[k]); + kernel.packet[1] = ploadu(&b1[k]); + kernel.packet[2] = ploadu(&b2[k]); + kernel.packet[3] = ploadu(&b3[k]); ptranspose(kernel); - for (int p = 0; p < PacketSize; ++p) { - pstoreu(blockB+count, cj.pconj(kernel.packet[p])); - count+=PacketSize; - } + pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0])); + pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1])); + pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.packet[2])); + pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.packet[3])); + count+=4*PacketSize; } } for(; k void packetmath() internal::pstore(data2, internal::preverse(internal::pload(data1))); VERIFY(areApprox(ref, data2, PacketSize) && "internal::preverse"); - internal::Kernel kernel; + internal::PacketBlock kernel; for (int i=0; i(data1+i*PacketSize); } @@ -236,7 +236,7 @@ template void packetmath() for (int i=0; i void packetmath_scatter_gather() { for (int i = 0; i < PacketSize*11; ++i) { if ((i%11) == 0) { - VERIFY(isApproxAbs(buffer[i], data1[i/11], refvalue)); + VERIFY(isApproxAbs(buffer[i], data1[i/11], refvalue) && "pscatter"); } else { - VERIFY(isApproxAbs(buffer[i], Scalar(0), refvalue)); + VERIFY(isApproxAbs(buffer[i], Scalar(0), refvalue) && "pscatter"); } } @@ -405,7 +405,7 @@ template void packetmath_scatter_gather() { packet = internal::pgather(buffer, 7); internal::pstore(data1, packet); for (int i = 0; i < PacketSize; ++i) { - VERIFY(isApproxAbs(data1[i], buffer[i*7], refvalue)); + VERIFY(isApproxAbs(data1[i], buffer[i*7], refvalue) && "pgather"); } } From cf7eaed38db25c5205f6aabfd4e96e774fd98af5 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 25 Apr 2014 11:04:02 +0200 Subject: [PATCH 106/158] Avoid blocking-size mismatch in unit tests calling Eigen's blas interface. --- test/cholmod_support.cpp | 1 + test/main.h | 4 ++++ test/pastix_support.cpp | 2 ++ test/spqr_support.cpp | 2 ++ test/superlu_support.cpp | 1 + test/umfpack_support.cpp | 1 + 6 files changed, 11 insertions(+) diff --git a/test/cholmod_support.cpp b/test/cholmod_support.cpp index 8f8be3c0e..87f119b1e 100644 --- a/test/cholmod_support.cpp +++ b/test/cholmod_support.cpp @@ -7,6 +7,7 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +#define EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS #include "sparse_solver.h" #include diff --git a/test/main.h b/test/main.h index fcac0e3ab..3ccc2ae88 100644 --- a/test/main.h +++ b/test/main.h @@ -31,7 +31,11 @@ // B0 is defined in POSIX header termios.h #define B0 FORBIDDEN_IDENTIFIER +// Unit tests calling Eigen's blas library must preserve the default blocking size +// to avoid troubles. +#ifndef EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS #define EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS +#endif // shuts down ICC's remark #593: variable "XXX" was set but never used #define TEST_SET_BUT_UNUSED_VARIABLE(X) X = X + 0; diff --git a/test/pastix_support.cpp b/test/pastix_support.cpp index 14da0944b..49239e3a5 100644 --- a/test/pastix_support.cpp +++ b/test/pastix_support.cpp @@ -7,6 +7,8 @@ // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS #include "sparse_solver.h" #include #include diff --git a/test/spqr_support.cpp b/test/spqr_support.cpp index b8980e081..901c42c40 100644 --- a/test/spqr_support.cpp +++ b/test/spqr_support.cpp @@ -5,6 +5,8 @@ // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed + +#define EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS #include "sparse.h" #include diff --git a/test/superlu_support.cpp b/test/superlu_support.cpp index 3b16135bc..98a7bc5c8 100644 --- a/test/superlu_support.cpp +++ b/test/superlu_support.cpp @@ -7,6 +7,7 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +#define EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS #include "sparse_solver.h" #include diff --git a/test/umfpack_support.cpp b/test/umfpack_support.cpp index 9eb84c14b..37ab11f0b 100644 --- a/test/umfpack_support.cpp +++ b/test/umfpack_support.cpp @@ -7,6 +7,7 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +#define EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS #include "sparse_solver.h" #include From c79bd4b64bcc7c66d2b53a9a668a3310a3ae2998 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 25 Apr 2014 11:06:03 +0200 Subject: [PATCH 107/158] Minor optimizations in product kernel: - use pbroadcast4 (helpful when AVX is not available) - process all remaining rows at once (significant speedup for small matrices) --- .../Core/products/GeneralBlockPanelKernel.h | 166 ++++++++---------- 1 file changed, 75 insertions(+), 91 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 0a94f25e4..e76c12c39 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -214,11 +214,11 @@ public: p = pset1(ResScalar(0)); } -// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) -// { -// pbroadcast4(b, b0, b1, b2, b3); -// } -// + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + { + pbroadcast4(b, b0, b1, b2, b3); + } + // EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) // { // pbroadcast2(b, b0, b1); @@ -342,11 +342,11 @@ public: dest = ploadu(a); } -// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) -// { -// pbroadcast4(b, b0, b1, b2, b3); -// } -// + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + { + pbroadcast4(b, b0, b1, b2, b3); + } + // EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) // { // pbroadcast2(b, b0, b1); @@ -713,11 +713,9 @@ void gebp_kernel const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0; enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell) const Index peeled_kc = depth & ~(pk-1); + const Index prefetch_res_offset = 32/sizeof(ResScalar); // const Index depth2 = depth & ~1; -// std::cout << mr << " " << peeled_mc3 << " " << peeled_mc2 << " " << peeled_mc1 << "\n"; - - //---------- Process 3 * LhsProgress rows at once ---------- // This corresponds to 3*LhsProgress x nr register blocks. // Usually, make sense only with FMA @@ -736,19 +734,12 @@ void gebp_kernel prefetch(&blA[0]); // gets res block as register - AccPacket C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, C10, C11; - traits.initAcc(C0); - traits.initAcc(C1); - traits.initAcc(C2); - traits.initAcc(C3); - traits.initAcc(C4); - traits.initAcc(C5); - traits.initAcc(C6); - traits.initAcc(C7); - traits.initAcc(C8); - traits.initAcc(C9); - traits.initAcc(C10); - traits.initAcc(C11); + AccPacket C0, C1, C2, C3, + C4, C5, C6, C7, + C8, C9, C10, C11; + traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3); + traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7); + traits.initAcc(C8); traits.initAcc(C9); traits.initAcc(C10); traits.initAcc(C11); ResScalar* r0 = &res[(j2+0)*resStride + i]; ResScalar* r1 = &res[(j2+1)*resStride + i]; @@ -767,7 +758,6 @@ void gebp_kernel for(Index k=0; k prefetch(&blA[0]); // gets res block as register - AccPacket C0, C1, C2, C3, C4, C5, C6, C7; - traits.initAcc(C0); - traits.initAcc(C1); - traits.initAcc(C2); - traits.initAcc(C3); - traits.initAcc(C4); - traits.initAcc(C5); - traits.initAcc(C6); - traits.initAcc(C7); + AccPacket C0, C1, C2, C3, + C4, C5, C6, C7; + traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3); + traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7); ResScalar* r0 = &res[(j2+0)*resStride + i]; ResScalar* r1 = &res[(j2+1)*resStride + i]; ResScalar* r2 = &res[(j2+2)*resStride + i]; ResScalar* r3 = &res[(j2+3)*resStride + i]; - internal::prefetch(r0); - internal::prefetch(r1); - internal::prefetch(r2); - internal::prefetch(r3); + internal::prefetch(r0+prefetch_res_offset); + internal::prefetch(r1+prefetch_res_offset); + internal::prefetch(r2+prefetch_res_offset); + internal::prefetch(r3+prefetch_res_offset); // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; @@ -977,26 +962,22 @@ void gebp_kernel for(Index k=0; k blB += pk*4*RhsProgress; blA += pk*(2*Traits::LhsProgress); - IACA_END } // process remaining peeled loop for(Index k=peeled_kc; k traits.initAcc(C4); ResScalar* r0 = &res[(j2+0)*resStride + i]; + internal::prefetch(r0+prefetch_res_offset); // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB]; @@ -1079,7 +1060,7 @@ void gebp_kernel traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \ traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \ traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ - traits.madd(A0, B_0, C0, B1); \ + traits.madd(A0, B_0, C0, B1); \ traits.madd(A1, B_0, C4, B_0) EIGEN_GEBGP_ONESTEP(0); @@ -1143,10 +1124,10 @@ void gebp_kernel ResScalar* r2 = &res[(j2+2)*resStride + i]; ResScalar* r3 = &res[(j2+3)*resStride + i]; - internal::prefetch(r0); - internal::prefetch(r1); - internal::prefetch(r2); - internal::prefetch(r3); + internal::prefetch(r0+prefetch_res_offset); + internal::prefetch(r1+prefetch_res_offset); + internal::prefetch(r2+prefetch_res_offset); + internal::prefetch(r3+prefetch_res_offset); // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; @@ -1155,19 +1136,18 @@ void gebp_kernel for(Index k=0; k blB += pk*4*RhsProgress; blA += pk*1*LhsProgress; - IACA_END } // process remaining peeled loop for(Index k=peeled_kc; k } //---------- Process remaining rows, 1 at once ---------- { - // loop on each row of the lhs (1*LhsProgress x depth) - for(Index i=peeled_mc1; i Index k=0; for(; k res[(j2+3)*resStride + i] += alpha*C3; } } - // remaining columns - for(Index j2=packet_cols4; j2 Date: Fri, 25 Apr 2014 11:15:13 +0200 Subject: [PATCH 108/158] Fix ptranspose overload prototypes for NEON --- Eigen/src/Core/arch/NEON/Complex.h | 2 +- Eigen/src/Core/arch/NEON/PacketMath.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index 3c60735a9..259f2e7b8 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -263,7 +263,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, con return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s))); } -template<> EIGEN_DEVICE_FUNC inline void +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { float32x4_t tmp = vcombine_f32(vget_high_f32(kernel.packet[0].v), vget_high_f32(kernel.packet[1].v)); kernel.packet[0].v = vcombine_f32(vget_low_f32(kernel.packet[0].v), vget_low_f32(kernel.packet[1].v)); diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 37fb840d4..e5eb06f36 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -451,7 +451,7 @@ PALIGN_NEON(3,Packet4i,vextq_s32) #undef PALIGN_NEON -template<> EIGEN_DEVICE_FUNC inline void +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { float32x4x2_t tmp1 = vzipq_f32(kernel.packet[0], kernel.packet[1]); float32x4x2_t tmp2 = vzipq_f32(kernel.packet[2], kernel.packet[3]); @@ -462,7 +462,7 @@ ptranspose(PacketBlock& kernel) { kernel.packet[3] = vcombine_f32(vget_high_f32(tmp1.val[1]), vget_high_f32(tmp2.val[1])); } -template<> EIGEN_DEVICE_FUNC inline void +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { int32x4x2_t tmp1 = vzipq_s32(kernel.packet[0], kernel.packet[1]); int32x4x2_t tmp2 = vzipq_s32(kernel.packet[2], kernel.packet[3]); From ae4d9434e23e62d9403e570c20aeb3b8b44a2dd3 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 25 Apr 2014 11:21:18 +0200 Subject: [PATCH 109/158] Add unit test for pbroadcast4/2 --- test/packetmath.cpp | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/test/packetmath.cpp b/test/packetmath.cpp index b4815629e..eb2cf7ebe 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -171,6 +171,30 @@ template void packetmath() VERIFY(areApprox(ref, data2, PacketSize) && "internal::pset1"); } + for(int offset=0;offset<3;++offset) + { + for (int i=0; i(&data1[offset], A0, A1, A2, A3); + internal::pstore(data2+0*PacketSize, A0); + internal::pstore(data2+1*PacketSize, A1); + internal::pstore(data2+2*PacketSize, A2); + internal::pstore(data2+3*PacketSize, A3); + VERIFY(areApprox(ref, data2, 4*PacketSize) && "internal::pbroadcast4"); + } + + for(int offset=0;offset<3;++offset) + { + for (int i=0; i(&data1[offset], A0, A1); + internal::pstore(data2+0*PacketSize, A0); + internal::pstore(data2+1*PacketSize, A1); + VERIFY(areApprox(ref, data2, 2*PacketSize) && "internal::pbroadcast2"); + } + VERIFY(internal::isApprox(data1[0], internal::pfirst(internal::pload(data1))) && "internal::pfirst"); if(PacketSize>1) From c9788d55b910c4bd8fb6cdc7978695f6b5d97e4c Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 25 Apr 2014 11:48:22 +0200 Subject: [PATCH 110/158] Disable 3pX4 kernel on Altivec: despite this platform has 32 registers, this version seems significantly slower. --- .../Core/products/GeneralBlockPanelKernel.h | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index e76c12c39..41c46c67a 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -188,7 +188,7 @@ public: nr = 4, // register block size along the M direction (currently, this one cannot be modified) -#ifdef EIGEN_HAS_FUSED_MADD +#if defined(EIGEN_HAS_FUSED_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) // we assume 16 registers mr = 3*LhsPacketSize, #else @@ -296,7 +296,7 @@ public: NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, nr = 4, -#ifdef EIGEN_HAS_FUSED_MADD +#if defined(EIGEN_HAS_FUSED_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) // we assume 16 registers mr = 3*LhsPacketSize, #else @@ -759,29 +759,29 @@ void gebp_kernel for(Index k=0; k // process remaining peeled loop for(Index k=peeled_kc; k Date: Fri, 25 Apr 2014 02:46:22 -0700 Subject: [PATCH 111/158] pbroadcast4/2 assume aligned memory --- test/packetmath.cpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/test/packetmath.cpp b/test/packetmath.cpp index eb2cf7ebe..9dab07522 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -171,12 +171,11 @@ template void packetmath() VERIFY(areApprox(ref, data2, PacketSize) && "internal::pset1"); } - for(int offset=0;offset<3;++offset) { for (int i=0; i(&data1[offset], A0, A1, A2, A3); + internal::pbroadcast4(data1, A0, A1, A2, A3); internal::pstore(data2+0*PacketSize, A0); internal::pstore(data2+1*PacketSize, A1); internal::pstore(data2+2*PacketSize, A2); @@ -184,12 +183,11 @@ template void packetmath() VERIFY(areApprox(ref, data2, 4*PacketSize) && "internal::pbroadcast4"); } - for(int offset=0;offset<3;++offset) { for (int i=0; i(&data1[offset], A0, A1); + internal::pbroadcast2(data1, A0, A1); internal::pstore(data2+0*PacketSize, A0); internal::pstore(data2+1*PacketSize, A1); VERIFY(areApprox(ref, data2, 2*PacketSize) && "internal::pbroadcast2"); From 2dbfd83424cd0d30dac3b42b27b970b44a4e4541 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 25 Apr 2014 02:46:57 -0700 Subject: [PATCH 112/158] Implement pbroadcast4 on altivec --- Eigen/src/Core/arch/AltiVec/Complex.h | 2 +- Eigen/src/Core/arch/AltiVec/PacketMath.h | 26 ++++++++++++++++++++++-- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h index 8fdffad5e..5409ddedd 100644 --- a/Eigen/src/Core/arch/AltiVec/Complex.h +++ b/Eigen/src/Core/arch/AltiVec/Complex.h @@ -229,7 +229,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip(const Packet2cf& x return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX_REV)); } -template<> EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { Packet4f tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_COMPLEX_TRANSPOSE_0); kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_COMPLEX_TRANSPOSE_1); diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 8a67354e4..0e9adf450 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -168,6 +168,28 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { return vc; } + +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const float *a, + Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) +{ + a3 = vec_ld(0,a); + a0 = vec_splat(a3, 0); + a1 = vec_splat(a3, 1); + a2 = vec_splat(a3, 2); + a3 = vec_splat(a3, 3); +} +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const int *a, + Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3) +{ + a3 = vec_ld(0,a); + a0 = vec_splat(a3, 0); + a1 = vec_splat(a3, 1); + a2 = vec_splat(a3, 2); + a3 = vec_splat(a3, 3); +} + template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, int stride) { float EIGEN_ALIGN16 af[4]; @@ -538,7 +560,7 @@ struct palign_impl } }; -template<> EIGEN_DEVICE_FUNC inline void +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { Packet4f t0, t1, t2, t3; t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); @@ -551,7 +573,7 @@ ptranspose(PacketBlock& kernel) { kernel.packet[3] = vec_mergel(t1, t3); } -template<> EIGEN_DEVICE_FUNC inline void +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { Packet4i t0, t1, t2, t3; t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); From c20e3641de5b6d56f5496fef2619a1f53f8a1835 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 25 Apr 2014 13:22:34 +0200 Subject: [PATCH 113/158] Fix for mixed products --- .../Core/products/GeneralBlockPanelKernel.h | 53 +++++++++++-------- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 41c46c67a..ebf438d57 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -480,8 +480,14 @@ public: loadRhs(b,dest); } - // linking error if instantiated without being optimized out: - void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3); + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + { + // FIXME not sure that's the best way to implement it! + loadRhs(b+0, b0); + loadRhs(b+1, b1); + loadRhs(b+2, b2); + loadRhs(b+3, b3); + } // Vectorized path EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacketType& b0, DoublePacketType& b1) @@ -602,9 +608,11 @@ public: dest = pset1(*b); } - // linking error if instantiated without being optimized out: -// void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3); -// + void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + { + pbroadcast4(b, b0, b1, b2, b3); + } + // EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) // { // // FIXME not sure that's the best way to implement it! @@ -1137,19 +1145,16 @@ void gebp_kernel for(Index k=0; k // process remaining peeled loop for(Index k=peeled_kc; k=1*PacketSize ? peeled_mc1 : Pack2>1 ? (rows/Pack2)*Pack2 : 0; + Index i=0; + // Pack 3 packets if(Pack1>=3*PacketSize) { - for(Index i=0; i=2*PacketSize) { - for(Index i=peeled_mc3; i=1*PacketSize) { - for(Index i=peeled_mc2; i1) { - for(Index i=peeled_mc1; i Date: Fri, 25 Apr 2014 14:05:54 +0200 Subject: [PATCH 114/158] Fix sizeof unit test --- Eigen/src/Core/DenseStorage.h | 4 ++-- test/sizeof.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h index 94f796783..59f515495 100644 --- a/Eigen/src/Core/DenseStorage.h +++ b/Eigen/src/Core/DenseStorage.h @@ -83,7 +83,7 @@ struct plain_array template struct plain_array { - EIGEN_USER_ALIGN32 T array[Size]; + EIGEN_USER_ALIGN_DEFAULT T array[Size]; EIGEN_DEVICE_FUNC plain_array() @@ -102,7 +102,7 @@ struct plain_array template struct plain_array { - EIGEN_USER_ALIGN32 T array[1]; + EIGEN_USER_ALIGN_DEFAULT T array[1]; EIGEN_DEVICE_FUNC plain_array() {} EIGEN_DEVICE_FUNC plain_array(constructor_without_unaligned_array_assert) {} }; diff --git a/test/sizeof.cpp b/test/sizeof.cpp index 7044d2062..7763e51bd 100644 --- a/test/sizeof.cpp +++ b/test/sizeof.cpp @@ -13,9 +13,9 @@ template void verifySizeOf(const MatrixType&) { typedef typename MatrixType::Scalar Scalar; if (MatrixType::RowsAtCompileTime!=Dynamic && MatrixType::ColsAtCompileTime!=Dynamic) - VERIFY(std::ptrdiff_t(sizeof(MatrixType))==std::ptrdiff_t(sizeof(Scalar))*std::ptrdiff_t(MatrixType::SizeAtCompileTime)); + VERIFY_IS_EQUAL(std::ptrdiff_t(sizeof(MatrixType)),std::ptrdiff_t(sizeof(Scalar))*std::ptrdiff_t(MatrixType::SizeAtCompileTime)); else - VERIFY(sizeof(MatrixType)==sizeof(Scalar*) + 2 * sizeof(typename MatrixType::Index)); + VERIFY_IS_EQUAL(sizeof(MatrixType),sizeof(Scalar*) + 2 * sizeof(typename MatrixType::Index)); } void test_sizeof() From f9d2f3903eea91b79a1f0a4ff96d5df544402dcc Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 25 Apr 2014 16:54:30 +0200 Subject: [PATCH 115/158] Product kernel: skip loop on columns if there is no remaining rows --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index ebf438d57..cc4a9c485 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1255,6 +1255,7 @@ void gebp_kernel } } //---------- Process remaining rows, 1 at once ---------- + if(peeled_mc1 straits.loadLhsUnaligned(blB+0*SwappedTraits::LhsProgress, A0); straits.loadLhsUnaligned(blB+1*SwappedTraits::LhsProgress, A1); - straits.loadRhsQuad(blA+0*spk, B_0); straits.loadRhsQuad(blA+1*spk, B_1); straits.madd(A0,B_0,C0,B_0); straits.madd(A1,B_1,C1,B_1); - straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0); straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1); straits.loadRhsQuad(blA+2*spk, B_0); From 450d0c3de044c9f32fa2f37fee821f6e390df382 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 25 Apr 2014 22:25:48 +0200 Subject: [PATCH 116/158] Make sure that calls to broadcast4 are 16 bytes aligned --- Eigen/src/Core/arch/SSE/PacketMath.h | 6 +++--- Eigen/src/Core/products/TriangularMatrixMatrix.h | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index ad935d5f1..6912f3bc3 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -486,7 +486,7 @@ template<> EIGEN_STRONG_INLINE void pbroadcast4(const float *a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) { - a3 = ploadu(a); + a3 = pload(a); a0 = vec4f_swizzle1(a3, 0,0,0,0); a1 = vec4f_swizzle1(a3, 1,1,1,1); a2 = vec4f_swizzle1(a3, 2,2,2,2); @@ -502,10 +502,10 @@ pbroadcast4(const double *a, a2 = _mm_loaddup_pd(a+2); a3 = _mm_loaddup_pd(a+3); #else - a1 = ploadu(a); + a1 = pload(a); a0 = vec2d_swizzle1(a1, 0,0); a1 = vec2d_swizzle1(a1, 1,1); - a3 = ploadu(a+2); + a3 = pload(a+2); a2 = vec2d_swizzle1(a3, 0,0); a3 = vec2d_swizzle1(a3, 1,1); #endif diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h index 62575aff4..8088aa691 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -300,6 +300,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix=cols) ? 0 : actual_kc; Scalar* geb = blockB+ts*ts; + geb = geb + internal::first_aligned(geb,EIGEN_ALIGN_BYTES/sizeof(Scalar)); pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, actual_kc, rs); From e7ef26fa44999b054cbf36fb909f9737a111c4fb Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 25 Apr 2014 23:36:22 +0200 Subject: [PATCH 117/158] TRMM: Make sure we have enough memory in rhs block to enforce alignment. --- Eigen/src/Core/products/TriangularMatrixMatrix.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h index 8088aa691..db7b27f8e 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -263,7 +263,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix Date: Mon, 28 Apr 2014 14:10:22 +0100 Subject: [PATCH 118/158] Make gdb pretty printer Python3-compatible (bug #800). --- debug/gdb/printers.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/debug/gdb/printers.py b/debug/gdb/printers.py index 86996a4f9..0d67a5f99 100644 --- a/debug/gdb/printers.py +++ b/debug/gdb/printers.py @@ -49,7 +49,7 @@ class EigenMatrixPrinter: regex = re.compile('\<.*\>') m = regex.findall(tag)[0][1:-1] template_params = m.split(',') - template_params = map(lambda x:x.replace(" ", ""), template_params) + template_params = [x.replace(" ", "") for x in template_params] if template_params[1] == '-0x00000000000000001' or template_params[1] == '-0x000000001' or template_params[1] == '-1': self.rows = val['m_storage']['m_rows'] @@ -88,8 +88,11 @@ class EigenMatrixPrinter: def __iter__ (self): return self - + def next(self): + return self.__next__() # Python 2.x compatibility + + def __next__(self): row = self.currentRow col = self.currentCol @@ -151,8 +154,11 @@ class EigenQuaternionPrinter: def __iter__ (self): return self - + def next(self): + return self.__next__() # Python 2.x compatibility + + def __next__(self): element = self.currentElement if self.currentElement >= 4: #there are 4 elements in a quanternion From 2fb64578aa31d35e26c18f8c44e041c5daac9982 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 28 Apr 2014 16:16:29 +0200 Subject: [PATCH 119/158] Add a small benchmark to compare dense solvers for small to large problems. --- bench/dense_solvers.cpp | 76 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 bench/dense_solvers.cpp diff --git a/bench/dense_solvers.cpp b/bench/dense_solvers.cpp new file mode 100644 index 000000000..f37a8bb5f --- /dev/null +++ b/bench/dense_solvers.cpp @@ -0,0 +1,76 @@ +#include +#include "BenchTimer.h" +#include +#include +#include +using namespace Eigen; + +std::map > results; + +template +void bench(int id, int size = Size) +{ + typedef Matrix Mat; + Mat A(size,size); + A.setRandom(); + A = A*A.adjoint(); + BenchTimer t_llt, t_ldlt, t_lu, t_fplu, t_qr, t_cpqr, t_fpqr, t_jsvd; + + int tries = 3; + int rep = 1000/size; + if(rep==0) rep = 1; + rep = rep*rep; + + LLT llt(A); + LDLT ldlt(A); + PartialPivLU lu(A); + FullPivLU fplu(A); + HouseholderQR qr(A); + ColPivHouseholderQR cpqr(A); + FullPivHouseholderQR fpqr(A); + JacobiSVD jsvd(A.rows(),A.cols()); + + BENCH(t_llt, tries, rep, llt.compute(A)); + BENCH(t_ldlt, tries, rep, ldlt.compute(A)); + BENCH(t_lu, tries, rep, lu.compute(A)); + BENCH(t_fplu, tries, rep, fplu.compute(A)); + BENCH(t_qr, tries, rep, qr.compute(A)); + BENCH(t_cpqr, tries, rep, cpqr.compute(A)); + BENCH(t_fpqr, tries, rep, fpqr.compute(A)); + if(size<500) // JacobiSVD is really too slow for too large matrices + BENCH(t_jsvd, tries, rep, jsvd.compute(A,ComputeFullU|ComputeFullV)); + + results["LLT"][id] = t_llt.best(); + results["LDLT"][id] = t_ldlt.best(); + results["PartialPivLU"][id] = t_lu.best(); + results["FullPivLU"][id] = t_fplu.best(); + results["HouseholderQR"][id] = t_qr.best(); + results["ColPivHouseholderQR"][id] = t_cpqr.best(); + results["FullPivHouseholderQR"][id] = t_fpqr.best(); + results["JacobiSVD"][id] = size<500 ? t_jsvd.best() : 0; +} + +int main() +{ + const int small = 8; + const int medium = 100; + const int large = 1000; + const int xl = 4000; + + bench(0); + bench(1,medium); + bench(2,large); + bench(3,xl); + + IOFormat fmt(3, 0, " \t", "\n", "", ""); + + std::cout << "solver/size " << small << "\t" << medium << "\t" << large << "\t" << xl << "\n"; + std::cout << "LLT (ms) " << (results["LLT"]/1000.).format(fmt) << "\n"; + std::cout << "LDLT (%) " << (results["LDLT"]/results["LLT"]).format(fmt) << "\n"; + std::cout << "PartialPivLU (%) " << (results["PartialPivLU"]/results["LLT"]).format(fmt) << "\n"; + std::cout << "FullPivLU (%) " << (results["FullPivLU"]/results["LLT"]).format(fmt) << "\n"; + std::cout << "HouseholderQR (%) " << (results["HouseholderQR"]/results["LLT"]).format(fmt) << "\n"; + std::cout << "ColPivHouseholderQR (%) " << (results["ColPivHouseholderQR"]/results["LLT"]).format(fmt) << "\n"; + std::cout << "FullPivHouseholderQR (%) " << (results["FullPivHouseholderQR"]/results["LLT"]).format(fmt) << "\n"; + std::cout << "JacobiSVD (%) " << (results["JacobiSVD"]/results["LLT"]).format(fmt) << "\n"; +} From 07986189b76001c0c29b1583592b00340b3343ba Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 1 May 2014 23:03:54 +0200 Subject: [PATCH 120/158] Fix bug #803: avoid char* to int* conversion --- Eigen/src/Core/util/Memory.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index e1a12aef1..390b60c74 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -767,9 +767,9 @@ namespace internal { #ifdef EIGEN_CPUID -inline bool cpuid_is_vendor(int abcd[4], const char* vendor) +inline bool cpuid_is_vendor(int abcd[4], const int vendor[3]) { - return abcd[1]==(reinterpret_cast(vendor))[0] && abcd[3]==(reinterpret_cast(vendor))[1] && abcd[2]==(reinterpret_cast(vendor))[2]; + return abcd[1]==vendor[0] && abcd[3]==vendor[1] && abcd[2]==vendor[2]; } inline void queryCacheSizes_intel_direct(int& l1, int& l2, int& l3) @@ -911,13 +911,16 @@ inline void queryCacheSizes(int& l1, int& l2, int& l3) { #ifdef EIGEN_CPUID int abcd[4]; + const int GenuineIntel[] = {0x756e6547, 0x49656e69, 0x6c65746e}; + const int AuthenticAMD[] = {0x68747541, 0x69746e65, 0x444d4163}; + const int AMDisbetter_[] = {0x69444d41, 0x74656273, 0x21726574}; // "AMDisbetter!" // identify the CPU vendor EIGEN_CPUID(abcd,0x0,0); int max_std_funcs = abcd[1]; - if(cpuid_is_vendor(abcd,"GenuineIntel")) + if(cpuid_is_vendor(abcd,GenuineIntel)) queryCacheSizes_intel(l1,l2,l3,max_std_funcs); - else if(cpuid_is_vendor(abcd,"AuthenticAMD") || cpuid_is_vendor(abcd,"AMDisbetter!")) + else if(cpuid_is_vendor(abcd,AuthenticAMD) || cpuid_is_vendor(abcd,AMDisbetter_)) queryCacheSizes_amd(l1,l2,l3); else // by default let's use Intel's API From d67aa1549b5675bb6089e9df0c299dab7d0bb80d Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 3 May 2014 10:46:11 +0200 Subject: [PATCH 121/158] Add missing add_subdirectory directive --- Eigen/src/Core/arch/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/Eigen/src/Core/arch/CMakeLists.txt b/Eigen/src/Core/arch/CMakeLists.txt index 8456dec15..0db8c558d 100644 --- a/Eigen/src/Core/arch/CMakeLists.txt +++ b/Eigen/src/Core/arch/CMakeLists.txt @@ -1,4 +1,5 @@ ADD_SUBDIRECTORY(SSE) ADD_SUBDIRECTORY(AltiVec) ADD_SUBDIRECTORY(NEON) +ADD_SUBDIRECTORY(AVX) ADD_SUBDIRECTORY(Default) From 0b7f95a03f9678b4cbdbb0bb19227e54b6d05718 Mon Sep 17 00:00:00 2001 From: Benjamin Chretien Date: Sat, 3 May 2014 12:41:37 +0200 Subject: [PATCH 122/158] Fix typo in SparseMatrix assert. --- Eigen/src/SparseCore/SparseMatrix.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h index 9ac18bcf6..e0b7494c1 100644 --- a/Eigen/src/SparseCore/SparseMatrix.h +++ b/Eigen/src/SparseCore/SparseMatrix.h @@ -1139,7 +1139,7 @@ EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& Sparse m_data.value(p) = m_data.value(p-1); --p; } - eigen_assert((p<=startId || m_data.index(p-1)!=inner) && "you cannot insert an element that already exist, you must call coeffRef to this end"); + eigen_assert((p<=startId || m_data.index(p-1)!=inner) && "you cannot insert an element that already exists, you must call coeffRef to this end"); m_innerNonZeros[outer]++; From b5e3d76aa50dd4adc63ebb1e20e6693e261aa7dc Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Mon, 5 May 2014 14:22:27 +0200 Subject: [PATCH 123/158] Fixed bug #806: Missing scalar type cast in Quaternion::setFromTwoVectors() --- Eigen/src/Geometry/Quaternion.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Geometry/Quaternion.h b/Eigen/src/Geometry/Quaternion.h index a712692e5..11e5398d4 100644 --- a/Eigen/src/Geometry/Quaternion.h +++ b/Eigen/src/Geometry/Quaternion.h @@ -587,7 +587,7 @@ inline Derived& QuaternionBase::setFromTwoVectors(const MatrixBase::dummy_precision()) { - c = max(c,-1); + c = (max)(c,Scalar(-1)); Matrix m; m << v0.transpose(), v1.transpose(); JacobiSVD > svd(m, ComputeFullV); Vector3 axis = svd.matrixV().col(2); From b4beba72a2d31934d3a2a49401c792f9a8cd49e0 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Mon, 5 May 2014 14:23:52 +0200 Subject: [PATCH 124/158] Fix bug #807: Missing scalar type cast in umeyama() --- Eigen/src/Geometry/Umeyama.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Geometry/Umeyama.h b/Eigen/src/Geometry/Umeyama.h index 345b47e0c..5e20662f8 100644 --- a/Eigen/src/Geometry/Umeyama.h +++ b/Eigen/src/Geometry/Umeyama.h @@ -113,7 +113,7 @@ umeyama(const MatrixBase& src, const MatrixBase& dst, boo const Index n = src.cols(); // number of measurements // required for demeaning ... - const RealScalar one_over_n = 1 / static_cast(n); + const RealScalar one_over_n = RealScalar(1) / static_cast(n); // computation of mean const VectorType src_mean = src.rowwise().sum() * one_over_n; @@ -136,16 +136,16 @@ umeyama(const MatrixBase& src, const MatrixBase& dst, boo // Eq. (39) VectorType S = VectorType::Ones(m); - if (sigma.determinant()<0) S(m-1) = -1; + if (sigma.determinant() 0 ) { + if ( svd.matrixU().determinant() * svd.matrixV().determinant() > Scalar(0) ) { Rt.block(0,0,m,m).noalias() = svd.matrixU()*svd.matrixV().transpose(); } else { - const Scalar s = S(m-1); S(m-1) = -1; + const Scalar s = S(m-1); S(m-1) = Scalar(-1); Rt.block(0,0,m,m).noalias() = svd.matrixU() * S.asDiagonal() * svd.matrixV().transpose(); S(m-1) = s; } @@ -156,7 +156,7 @@ umeyama(const MatrixBase& src, const MatrixBase& dst, boo if (with_scaling) { // Eq. (42) - const Scalar c = 1/src_var * svd.singularValues().dot(S); + const Scalar c = Scalar(1)/src_var * svd.singularValues().dot(S); // Eq. (41) Rt.col(m).head(m) = dst_mean; From 56de8d38161f1f190688d9550bff7afcf4e9dc3f Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Mon, 5 May 2014 15:03:29 +0200 Subject: [PATCH 125/158] Fixed unused variable warnings --- test/packetmath.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 9dab07522..a51d31dbd 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -186,7 +186,7 @@ template void packetmath() { for (int i=0; i(data1, A0, A1); internal::pstore(data2+0*PacketSize, A0); internal::pstore(data2+1*PacketSize, A1); From 84cb1d72b82589f99dd41a15485a01c3c70858d9 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Mon, 5 May 2014 15:06:37 +0200 Subject: [PATCH 126/158] Removed IACA-defines This caused redefinition warnings if IACA headers were included from elsewhere. For a clean solution we should define our own EIGEN_IACA_* macros --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index cc4a9c485..de97f2b65 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -10,12 +10,6 @@ #ifndef EIGEN_GENERAL_BLOCK_PANEL_H #define EIGEN_GENERAL_BLOCK_PANEL_H -#ifdef USE_IACA -#include "iacaMarks.h" -#else -#define IACA_START -#define IACA_END -#endif namespace Eigen { From 9217de8bf2308b60a9bd410a09338301dea30134 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Mon, 5 May 2014 15:10:18 +0200 Subject: [PATCH 127/158] Missed to remove IACA_END in previous commit --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 1 - 1 file changed, 1 deletion(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index de97f2b65..7da52c2e8 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -799,7 +799,6 @@ void gebp_kernel blB += pk*4*RhsProgress; blA += pk*3*Traits::LhsProgress; - IACA_END } // process remaining peeled loop for(Index k=peeled_kc; k Date: Tue, 6 May 2014 12:53:18 +0200 Subject: [PATCH 128/158] Disabled unused warnings in Eigen2-tests --- test/eigen2/CMakeLists.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test/eigen2/CMakeLists.txt b/test/eigen2/CMakeLists.txt index 84931e037..41a02f4ad 100644 --- a/test/eigen2/CMakeLists.txt +++ b/test/eigen2/CMakeLists.txt @@ -5,6 +5,11 @@ add_dependencies(buildtests eigen2_buildtests) add_definitions("-DEIGEN2_SUPPORT_STAGE10_FULL_EIGEN2_API") +# Disable unused warnings for this module +# As EIGEN2 support is deprecated, it is not really worth fixing them +ei_add_cxx_compiler_flag("-Wno-unused-local-typedefs") +ei_add_cxx_compiler_flag("-Wno-unused-but-set-variable") + ei_add_test(eigen2_meta) ei_add_test(eigen2_sizeof) ei_add_test(eigen2_dynalloc) From 881aab14b47edd02ec0a99167fd19652202ec548 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 7 May 2014 13:34:46 -0700 Subject: [PATCH 129/158] Made it possible to call the assignment operator on an Eigen::Block from a CUDA kernel. --- Eigen/src/Core/util/Macros.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index f69fc5ec4..32790fddb 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -326,13 +326,13 @@ namespace Eigen { #elif defined(__clang__) // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653) #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ using Base::operator =; \ - EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) { Base::operator=(other); return *this; } \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) { Base::operator=(other); return *this; } \ template \ - EIGEN_STRONG_INLINE Derived& operator=(const DenseBase& other) { Base::operator=(other.derived()); return *this; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const DenseBase& other) { Base::operator=(other.derived()); return *this; } #else #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ using Base::operator =; \ - EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) \ { \ Base::operator=(other); \ return *this; \ From edebb152758447c34b9e816154a5236478dc5d3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Benjamin=20Chr=C3=A9tien?= Date: Mon, 19 May 2014 18:21:29 +0200 Subject: [PATCH 130/158] PolynomialSolver: add a test to reveal a bug. --- unsupported/test/polynomialsolver.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/unsupported/test/polynomialsolver.cpp b/unsupported/test/polynomialsolver.cpp index 680ff6d31..f62c98de2 100644 --- a/unsupported/test/polynomialsolver.cpp +++ b/unsupported/test/polynomialsolver.cpp @@ -210,5 +210,6 @@ void test_polynomialsolver() CALL_SUBTEST_10((polynomialsolver( internal::random(9,13) )) ); + CALL_SUBTEST_11((polynomialsolver(1)) ); } } From 0f946079471dc5d6d694d892a843dc1c61d6a859 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Benjamin=20Chr=C3=A9tien?= Date: Mon, 19 May 2014 18:34:10 +0200 Subject: [PATCH 131/158] PolynomialSolver: test template constructor in test suite. --- unsupported/test/polynomialsolver.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/unsupported/test/polynomialsolver.cpp b/unsupported/test/polynomialsolver.cpp index f62c98de2..0c87478dd 100644 --- a/unsupported/test/polynomialsolver.cpp +++ b/unsupported/test/polynomialsolver.cpp @@ -38,6 +38,9 @@ bool aux_evalSolver( const POLYNOMIAL& pols, SOLVER& psolve ) const Index deg = pols.size()-1; + // Test template constructor from coefficient vector + SOLVER solve_constr (pols); + psolve.compute( pols ); const RootsType& roots( psolve.roots() ); EvalRootsType evr( deg ); From df92649379d9dded0e807fde758e7394d8c5ca4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Benjamin=20Chr=C3=A9tien?= Date: Mon, 19 May 2014 18:40:29 +0200 Subject: [PATCH 132/158] PolynomialSolver: add missing constructors. --- unsupported/Eigen/src/Polynomials/PolynomialSolver.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/unsupported/Eigen/src/Polynomials/PolynomialSolver.h b/unsupported/Eigen/src/Polynomials/PolynomialSolver.h index cd5c04bbf..5d00fbeac 100644 --- a/unsupported/Eigen/src/Polynomials/PolynomialSolver.h +++ b/unsupported/Eigen/src/Polynomials/PolynomialSolver.h @@ -380,6 +380,13 @@ class PolynomialSolver<_Scalar,1> : public PolynomialSolverBase<_Scalar,1> m_roots[0] = -poly[0]/poly[poly.size()-1]; } + public: + template< typename OtherPolynomial > + inline PolynomialSolver( const OtherPolynomial& poly ){ + compute( poly ); } + + inline PolynomialSolver(){} + protected: using PS_Base::m_roots; }; From eda79321becc9886e922654210d0e3ae28f64647 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Benjamin=20Chr=C3=A9tien?= Date: Mon, 19 May 2014 19:08:51 +0200 Subject: [PATCH 133/158] PolynomialSolver: fix bugs related to linear polynomials. --- .../Eigen/src/Polynomials/PolynomialSolver.h | 24 +++++++++++++------ 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/unsupported/Eigen/src/Polynomials/PolynomialSolver.h b/unsupported/Eigen/src/Polynomials/PolynomialSolver.h index 5d00fbeac..66a91d1a2 100644 --- a/unsupported/Eigen/src/Polynomials/PolynomialSolver.h +++ b/unsupported/Eigen/src/Polynomials/PolynomialSolver.h @@ -41,7 +41,7 @@ class PolynomialSolverBase protected: template< typename OtherPolynomial > inline void setPolynomial( const OtherPolynomial& poly ){ - m_roots.resize(poly.size()); } + m_roots.resize(poly.size()-1); } public: template< typename OtherPolynomial > @@ -345,10 +345,19 @@ class PolynomialSolver : public PolynomialSolverBase<_Scalar,_Deg> void compute( const OtherPolynomial& poly ) { eigen_assert( Scalar(0) != poly[poly.size()-1] ); - internal::companion companion( poly ); - companion.balance(); - m_eigenSolver.compute( companion.denseMatrix() ); - m_roots = m_eigenSolver.eigenvalues(); + eigen_assert( poly.size() > 1 ); + if(poly.size() > 2 ) + { + internal::companion companion( poly ); + companion.balance(); + m_eigenSolver.compute( companion.denseMatrix() ); + m_roots = m_eigenSolver.eigenvalues(); + } + else if(poly.size () == 2) + { + m_roots.resize(1); + m_roots[0] = -poly[0]/poly[1]; + } } public: @@ -376,8 +385,9 @@ class PolynomialSolver<_Scalar,1> : public PolynomialSolverBase<_Scalar,1> template< typename OtherPolynomial > void compute( const OtherPolynomial& poly ) { - eigen_assert( Scalar(0) != poly[poly.size()-1] ); - m_roots[0] = -poly[0]/poly[poly.size()-1]; + eigen_assert( poly.size() == 2 ); + eigen_assert( Scalar(0) != poly[1] ); + m_roots[0] = -poly[0]/poly[1]; } public: From c55c5763fed4c67ede074cae13ce872f2412b2e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Benjamin=20Chr=C3=A9tien?= Date: Mon, 19 May 2014 19:24:02 +0200 Subject: [PATCH 134/158] PolynomialSolver: fix typo. --- unsupported/Eigen/src/Polynomials/PolynomialSolver.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/src/Polynomials/PolynomialSolver.h b/unsupported/Eigen/src/Polynomials/PolynomialSolver.h index 66a91d1a2..03198ec8e 100644 --- a/unsupported/Eigen/src/Polynomials/PolynomialSolver.h +++ b/unsupported/Eigen/src/Polynomials/PolynomialSolver.h @@ -316,7 +316,7 @@ class PolynomialSolverBase * - real roots with greatest, smallest absolute real value. * - greatest, smallest real roots. * - * WARNING: this polynomial solver is experimental, part of the unsuported Eigen modules. + * WARNING: this polynomial solver is experimental, part of the unsupported Eigen modules. * * * Currently a QR algorithm is used to compute the eigenvalues of the companion matrix of From aa524604b7ba36c164df3a47a6b6ef12952686b9 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Wed, 21 May 2014 14:08:04 +0000 Subject: [PATCH 135/158] README.md edited online with Bitbucket --- README.md | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 000000000..04fa63fce --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +**Eigen is a C++ template library for linear algebra: matrices, vectors, numerical solvers, and related algorithms.** + +For more information go to http://eigen.tuxfamily.org/. From c794099e69353d82cfb0601c680d15e5848c7bc0 Mon Sep 17 00:00:00 2001 From: Mark Borgerding Date: Thu, 8 May 2014 15:14:12 -0400 Subject: [PATCH 136/158] fixed AsciiQuickReference typo: LinSpace -> LinSpaced (transplanted from e66781905586e3c438031597fae07306d47fea60 ) --- doc/AsciiQuickReference.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/AsciiQuickReference.txt b/doc/AsciiQuickReference.txt index 4c8fe2f47..f62c112cd 100644 --- a/doc/AsciiQuickReference.txt +++ b/doc/AsciiQuickReference.txt @@ -41,8 +41,8 @@ MatrixXd::Ones(rows,cols) // ones(rows,cols) C.setOnes(rows,cols) // C = ones(rows,cols) MatrixXd::Random(rows,cols) // rand(rows,cols)*2-1 // MatrixXd::Random returns uniform random numbers in (-1, 1). C.setRandom(rows,cols) // C = rand(rows,cols)*2-1 -VectorXd::LinSpace(size,low,high) // linspace(low,high,size)' -v.setLinSpace(size,low,high) // v = linspace(low,high,size)' +VectorXd::LinSpaced(size,low,high) // linspace(low,high,size)' +v.setLinSpaced(size,low,high) // v = linspace(low,high,size)' // Matrix slicing and blocks. All expressions listed here are read/write. From e3ab46b8c931b4435fe2283d591023de9e257b1a Mon Sep 17 00:00:00 2001 From: Mark Borgerding Date: Fri, 16 May 2014 13:45:35 -0400 Subject: [PATCH 137/158] AsciiQuickReference: added .real(), .imag() (transplanted from 11462c1a291bdb9c0ac27db25fef364e51632484 ) --- doc/AsciiQuickReference.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/AsciiQuickReference.txt b/doc/AsciiQuickReference.txt index f62c112cd..c4d021624 100644 --- a/doc/AsciiQuickReference.txt +++ b/doc/AsciiQuickReference.txt @@ -168,6 +168,8 @@ x.cross(y) // cross(x, y) Requires #include A.cast(); // double(A) A.cast(); // single(A) A.cast(); // int32(A) +A.real(); // real(A) +A.imag(); // imag(A) // if the original type equals destination type, no work is done // Note that for most operations Eigen requires all operands to have the same type: From eb56461ac2c51374aa7c21a7c62e3c695f4da76b Mon Sep 17 00:00:00 2001 From: Jitse Niesen Date: Sat, 31 May 2014 23:05:18 +0100 Subject: [PATCH 138/158] Fix doc'n of FullPivLU re permutation matrices (bug #815). --- Eigen/src/LU/FullPivLU.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h index 44699b763..971b9da1d 100644 --- a/Eigen/src/LU/FullPivLU.h +++ b/Eigen/src/LU/FullPivLU.h @@ -20,10 +20,11 @@ namespace Eigen { * * \param MatrixType the type of the matrix of which we are computing the LU decomposition * - * This class represents a LU decomposition of any matrix, with complete pivoting: the matrix A - * is decomposed as A = PLUQ where L is unit-lower-triangular, U is upper-triangular, and P and Q - * are permutation matrices. This is a rank-revealing LU decomposition. The eigenvalues (diagonal - * coefficients) of U are sorted in such a way that any zeros are at the end. + * This class represents a LU decomposition of any matrix, with complete pivoting: the matrix A is + * decomposed as \f$ A = P^{-1} L U Q^{-1} \f$ where L is unit-lower-triangular, U is + * upper-triangular, and P and Q are permutation matrices. This is a rank-revealing LU + * decomposition. The eigenvalues (diagonal coefficients) of U are sorted in such a way that any + * zeros are at the end. * * This decomposition provides the generic approach to solving systems of linear equations, computing * the rank, invertibility, inverse, kernel, and determinant. @@ -511,8 +512,8 @@ typename internal::traits::Scalar FullPivLU::determinant } /** \returns the matrix represented by the decomposition, - * i.e., it returns the product: P^{-1} L U Q^{-1}. - * This function is provided for debug purpose. */ + * i.e., it returns the product: \f$ P^{-1} L U Q^{-1} \f$. + * This function is provided for debug purposes. */ template MatrixType FullPivLU::reconstructedMatrix() const { From 789674809fb039c7137f52bea36c8ac71bcc9b81 Mon Sep 17 00:00:00 2001 From: Jitse Niesen Date: Mon, 2 Jun 2014 11:42:42 +0100 Subject: [PATCH 139/158] Fix test: EigenSolver on 1x1 matrix with NaN sets info to NumericalIssue. This was changed in 3c66bb136bf2adcb9d73d3d66850a8b907bc9264 . --- test/eigensolver_generic.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/test/eigensolver_generic.cpp b/test/eigensolver_generic.cpp index 91383b5cf..92d33f66a 100644 --- a/test/eigensolver_generic.cpp +++ b/test/eigensolver_generic.cpp @@ -114,10 +114,9 @@ void test_eigensolver_generic() CALL_SUBTEST_2( { MatrixXd A(1,1); - A(0,0) = std::sqrt(-1.); + A(0,0) = std::sqrt(-1.); // is Not-a-Number Eigen::EigenSolver solver(A); - MatrixXd V(1, 1); - V(0,0) = solver.eigenvectors()(0,0).real(); + VERIFY_IS_EQUAL(solver.info(), NumericalIssue); } ); From 0f1e321dd4a1dec90f25aa248f77dca5e353f394 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 4 Jun 2014 11:58:01 +0200 Subject: [PATCH 140/158] Fic bug #819: include path of details.h --- Eigen/src/StlSupport/StdDeque.h | 2 +- Eigen/src/StlSupport/StdList.h | 2 +- Eigen/src/StlSupport/StdVector.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Eigen/src/StlSupport/StdDeque.h b/Eigen/src/StlSupport/StdDeque.h index 4ee8e5c10..aaf66330b 100644 --- a/Eigen/src/StlSupport/StdDeque.h +++ b/Eigen/src/StlSupport/StdDeque.h @@ -11,7 +11,7 @@ #ifndef EIGEN_STDDEQUE_H #define EIGEN_STDDEQUE_H -#include "Eigen/src/StlSupport/details.h" +#include "details.h" // Define the explicit instantiation (e.g. necessary for the Intel compiler) #if defined(__INTEL_COMPILER) || defined(__GNUC__) diff --git a/Eigen/src/StlSupport/StdList.h b/Eigen/src/StlSupport/StdList.h index 627381ece..3c742430c 100644 --- a/Eigen/src/StlSupport/StdList.h +++ b/Eigen/src/StlSupport/StdList.h @@ -10,7 +10,7 @@ #ifndef EIGEN_STDLIST_H #define EIGEN_STDLIST_H -#include "Eigen/src/StlSupport/details.h" +#include "details.h" // Define the explicit instantiation (e.g. necessary for the Intel compiler) #if defined(__INTEL_COMPILER) || defined(__GNUC__) diff --git a/Eigen/src/StlSupport/StdVector.h b/Eigen/src/StlSupport/StdVector.h index 40a9abefa..611664a2e 100644 --- a/Eigen/src/StlSupport/StdVector.h +++ b/Eigen/src/StlSupport/StdVector.h @@ -11,7 +11,7 @@ #ifndef EIGEN_STDVECTOR_H #define EIGEN_STDVECTOR_H -#include "Eigen/src/StlSupport/details.h" +#include "details.h" /** * This section contains a convenience MACRO which allows an easy specialization of From 45515779d3e6a56aeddebc07b3a18ed006cefaf1 Mon Sep 17 00:00:00 2001 From: Christian Seiler Date: Wed, 4 Jun 2014 18:31:02 +0200 Subject: [PATCH 141/158] Fix compilation for CXX11/Tensor module if unsupported is not in include path --- unsupported/Eigen/CXX11/Tensor | 2 +- unsupported/Eigen/CXX11/TensorSymmetry | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index f2c5129b3..049ce5596 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -10,7 +10,7 @@ #ifndef EIGEN_CXX11_TENSOR_MODULE #define EIGEN_CXX11_TENSOR_MODULE -#include +#include #include diff --git a/unsupported/Eigen/CXX11/TensorSymmetry b/unsupported/Eigen/CXX11/TensorSymmetry index 027c6087f..f1dc25fea 100644 --- a/unsupported/Eigen/CXX11/TensorSymmetry +++ b/unsupported/Eigen/CXX11/TensorSymmetry @@ -10,7 +10,7 @@ #ifndef EIGEN_CXX11_TENSORSYMMETRY_MODULE #define EIGEN_CXX11_TENSORSYMMETRY_MODULE -#include +#include #include From 58cfac9a1277f2b2198ed1e5c3190543d1ed95a6 Mon Sep 17 00:00:00 2001 From: Christian Seiler Date: Wed, 4 Jun 2014 18:47:42 +0200 Subject: [PATCH 142/158] unsupported/ C++11 workarounds: don't use hack for libc++ if not required libc++ from 3.4 onwards supports constexpr std::get, but only if compiled with -std=c++1y. Change the detection so that libc++'s internals are only used if either -std=c++1y is not specified or the library is too old, making the whole hack a bit more future-proof. --- unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h index 356ae10cf..d71a67590 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h +++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h @@ -48,13 +48,15 @@ namespace internal { * - libstdc++ from version 4.7 onwards has it nevertheless, * so use that * - libstdc++ older versions: use _M_instance directly - * - libc++ all versions so far: use __elems_ directly + * - libc++ from version 3.4 onwards has it IF compiled with + * -std=c++1y + * - libc++ older versions or -std=c++11: use __elems_ directly * - all other libs: use std::get to be portable, but * this may not be constexpr */ #if defined(__GLIBCXX__) && __GLIBCXX__ < 20120322 #define STD_GET_ARR_HACK a._M_instance[I] -#elif defined(_LIBCPP_VERSION) +#elif defined(_LIBCPP_VERSION) && (!defined(_LIBCPP_STD_VER) || _LIBCPP_STD_VER <= 11) #define STD_GET_ARR_HACK a.__elems_[I] #else #define STD_GET_ARR_HACK std::template get(a) From cee62018fc38f5408e0afe497c37fade64ca15d0 Mon Sep 17 00:00:00 2001 From: Christian Seiler Date: Wed, 4 Jun 2014 19:54:22 +0200 Subject: [PATCH 143/158] unsupported/CXX11/Core: allow gen_numeric_list to have a starting point Add a template parameter to gen_numeric_list that acts as a starting point for the list, i.e. gen_numeric_list will generate a numeric_list. --- .../Eigen/CXX11/src/Core/util/CXX11Meta.h | 12 ++++++------ unsupported/test/cxx11_meta.cpp | 18 ++++++++++++++++++ 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h index 618e2eb7b..0e274b801 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h @@ -42,14 +42,14 @@ struct numeric_list { constexpr static std::size_t count = sizeof.. * typename gen_numeric_list_repeated::type numeric_list */ -template struct gen_numeric_list : gen_numeric_list {}; -template struct gen_numeric_list { typedef numeric_list type; }; +template struct gen_numeric_list : gen_numeric_list {}; +template struct gen_numeric_list { typedef numeric_list type; }; -template struct gen_numeric_list_reversed : gen_numeric_list_reversed {}; -template struct gen_numeric_list_reversed { typedef numeric_list type; }; +template struct gen_numeric_list_reversed : gen_numeric_list_reversed {}; +template struct gen_numeric_list_reversed { typedef numeric_list type; }; -template struct gen_numeric_list_swapped_pair : gen_numeric_list_swapped_pair {}; -template struct gen_numeric_list_swapped_pair { typedef numeric_list type; }; +template struct gen_numeric_list_swapped_pair : gen_numeric_list_swapped_pair {}; +template struct gen_numeric_list_swapped_pair { typedef numeric_list type; }; template struct gen_numeric_list_repeated : gen_numeric_list_repeated {}; template struct gen_numeric_list_repeated { typedef numeric_list type; }; diff --git a/unsupported/test/cxx11_meta.cpp b/unsupported/test/cxx11_meta.cpp index a9843e9a9..af5cadbf9 100644 --- a/unsupported/test/cxx11_meta.cpp +++ b/unsupported/test/cxx11_meta.cpp @@ -91,18 +91,36 @@ static void test_gen_numeric_list() VERIFY((is_same::type, numeric_list>::value)); VERIFY((is_same::type, numeric_list>::value)); + VERIFY((is_same::type, numeric_list>::value)); + VERIFY((is_same::type, numeric_list>::value)); + VERIFY((is_same::type, numeric_list>::value)); + VERIFY((is_same::type, numeric_list>::value)); + VERIFY((is_same::type, numeric_list>::value)); + VERIFY((is_same::type, numeric_list>::value)); VERIFY((is_same::type, numeric_list>::value)); VERIFY((is_same::type, numeric_list>::value)); VERIFY((is_same::type, numeric_list>::value)); VERIFY((is_same::type, numeric_list>::value)); + VERIFY((is_same::type, numeric_list>::value)); + VERIFY((is_same::type, numeric_list>::value)); + VERIFY((is_same::type, numeric_list>::value)); + VERIFY((is_same::type, numeric_list>::value)); + VERIFY((is_same::type, numeric_list>::value)); + VERIFY((is_same::type, numeric_list>::value)); VERIFY((is_same::type, numeric_list>::value)); VERIFY((is_same::type, numeric_list>::value)); VERIFY((is_same::type, numeric_list>::value)); VERIFY((is_same::type, numeric_list>::value)); + VERIFY((is_same::type, numeric_list>::value)); + VERIFY((is_same::type, numeric_list>::value)); + VERIFY((is_same::type, numeric_list>::value)); + VERIFY((is_same::type, numeric_list>::value)); + VERIFY((is_same::type, numeric_list>::value)); + VERIFY((is_same::type, numeric_list>::value)); VERIFY((is_same::type, numeric_list>::value)); VERIFY((is_same::type, numeric_list>::value)); From ea9943352368b990d27ba22eb8670287cf96302d Mon Sep 17 00:00:00 2001 From: Christian Seiler Date: Wed, 4 Jun 2014 20:27:42 +0200 Subject: [PATCH 144/158] unsupported/TensorSymmetry: make symgroup construction autodetect number of indices When constructing a symmetry group, make the code automatically detect the number of indices required from the indices of the group's generators. Also, allow the symmetry group to be applied to lists of indices that are larger than the number of indices of the symmetry group. Before: SGroup<4, Symmetry<0, 1>, Symmetry<2,3>> group; group.apply(std::array{{0, 1, 2, 3}}, 0); After: SGroup, Symmetry<2,3>> group; group.apply(std::array{{0, 1, 2, 3}}, 0); group.apply(std::array{{0, 1, 2, 3, 4}}, 0); This should make the symmetry group easier to use - especially if one wants to reuse the same symmetry group for different tensors of maybe different rank. static/runtime asserts remain for the case where the length of the index list to which a symmetry group is to be applied is too small. --- .../src/TensorSymmetry/DynamicSymmetry.h | 42 +++++++++++----- .../CXX11/src/TensorSymmetry/StaticSymmetry.h | 50 +++++++++++-------- .../Eigen/CXX11/src/TensorSymmetry/Symmetry.h | 47 +++++++++++++---- unsupported/test/cxx11_tensor_symmetry.cpp | 26 +++++----- 4 files changed, 108 insertions(+), 57 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h b/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h index b5738b778..0329278a9 100644 --- a/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h +++ b/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h @@ -15,7 +15,7 @@ namespace Eigen { class DynamicSGroup { public: - inline explicit DynamicSGroup(std::size_t numIndices) : m_numIndices(numIndices), m_elements(), m_generators(), m_globalFlags(0) { m_elements.push_back(ge(Generator(0, 0, 0))); } + inline explicit DynamicSGroup() : m_numIndices(1), m_elements(), m_generators(), m_globalFlags(0) { m_elements.push_back(ge(Generator(0, 0, 0))); } inline DynamicSGroup(const DynamicSGroup& o) : m_numIndices(o.m_numIndices), m_elements(o.m_elements), m_generators(o.m_generators), m_globalFlags(o.m_globalFlags) { } inline DynamicSGroup(DynamicSGroup&& o) : m_numIndices(o.m_numIndices), m_elements(), m_generators(o.m_generators), m_globalFlags(o.m_globalFlags) { std::swap(m_elements, o.m_elements); } inline DynamicSGroup& operator=(const DynamicSGroup& o) { m_numIndices = o.m_numIndices; m_elements = o.m_elements; m_generators = o.m_generators; m_globalFlags = o.m_globalFlags; return *this; } @@ -33,7 +33,7 @@ class DynamicSGroup template inline RV apply(const std::array& idx, RV initial, Args&&... args) const { - eigen_assert(N == m_numIndices); + eigen_assert(N >= m_numIndices && "Can only apply symmetry group to objects that have at least the required amount of indices."); for (std::size_t i = 0; i < size(); i++) initial = Op::run(h_permute(i, idx, typename internal::gen_numeric_list::type()), m_elements[i].flags, initial, std::forward(args)...); return initial; @@ -42,7 +42,7 @@ class DynamicSGroup template inline RV apply(const std::vector& idx, RV initial, Args&&... args) const { - eigen_assert(idx.size() == m_numIndices); + eigen_assert(idx.size() >= m_numIndices && "Can only apply symmetry group to objects that have at least the required amount of indices."); for (std::size_t i = 0; i < size(); i++) initial = Op::run(h_permute(i, idx), m_elements[i].flags, initial, std::forward(args)...); return initial; @@ -77,7 +77,7 @@ class DynamicSGroup template inline std::array h_permute(std::size_t which, const std::array& idx, internal::numeric_list) const { - return std::array{{ idx[m_elements[which].representation[n]]... }}; + return std::array{{ idx[n >= m_numIndices ? n : m_elements[which].representation[n]]... }}; } template @@ -87,6 +87,8 @@ class DynamicSGroup result.reserve(idx.size()); for (auto k : m_elements[which].representation) result.push_back(idx[k]); + for (std::size_t i = m_numIndices; i < idx.size(); i++) + result.push_back(idx[i]); return result; } @@ -135,18 +137,18 @@ class DynamicSGroup }; // dynamic symmetry group that auto-adds the template parameters in the constructor -template +template class DynamicSGroupFromTemplateArgs : public DynamicSGroup { public: - inline DynamicSGroupFromTemplateArgs() : DynamicSGroup(NumIndices) + inline DynamicSGroupFromTemplateArgs() : DynamicSGroup() { add_all(internal::type_list()); } inline DynamicSGroupFromTemplateArgs(DynamicSGroupFromTemplateArgs const& other) : DynamicSGroup(other) { } inline DynamicSGroupFromTemplateArgs(DynamicSGroupFromTemplateArgs&& other) : DynamicSGroup(other) { } - inline DynamicSGroupFromTemplateArgs& operator=(const DynamicSGroupFromTemplateArgs& o) { DynamicSGroup::operator=(o); return *this; } - inline DynamicSGroupFromTemplateArgs& operator=(DynamicSGroupFromTemplateArgs&& o) { DynamicSGroup::operator=(o); return *this; } + inline DynamicSGroupFromTemplateArgs& operator=(const DynamicSGroupFromTemplateArgs& o) { DynamicSGroup::operator=(o); return *this; } + inline DynamicSGroupFromTemplateArgs& operator=(DynamicSGroupFromTemplateArgs&& o) { DynamicSGroup::operator=(o); return *this; } private: template @@ -168,18 +170,32 @@ inline DynamicSGroup::GroupElement DynamicSGroup::mul(GroupElement g1, GroupElem GroupElement result; result.representation.reserve(m_numIndices); - for (std::size_t i = 0; i < m_numIndices; i++) - result.representation.push_back(g2.representation[g1.representation[i]]); + for (std::size_t i = 0; i < m_numIndices; i++) { + int v = g2.representation[g1.representation[i]]; + eigen_assert(v >= 0); + result.representation.push_back(v); + } result.flags = g1.flags ^ g2.flags; return result; } inline void DynamicSGroup::add(int one, int two, int flags) { - eigen_assert(one >= 0 && (std::size_t)one < m_numIndices); - eigen_assert(two >= 0 && (std::size_t)two < m_numIndices); + eigen_assert(one >= 0); + eigen_assert(two >= 0); eigen_assert(one != two); - Generator g{one, two ,flags}; + + if ((std::size_t)one >= m_numIndices || (std::size_t)two >= m_numIndices) { + std::size_t newNumIndices = (one > two) ? one : two + 1; + for (auto& gelem : m_elements) { + gelem.representation.reserve(newNumIndices); + for (std::size_t i = m_numIndices; i < newNumIndices; i++) + gelem.representation.push_back(i); + } + m_numIndices = newNumIndices; + } + + Generator g{one, two, flags}; GroupElement e = ge(g); /* special case for first generator */ diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h b/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h index c5a630105..0eb468fc0 100644 --- a/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h +++ b/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h @@ -114,20 +114,24 @@ struct tensor_static_symgroup_equality template struct tensor_static_symgroup { - typedef StaticSGroup type; + typedef StaticSGroup type; constexpr static std::size_t size = type::static_size; }; -template -constexpr static inline std::array tensor_static_symgroup_index_permute(std::array idx, internal::numeric_list) +template +constexpr static inline std::array tensor_static_symgroup_index_permute(std::array idx, internal::numeric_list, internal::numeric_list) { - return {{ idx[ii]... }}; + return {{ idx[ii]..., idx[jj]... }}; } template static inline std::vector tensor_static_symgroup_index_permute(std::vector idx, internal::numeric_list) { - return {{ idx[ii]... }}; + std::vector result{{ idx[ii]... }}; + std::size_t target_size = idx.size(); + for (std::size_t i = result.size(); i < target_size; i++) + result.push_back(idx[i]); + return result; } template struct tensor_static_symgroup_do_apply; @@ -135,32 +139,35 @@ template struct tensor_static_symgroup_do_apply; template struct tensor_static_symgroup_do_apply> { - template - static inline RV run(const std::array& idx, RV initial, Args&&... args) + template + static inline RV run(const std::array& idx, RV initial, Args&&... args) { - initial = Op::run(tensor_static_symgroup_index_permute(idx, typename first::indices()), first::flags, initial, std::forward(args)...); - return tensor_static_symgroup_do_apply>::template run(idx, initial, args...); + static_assert(NumIndices >= SGNumIndices, "Can only apply symmetry group to objects that have at least the required amount of indices."); + typedef typename internal::gen_numeric_list::type remaining_indices; + initial = Op::run(tensor_static_symgroup_index_permute(idx, typename first::indices(), remaining_indices()), first::flags, initial, std::forward(args)...); + return tensor_static_symgroup_do_apply>::template run(idx, initial, args...); } - template + template static inline RV run(const std::vector& idx, RV initial, Args&&... args) { + eigen_assert(idx.size() >= SGNumIndices && "Can only apply symmetry group to objects that have at least the required amount of indices."); initial = Op::run(tensor_static_symgroup_index_permute(idx, typename first::indices()), first::flags, initial, std::forward(args)...); - return tensor_static_symgroup_do_apply>::template run(idx, initial, args...); + return tensor_static_symgroup_do_apply>::template run(idx, initial, args...); } }; template struct tensor_static_symgroup_do_apply> { - template - static inline RV run(const std::array&, RV initial, Args&&...) + template + static inline RV run(const std::array&, RV initial, Args&&...) { // do nothing return initial; } - template + template static inline RV run(const std::vector&, RV initial, Args&&...) { // do nothing @@ -170,9 +177,10 @@ struct tensor_static_symgroup_do_apply +template class StaticSGroup { + constexpr static std::size_t NumIndices = internal::tensor_symmetry_num_indices::value; typedef internal::group_theory::enumerate_group_elements< internal::tensor_static_symgroup_multiply, internal::tensor_static_symgroup_equality, @@ -182,20 +190,20 @@ class StaticSGroup typedef typename group_elements::type ge; public: constexpr inline StaticSGroup() {} - constexpr inline StaticSGroup(const StaticSGroup&) {} - constexpr inline StaticSGroup(StaticSGroup&&) {} + constexpr inline StaticSGroup(const StaticSGroup&) {} + constexpr inline StaticSGroup(StaticSGroup&&) {} - template - static inline RV apply(const std::array& idx, RV initial, Args&&... args) + template + static inline RV apply(const std::array& idx, RV initial, Args&&... args) { - return internal::tensor_static_symgroup_do_apply::template run(idx, initial, args...); + return internal::tensor_static_symgroup_do_apply::template run(idx, initial, args...); } template static inline RV apply(const std::vector& idx, RV initial, Args&&... args) { eigen_assert(idx.size() == NumIndices); - return internal::tensor_static_symgroup_do_apply::template run(idx, initial, args...); + return internal::tensor_static_symgroup_do_apply::template run(idx, initial, args...); } constexpr static std::size_t static_size = ge::count; diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h b/unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h index f0813086a..f1ccc33ef 100644 --- a/unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h +++ b/unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h @@ -30,6 +30,7 @@ template struct tenso template struct tensor_static_symgroup_if; template struct tensor_symmetry_calculate_flags; template struct tensor_symmetry_assign_value; +template struct tensor_symmetry_num_indices; } // end namespace internal @@ -94,7 +95,7 @@ class DynamicSGroup; * This class is a child class of DynamicSGroup. It uses the template arguments * specified to initialize itself. */ -template +template class DynamicSGroupFromTemplateArgs; /** \class StaticSGroup @@ -116,7 +117,7 @@ class DynamicSGroupFromTemplateArgs; * group becomes too large. (In that case, unrolling may not even be * beneficial.) */ -template +template class StaticSGroup; /** \class SGroup @@ -131,24 +132,50 @@ class StaticSGroup; * \sa StaticSGroup * \sa DynamicSGroup */ -template -class SGroup : public internal::tensor_symmetry_pre_analysis::root_type +template +class SGroup : public internal::tensor_symmetry_pre_analysis::value, Gen...>::root_type { public: + constexpr static std::size_t NumIndices = internal::tensor_symmetry_num_indices::value; typedef typename internal::tensor_symmetry_pre_analysis::root_type Base; // make standard constructors + assignment operators public inline SGroup() : Base() { } - inline SGroup(const SGroup& other) : Base(other) { } - inline SGroup(SGroup&& other) : Base(other) { } - inline SGroup& operator=(const SGroup& other) { Base::operator=(other); return *this; } - inline SGroup& operator=(SGroup&& other) { Base::operator=(other); return *this; } + inline SGroup(const SGroup& other) : Base(other) { } + inline SGroup(SGroup&& other) : Base(other) { } + inline SGroup& operator=(const SGroup& other) { Base::operator=(other); return *this; } + inline SGroup& operator=(SGroup&& other) { Base::operator=(other); return *this; } // all else is defined in the base class }; namespace internal { +template struct tensor_symmetry_num_indices +{ + constexpr static std::size_t value = 1; +}; + +template struct tensor_symmetry_num_indices, Sym...> +{ +private: + constexpr static std::size_t One = static_cast(One_); + constexpr static std::size_t Two = static_cast(Two_); + constexpr static std::size_t Three = tensor_symmetry_num_indices::value; + + // don't use std::max, since it's not constexpr until C++14... + constexpr static std::size_t maxOneTwoPlusOne = ((One > Two) ? One : Two) + 1; +public: + constexpr static std::size_t value = (maxOneTwoPlusOne > Three) ? maxOneTwoPlusOne : Three; +}; + +template struct tensor_symmetry_num_indices, Sym...> + : public tensor_symmetry_num_indices, Sym...> {}; +template struct tensor_symmetry_num_indices, Sym...> + : public tensor_symmetry_num_indices, Sym...> {}; +template struct tensor_symmetry_num_indices, Sym...> + : public tensor_symmetry_num_indices, Sym...> {}; + /** \internal * * \class tensor_symmetry_pre_analysis @@ -199,7 +226,7 @@ namespace internal { template struct tensor_symmetry_pre_analysis { - typedef StaticSGroup root_type; + typedef StaticSGroup<> root_type; }; template @@ -212,7 +239,7 @@ struct tensor_symmetry_pre_analysis typedef typename conditional< possible_size == 0 || possible_size >= max_static_elements, - DynamicSGroupFromTemplateArgs, + DynamicSGroupFromTemplateArgs, typename helper::type >::type root_type; }; diff --git a/unsupported/test/cxx11_tensor_symmetry.cpp b/unsupported/test/cxx11_tensor_symmetry.cpp index e8dfffd92..2a1669995 100644 --- a/unsupported/test/cxx11_tensor_symmetry.cpp +++ b/unsupported/test/cxx11_tensor_symmetry.cpp @@ -32,8 +32,8 @@ using Eigen::GlobalImagFlag; // helper function to determine if the compiler intantiated a static // or dynamic symmetry group -template -bool isDynGroup(StaticSGroup const& dummy) +template +bool isDynGroup(StaticSGroup const& dummy) { (void)dummy; return false; @@ -86,7 +86,7 @@ static void test_symgroups_static() std::array identity{{0,1,2,3,4,5,6}}; // Simple static symmetry group - StaticSGroup<7, + StaticSGroup< AntiSymmetry<0,1>, Hermiticity<0,2> > group; @@ -113,7 +113,7 @@ static void test_symgroups_dynamic() identity.push_back(i); // Simple dynamic symmetry group - DynamicSGroup group(7); + DynamicSGroup group; group.add(0,1,NegationFlag); group.add(0,2,ConjugationFlag); @@ -143,7 +143,7 @@ static void test_symgroups_selection() { // Do the same test as in test_symgroups_static but // require selection via SGroup - SGroup<7, + SGroup< AntiSymmetry<0,1>, Hermiticity<0,2> > group; @@ -168,7 +168,7 @@ static void test_symgroups_selection() // simple factorizing group: 5 generators, 2^5 = 32 elements // selection should make this dynamic, although static group // can still be reasonably generated - SGroup<10, + SGroup< Symmetry<0,1>, Symmetry<2,3>, Symmetry<4,5>, @@ -196,7 +196,7 @@ static void test_symgroups_selection() // no verify that we could also generate a static group // with these generators found.clear(); - StaticSGroup<10, + StaticSGroup< Symmetry<0,1>, Symmetry<2,3>, Symmetry<4,5>, @@ -211,7 +211,7 @@ static void test_symgroups_selection() { // try to create a HUGE group - SGroup<7, + SGroup< Symmetry<0,1>, Symmetry<1,2>, Symmetry<2,3>, @@ -657,7 +657,7 @@ static void test_symgroups_selection() static void test_tensor_epsilon() { - SGroup<3, AntiSymmetry<0,1>, AntiSymmetry<1,2>> sym; + SGroup, AntiSymmetry<1,2>> sym; Tensor epsilon(3,3,3); epsilon.setZero(); @@ -674,7 +674,7 @@ static void test_tensor_epsilon() static void test_tensor_sym() { - SGroup<4, Symmetry<0,1>, Symmetry<2,3>> sym; + SGroup, Symmetry<2,3>> sym; Tensor t(10,10,10,10); t.setZero(); @@ -703,7 +703,7 @@ static void test_tensor_sym() static void test_tensor_asym() { - SGroup<4, AntiSymmetry<0,1>, AntiSymmetry<2,3>> sym; + SGroup, AntiSymmetry<2,3>> sym; Tensor t(10,10,10,10); t.setZero(); @@ -740,7 +740,7 @@ static void test_tensor_asym() static void test_tensor_dynsym() { - DynamicSGroup sym(4); + DynamicSGroup sym; sym.addSymmetry(0,1); sym.addSymmetry(2,3); Tensor t(10,10,10,10); @@ -770,7 +770,7 @@ static void test_tensor_dynsym() static void test_tensor_randacc() { - SGroup<4, Symmetry<0,1>, Symmetry<2,3>> sym; + SGroup, Symmetry<2,3>> sym; Tensor t(10,10,10,10); t.setZero(); From 96cb58fa3b83448fcb2af2d131434a7ac10b915c Mon Sep 17 00:00:00 2001 From: Christian Seiler Date: Wed, 4 Jun 2014 20:44:22 +0200 Subject: [PATCH 145/158] unsupported/TensorSymmetry: factor out completely from Tensor module Remove the symCoeff() method of the the Tensor module and move the functionality into a new operator() of the symmetry classes. This makes the Tensor module now completely self-contained without symmetry support (even though previously it was only a forward declaration and a otherwise harmless trivial templated method) and also removes the inconsistency with the rest of eigen w.r.t. the method's naming scheme. --- unsupported/Eigen/CXX11/src/Tensor/Tensor.h | 15 --------------- .../CXX11/src/TensorSymmetry/DynamicSymmetry.h | 13 +++++++++++++ .../CXX11/src/TensorSymmetry/StaticSymmetry.h | 13 +++++++++++++ .../Eigen/CXX11/src/TensorSymmetry/Symmetry.h | 2 +- unsupported/test/cxx11_tensor_symmetry.cpp | 10 +++++----- 5 files changed, 32 insertions(+), 21 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index c6216e14c..70ca1433f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -91,9 +91,6 @@ struct tensor_index_linearization_helper return std_array_get(indices); } }; - -/* Forward-declaration required for the symmetry support. */ -template class tensor_symmetry_value_setter; } // end namespace internal template @@ -285,18 +282,6 @@ class Tensor #endif } - template - internal::tensor_symmetry_value_setter symCoeff(const Symmetry_& symmetry, Index firstIndex, IndexTypes... otherIndices) - { - return symCoeff(symmetry, std::array{{firstIndex, otherIndices...}}); - } - - template - internal::tensor_symmetry_value_setter symCoeff(const Symmetry_& symmetry, std::array const& indices) - { - return internal::tensor_symmetry_value_setter(*this, symmetry, indices); - } - protected: bool checkIndexRange(const std::array& indices) const { diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h b/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h index 0329278a9..bc4f2025f 100644 --- a/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h +++ b/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h @@ -50,6 +50,19 @@ class DynamicSGroup inline int globalFlags() const { return m_globalFlags; } inline std::size_t size() const { return m_elements.size(); } + + template + inline internal::tensor_symmetry_value_setter operator()(Tensor_& tensor, typename Tensor_::Index firstIndex, IndexTypes... otherIndices) const + { + static_assert(sizeof...(otherIndices) + 1 == Tensor_::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); + return operator()(tensor, std::array{{firstIndex, otherIndices...}}); + } + + template + inline internal::tensor_symmetry_value_setter operator()(Tensor_& tensor, std::array const& indices) const + { + return internal::tensor_symmetry_value_setter(tensor, *this, indices); + } private: struct GroupElement { std::vector representation; diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h b/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h index 0eb468fc0..942293bd7 100644 --- a/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h +++ b/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h @@ -212,6 +212,19 @@ class StaticSGroup return ge::count; } constexpr static inline int globalFlags() { return group_elements::global_flags; } + + template + inline internal::tensor_symmetry_value_setter> operator()(Tensor_& tensor, typename Tensor_::Index firstIndex, IndexTypes... otherIndices) const + { + static_assert(sizeof...(otherIndices) + 1 == Tensor_::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); + return operator()(tensor, std::array{{firstIndex, otherIndices...}}); + } + + template + inline internal::tensor_symmetry_value_setter> operator()(Tensor_& tensor, std::array const& indices) const + { + return internal::tensor_symmetry_value_setter>(tensor, *this, indices); + } }; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h b/unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h index f1ccc33ef..879d6cd77 100644 --- a/unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h +++ b/unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h @@ -293,7 +293,7 @@ struct tensor_symmetry_calculate_flags } }; -template +template class tensor_symmetry_value_setter { public: diff --git a/unsupported/test/cxx11_tensor_symmetry.cpp b/unsupported/test/cxx11_tensor_symmetry.cpp index 2a1669995..d680e9b3b 100644 --- a/unsupported/test/cxx11_tensor_symmetry.cpp +++ b/unsupported/test/cxx11_tensor_symmetry.cpp @@ -661,7 +661,7 @@ static void test_tensor_epsilon() Tensor epsilon(3,3,3); epsilon.setZero(); - epsilon.symCoeff(sym, 0, 1, 2) = 1; + sym(epsilon, 0, 1, 2) = 1; for (int i = 0; i < 3; i++) { for (int j = 0; j < 3; j++) { @@ -683,7 +683,7 @@ static void test_tensor_sym() for (int k = l; k < 10; k++) { for (int j = 0; j < 10; j++) { for (int i = j; i < 10; i++) { - t.symCoeff(sym, i, j, k, l) = (i + j) * (k + l); + sym(t, i, j, k, l) = (i + j) * (k + l); } } } @@ -712,7 +712,7 @@ static void test_tensor_asym() for (int k = l + 1; k < 10; k++) { for (int j = 0; j < 10; j++) { for (int i = j + 1; i < 10; i++) { - t.symCoeff(sym, i, j, k, l) = ((i * j) + (k * l)); + sym(t, i, j, k, l) = ((i * j) + (k * l)); } } } @@ -751,7 +751,7 @@ static void test_tensor_dynsym() for (int k = l; k < 10; k++) { for (int j = 0; j < 10; j++) { for (int i = j; i < 10; i++) { - t.symCoeff(sym, i, j, k, l) = (i + j) * (k + l); + sym(t, i, j, k, l) = (i + j) * (k + l); } } } @@ -787,7 +787,7 @@ static void test_tensor_randacc() std::swap(i, j); if (k < l) std::swap(k, l); - t.symCoeff(sym, i, j, k, l) = (i + j) * (k + l); + sym(t, i, j, k, l) = (i + j) * (k + l); } for (int l = 0; l < 10; l++) { From ed37c44765b4401629281a1ea7ae223cddf91fde Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 6 Jun 2014 11:02:20 +0200 Subject: [PATCH 146/158] Enable LinearAccessBit in Block expression for inner-panels --- Eigen/src/Core/Block.h | 2 +- Eigen/src/Core/MapBase.h | 2 ++ Eigen/src/Geometry/Transform.h | 4 ++-- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h index e948e14aa..da193d1a2 100644 --- a/Eigen/src/Core/Block.h +++ b/Eigen/src/Core/Block.h @@ -84,7 +84,7 @@ struct traits > : traits::Flags&LinearAccessBit))) ? LinearAccessBit : 0, FlagsLvalueBit = is_lvalue::value ? LvalueBit : 0, FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0, Flags0 = traits::Flags & ( (HereditaryBits & ~RowMajorBit) | diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h index a45a0b374..e8ecb175b 100644 --- a/Eigen/src/Core/MapBase.h +++ b/Eigen/src/Core/MapBase.h @@ -250,6 +250,8 @@ template class MapBase using Base::Base::operator=; }; +#undef EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS + } // end namespace Eigen #endif // EIGEN_MAPBASE_H diff --git a/Eigen/src/Geometry/Transform.h b/Eigen/src/Geometry/Transform.h index f40644011..cb93acf6b 100644 --- a/Eigen/src/Geometry/Transform.h +++ b/Eigen/src/Geometry/Transform.h @@ -194,9 +194,9 @@ public: /** type of the matrix used to represent the linear part of the transformation */ typedef Matrix LinearMatrixType; /** type of read/write reference to the linear part of the transformation */ - typedef Block LinearPart; + typedef Block LinearPart; /** type of read reference to the linear part of the transformation */ - typedef const Block ConstLinearPart; + typedef const Block ConstLinearPart; /** type of read/write reference to the affine part of the transformation */ typedef typename internal::conditional Date: Fri, 6 Jun 2014 11:06:44 +0200 Subject: [PATCH 147/158] Fix bug #738: use the "current" version of cmake project directories to ease the inclusion of Eigen within other projects. --- CMakeLists.txt | 4 ++-- cmake/EigenConfigureTesting.cmake | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fb13769f4..a719e47fd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -449,12 +449,12 @@ set ( EIGEN_INCLUDE_DIR ${INCLUDE_INSTALL_DIR} ) set ( EIGEN_INCLUDE_DIRS ${EIGEN_INCLUDE_DIR} ) set ( EIGEN_ROOT_DIR ${CMAKE_INSTALL_PREFIX} ) -configure_file ( ${CMAKE_SOURCE_DIR}/cmake/Eigen3Config.cmake.in +configure_file ( ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3Config.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake @ONLY ESCAPE_QUOTES ) -install ( FILES ${CMAKE_SOURCE_DIR}/cmake/UseEigen3.cmake +install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/cmake/UseEigen3.cmake ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake DESTINATION ${EIGEN_CONFIG_CMAKE_PATH} ) diff --git a/cmake/EigenConfigureTesting.cmake b/cmake/EigenConfigureTesting.cmake index 7844bf4d3..0b5de95bb 100644 --- a/cmake/EigenConfigureTesting.cmake +++ b/cmake/EigenConfigureTesting.cmake @@ -49,7 +49,7 @@ else() set(EIGEN_MAKECOMMAND_PLACEHOLDER "${EIGEN_BUILD_COMMAND}") endif() -configure_file(${CMAKE_BINARY_DIR}/DartConfiguration.tcl ${CMAKE_BINARY_DIR}/DartConfiguration.tcl) +configure_file(${CMAKE_CURRENT_BINARY_DIR}/DartConfiguration.tcl ${CMAKE_BINARY_DIR}/DartConfiguration.tcl) # restore default CMAKE_MAKE_PROGRAM set(CMAKE_MAKE_PROGRAM ${CMAKE_MAKE_PROGRAM_SAVE}) @@ -58,7 +58,7 @@ set(CMAKE_MAKE_PROGRAM ${CMAKE_MAKE_PROGRAM_SAVE}) unset(CMAKE_MAKE_PROGRAM_SAVE) unset(EIGEN_MAKECOMMAND_PLACEHOLDER) -configure_file(${CMAKE_SOURCE_DIR}/CTestCustom.cmake.in ${CMAKE_BINARY_DIR}/CTestCustom.cmake) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/CTestCustom.cmake.in ${CMAKE_BINARY_DIR}/CTestCustom.cmake) # some documentation of this function would be nice ei_init_testing() From abc1ca0af14872fe44e583faa2b43e496b038f8a Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 6 Jun 2014 11:21:19 +0200 Subject: [PATCH 148/158] The BLAS interface is complete. --- blas/README.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/blas/README.txt b/blas/README.txt index 07a8bd92a..63a5203b9 100644 --- a/blas/README.txt +++ b/blas/README.txt @@ -1,9 +1,6 @@ This directory contains a BLAS library built on top of Eigen. -This is currently a work in progress which is far to be ready for use, -but feel free to contribute to it if you wish. - This module is not built by default. In order to compile it, you need to type 'make blas' from within your build dir. From 1ee4e2db15caef88c281cfcecb44577b3e59a357 Mon Sep 17 00:00:00 2001 From: Vladimir Chalupecky Date: Thu, 12 Jun 2014 10:51:02 +0900 Subject: [PATCH 149/158] Change variable names in Eigen3Config.cmake to EIGEN3_* --- cmake/Eigen3Config.cmake.in | 40 ++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/cmake/Eigen3Config.cmake.in b/cmake/Eigen3Config.cmake.in index 257c595ed..e50f6dbe0 100644 --- a/cmake/Eigen3Config.cmake.in +++ b/cmake/Eigen3Config.cmake.in @@ -3,26 +3,26 @@ # Eigen3Config.cmake(.in) # Use the following variables to compile and link against Eigen: -# EIGEN_FOUND - True if Eigen was found on your system -# EIGEN_USE_FILE - The file making Eigen usable -# EIGEN_DEFINITIONS - Definitions needed to build with Eigen -# EIGEN_INCLUDE_DIR - Directory where signature_of_eigen3_matrix_library can be found -# EIGEN_INCLUDE_DIRS - List of directories of Eigen and it's dependencies -# EIGEN_ROOT_DIR - The base directory of Eigen -# EIGEN_VERSION_STRING - A human-readable string containing the version -# EIGEN_VERSION_MAJOR - The major version of Eigen -# EIGEN_VERSION_MINOR - The minor version of Eigen -# EIGEN_VERSION_PATCH - The patch version of Eigen +# EIGEN3_FOUND - True if Eigen was found on your system +# EIGEN3_USE_FILE - The file making Eigen usable +# EIGEN3_DEFINITIONS - Definitions needed to build with Eigen +# EIGEN3_INCLUDE_DIR - Directory where signature_of_eigen3_matrix_library can be found +# EIGEN3_INCLUDE_DIRS - List of directories of Eigen and it's dependencies +# EIGEN3_ROOT_DIR - The base directory of Eigen +# EIGEN3_VERSION_STRING - A human-readable string containing the version +# EIGEN3_VERSION_MAJOR - The major version of Eigen +# EIGEN3_VERSION_MINOR - The minor version of Eigen +# EIGEN3_VERSION_PATCH - The patch version of Eigen -set ( EIGEN_FOUND 1 ) -set ( EIGEN_USE_FILE "@EIGEN_USE_FILE@" ) +set ( EIGEN3_FOUND 1 ) +set ( EIGEN3_USE_FILE "@EIGEN_USE_FILE@" ) -set ( EIGEN_DEFINITIONS "@EIGEN_DEFINITIONS@" ) -set ( EIGEN_INCLUDE_DIR "@EIGEN_INCLUDE_DIR@" ) -set ( EIGEN_INCLUDE_DIRS "@EIGEN_INCLUDE_DIRS@" ) -set ( EIGEN_ROOT_DIR "@EIGEN_ROOT_DIR@" ) +set ( EIGEN3_DEFINITIONS "@EIGEN_DEFINITIONS@" ) +set ( EIGEN3_INCLUDE_DIR "@EIGEN_INCLUDE_DIR@" ) +set ( EIGEN3_INCLUDE_DIRS "@EIGEN_INCLUDE_DIRS@" ) +set ( EIGEN3_ROOT_DIR "@EIGEN_ROOT_DIR@" ) -set ( EIGEN_VERSION_STRING "@EIGEN_VERSION_STRING@" ) -set ( EIGEN_VERSION_MAJOR "@EIGEN_VERSION_MAJOR@" ) -set ( EIGEN_VERSION_MINOR "@EIGEN_VERSION_MINOR@" ) -set ( EIGEN_VERSION_PATCH "@EIGEN_VERSION_PATCH@" ) +set ( EIGEN3_VERSION_STRING "@EIGEN_VERSION_STRING@" ) +set ( EIGEN3_VERSION_MAJOR "@EIGEN_VERSION_MAJOR@" ) +set ( EIGEN3_VERSION_MINOR "@EIGEN_VERSION_MINOR@" ) +set ( EIGEN3_VERSION_PATCH "@EIGEN_VERSION_PATCH@" ) From 95ecd582a37c5e3b3df47392f6807280488852f8 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 17 Jun 2014 09:37:07 +0200 Subject: [PATCH 150/158] Update decompositions tables --- doc/TopicLinearAlgebraDecompositions.dox | 4 ++-- doc/TutorialLinearAlgebra.dox | 26 +++++++++++++++++++----- 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/doc/TopicLinearAlgebraDecompositions.dox b/doc/TopicLinearAlgebraDecompositions.dox index 77f2c92ab..5bcff2c96 100644 --- a/doc/TopicLinearAlgebraDecompositions.dox +++ b/doc/TopicLinearAlgebraDecompositions.dox @@ -116,7 +116,7 @@ For an introduction on linear solvers and decompositions, check this \link Tutor JacobiSVD (two-sided) - Slow (but fast for small matrices) - Excellent-Proven3 + Proven3 Yes Singular values/vectors, least squares Yes (and does least squares) @@ -132,7 +132,7 @@ For an introduction on linear solvers and decompositions, check this \link Tutor Yes Eigenvalues/vectors - - Good + Excellent Closed forms for 2x2 and 3x3 diff --git a/doc/TutorialLinearAlgebra.dox b/doc/TutorialLinearAlgebra.dox index e6c41fd70..cb92ceeae 100644 --- a/doc/TutorialLinearAlgebra.dox +++ b/doc/TutorialLinearAlgebra.dox @@ -40,8 +40,9 @@ depending on your matrix and the trade-off you want to make: Decomposition Method - Requirements on the matrix - Speed + Requirements
on the matrix + Speed
(small-to-medium) + Speed
(large) Accuracy @@ -49,6 +50,7 @@ depending on your matrix and the trade-off you want to make: partialPivLu() Invertible ++ + ++ + @@ -56,6 +58,7 @@ depending on your matrix and the trade-off you want to make: fullPivLu() None - + - - +++ @@ -63,20 +66,23 @@ depending on your matrix and the trade-off you want to make: householderQr() None ++ + ++ + ColPivHouseholderQR colPivHouseholderQr() None - + ++ + - + +++ FullPivHouseholderQR fullPivHouseholderQr() None - + - - +++ @@ -84,21 +90,31 @@ depending on your matrix and the trade-off you want to make: llt() Positive definite +++ + +++ + LDLT ldlt() - Positive or negative semidefinite + Positive or negative
semidefinite +++ + + ++ + + JacobiSVD + jacobiSvd() + None + - - + - - - + +++ + All of these decompositions offer a solve() method that works as in the above example. For example, if your matrix is positive definite, the above table says that a very good -choice is then the LDLT decomposition. Here's an example, also demonstrating that using a general +choice is then the LLT or LDLT decomposition. Here's an example, also demonstrating that using a general matrix (not a vector) as right hand side is possible. From c06ec0f464a312dbce24edfde1de75bb1a69c4a6 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 17 Jun 2014 23:47:30 +0200 Subject: [PATCH 151/158] Fix Jacobi preconditioner with zero diagonal entries --- Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h index 73ca9bfde..1f3c060d0 100644 --- a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +++ b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h @@ -65,10 +65,10 @@ class DiagonalPreconditioner { typename MatType::InnerIterator it(mat,j); while(it && it.index()!=j) ++it; - if(it && it.index()==j) + if(it && it.index()==j && it.value()!=Scalar(0)) m_invdiag(j) = Scalar(1)/it.value(); else - m_invdiag(j) = 0; + m_invdiag(j) = Scalar(1); } m_isInitialized = true; return *this; From afb1a8c124c9ee52e027a3625b14ecad6be99fa4 Mon Sep 17 00:00:00 2001 From: Mark Borgerding Date: Tue, 17 Jun 2014 18:25:56 -0400 Subject: [PATCH 152/158] fixed warning: -Wunused-local-typedefs --- Eigen/src/Core/products/TriangularMatrixVector_MKL.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/Eigen/src/Core/products/TriangularMatrixVector_MKL.h b/Eigen/src/Core/products/TriangularMatrixVector_MKL.h index 09f110da7..3672b1240 100644 --- a/Eigen/src/Core/products/TriangularMatrixVector_MKL.h +++ b/Eigen/src/Core/products/TriangularMatrixVector_MKL.h @@ -129,7 +129,6 @@ struct triangular_matrix_vector_product_trmv MatrixLhs; \ if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \ x = x_tmp.data(); \ if (size MatrixLhs; \ if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \ x = x_tmp.data(); \ if (size Date: Thu, 19 Jun 2014 14:55:14 +0100 Subject: [PATCH 153/158] Add component-wise atan() function (see bug #80). --- Eigen/src/Core/Assign_MKL.h | 1 + Eigen/src/Core/GenericPacketMath.h | 4 ++++ Eigen/src/Core/GlobalFunctions.h | 1 + Eigen/src/Core/functors/UnaryFunctors.h | 20 ++++++++++++++++++++ Eigen/src/plugins/ArrayCwiseUnaryOps.h | 9 +++++++++ 5 files changed, 35 insertions(+) diff --git a/Eigen/src/Core/Assign_MKL.h b/Eigen/src/Core/Assign_MKL.h index 7772951b9..97134ffd7 100644 --- a/Eigen/src/Core/Assign_MKL.h +++ b/Eigen/src/Core/Assign_MKL.h @@ -202,6 +202,7 @@ EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(asin, Asin) EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(cos, Cos) EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(acos, Acos) EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(tan, Tan) +EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(atan, Atan) //EIGEN_MKL_VML_DECLARE_UNARY_CALLS(abs, Abs) EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(exp, Exp) EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(log, Ln) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 98313c68f..4523d2263 100755 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -318,6 +318,10 @@ Packet pasin(const Packet& a) { using std::asin; return asin(a); } template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pacos(const Packet& a) { using std::acos; return acos(a); } +/** \internal \returns the atan of \a a (coeff-wise) */ +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet patan(const Packet& a) { using std::atan; return atan(a); } + /** \internal \returns the exp of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp(const Packet& a) { using std::exp; return exp(a); } diff --git a/Eigen/src/Core/GlobalFunctions.h b/Eigen/src/Core/GlobalFunctions.h index 2acf97723..2067a2a6e 100644 --- a/Eigen/src/Core/GlobalFunctions.h +++ b/Eigen/src/Core/GlobalFunctions.h @@ -45,6 +45,7 @@ namespace Eigen EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asin,scalar_asin_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acos,scalar_acos_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tan,scalar_tan_op) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(atan,scalar_atan_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp,scalar_exp_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log,scalar_log_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs,scalar_abs_op) diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h index a0fcea3f9..ec42e6850 100644 --- a/Eigen/src/Core/functors/UnaryFunctors.h +++ b/Eigen/src/Core/functors/UnaryFunctors.h @@ -320,6 +320,26 @@ struct functor_traits > }; }; + +/** \internal + * \brief Template functor to compute the atan of a scalar + * \sa class CwiseUnaryOp, ArrayBase::atan() + */ +template struct scalar_atan_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_atan_op) + inline const Scalar operator() (const Scalar& a) const { using std::atan; return atan(a); } + typedef typename packet_traits::type Packet; + inline Packet packetOp(const Packet& a) const { return internal::patan(a); } +}; +template +struct functor_traits > +{ + enum { + Cost = 5 * NumTraits::MulCost, + PacketAccess = packet_traits::HasATan + }; +}; + /** \internal * \brief Template functor to compute the inverse of a scalar * \sa class CwiseUnaryOp, Cwise::inverse() diff --git a/Eigen/src/plugins/ArrayCwiseUnaryOps.h b/Eigen/src/plugins/ArrayCwiseUnaryOps.h index aea3375ed..d2a8ea75b 100644 --- a/Eigen/src/plugins/ArrayCwiseUnaryOps.h +++ b/Eigen/src/plugins/ArrayCwiseUnaryOps.h @@ -141,6 +141,15 @@ tan() const return derived(); } +/** \returns an expression of the coefficient-wise arc tan of *this. + * + * \sa cos(), sin(), tan() + */ +inline const CwiseUnaryOp, Derived> +atan() const +{ + return derived(); +} /** \returns an expression of the coefficient-wise power of *this to the given exponent. * From 55453c51e8c7d7d69ffa67777ad3355ab9c6f771 Mon Sep 17 00:00:00 2001 From: Jitse Niesen Date: Thu, 19 Jun 2014 15:07:42 +0100 Subject: [PATCH 154/158] Add documentation and very simple test for array atan(). --- Eigen/src/plugins/ArrayCwiseUnaryOps.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Eigen/src/plugins/ArrayCwiseUnaryOps.h b/Eigen/src/plugins/ArrayCwiseUnaryOps.h index d2a8ea75b..ce462e951 100644 --- a/Eigen/src/plugins/ArrayCwiseUnaryOps.h +++ b/Eigen/src/plugins/ArrayCwiseUnaryOps.h @@ -142,6 +142,9 @@ tan() const } /** \returns an expression of the coefficient-wise arc tan of *this. + * + * Example: \include Cwise_atan.cpp + * Output: \verbinclude Cwise_atan.out * * \sa cos(), sin(), tan() */ From de150b1e14b5f7fc6f4831b6cd982d3272ca3aca Mon Sep 17 00:00:00 2001 From: Jitse Niesen Date: Thu, 19 Jun 2014 15:12:33 +0100 Subject: [PATCH 155/158] Add documentation and very simple test for array atan(), part 2 (files I forget in the previous commit). --- doc/snippets/Cwise_atan.cpp | 2 ++ test/array.cpp | 1 + 2 files changed, 3 insertions(+) create mode 100644 doc/snippets/Cwise_atan.cpp diff --git a/doc/snippets/Cwise_atan.cpp b/doc/snippets/Cwise_atan.cpp new file mode 100644 index 000000000..446844726 --- /dev/null +++ b/doc/snippets/Cwise_atan.cpp @@ -0,0 +1,2 @@ +ArrayXd v = ArrayXd::LinSpaced(5,0,1); +cout << v.atan() << endl; diff --git a/test/array.cpp b/test/array.cpp index 5f49fc1ea..010fead2d 100644 --- a/test/array.cpp +++ b/test/array.cpp @@ -178,6 +178,7 @@ template void array_real(const ArrayType& m) VERIFY_IS_APPROX(m1.asin(), asin(m1)); VERIFY_IS_APPROX(m1.acos(), acos(m1)); VERIFY_IS_APPROX(m1.tan(), tan(m1)); + VERIFY_IS_APPROX(m1.atan(), atan(m1)); VERIFY_IS_APPROX(cos(m1+RealScalar(3)*m2), cos((m1+RealScalar(3)*m2).eval())); From 1fdef63d1f935283eca4d7735722832eca179a80 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 20 Jun 2014 13:23:33 +0200 Subject: [PATCH 156/158] Explain how to export sparse linear problems in matrix-market format. --- doc/SparseLinearSystems.dox | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/doc/SparseLinearSystems.dox b/doc/SparseLinearSystems.dox index c00be10d3..f0456ff52 100644 --- a/doc/SparseLinearSystems.dox +++ b/doc/SparseLinearSystems.dox @@ -140,7 +140,16 @@ x2 = solver.solve(b2); For direct methods, the solution are computed at the machine precision. Sometimes, the solution need not be too accurate. In this case, the iterative methods are more suitable and the desired accuracy can be set before the solve step using \b setTolerance(). For all the available functions, please, refer to the documentation of the \link IterativeLinearSolvers_Module Iterative solvers module \endlink. \section BenchmarkRoutine -Most of the time, all you need is to know how much time it will take to qolve your system, and hopefully, what is the most suitable solver. In Eigen, we provide a benchmark routine that can be used for this purpose. It is very easy to use. In the build directory, navigate to bench/spbench and compile the routine by typing \b make \e spbenchsolver. Run it with --help option to get the list of all available options. Basically, the matrices to test should be in MatrixMarket Coordinate format, and the routine returns the statistics from all available solvers in Eigen. +Most of the time, all you need is to know how much time it will take to qolve your system, and hopefully, what is the most suitable solver. In Eigen, we provide a benchmark routine that can be used for this purpose. It is very easy to use. In the build directory, navigate to bench/spbench and compile the routine by typing \b make \e spbenchsolver. Run it with --help option to get the list of all available options. Basically, the matrices to test should be in MatrixMarket Coordinate format, and the routine returns the statistics from all available solvers in Eigen. + +To export your matrices and right-hand-side vectors in the matrix-market format, you can the the unsupported SparseExtra module: +\code +#include +... +Eigen::saveMarket(A, "filename.mtx"); +Eigen::saveMarket(A, "filename_SPD.mtx", Eigen::Symmetric); // if A is symmetric-positive-definite +Eigen::saveMarketVector(B, "filename_b.mtx"); +\endcode The following table gives an example of XML statistics from several Eigen built-in and external solvers.
From 98ef44fe55925ba8f144889c0ec42be9bf572cc3 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 20 Jun 2014 14:43:47 +0200 Subject: [PATCH 157/158] Add assertion and warning on the requirements of SparseQR and COLAMDOrdering --- Eigen/src/OrderingMethods/Ordering.h | 8 ++++++-- Eigen/src/SparseQR/SparseQR.h | 17 +++++++++++++++++ 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/Eigen/src/OrderingMethods/Ordering.h b/Eigen/src/OrderingMethods/Ordering.h index b4da6531a..4e0609784 100644 --- a/Eigen/src/OrderingMethods/Ordering.h +++ b/Eigen/src/OrderingMethods/Ordering.h @@ -109,7 +109,7 @@ class NaturalOrdering * \class COLAMDOrdering * * Functor computing the \em column \em approximate \em minimum \em degree ordering - * The matrix should be in column-major format + * The matrix should be in column-major and \b compressed format (see SparseMatrix::makeCompressed()). */ template class COLAMDOrdering @@ -118,10 +118,14 @@ class COLAMDOrdering typedef PermutationMatrix PermutationType; typedef Matrix IndexVector; - /** Compute the permutation vector form a sparse matrix */ + /** Compute the permutation vector \a perm form the sparse matrix \a mat + * \warning The input sparse matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()). + */ template void operator() (const MatrixType& mat, PermutationType& perm) { + eigen_assert(mat.isCompressed() && "COLAMDOrdering requires a sparse matrix in compressed mode. Call .makeCompressed() before passing it to COLAMDOrdering"); + Index m = mat.rows(); Index n = mat.cols(); Index nnz = mat.nonZeros(); diff --git a/Eigen/src/SparseQR/SparseQR.h b/Eigen/src/SparseQR/SparseQR.h index 267c48bc3..5fb5bc203 100644 --- a/Eigen/src/SparseQR/SparseQR.h +++ b/Eigen/src/SparseQR/SparseQR.h @@ -58,6 +58,7 @@ namespace internal { * \tparam _OrderingType The fill-reducing ordering method. See the \link OrderingMethods_Module * OrderingMethods \endlink module for the list of built-in and external ordering methods. * + * \warning The input sparse matrix A must be in compressed mode (see SparseMatrix::makeCompressed()). * */ template @@ -77,10 +78,23 @@ class SparseQR SparseQR () : m_isInitialized(false), m_analysisIsok(false), m_lastError(""), m_useDefaultThreshold(true),m_isQSorted(false) { } + /** Construct a QR factorization of the matrix \a mat. + * + * \warning The matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()). + * + * \sa compute() + */ SparseQR(const MatrixType& mat) : m_isInitialized(false), m_analysisIsok(false), m_lastError(""), m_useDefaultThreshold(true),m_isQSorted(false) { compute(mat); } + + /** Computes the QR factorization of the sparse matrix \a mat. + * + * \warning The matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()). + * + * \sa analyzePattern(), factorize() + */ void compute(const MatrixType& mat) { analyzePattern(mat); @@ -255,6 +269,8 @@ class SparseQR }; /** \brief Preprocessing step of a QR factorization + * + * \warning The matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()). * * In this step, the fill-reducing permutation is computed and applied to the columns of A * and the column elimination tree is computed as well. Only the sparsity pattern of \a mat is exploited. @@ -264,6 +280,7 @@ class SparseQR template void SparseQR::analyzePattern(const MatrixType& mat) { + eigen_assert(mat.isCompressed() && "SparseQR requires a sparse matrix in compressed mode. Call .makeCompressed() before passing it to SparseQR"); // Compute the column fill reducing ordering OrderingType ord; ord(mat, m_perm_c); From 963d338922e9ef1addcd29c1b43e9b66243207c0 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 20 Jun 2014 15:09:42 +0200 Subject: [PATCH 158/158] Fix bug #827: improve accuracy of quaternion to angle-axis conversion --- Eigen/src/Geometry/AngleAxis.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/Eigen/src/Geometry/AngleAxis.h b/Eigen/src/Geometry/AngleAxis.h index b42048c55..636712c2b 100644 --- a/Eigen/src/Geometry/AngleAxis.h +++ b/Eigen/src/Geometry/AngleAxis.h @@ -77,7 +77,9 @@ public: * represents an invalid rotation. */ template inline AngleAxis(const Scalar& angle, const MatrixBase& axis) : m_axis(axis), m_angle(angle) {} - /** Constructs and initialize the angle-axis rotation from a quaternion \a q. */ + /** Constructs and initialize the angle-axis rotation from a quaternion \a q. + * This function implicitly normalizes the quaternion \a q. + */ template inline explicit AngleAxis(const QuaternionBase& q) { *this = q; } /** Constructs and initialize the angle-axis rotation from a 3x3 rotation matrix. */ template @@ -149,29 +151,27 @@ typedef AngleAxis AngleAxisf; typedef AngleAxis AngleAxisd; /** Set \c *this from a \b unit quaternion. - * The axis is normalized. + * The resulting axis is normalized. * - * \warning As any other method dealing with quaternion, if the input quaternion - * is not normalized then the result is undefined. + * This function implicitly normalizes the quaternion \a q. */ template template AngleAxis& AngleAxis::operator=(const QuaternionBase& q) { - using std::acos; - EIGEN_USING_STD_MATH(min); - EIGEN_USING_STD_MATH(max); - using std::sqrt; - Scalar n2 = q.vec().squaredNorm(); - if (n2 < NumTraits::dummy_precision()*NumTraits::dummy_precision()) + using std::atan2; + Scalar n = q.vec().norm(); + if(n::epsilon()) + n = q.vec().stableNorm(); + if (n > Scalar(0)) { - m_angle = Scalar(0); - m_axis << Scalar(1), Scalar(0), Scalar(0); + m_angle = Scalar(2)*atan2(n, q.w()); + m_axis = q.vec() / n; } else { - m_angle = Scalar(2)*acos((min)((max)(Scalar(-1),q.w()),Scalar(1))); - m_axis = q.vec() / sqrt(n2); + m_angle = 0; + m_axis << 1, 0, 0; } return *this; }