diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h old mode 100644 new mode 100755 index b0469fa1e..d07541285 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -169,6 +169,44 @@ ploaddup(const typename unpacket_traits::type* from) { return *from; } template EIGEN_DEVICE_FUNC inline Packet pset1(const typename unpacket_traits::type& a) { return a; } +/** \internal \returns a packet with constant coefficients \a a[0], e.g.: (a[0],a[0],a[0],a[0]) */ +template EIGEN_DEVICE_FUNC inline Packet +pload1(const typename unpacket_traits::type *a) { return pset1(*a); } + +/** \internal equivalent to + * \code + * a0 = pload1(a+0); + * a1 = pload1(a+1); + * a2 = pload1(a+2); + * a3 = pload1(a+3); + * \endcode + * \sa pset1, pload1, ploaddup, pbroadcast2 + */ +template EIGEN_DEVICE_FUNC +inline void pbroadcast4(const typename unpacket_traits::type *a, + Packet& a0, Packet& a1, Packet& a2, Packet& a3) +{ + a0 = pload1(a+0); + a1 = pload1(a+1); + a2 = pload1(a+2); + a3 = pload1(a+3); +} + +/** \internal equivalent to + * \code + * a0 = pload1(a+0); + * a1 = pload1(a+1); + * \endcode + * \sa pset1, pload1, ploaddup, pbroadcast4 + */ +template EIGEN_DEVICE_FUNC +inline void pbroadcast2(const typename unpacket_traits::type *a, + Packet& a0, Packet& a1) +{ + a0 = pload1(a+0); + a1 = pload1(a+1); +} + /** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */ template inline typename packet_traits::type plset(const Scalar& a) { return a; } diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h old mode 100644 new mode 100755 index a35f9ce98..9d8faa7d6 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -114,11 +114,22 @@ template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { re template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return _mm_set_pd(from,from); } template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { return _mm_set_epi32(from,from,from,from); } #else -template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { return _mm_set1_ps(from); } +template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { return _mm_set_ps1(from); } template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return _mm_set1_pd(from); } template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { return _mm_set1_epi32(from); } #endif +// GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction. +// However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203) +// Using inline assembly is also not an option because then gcc fails to reorder properly the instructions. +// Therefore, we introduced the pload1 functions to be used in product kernels for which bug 203 does not apply. +// Also note that with AVX, we want it to generate a vbroadcastss. +#if (defined __GNUC__) && (!defined __INTEL_COMPILER) && (!defined __clang__) && (!defined __AVX__) +template<> EIGEN_STRONG_INLINE Packet4f pload1(const float *from) { + return vec4f_swizzle1(_mm_load_ss(from),0,0,0,0); +} +#endif + #ifndef EIGEN_VECTORIZE_AVX template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return _mm_add_ps(pset1(a), _mm_set_ps(3,2,1,0)); } template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) { return _mm_add_pd(pset1(a),_mm_set_pd(1,0)); } @@ -392,6 +403,38 @@ template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) #endif } +// with AVX, the default implementations based on pload1 are faster +#ifndef __AVX__ +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const float *a, + Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) +{ + a3 = ploadu(a); + a0 = vec4f_swizzle1(a3, 0,0,0,0); + a1 = vec4f_swizzle1(a3, 1,1,1,1); + a2 = vec4f_swizzle1(a3, 2,2,2,2); + a3 = vec4f_swizzle1(a3, 3,3,3,3); +} +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const double *a, + Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) +{ +#ifdef EIGEN_VECTORIZE_SSE3 + a0 = _mm_loaddup_pd(a+0); + a1 = _mm_loaddup_pd(a+1); + a2 = _mm_loaddup_pd(a+2); + a3 = _mm_loaddup_pd(a+3); +#else + a1 = ploadu(a); + a0 = vec2d_swizzle1(a1, 0,0); + a1 = vec2d_swizzle1(a1, 1,1); + a3 = ploadu(a+2); + a2 = vec2d_swizzle1(a3, 0,0); + a3 = vec2d_swizzle1(a3, 1,1); +#endif +} +#endif + EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs) { vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55)); diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index bb9e62ae5..569cfea71 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -10,6 +10,7 @@ #ifndef EIGEN_GENERAL_BLOCK_PANEL_H #define EIGEN_GENERAL_BLOCK_PANEL_H + namespace Eigen { namespace internal { @@ -160,16 +161,16 @@ public: NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, - // register block size along the N direction (must be either 2 or 4) - nr = NumberOfRegisters/4, + // register block size along the N direction (must be either 4 or 8) + nr = NumberOfRegisters/2, // register block size along the M direction (currently, this one cannot be modified) - mr = 2 * LhsPacketSize, + mr = LhsPacketSize, WorkSpaceFactor = nr * RhsPacketSize, LhsProgress = LhsPacketSize, - RhsProgress = RhsPacketSize + RhsProgress = 1 }; typedef typename packet_traits::type _LhsPacket; @@ -186,16 +187,20 @@ public: { p = pset1(ResScalar(0)); } - - EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) + + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) { - for(DenseIndex k=0; k(&b[k*RhsPacketSize], rhs[k]); + pbroadcast4(b, b0, b1, b2, b3); + } + + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) + { + pbroadcast2(b, b0, b1); } EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { - dest = pload(b); + dest = pset1(*b); } EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const @@ -243,12 +248,12 @@ public: ResPacketSize = Vectorizable ? packet_traits::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, - nr = NumberOfRegisters/4, - mr = 2 * LhsPacketSize, + nr = NumberOfRegisters/2, + mr = LhsPacketSize, WorkSpaceFactor = nr*RhsPacketSize, LhsProgress = LhsPacketSize, - RhsProgress = RhsPacketSize + RhsProgress = 1 }; typedef typename packet_traits::type _LhsPacket; @@ -266,21 +271,25 @@ public: p = pset1(ResScalar(0)); } - EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) - { - for(DenseIndex k=0; k(&b[k*RhsPacketSize], rhs[k]); - } - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { - dest = pload(b); + dest = pset1(*b); } EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { dest = pload(a); } + + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + { + pbroadcast4(b, b0, b1, b2, b3); + } + + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) + { + pbroadcast2(b, b0, b1); + } EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const { @@ -327,12 +336,13 @@ public: RealPacketSize = Vectorizable ? packet_traits::size : 1, ResPacketSize = Vectorizable ? packet_traits::size : 1, - nr = 2, - mr = 2 * ResPacketSize, + // FIXME: should depend on NumberOfRegisters + nr = 4, + mr = ResPacketSize, WorkSpaceFactor = Vectorizable ? 2*nr*RealPacketSize : nr, LhsProgress = ResPacketSize, - RhsProgress = Vectorizable ? 2*ResPacketSize : 1 + RhsProgress = 1 }; typedef typename packet_traits::type RealPacket; @@ -356,30 +366,36 @@ public: p.second = pset1(RealScalar(0)); } - /* Unpack the rhs coeff such that each complex coefficient is spread into - * two packects containing respectively the real and imaginary coefficient - * duplicated as many time as needed: (x+iy) => [x, ..., x] [y, ..., y] - */ - EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const Scalar* rhs, Scalar* b) + // Scalar path + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const { - for(DenseIndex k=0; k((RealScalar*)&b[k*ResPacketSize*2+0], real(rhs[k])); - pstore1((RealScalar*)&b[k*ResPacketSize*2+ResPacketSize], imag(rhs[k])); - } - else - b[k] = rhs[k]; - } + dest = pset1(*b); } - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const { dest = *b; } - + // Vectorized path EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const { - dest.first = pload((const RealScalar*)b); - dest.second = pload((const RealScalar*)(b+ResPacketSize)); + dest.first = pset1(real(*b)); + dest.second = pset1(imag(*b)); + } + + // linking error if instantiated without being optimized out: + void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3); + + // Vectorized path + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacket& b0, DoublePacket& b1) + { + // FIXME not sure that's the best way to implement it! + loadRhs(b+0, b0); + loadRhs(b+1, b1); + } + + // Scalar path + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsScalar& b0, RhsScalar& b1) + { + // FIXME not sure that's the best way to implement it! + loadRhs(b+0, b0); + loadRhs(b+1, b1); } // nothing special here @@ -452,12 +468,13 @@ public: ResPacketSize = Vectorizable ? packet_traits::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, + // FIXME: should depend on NumberOfRegisters nr = 4, - mr = 2*ResPacketSize, + mr = ResPacketSize, WorkSpaceFactor = nr*RhsPacketSize, LhsProgress = ResPacketSize, - RhsProgress = ResPacketSize + RhsProgress = 1 }; typedef typename packet_traits::type _LhsPacket; @@ -475,15 +492,19 @@ public: p = pset1(ResScalar(0)); } - EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) - { - for(DenseIndex k=0; k(&b[k*RhsPacketSize], rhs[k]); - } - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { - dest = pload(b); + dest = pset1(*b); + } + + // linking error if instantiated without being optimized out: + void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3); + + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) + { + // FIXME not sure that's the best way to implement it! + b0 = pload1(b+0); + b1 = pload1(b+1); } EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const @@ -545,179 +566,116 @@ struct gebp_kernel EIGEN_DONT_INLINE void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, - Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB=0); + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); }; template EIGEN_DONT_INLINE void gebp_kernel ::operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, - Index strideA, Index strideB, Index offsetA, Index offsetB, RhsScalar* unpackedB) + Index strideA, Index strideB, Index offsetA, Index offsetB) { Traits traits; if(strideA==-1) strideA = depth; if(strideB==-1) strideB = depth; conj_helper cj; -// conj_helper pcj; Index packet_cols = (cols/nr) * nr; + // Here we assume that mr==LhsProgress const Index peeled_mc = (rows/mr)*mr; - // FIXME: - const Index peeled_mc2 = peeled_mc + (rows-peeled_mc >= LhsProgress ? LhsProgress : 0); const Index peeled_kc = (depth/4)*4; - if(unpackedB==0) - unpackedB = const_cast(blockB - strideB * nr * RhsProgress); - // loops on each micro vertical panel of rhs (depth x nr) for(Index j2=0; j2 we select a mr x nr micro block of res which is entirely // stored into mr/packet_size x nr registers. for(Index i=0; i(alpha); - R0 = ploadu(r0); - R1 = ploadu(r1); - R2 = ploadu(r2); - R3 = ploadu(r3); - R4 = ploadu(r0 + ResPacketSize); - R5 = ploadu(r1 + ResPacketSize); - R6 = ploadu(r2 + ResPacketSize); + R0 = ploadu(r0+0*resStride); + R1 = ploadu(r0+1*resStride); + R2 = ploadu(r0+2*resStride); + R3 = ploadu(r0+3*resStride); + R4 = ploadu(r0+4*resStride); + R5 = ploadu(r0+5*resStride); + R6 = ploadu(r0+6*resStride); traits.acc(C0, alphav, R0); - pstoreu(r0, R0); - R0 = ploadu(r3 + ResPacketSize); + pstoreu(r0+0*resStride, R0); + R0 = ploadu(r0+7*resStride); traits.acc(C1, alphav, R1); traits.acc(C2, alphav, R2); @@ -792,239 +727,111 @@ EIGEN_ASM_COMMENT("mybegin4"); traits.acc(C6, alphav, R6); traits.acc(C7, alphav, R0); - pstoreu(r1, R1); - pstoreu(r2, R2); - pstoreu(r3, R3); - pstoreu(r0 + ResPacketSize, R4); - pstoreu(r1 + ResPacketSize, R5); - pstoreu(r2 + ResPacketSize, R6); - pstoreu(r3 + ResPacketSize, R0); + pstoreu(r0+1*resStride, R1); + pstoreu(r0+2*resStride, R2); + pstoreu(r0+3*resStride, R3); + pstoreu(r0+4*resStride, R4); + pstoreu(r0+5*resStride, R5); + pstoreu(r0+6*resStride, R6); + pstoreu(r0+7*resStride, R0); } - else + else // nr==4 { - ResPacket R0, R1, R4; + ResPacket R0, R1, R2; ResPacket alphav = pset1(alpha); - R0 = ploadu(r0); - R1 = ploadu(r1); - R4 = ploadu(r0 + ResPacketSize); + R0 = ploadu(r0+0*resStride); + R1 = ploadu(r0+1*resStride); + R2 = ploadu(r0+2*resStride); traits.acc(C0, alphav, R0); - pstoreu(r0, R0); - R0 = ploadu(r1 + ResPacketSize); + pstoreu(r0+0*resStride, R0); + R0 = ploadu(r0+3*resStride); + traits.acc(C1, alphav, R1); - traits.acc(C4, alphav, R4); - traits.acc(C5, alphav, R0); - pstoreu(r1, R1); - pstoreu(r0 + ResPacketSize, R4); - pstoreu(r1 + ResPacketSize, R0); + traits.acc(C2, alphav, R2); + traits.acc(C3, alphav, R0); + + pstoreu(r0+1*resStride, R1); + pstoreu(r0+2*resStride, R2); + pstoreu(r0+3*resStride, R0); } } - if(rows-peeled_mc>=LhsProgress) - { - Index i = peeled_mc; - const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress]; - prefetch(&blA[0]); - - // gets res block as register - AccPacket C0, C1, C2, C3; - traits.initAcc(C0); - traits.initAcc(C1); - if(nr==4) traits.initAcc(C2); - if(nr==4) traits.initAcc(C3); - - // performs "inner" product - const RhsScalar* blB = unpackedB; - for(Index k=0; k(alpha); - - ResScalar* r0 = &res[(j2+0)*resStride + i]; - ResScalar* r1 = r0 + resStride; - ResScalar* r2 = r1 + resStride; - ResScalar* r3 = r2 + resStride; - - R0 = ploadu(r0); - R1 = ploadu(r1); - if(nr==4) R2 = ploadu(r2); - if(nr==4) R3 = ploadu(r3); - - traits.acc(C0, alphav, R0); - traits.acc(C1, alphav, R1); - if(nr==4) traits.acc(C2, alphav, R2); - if(nr==4) traits.acc(C3, alphav, R3); - - pstoreu(r0, R0); - pstoreu(r1, R1); - if(nr==4) pstoreu(r2, R2); - if(nr==4) pstoreu(r3, R3); - } - for(Index i=peeled_mc2; i do the same but with nr==1 for(Index j2=packet_cols; j2(alpha); - - ResScalar* r0 = &res[(j2+0)*resStride + i]; - - R0 = ploadu(r0); - R4 = ploadu(r0+ResPacketSize); - - traits.acc(C0, alphav, R0); - traits.acc(C4, alphav, R4); - - pstoreu(r0, R0); - pstoreu(r0+ResPacketSize, R4); - } - if(rows-peeled_mc>=LhsProgress) - { - Index i = peeled_mc; - const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress]; - prefetch(&blA[0]); - AccPacket C0; traits.initAcc(C0); - const RhsScalar* blB = unpackedB; + const RhsScalar* blB = &blockB[j2*strideB+offsetB]; for(Index k=0; k(alpha); - ResPacket R0 = ploadu(&res[(j2+0)*resStride + i]); + ResScalar* r0 = &res[(j2+0)*resStride + i]; + R0 = ploadu(r0); traits.acc(C0, alphav, R0); - pstoreu(&res[(j2+0)*resStride + i], R0); + pstoreu(r0, R0); } - for(Index i=peeled_mc2; i=depth && offset<=stride)); - eigen_assert( (StorageOrder==RowMajor) || ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) ); + eigen_assert( (StorageOrder==RowMajor) || ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) ); conj_if::IsComplex && Conjugate> cj; const_blas_data_mapper lhs(_lhs,lhsStride); Index count = 0; @@ -1160,15 +931,25 @@ EIGEN_DONT_INLINE void gemm_pack_lhs=1*PacketSize) A = ploadu(&lhs(i+0*PacketSize, k)); - if(Pack1>=2*PacketSize) B = ploadu(&lhs(i+1*PacketSize, k)); - if(Pack1>=3*PacketSize) C = ploadu(&lhs(i+2*PacketSize, k)); - if(Pack1>=4*PacketSize) D = ploadu(&lhs(i+3*PacketSize, k)); - if(Pack1>=1*PacketSize) { pstore(blockA+count, cj.pconj(A)); count+=PacketSize; } - if(Pack1>=2*PacketSize) { pstore(blockA+count, cj.pconj(B)); count+=PacketSize; } - if(Pack1>=3*PacketSize) { pstore(blockA+count, cj.pconj(C)); count+=PacketSize; } - if(Pack1>=4*PacketSize) { pstore(blockA+count, cj.pconj(D)); count+=PacketSize; } + if((Pack1%PacketSize)==0) + { + Packet A, B, C, D; + if(Pack1>=1*PacketSize) A = ploadu(&lhs(i+0*PacketSize, k)); + if(Pack1>=2*PacketSize) B = ploadu(&lhs(i+1*PacketSize, k)); + if(Pack1>=3*PacketSize) C = ploadu(&lhs(i+2*PacketSize, k)); + if(Pack1>=4*PacketSize) D = ploadu(&lhs(i+3*PacketSize, k)); + if(Pack1>=1*PacketSize) { pstore(blockA+count, cj.pconj(A)); count+=PacketSize; } + if(Pack1>=2*PacketSize) { pstore(blockA+count, cj.pconj(B)); count+=PacketSize; } + if(Pack1>=3*PacketSize) { pstore(blockA+count, cj.pconj(C)); count+=PacketSize; } + if(Pack1>=4*PacketSize) { pstore(blockA+count, cj.pconj(D)); count+=PacketSize; } + } + else + { + if(Pack1>=1) blockA[count++] = cj(lhs(i+0, k)); + if(Pack1>=2) blockA[count++] = cj(lhs(i+1, k)); + if(Pack1>=3) blockA[count++] = cj(lhs(i+2, k)); + if(Pack1>=4) blockA[count++] = cj(lhs(i+3, k)); + } } } else @@ -1247,12 +1028,20 @@ EIGEN_DONT_INLINE void gemm_pack_rhs=4) blockB[count+2] = cj(b2[k]); + if(nr>=4) blockB[count+3] = cj(b3[k]); + if(nr>=8) blockB[count+4] = cj(b4[k]); + if(nr>=8) blockB[count+5] = cj(b5[k]); + if(nr>=8) blockB[count+6] = cj(b6[k]); + if(nr>=8) blockB[count+7] = cj(b7[k]); count += nr; } // skip what we have after @@ -1307,8 +1096,12 @@ EIGEN_DONT_INLINE void gemm_pack_rhs=4) blockB[count+2] = cj(b0[2]); + if(nr>=4) blockB[count+3] = cj(b0[3]); + if(nr>=8) blockB[count+4] = cj(b0[4]); + if(nr>=8) blockB[count+5] = cj(b0[5]); + if(nr>=8) blockB[count+6] = cj(b0[6]); + if(nr>=8) blockB[count+7] = cj(b0[7]); count += nr; } } diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index 3dfd239c1..db1b353e7 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -81,9 +81,7 @@ static void run(Index rows, Index cols, Index depth, Index threads = omp_get_num_threads(); std::size_t sizeA = kc*mc; - std::size_t sizeW = kc*Traits::WorkSpaceFactor; ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, 0); - ei_declare_aligned_stack_constructed_variable(RhsScalar, w, sizeW, 0); RhsScalar* blockB = blocking.blockB(); eigen_internal_assert(blockB!=0); @@ -122,7 +120,7 @@ static void run(Index rows, Index cols, Index depth, if(shift>0) while(info[j].sync!=k) {} - gebp(res+info[j].rhs_start*resStride, resStride, blockA, blockB+info[j].rhs_start*actual_kc, mc, actual_kc, info[j].rhs_length, alpha, -1,-1,0,0, w); + gebp(res+info[j].rhs_start*resStride, resStride, blockA, blockB+info[j].rhs_start*actual_kc, mc, actual_kc, info[j].rhs_length, alpha, -1,-1,0,0); } // Then keep going as usual with the remaining A' @@ -134,7 +132,7 @@ static void run(Index rows, Index cols, Index depth, pack_lhs(blockA, &lhs(i,k), lhsStride, actual_kc, actual_mc); // C_i += A' * B' - gebp(res+i, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1,-1,0,0, w); + gebp(res+i, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1,-1,0,0); } // Release all the sub blocks B'_j of B' for the current thread, @@ -152,11 +150,9 @@ static void run(Index rows, Index cols, Index depth, // this is the sequential version! std::size_t sizeA = kc*mc; std::size_t sizeB = kc*cols; - std::size_t sizeW = kc*Traits::WorkSpaceFactor; ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA()); ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB()); - ei_declare_aligned_stack_constructed_variable(RhsScalar, blockW, sizeW, blocking.blockW()); // For each horizontal panel of the rhs, and corresponding panel of the lhs... // (==GEMM_VAR1) @@ -182,7 +178,7 @@ static void run(Index rows, Index cols, Index depth, pack_lhs(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc); // Everything is packed, we can now call the block * panel kernel: - gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW); + gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0); } } } diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index 5c3763909..ffa871cae 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -73,11 +73,8 @@ struct general_matrix_matrix_triangular_product Traits::nr) mc = (mc/Traits::nr)*Traits::nr; - std::size_t sizeW = kc*Traits::WorkSpaceFactor; - std::size_t sizeB = sizeW + kc*size; ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, kc*mc, 0); - ei_declare_aligned_stack_constructed_variable(RhsScalar, allocatedBlockB, sizeB, 0); - RhsScalar* blockB = allocatedBlockB + sizeW; + ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, kc*size, 0); gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; @@ -103,15 +100,15 @@ struct general_matrix_matrix_triangular_product processed with gebp or skipped if (UpLo==Lower) gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, (std::min)(size,i2), alpha, - -1, -1, 0, 0, allocatedBlockB); + -1, -1, 0, 0); - sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha, allocatedBlockB); + sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha); if (UpLo==Upper) { Index j2 = i2+actual_mc; gebp(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, (std::max)(Index(0), size-j2), alpha, - -1, -1, 0, 0, allocatedBlockB); + -1, -1, 0, 0); } } } @@ -136,7 +133,7 @@ struct tribb_kernel enum { BlockSize = EIGEN_PLAIN_ENUM_MAX(mr,nr) }; - void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha, RhsScalar* workspace) + void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha) { gebp_kernel gebp_kernel; Matrix buffer; @@ -150,7 +147,7 @@ struct tribb_kernel if(UpLo==Upper) gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, alpha, - -1, -1, 0, 0, workspace); + -1, -1, 0, 0); // selfadjoint micro block { @@ -158,7 +155,7 @@ struct tribb_kernel buffer.setZero(); // 1 - apply the kernel on the temporary buffer gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha, - -1, -1, 0, 0, workspace); + -1, -1, 0, 0); // 2 - triangular accumulation for(Index j1=0; j1=4) { blockB[count+2] = rhs(k,j2+2); blockB[count+3] = rhs(k,j2+3); } + if (nr>=8) + { + blockB[count+4] = rhs(k,j2+4); + blockB[count+5] = rhs(k,j2+5); + blockB[count+6] = rhs(k,j2+6); + blockB[count+7] = rhs(k,j2+7); + } count += nr; } } @@ -109,11 +116,18 @@ struct symm_pack_rhs { blockB[count+0] = numext::conj(rhs(j2+0,k)); blockB[count+1] = numext::conj(rhs(j2+1,k)); - if (nr==4) + if (nr>=4) { blockB[count+2] = numext::conj(rhs(j2+2,k)); blockB[count+3] = numext::conj(rhs(j2+3,k)); } + if (nr>=8) + { + blockB[count+4] = numext::conj(rhs(j2+4,k)); + blockB[count+5] = numext::conj(rhs(j2+5,k)); + blockB[count+6] = numext::conj(rhs(j2+6,k)); + blockB[count+7] = numext::conj(rhs(j2+7,k)); + } count += nr; } // symmetric @@ -137,11 +151,18 @@ struct symm_pack_rhs { blockB[count+0] = rhs(k,j2+0); blockB[count+1] = rhs(k,j2+1); - if (nr==4) + if (nr>=4) { blockB[count+2] = rhs(k,j2+2); blockB[count+3] = rhs(k,j2+3); } + if (nr>=8) + { + blockB[count+4] = rhs(k,j2+4); + blockB[count+5] = rhs(k,j2+5); + blockB[count+6] = rhs(k,j2+6); + blockB[count+7] = rhs(k,j2+7); + } count += nr; } } @@ -153,11 +174,18 @@ struct symm_pack_rhs { blockB[count+0] = numext::conj(rhs(j2+0,k)); blockB[count+1] = numext::conj(rhs(j2+1,k)); - if (nr==4) + if (nr>=4) { blockB[count+2] = numext::conj(rhs(j2+2,k)); blockB[count+3] = numext::conj(rhs(j2+3,k)); } + if (nr>=8) + { + blockB[count+4] = numext::conj(rhs(j2+4,k)); + blockB[count+5] = numext::conj(rhs(j2+5,k)); + blockB[count+6] = numext::conj(rhs(j2+6,k)); + blockB[count+7] = numext::conj(rhs(j2+7,k)); + } count += nr; } } @@ -422,11 +450,11 @@ struct SelfadjointProductMatrix NumTraits::IsComplex && EIGEN_LOGICAL_XOR(RhsIsUpper,bool(RhsBlasTraits::NeedToConjugate)), internal::traits::Flags&RowMajorBit ? RowMajor : ColMajor> ::run( - lhs.rows(), rhs.cols(), // sizes - &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info - &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info - &dst.coeffRef(0,0), dst.outerStride(), // result info - actualAlpha // alpha + lhs.rows(), rhs.cols(), // sizes + &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info + &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info + &dst.coeffRef(0,0), dst.outerStride(), // result info + actualAlpha // alpha ); } }; diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h index 8110507b5..62575aff4 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -125,11 +125,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix triangularBuffer; triangularBuffer.setZero(); @@ -187,7 +185,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix0) @@ -197,7 +195,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix() (blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc); - gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW); + gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0); } } } @@ -266,11 +264,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix triangularBuffer; triangularBuffer.setZero(); @@ -357,14 +353,13 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix conj; gebp_kernel gebp_kernel; @@ -158,7 +156,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix conj; gebp_kernel gebp_kernel; @@ -285,8 +281,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix0) gebp_kernel(_other+i2+startPanel*otherStride, otherStride, blockA, geb, actual_mc, actual_kc, rs, Scalar(-1), - -1, -1, 0, 0, blockW); + -1, -1, 0, 0); } } } diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 76447c5b6..a4d7e454d 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -700,98 +700,42 @@ template class aligned_stack_memory_handler * \sa \ref TopicStlContainers. */ template -class aligned_allocator +class aligned_allocator : public std::allocator { public: - typedef size_t size_type; - typedef std::ptrdiff_t difference_type; - typedef T* pointer; - typedef const T* const_pointer; - typedef T& reference; - typedef const T& const_reference; - typedef T value_type; + typedef size_t size_type; + typedef std::ptrdiff_t difference_type; + typedef T* pointer; + typedef const T* const_pointer; + typedef T& reference; + typedef const T& const_reference; + typedef T value_type; - template - struct rebind - { - typedef aligned_allocator other; - }; + template + struct rebind + { + typedef aligned_allocator other; + }; - pointer address( reference value ) const - { - return &value; - } + aligned_allocator() : std::allocator() {} - const_pointer address( const_reference value ) const - { - return &value; - } + aligned_allocator(const aligned_allocator& other) : std::allocator(other) {} - aligned_allocator() - { - } + template + aligned_allocator(const aligned_allocator& other) : std::allocator(other) {} - aligned_allocator( const aligned_allocator& ) - { - } + ~aligned_allocator() {} - template - aligned_allocator( const aligned_allocator& ) - { - } + pointer allocate(size_type num, const void* /*hint*/ = 0) + { + internal::check_size_for_overflow(num); + return static_cast( internal::aligned_malloc(num * sizeof(T)) ); + } - ~aligned_allocator() - { - } - - size_type max_size() const - { - return (std::numeric_limits::max)(); - } - - pointer allocate( size_type num, const void* hint = 0 ) - { - EIGEN_UNUSED_VARIABLE(hint); - internal::check_size_for_overflow(num); - return static_cast( internal::aligned_malloc( num * sizeof(T) ) ); - } - - void construct( pointer p, const T& value ) - { - ::new( p ) T( value ); - } - -#if (__cplusplus >= 201103L) - template - void construct( U* u, Args&&... args) - { - ::new( static_cast(u) ) U( std::forward( args )... ); - } -#endif - - void destroy( pointer p ) - { - p->~T(); - } - -#if (__cplusplus >= 201103L) - template - void destroy( U* u ) - { - u->~U(); - } -#endif - - void deallocate( pointer p, size_type /*num*/ ) - { - internal::aligned_free( p ); - } - - bool operator!=(const aligned_allocator& ) const - { return false; } - - bool operator==(const aligned_allocator& ) const - { return true; } + void deallocate(pointer p, size_type /*num*/) + { + internal::aligned_free(p); + } }; //---------- Cache sizes ---------- diff --git a/Eigen/src/Geometry/AngleAxis.h b/Eigen/src/Geometry/AngleAxis.h index f424e6d7d..b42048c55 100644 --- a/Eigen/src/Geometry/AngleAxis.h +++ b/Eigen/src/Geometry/AngleAxis.h @@ -165,8 +165,8 @@ AngleAxis& AngleAxis::operator=(const QuaternionBase::dummy_precision()*NumTraits::dummy_precision()) { - m_angle = 0; - m_axis << 1, 0, 0; + m_angle = Scalar(0); + m_axis << Scalar(1), Scalar(0), Scalar(0); } else { diff --git a/Eigen/src/Householder/BlockHouseholder.h b/Eigen/src/Householder/BlockHouseholder.h index 1991c6527..60dbea5f5 100644 --- a/Eigen/src/Householder/BlockHouseholder.h +++ b/Eigen/src/Householder/BlockHouseholder.h @@ -48,7 +48,7 @@ void apply_block_householder_on_the_left(MatrixType& mat, const VectorsType& vec typedef typename MatrixType::Index Index; enum { TFactorSize = MatrixType::ColsAtCompileTime }; Index nbVecs = vectors.cols(); - Matrix T(nbVecs,nbVecs); + Matrix T(nbVecs,nbVecs); make_block_householder_triangular_factor(T, vectors, hCoeffs); const TriangularView& V(vectors);