diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 9f81a4623..b1f7b7717 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -397,7 +397,7 @@ template<> EIGEN_STRONG_INLINE void pbroadcast4(const float *a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) { - a3 = pload(a); + a3 = ploadu(a); a0 = vec4f_swizzle1(a3, 0,0,0,0); a1 = vec4f_swizzle1(a3, 1,1,1,1); a2 = vec4f_swizzle1(a3, 2,2,2,2); @@ -413,10 +413,10 @@ pbroadcast4(const double *a, a2 = _mm_loaddup_pd(a+2); a3 = _mm_loaddup_pd(a+3); #else - a1 = pload(a); + a1 = ploadu(a); a0 = vec2d_swizzle1(a1, 0,0); a1 = vec2d_swizzle1(a1, 1,1); - a3 = pload(a+2); + a3 = ploadu(a+2); a2 = vec2d_swizzle1(a3, 0,0); a3 = vec2d_swizzle1(a3, 1,1); #endif diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index ba6fad246..8a398d912 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -161,11 +161,11 @@ public: NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, - // register block size along the N direction (must be either 2 or 4) - nr = NumberOfRegisters/4, + // register block size along the N direction (must be either 4 or 8) + nr = NumberOfRegisters/2, // register block size along the M direction (currently, this one cannot be modified) - mr = 2 * LhsPacketSize, + mr = LhsPacketSize, WorkSpaceFactor = nr * RhsPacketSize, @@ -187,6 +187,16 @@ public: { p = pset1(ResScalar(0)); } + + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + { + pbroadcast4(b, b0, b1, b2, b3); + } + + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) + { + pbroadcast2(b, b0, b1); + } EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { @@ -230,8 +240,8 @@ public: ResPacketSize = Vectorizable ? packet_traits::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, - nr = NumberOfRegisters/4, - mr = 2 * LhsPacketSize, + nr = NumberOfRegisters/2, + mr = LhsPacketSize, WorkSpaceFactor = nr*RhsPacketSize, LhsProgress = LhsPacketSize, @@ -262,6 +272,16 @@ public: { dest = pload(a); } + + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + { + pbroadcast4(b, b0, b1, b2, b3); + } + + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) + { + pbroadcast2(b, b0, b1); + } EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const { @@ -304,8 +324,9 @@ public: RealPacketSize = Vectorizable ? packet_traits::size : 1, ResPacketSize = Vectorizable ? packet_traits::size : 1, - nr = 2, - mr = 2 * ResPacketSize, + // FIXME: should depend on NumberOfRegisters + nr = 4, + mr = ResPacketSize, WorkSpaceFactor = Vectorizable ? 2*nr*RealPacketSize : nr, LhsProgress = ResPacketSize, @@ -333,16 +354,37 @@ public: p.second = pset1(RealScalar(0)); } + // Scalar path EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const { dest = pset1(*b); } + // Vectorized path EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const { dest.first = pset1(real(*b)); dest.second = pset1(imag(*b)); } + + // linking error if instantiated without being optimized out: + void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3); + + // Vectorized path + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacket& b0, DoublePacket& b1) + { + // FIXME not sure that's the best way to implement it! + loadRhs(b+0, b0); + loadRhs(b+1, b1); + } + + // Scalar path + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsScalar& b0, RhsScalar& b1) + { + // FIXME not sure that's the best way to implement it! + loadRhs(b+0, b0); + loadRhs(b+1, b1); + } // nothing special here EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const @@ -414,8 +456,9 @@ public: ResPacketSize = Vectorizable ? packet_traits::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, + // FIXME: should depend on NumberOfRegisters nr = 4, - mr = 2*ResPacketSize, + mr = ResPacketSize, WorkSpaceFactor = nr*RhsPacketSize, LhsProgress = ResPacketSize, @@ -441,6 +484,16 @@ public: { dest = pset1(*b); } + + // linking error if instantiated without being optimized out: + void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3); + + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) + { + // FIXME not sure that's the best way to implement it! + b0 = pload1(b+0); + b1 = pload1(b+1); + } EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { @@ -511,11 +564,9 @@ void gebp_kernel if(strideA==-1) strideA = depth; if(strideB==-1) strideB = depth; conj_helper cj; -// conj_helper pcj; Index packet_cols = (cols/nr) * nr; + // Here we assume that mr==LhsProgress const Index peeled_mc = (rows/mr)*mr; - // FIXME: - const Index peeled_mc2 = peeled_mc + (rows-peeled_mc >= LhsProgress ? LhsProgress : 0); const Index peeled_kc = (depth/4)*4; // loops on each micro vertical panel of rhs (depth x nr) @@ -527,144 +578,88 @@ void gebp_kernel for(Index i=0; i(alpha); - R0 = ploadu(r0); - R1 = ploadu(r1); - R2 = ploadu(r2); - R3 = ploadu(r3); - R4 = ploadu(r0 + ResPacketSize); - R5 = ploadu(r1 + ResPacketSize); - R6 = ploadu(r2 + ResPacketSize); + R0 = ploadu(r0+0*resStride); + R1 = ploadu(r0+1*resStride); + R2 = ploadu(r0+2*resStride); + R3 = ploadu(r0+3*resStride); + R4 = ploadu(r0+4*resStride); + R5 = ploadu(r0+5*resStride); + R6 = ploadu(r0+6*resStride); traits.acc(C0, alphav, R0); - pstoreu(r0, R0); - R0 = ploadu(r3 + ResPacketSize); + pstoreu(r0+0*resStride, R0); + R0 = ploadu(r0+7*resStride); traits.acc(C1, alphav, R1); traits.acc(C2, alphav, R2); @@ -739,232 +711,107 @@ EIGEN_ASM_COMMENT("mybegin4"); traits.acc(C6, alphav, R6); traits.acc(C7, alphav, R0); - pstoreu(r1, R1); - pstoreu(r2, R2); - pstoreu(r3, R3); - pstoreu(r0 + ResPacketSize, R4); - pstoreu(r1 + ResPacketSize, R5); - pstoreu(r2 + ResPacketSize, R6); - pstoreu(r3 + ResPacketSize, R0); + pstoreu(r0+1*resStride, R1); + pstoreu(r0+2*resStride, R2); + pstoreu(r0+3*resStride, R3); + pstoreu(r0+4*resStride, R4); + pstoreu(r0+5*resStride, R5); + pstoreu(r0+6*resStride, R6); + pstoreu(r0+7*resStride, R0); } - else + else // nr==4 { - ResPacket R0, R1, R4; + ResPacket R0, R1, R2; ResPacket alphav = pset1(alpha); - R0 = ploadu(r0); - R1 = ploadu(r1); - R4 = ploadu(r0 + ResPacketSize); + R0 = ploadu(r0+0*resStride); + R1 = ploadu(r0+1*resStride); + R2 = ploadu(r0+2*resStride); traits.acc(C0, alphav, R0); - pstoreu(r0, R0); - R0 = ploadu(r1 + ResPacketSize); + pstoreu(r0+0*resStride, R0); + R0 = ploadu(r0+3*resStride); + traits.acc(C1, alphav, R1); - traits.acc(C4, alphav, R4); - traits.acc(C5, alphav, R0); - pstoreu(r1, R1); - pstoreu(r0 + ResPacketSize, R4); - pstoreu(r1 + ResPacketSize, R0); + traits.acc(C2, alphav, R2); + traits.acc(C3, alphav, R0); + + pstoreu(r0+1*resStride, R1); + pstoreu(r0+2*resStride, R2); + pstoreu(r0+3*resStride, R0); } } - if(rows-peeled_mc>=LhsProgress) - { - Index i = peeled_mc; - const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress]; - prefetch(&blA[0]); - - // gets res block as register - AccPacket C0, C1, C2, C3; - traits.initAcc(C0); - traits.initAcc(C1); - if(nr==4) traits.initAcc(C2); - if(nr==4) traits.initAcc(C3); - - // performs "inner" product - const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; - for(Index k=0; k(alpha); - - ResScalar* r0 = &res[(j2+0)*resStride + i]; - ResScalar* r1 = r0 + resStride; - ResScalar* r2 = r1 + resStride; - ResScalar* r3 = r2 + resStride; - - R0 = ploadu(r0); - R1 = ploadu(r1); - if(nr==4) R2 = ploadu(r2); - if(nr==4) R3 = ploadu(r3); - - traits.acc(C0, alphav, R0); - traits.acc(C1, alphav, R1); - if(nr==4) traits.acc(C2, alphav, R2); - if(nr==4) traits.acc(C3, alphav, R3); - - pstoreu(r0, R0); - pstoreu(r1, R1); - if(nr==4) pstoreu(r2, R2); - if(nr==4) pstoreu(r3, R3); - } - for(Index i=peeled_mc2; i do the same but with nr==1 for(Index j2=packet_cols; j2(alpha); - - ResScalar* r0 = &res[(j2+0)*resStride + i]; - - R0 = ploadu(r0); - R4 = ploadu(r0+ResPacketSize); - - traits.acc(C0, alphav, R0); - traits.acc(C4, alphav, R4); - - pstoreu(r0, R0); - pstoreu(r0+ResPacketSize, R4); - } - if(rows-peeled_mc>=LhsProgress) - { - Index i = peeled_mc; - const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress]; - prefetch(&blA[0]); - AccPacket C0; traits.initAcc(C0); @@ -1025,19 +832,23 @@ EIGEN_ASM_COMMENT("mybegin4"); { LhsPacket A0; RhsPacket B_0; - traits.loadLhs(blA, A0); - traits.loadRhs(blB, B_0); - traits.madd(A0, B_0, C0, B_0); + RhsPacket T0; + + traits.loadLhs(&blA[0*LhsProgress], A0); + traits.loadRhs(&blB[0*RhsProgress], B_0); + traits.madd(A0,B_0,C0,T0); + blB += RhsProgress; blA += LhsProgress; } - + ResPacket R0; ResPacket alphav = pset1(alpha); - ResPacket R0 = ploadu(&res[(j2+0)*resStride + i]); + ResScalar* r0 = &res[(j2+0)*resStride + i]; + R0 = ploadu(r0); traits.acc(C0, alphav, R0); - pstoreu(&res[(j2+0)*resStride + i], R0); + pstoreu(r0, R0); } - for(Index i=peeled_mc2; i=depth && offset<=stride)); - eigen_assert( (StorageOrder==RowMajor) || ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) ); + eigen_assert( (StorageOrder==RowMajor) || ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) ); conj_if::IsComplex && Conjugate> cj; const_blas_data_mapper lhs(_lhs,lhsStride); Index count = 0; @@ -1104,15 +915,25 @@ EIGEN_DONT_INLINE void gemm_pack_lhs=1*PacketSize) A = ploadu(&lhs(i+0*PacketSize, k)); - if(Pack1>=2*PacketSize) B = ploadu(&lhs(i+1*PacketSize, k)); - if(Pack1>=3*PacketSize) C = ploadu(&lhs(i+2*PacketSize, k)); - if(Pack1>=4*PacketSize) D = ploadu(&lhs(i+3*PacketSize, k)); - if(Pack1>=1*PacketSize) { pstore(blockA+count, cj.pconj(A)); count+=PacketSize; } - if(Pack1>=2*PacketSize) { pstore(blockA+count, cj.pconj(B)); count+=PacketSize; } - if(Pack1>=3*PacketSize) { pstore(blockA+count, cj.pconj(C)); count+=PacketSize; } - if(Pack1>=4*PacketSize) { pstore(blockA+count, cj.pconj(D)); count+=PacketSize; } + if((Pack1%PacketSize)==0) + { + Packet A, B, C, D; + if(Pack1>=1*PacketSize) A = ploadu(&lhs(i+0*PacketSize, k)); + if(Pack1>=2*PacketSize) B = ploadu(&lhs(i+1*PacketSize, k)); + if(Pack1>=3*PacketSize) C = ploadu(&lhs(i+2*PacketSize, k)); + if(Pack1>=4*PacketSize) D = ploadu(&lhs(i+3*PacketSize, k)); + if(Pack1>=1*PacketSize) { pstore(blockA+count, cj.pconj(A)); count+=PacketSize; } + if(Pack1>=2*PacketSize) { pstore(blockA+count, cj.pconj(B)); count+=PacketSize; } + if(Pack1>=3*PacketSize) { pstore(blockA+count, cj.pconj(C)); count+=PacketSize; } + if(Pack1>=4*PacketSize) { pstore(blockA+count, cj.pconj(D)); count+=PacketSize; } + } + else + { + if(Pack1>=1) blockA[count++] = cj(lhs(i+0, k)); + if(Pack1>=2) blockA[count++] = cj(lhs(i+1, k)); + if(Pack1>=3) blockA[count++] = cj(lhs(i+2, k)); + if(Pack1>=4) blockA[count++] = cj(lhs(i+3, k)); + } } } else @@ -1191,12 +1012,20 @@ EIGEN_DONT_INLINE void gemm_pack_rhs=4) blockB[count+2] = cj(b2[k]); + if(nr>=4) blockB[count+3] = cj(b3[k]); + if(nr>=8) blockB[count+4] = cj(b4[k]); + if(nr>=8) blockB[count+5] = cj(b5[k]); + if(nr>=8) blockB[count+6] = cj(b6[k]); + if(nr>=8) blockB[count+7] = cj(b7[k]); count += nr; } // skip what we have after @@ -1251,8 +1080,12 @@ EIGEN_DONT_INLINE void gemm_pack_rhs=4) blockB[count+2] = cj(b0[2]); + if(nr>=4) blockB[count+3] = cj(b0[3]); + if(nr>=8) blockB[count+4] = cj(b0[4]); + if(nr>=8) blockB[count+5] = cj(b0[5]); + if(nr>=8) blockB[count+6] = cj(b0[6]); + if(nr>=8) blockB[count+7] = cj(b0[7]); count += nr; } } diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index 99cf9e0ae..d9fd9f556 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -63,7 +63,7 @@ struct symm_pack_lhs for(Index i=peeled_mc; i=4) { blockB[count+2] = rhs(k,j2+2); blockB[count+3] = rhs(k,j2+3); } + if (nr>=8) + { + blockB[count+4] = rhs(k,j2+4); + blockB[count+5] = rhs(k,j2+5); + blockB[count+6] = rhs(k,j2+6); + blockB[count+7] = rhs(k,j2+7); + } count += nr; } } @@ -109,11 +116,18 @@ struct symm_pack_rhs { blockB[count+0] = numext::conj(rhs(j2+0,k)); blockB[count+1] = numext::conj(rhs(j2+1,k)); - if (nr==4) + if (nr>=4) { blockB[count+2] = numext::conj(rhs(j2+2,k)); blockB[count+3] = numext::conj(rhs(j2+3,k)); } + if (nr>=8) + { + blockB[count+4] = numext::conj(rhs(j2+4,k)); + blockB[count+5] = numext::conj(rhs(j2+5,k)); + blockB[count+6] = numext::conj(rhs(j2+6,k)); + blockB[count+7] = numext::conj(rhs(j2+7,k)); + } count += nr; } // symmetric @@ -137,11 +151,18 @@ struct symm_pack_rhs { blockB[count+0] = rhs(k,j2+0); blockB[count+1] = rhs(k,j2+1); - if (nr==4) + if (nr>=4) { blockB[count+2] = rhs(k,j2+2); blockB[count+3] = rhs(k,j2+3); } + if (nr>=8) + { + blockB[count+4] = rhs(k,j2+4); + blockB[count+5] = rhs(k,j2+5); + blockB[count+6] = rhs(k,j2+6); + blockB[count+7] = rhs(k,j2+7); + } count += nr; } } @@ -153,11 +174,18 @@ struct symm_pack_rhs { blockB[count+0] = numext::conj(rhs(j2+0,k)); blockB[count+1] = numext::conj(rhs(j2+1,k)); - if (nr==4) + if (nr>=4) { blockB[count+2] = numext::conj(rhs(j2+2,k)); blockB[count+3] = numext::conj(rhs(j2+3,k)); } + if (nr>=8) + { + blockB[count+4] = numext::conj(rhs(j2+4,k)); + blockB[count+5] = numext::conj(rhs(j2+5,k)); + blockB[count+6] = numext::conj(rhs(j2+6,k)); + blockB[count+7] = numext::conj(rhs(j2+7,k)); + } count += nr; } } @@ -422,11 +450,11 @@ struct SelfadjointProductMatrix NumTraits::IsComplex && EIGEN_LOGICAL_XOR(RhsIsUpper,bool(RhsBlasTraits::NeedToConjugate)), internal::traits::Flags&RowMajorBit ? RowMajor : ColMajor> ::run( - lhs.rows(), rhs.cols(), // sizes - &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info - &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info - &dst.coeffRef(0,0), dst.outerStride(), // result info - actualAlpha // alpha + lhs.rows(), rhs.cols(), // sizes + &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info + &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info + &dst.coeffRef(0,0), dst.outerStride(), // result info + actualAlpha // alpha ); } };