From ad59ade116969ca7b18409d690caf00c0b1c34c7 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 28 Mar 2014 12:11:23 -0700 Subject: [PATCH] Vectorized the loop peeling of the inner loop of the block-panel matrix multiplication code. This speeds up the multiplication of matrices which size is not a multiple of the packet size. --- .../Core/products/GeneralBlockPanelKernel.h | 222 ++++++++++++------ 1 file changed, 156 insertions(+), 66 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 0f47f6de5..3ed1fc5a3 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -206,6 +206,11 @@ public: dest = pload(a); } + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + { + dest = ploadu(a); + } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, AccPacket& tmp) const { // It would be a lot cleaner to call pmadd all the time. Unfortunately if we @@ -278,7 +283,12 @@ public: { dest = pload(a); } - + + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + { + dest = ploadu(a); + } + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) { pbroadcast4(b, b0, b1, b2, b3); @@ -334,7 +344,9 @@ public: && packet_traits::Vectorizable, RealPacketSize = Vectorizable ? packet_traits::size : 1, ResPacketSize = Vectorizable ? packet_traits::size : 1, - + LhsPacketSize = Vectorizable ? packet_traits::size : 1, + RhsPacketSize = Vectorizable ? packet_traits::size : 1, + // FIXME: should depend on NumberOfRegisters nr = 4, mr = ResPacketSize, @@ -402,6 +414,11 @@ public: dest = pload((const typename unpacket_traits::type*)(a)); } + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + { + dest = ploadu((const typename unpacket_traits::type*)(a)); + } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacket& c, RhsPacket& /*tmp*/) const { c.first = padd(pmul(a,b.first), c.first); @@ -509,6 +526,11 @@ public: dest = ploaddup(a); } + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + { + dest = ploaddup(a); + } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const { madd_impl(a, b, c, tmp, typename conditional::type()); @@ -706,49 +728,84 @@ void gebp_kernel const LhsScalar* blA = &blockA[i*strideA+offsetA]; prefetch(&blA[0]); - // gets a 1 x 8 res block as registers - ResScalar C0(0), C1(0), C2(0), C3(0), C4(0), C5(0), C6(0), C7(0); // FIXME directly use blockB ??? const RhsScalar* blB = &blockB[j2*strideB+offsetB*8]; - // TODO peel this loop - for(Index k=0; k SwappedTraits; + typedef typename SwappedTraits::ResScalar SResScalar; + typedef typename SwappedTraits::LhsPacket SLhsPacket; + typedef typename SwappedTraits::RhsPacket SRhsPacket; + typedef typename SwappedTraits::ResPacket SResPacket; + typedef typename SwappedTraits::AccPacket SAccPacket; + SwappedTraits straits; + + SAccPacket C0; + straits.initAcc(C0); + for(Index k=0; k(&res[j2*resStride + i], resStride); + SResPacket alphav = pset1(alpha); + straits.acc(C0, alphav, R); + pscatter(&res[j2*resStride + i], R, resStride); + + EIGEN_ASM_COMMENT("end_vectorized_multiplication_of_last_rows"); + } + else + { + // gets a 1 x 8 res block as registers + ResScalar C0(0), C1(0), C2(0), C3(0), C4(0), C5(0), C6(0), C7(0); + + for(Index k=0; k const LhsScalar* blA = &blockA[i*strideA+offsetA]; prefetch(&blA[0]); - // gets a 1 x 4 res block as registers - ResScalar C0(0), C1(0), C2(0), C3(0); // FIXME directly use blockB ??? const RhsScalar* blB = &blockB[j2*strideB+offsetB*4]; - // TODO peel this loop - for(Index k=0; k SwappedTraits; + typedef typename SwappedTraits::ResScalar SResScalar; + typedef typename SwappedTraits::LhsPacket SLhsPacket; + typedef typename SwappedTraits::RhsPacket SRhsPacket; + typedef typename SwappedTraits::ResPacket SResPacket; + typedef typename SwappedTraits::AccPacket SAccPacket; + SwappedTraits straits; + + SAccPacket C0; + straits.initAcc(C0); + for(Index k=0; k(&res[j2*resStride + i], resStride); + SResPacket alphav = pset1(alpha); + straits.acc(C0, alphav, R); + pscatter(&res[j2*resStride + i], R, resStride); + + EIGEN_ASM_COMMENT("end_vectorized_multiplication_of_last_rows"); + } else { + // gets a 1 x 4 res block as registers + ResScalar C0(0), C1(0), C2(0), C3(0); + + for(Index k=0; k