From 9bd8a4bab58231a1d3afe0dd43a7c72f217dfec1 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Wed, 18 Feb 2015 15:03:35 -0500 Subject: [PATCH] bug #955 - Implement a rotating kernel alternative in the 3px4 gebp path This is substantially faster on ARM, where it's important to minimize the number of loads. This is specific to the case where all packet types are of size 4. I made my best attempt to minimize how dirty this is... opinions welcome. Eventually one could have a generic rotated kernel, but it would take some work to get there. Also, on sandy bridge, in my experience, it's not beneficial (even about 1% slower). --- Eigen/src/Core/GenericPacketMath.h | 15 ++++ Eigen/src/Core/arch/NEON/PacketMath.h | 25 ++++++ Eigen/src/Core/arch/SSE/PacketMath.h | 23 +++++ .../Core/products/GeneralBlockPanelKernel.h | 86 +++++++++++++++---- Eigen/src/Core/util/StaticAssert.h | 3 +- 5 files changed, 133 insertions(+), 19 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 74e1174ae..967a07df5 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -287,6 +287,21 @@ template EIGEN_DEVICE_FUNC inline typename unpacket_traits EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& a) { return a; } +template +struct protate_impl +{ + static Packet run(const Packet& a) { return a; } +}; + +/** \internal \returns a packet with the coefficients rotated to the right in little-endian convention, + * by the given offset, e.g. for offset == 1: + * (packet[3], packet[2], packet[1], packet[0]) becomes (packet[0], packet[3], packet[2], packet[1]) + */ +template EIGEN_DEVICE_FUNC inline Packet protate(const Packet& a) +{ + EIGEN_STATIC_ASSERT(offset < unpacket_traits::size, ROTATION_BY_ILLEGAL_OFFSET); + return offset ? protate_impl::run(a) : a; +} /** \internal \returns \a a with real and imaginary part flipped (for complex type only) */ template EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet& a) diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 8149aed7f..e9af45f22 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -309,6 +309,23 @@ template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { a_hi = vget_high_s32(a_r64); return vcombine_s32(a_hi, a_lo); } + +template +struct protate_impl +{ + static Packet4f run(const Packet4f& a) { + return vextq_f32(a, a, offset); + } +}; + +template +struct protate_impl +{ + static Packet4i run(const Packet4i& a) { + return vextq_s32(a, a, offset); + } +}; + template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); } template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); } @@ -625,6 +642,14 @@ template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { retu template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); } +template +struct protate_impl +{ + static Packet2d run(const Packet2d& a) { + return vextq_f64(a, a, offset); + } +}; + template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); } #if EIGEN_COMP_CLANG && defined(__apple_build_version__) diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index b5a0ba2bc..3653783fd 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -462,6 +462,29 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { return _mm_shuffle_epi32(a,0x1B); } +template +struct protate_impl +{ + static Packet4f run(const Packet4f& a) { + return vec4f_swizzle1(a, offset, (offset + 1) % 4, (offset + 2) % 4, (offset + 3) % 4); + } +}; + +template +struct protate_impl +{ + static Packet4i run(const Packet4i& a) { + return vec4i_swizzle1(a, offset, (offset + 1) % 4, (offset + 2) % 4, (offset + 3) % 4); + } +}; + +template +struct protate_impl +{ + static Packet2d run(const Packet2d& a) { + return vec2d_swizzle1(a, offset, (offset + 1) % 2); + } +}; template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index c8a1dcced..6a16aa661 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -771,7 +771,19 @@ void gebp_kernel(&blB[(0+4*K)*RhsProgress]); \ + } else { \ + EIGEN_ASM_COMMENT("Do not reorder code, we're very tight on registers"); \ + B_0 = protate<1>(B_0); \ + } \ + } else { \ + traits.loadRhs(&blB[(N+4*K)*RhsProgress], B_0); \ + } \ + } while (false) + +#define EIGEN_GEBP_ONESTEP(K) \ do { \ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ @@ -827,34 +853,34 @@ void gebp_kernel resblock; \ + resblock.packet[0] = res0; \ + resblock.packet[1] = res1; \ + resblock.packet[2] = res2; \ + resblock.packet[3] = res3; \ + ptranspose(resblock); \ + resblock.packet[3] = protate<1>(resblock.packet[3]); \ + resblock.packet[2] = protate<2>(resblock.packet[2]); \ + resblock.packet[1] = protate<3>(resblock.packet[1]); \ + ptranspose(resblock); \ + res0 = resblock.packet[0]; \ + res1 = resblock.packet[1]; \ + res2 = resblock.packet[2]; \ + res3 = resblock.packet[3]; \ + } while (false) + + EIGEN_GEBP_UNROTATE_RESULT(C0, C1, C2, C3); + EIGEN_GEBP_UNROTATE_RESULT(C4, C5, C6, C7); + EIGEN_GEBP_UNROTATE_RESULT(C8, C9, C10, C11); + } + ResPacket R0, R1, R2; ResPacket alphav = pset1(alpha); diff --git a/Eigen/src/Core/util/StaticAssert.h b/Eigen/src/Core/util/StaticAssert.h index 7538a0633..5e16b775b 100644 --- a/Eigen/src/Core/util/StaticAssert.h +++ b/Eigen/src/Core/util/StaticAssert.h @@ -93,7 +93,8 @@ THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH, OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG, IMPLICIT_CONVERSION_TO_SCALAR_IS_FOR_INNER_PRODUCT_ONLY, - STORAGE_LAYOUT_DOES_NOT_MATCH + STORAGE_LAYOUT_DOES_NOT_MATCH, + ROTATION_BY_ILLEGAL_OFFSET }; };