diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 74e1174ae..967a07df5 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -287,6 +287,21 @@ template EIGEN_DEVICE_FUNC inline typename unpacket_traits EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& a) { return a; } +template +struct protate_impl +{ + static Packet run(const Packet& a) { return a; } +}; + +/** \internal \returns a packet with the coefficients rotated to the right in little-endian convention, + * by the given offset, e.g. for offset == 1: + * (packet[3], packet[2], packet[1], packet[0]) becomes (packet[0], packet[3], packet[2], packet[1]) + */ +template EIGEN_DEVICE_FUNC inline Packet protate(const Packet& a) +{ + EIGEN_STATIC_ASSERT(offset < unpacket_traits::size, ROTATION_BY_ILLEGAL_OFFSET); + return offset ? protate_impl::run(a) : a; +} /** \internal \returns \a a with real and imaginary part flipped (for complex type only) */ template EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet& a) diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 8149aed7f..e9af45f22 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -309,6 +309,23 @@ template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { a_hi = vget_high_s32(a_r64); return vcombine_s32(a_hi, a_lo); } + +template +struct protate_impl +{ + static Packet4f run(const Packet4f& a) { + return vextq_f32(a, a, offset); + } +}; + +template +struct protate_impl +{ + static Packet4i run(const Packet4i& a) { + return vextq_s32(a, a, offset); + } +}; + template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); } template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); } @@ -625,6 +642,14 @@ template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { retu template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); } +template +struct protate_impl +{ + static Packet2d run(const Packet2d& a) { + return vextq_f64(a, a, offset); + } +}; + template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); } #if EIGEN_COMP_CLANG && defined(__apple_build_version__) diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index b5a0ba2bc..3653783fd 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -462,6 +462,29 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { return _mm_shuffle_epi32(a,0x1B); } +template +struct protate_impl +{ + static Packet4f run(const Packet4f& a) { + return vec4f_swizzle1(a, offset, (offset + 1) % 4, (offset + 2) % 4, (offset + 3) % 4); + } +}; + +template +struct protate_impl +{ + static Packet4i run(const Packet4i& a) { + return vec4i_swizzle1(a, offset, (offset + 1) % 4, (offset + 2) % 4, (offset + 3) % 4); + } +}; + +template +struct protate_impl +{ + static Packet2d run(const Packet2d& a) { + return vec2d_swizzle1(a, offset, (offset + 1) % 2); + } +}; template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index c8a1dcced..6a16aa661 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -771,7 +771,19 @@ void gebp_kernel(&blB[(0+4*K)*RhsProgress]); \ + } else { \ + EIGEN_ASM_COMMENT("Do not reorder code, we're very tight on registers"); \ + B_0 = protate<1>(B_0); \ + } \ + } else { \ + traits.loadRhs(&blB[(N+4*K)*RhsProgress], B_0); \ + } \ + } while (false) + +#define EIGEN_GEBP_ONESTEP(K) \ do { \ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ @@ -827,34 +853,34 @@ void gebp_kernel resblock; \ + resblock.packet[0] = res0; \ + resblock.packet[1] = res1; \ + resblock.packet[2] = res2; \ + resblock.packet[3] = res3; \ + ptranspose(resblock); \ + resblock.packet[3] = protate<1>(resblock.packet[3]); \ + resblock.packet[2] = protate<2>(resblock.packet[2]); \ + resblock.packet[1] = protate<3>(resblock.packet[1]); \ + ptranspose(resblock); \ + res0 = resblock.packet[0]; \ + res1 = resblock.packet[1]; \ + res2 = resblock.packet[2]; \ + res3 = resblock.packet[3]; \ + } while (false) + + EIGEN_GEBP_UNROTATE_RESULT(C0, C1, C2, C3); + EIGEN_GEBP_UNROTATE_RESULT(C4, C5, C6, C7); + EIGEN_GEBP_UNROTATE_RESULT(C8, C9, C10, C11); + } + ResPacket R0, R1, R2; ResPacket alphav = pset1(alpha); diff --git a/Eigen/src/Core/util/StaticAssert.h b/Eigen/src/Core/util/StaticAssert.h index 7538a0633..5e16b775b 100644 --- a/Eigen/src/Core/util/StaticAssert.h +++ b/Eigen/src/Core/util/StaticAssert.h @@ -93,7 +93,8 @@ THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH, OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG, IMPLICIT_CONVERSION_TO_SCALAR_IS_FOR_INNER_PRODUCT_ONLY, - STORAGE_LAYOUT_DOES_NOT_MATCH + STORAGE_LAYOUT_DOES_NOT_MATCH, + ROTATION_BY_ILLEGAL_OFFSET }; };