From 7b2901e2aa4d69415d38c0df37ff0d2521c9c64b Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Wed, 21 Sep 2022 00:27:23 +0000 Subject: [PATCH] Add vectorized integer division for int32 with AVX512, AVX or SSE. --- Eigen/Core | 4 ++++ Eigen/src/Core/arch/AVX/PacketMath.h | 22 +++++++++++++++++----- Eigen/src/Core/arch/AVX512/PacketMath.h | 16 +++++++++++++++- Eigen/src/Core/arch/SSE/PacketMath.h | 22 ++++++++++++++++++++++ Eigen/src/Core/functors/BinaryFunctors.h | 24 ++++++++++++++++++++++-- 5 files changed, 80 insertions(+), 8 deletions(-) diff --git a/Eigen/Core b/Eigen/Core index 623d735d6..48c212189 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -177,6 +177,10 @@ using std::ptrdiff_t; #include "src/Core/arch/Default/TypeCasting.h" #include "src/Core/arch/Default/GenericPacketMathFunctionsFwd.h" +#ifndef EIGEN_GPU_COMPILE_PHASE + #include +#endif + #if defined EIGEN_VECTORIZE_AVX512 #if defined EIGEN_VECTORIZE_AVX512FP16 #include "src/Core/arch/AVX512/PacketMathFP16.h" diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 8f346f36d..227e88a86 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -212,6 +212,7 @@ template<> struct packet_traits : default_packet_traits Vectorizable = 1, AlignedOnScalar = 1, HasCmp = 1, + HasDiv = 1, size=8 }; }; @@ -545,13 +546,19 @@ template<> EIGEN_STRONG_INLINE Packet8i pmul(const Packet8i& a, const template<> EIGEN_STRONG_INLINE Packet8f pdiv(const Packet8f& a, const Packet8f& b) { return _mm256_div_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d pdiv(const Packet4d& a, const Packet4d& b) { return _mm256_div_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet8i pdiv(const Packet8i& /*a*/, const Packet8i& /*b*/) -{ eigen_assert(false && "packet integer division are not supported by AVX"); - return pset1(0); + +template<> EIGEN_STRONG_INLINE Packet8i pdiv(const Packet8i& a, const Packet8i& b) +{ +#ifdef EIGEN_VECTORIZE_AVX512 + return _mm512_cvttpd_epi32(_mm512_div_pd(_mm512_cvtepi32_pd(a), _mm512_cvtepi32_pd(b))); +#else + Packet4i lo = pdiv(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0)); + Packet4i hi = pdiv(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1)); + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1); +#endif } #ifdef EIGEN_VECTORIZE_FMA - template <> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) { return _mm256_fmadd_ps(a, b, c); @@ -1226,7 +1233,12 @@ template<> EIGEN_STRONG_INLINE double predux_max(const Packet4d& a) template<> EIGEN_STRONG_INLINE bool predux_any(const Packet8f& x) { - return _mm256_movemask_ps(x)!=0; + return _mm256_movemask_ps(x) != 0; +} + +template<> EIGEN_STRONG_INLINE bool predux_any(const Packet8i& x) +{ + return _mm256_movemask_ps(_mm256_castsi256_ps(x)) != 0; } EIGEN_DEVICE_FUNC inline void diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 7b07149a0..5e9670c52 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -178,6 +178,7 @@ template<> struct packet_traits : default_packet_traits Vectorizable = 1, AlignedOnScalar = 1, HasCmp = 1, + HasDiv = 1, size=16 }; }; @@ -389,12 +390,21 @@ EIGEN_STRONG_INLINE Packet16f pdiv(const Packet16f& a, const Packet16f& b) { return _mm512_div_ps(a, b); } + template <> EIGEN_STRONG_INLINE Packet8d pdiv(const Packet8d& a, const Packet8d& b) { return _mm512_div_pd(a, b); } +template <> +EIGEN_STRONG_INLINE Packet16i pdiv(const Packet16i& a, + const Packet16i& b) { + Packet8i q_lo = pdiv(_mm512_extracti64x4_epi64(a, 0), _mm512_extracti64x4_epi64(b,0)); + Packet8i q_hi = pdiv(_mm512_extracti64x4_epi64(a, 1), _mm512_extracti64x4_epi64(b, 1)); + return _mm512_inserti64x4(_mm512_castsi256_si512(q_lo), q_hi, 1); +} + #ifdef EIGEN_VECTORIZE_FMA template <> EIGEN_STRONG_INLINE Packet16f pmadd(const Packet16f& a, const Packet16f& b, @@ -1378,7 +1388,11 @@ template<> EIGEN_STRONG_INLINE bool predux_any(const Packet16f& x) return !_mm512_kortestz(tmp,tmp); } - +template<> EIGEN_STRONG_INLINE bool predux_any(const Packet16i& x) +{ + __mmask16 tmp = _mm512_test_epi32_mask(x,x); + return !_mm512_kortestz(tmp,tmp); +} #define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE) \ EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[INDEX], INPUT[INDEX + STRIDE]); diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 0fa43949e..f9426689e 100644 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -195,6 +195,7 @@ template<> struct packet_traits : default_packet_traits Vectorizable = 1, AlignedOnScalar = 1, HasCmp = 1, + HasDiv=1, size=4, HasShift = 1, @@ -369,6 +370,22 @@ template<> EIGEN_STRONG_INLINE Packet16b pmul(const Packet16b& a, con template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { return _mm_div_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); } +template <> +EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& a, + const Packet4i& b) { +#ifdef EIGEN_VECTORIZE_AVX + return _mm256_cvttpd_epi32( + _mm256_div_pd(_mm256_cvtepi32_pd(a), _mm256_cvtepi32_pd(b))); +#else + __m128i q_lo = _mm_cvttpd_epi32(_mm_div_pd(_mm_cvtepi32_pd(a), _mm_cvtepi32_pd(b))); + __m128i q_hi = + _mm_cvttpd_epi32(_mm_div_pd(_mm_cvtepi32_pd(vec4i_swizzle1(a, 2, 3, 0, 1)), + _mm_cvtepi32_pd(vec4i_swizzle1(b, 2, 3, 0, 1)))); + return vec4i_swizzle1(_mm_unpacklo_epi32(q_lo, q_hi), 0, 2, 1, 3); +#endif +} + + // for some weird raisons, it has to be overloaded for packet of integers template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); } #ifdef EIGEN_VECTORIZE_FMA @@ -1142,6 +1159,11 @@ template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) return _mm_movemask_ps(x) != 0x0; } +template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4i& x) +{ + return _mm_movemask_ps(_mm_castsi128_ps(x)) != 0x0; +} + EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]); diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h index 094acb401..9b560e991 100644 --- a/Eigen/src/Core/functors/BinaryFunctors.h +++ b/Eigen/src/Core/functors/BinaryFunctors.h @@ -376,6 +376,24 @@ struct functor_traits > { }; }; +template ::type>::IsInteger> +struct maybe_raise_div_by_zero { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Packet x) { + EIGEN_UNUSED_VARIABLE(x); + } +}; + +#ifndef EIGEN_GPU_COMPILE_PHASE +template +struct maybe_raise_div_by_zero { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Packet x) { + if (EIGEN_PREDICT_FALSE(predux_any(pcmp_eq(x, pzero(x))))) { + std::raise(SIGFPE); + } + } +}; +#endif + /** \internal * \brief Template functor to compute the quotient of two scalars * @@ -392,8 +410,10 @@ struct scalar_quotient_op : binary_op_base #endif EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a / b; } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const - { return internal::pdiv(a,b); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const { + maybe_raise_div_by_zero::run(b); + return internal::pdiv(a,b); + } }; template struct functor_traits > {