From 21edea5eddb282f12d38938d657973b1f8720779 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Thu, 15 Oct 2020 18:39:58 -0700 Subject: [PATCH] Fix the specialization of pfrexp for AVX to be faster when AVX2/AVX512DQ is not available, and avoid undefined behavior in C++. Also mask off the sign bit when extracting the exponent. --- Eigen/src/Core/arch/AVX/PacketMath.h | 31 ++++++++++++++-------------- Eigen/src/Core/arch/SSE/PacketMath.h | 7 ++++--- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 5dbb84616..ae4c4aaaa 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -691,26 +691,27 @@ template<> EIGEN_STRONG_INLINE Packet8f pfrexp(const Packet8f& a, Pack template<> EIGEN_STRONG_INLINE Packet4d pfrexp(const Packet4d& a, Packet4d& exponent) { const Packet4d cst_1022d = pset1(1022.0); const Packet4d cst_half = pset1(0.5); - const Packet4d cst_inv_mant_mask = pset1frombits(static_cast(~0x7ff0000000000000ull)); - __m256i a_expo = _mm256_castpd_si256(a); + const Packet4d cst_exp_mask = pset1frombits(static_cast(0x7ff0000000000000ull)); + __m256i a_expo = _mm256_castpd_si256(pand(a, cst_exp_mask)); #ifdef EIGEN_VECTORIZE_AVX2 a_expo = _mm256_srli_epi64(a_expo, 52); -#else - __m128i lo = _mm_srli_epi64(_mm256_extractf128_si256(a_expo, 0), 52); - __m128i hi = _mm_srli_epi64(_mm256_extractf128_si256(a_expo, 1), 52); - a_expo = _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); #endif -#ifdef EIGEN_VECTORIZE_AVX512DQ - // AVX512DQ finally provides an instruction for this +#if defined(EIGEN_VECTORIZE_AVX2) && defined(EIGEN_VECTORIZE_AVX512DQ) exponent = _mm256_cvtepi64_pd(a_expo); -#else - exponent = _mm256_set_pd(static_cast(_mm256_extract_epi64(a_expo, 3)), - static_cast(_mm256_extract_epi64(a_expo, 2)), - static_cast(_mm256_extract_epi64(a_expo, 1)), - static_cast(_mm256_extract_epi64(a_expo, 0))); -#endif +#else + __m128i lo = _mm256_extractf128_si256(a_expo, 0); + __m128i hi = _mm256_extractf128_si256(a_expo, 1); +#ifndef EIGEN_VECTORIZE_AVX2 + lo = _mm_srli_epi64(lo, 52); + hi = _mm_srli_epi64(hi, 52); +#endif + Packet2d exponent_lo = _mm_cvtepi32_pd(vec4i_swizzle1(lo, 0, 2, 1, 3)); + Packet2d exponent_hi = _mm_cvtepi32_pd(vec4i_swizzle1(hi, 0, 2, 1, 3)); + exponent = _mm256_set_m128d(exponent_hi, exponent_lo); +#endif // EIGEN_VECTORIZE_AVX512DQ exponent = psub(exponent, cst_1022d); - return por(pand(a, cst_inv_mant_mask), cst_half); + const Packet4d cst_mant_mask = pset1frombits(static_cast(~0x7ff0000000000000ull)); + return por(pand(a, cst_mant_mask), cst_half); } template<> EIGEN_STRONG_INLINE Packet8f pldexp(const Packet8f& a, const Packet8f& exponent) { diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 602adbad3..bd354c407 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -805,10 +805,11 @@ template<> EIGEN_STRONG_INLINE Packet4f pfrexp(const Packet4f& a, Pack template<> EIGEN_STRONG_INLINE Packet2d pfrexp(const Packet2d& a, Packet2d& exponent) { const Packet2d cst_1022d = pset1(1022.0); const Packet2d cst_half = pset1(0.5); - const Packet2d cst_inv_mant_mask = pset1frombits(static_cast(~0x7ff0000000000000ull)); - __m128i a_expo = _mm_srli_epi64(_mm_castpd_si128(a), 52); + const Packet2d cst_exp_mask = pset1frombits(static_cast(0x7ff0000000000000ull)); + __m128i a_expo = _mm_srli_epi64(_mm_castpd_si128(pand(a, cst_exp_mask)), 52); exponent = psub(_mm_cvtepi32_pd(vec4i_swizzle1(a_expo, 0, 2, 1, 3)), cst_1022d); - return por(pand(a, cst_inv_mant_mask), cst_half); + const Packet2d cst_mant_mask = pset1frombits(static_cast(~0x7ff0000000000000ull)); + return por(pand(a, cst_mant_mask), cst_half); } template<> EIGEN_STRONG_INLINE Packet4f pldexp(const Packet4f& a, const Packet4f& exponent) {