diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 3b94af528..4a0c72e04 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -560,7 +560,7 @@ EIGEN_STRONG_INLINE std::enable_if_t<(N >= 32) && (N < 63), Packet4l> parithmeti } template EIGEN_STRONG_INLINE std::enable_if_t<(N == 63), Packet4l> parithmetic_shift_right(Packet4l a) { - return _mm256_shuffle_epi32(_mm256_srai_epi32(a, 31), (shuffle_mask<1, 1, 3, 3>::mask)); + return _mm256_cmpgt_epi64(_mm256_setzero_si256(), a); } template EIGEN_STRONG_INLINE std::enable_if_t<(N < 0) || (N > 63), Packet4l> parithmetic_shift_right(Packet4l a) { @@ -1802,14 +1802,12 @@ EIGEN_STRONG_INLINE Packet4ul preverse(const Packet4ul& a) { // pabs should be ok template <> EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a) { - const Packet8f mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, - 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF)); + const Packet8f mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); return _mm256_and_ps(a, mask); } template <> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a) { - const Packet4d mask = _mm256_castsi256_pd(_mm256_setr_epi32(0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, - 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF)); + const Packet4d mask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x7FFFFFFFFFFFFFFF)); return _mm256_and_pd(a, mask); } template <> @@ -1829,28 +1827,28 @@ EIGEN_STRONG_INLINE Packet8ui pabs(const Packet8ui& a) { template <> EIGEN_STRONG_INLINE Packet8h psignbit(const Packet8h& a) { - return _mm_srai_epi16(a, 15); + return _mm_cmpgt_epi16(_mm_setzero_si128(), a); } template <> EIGEN_STRONG_INLINE Packet8bf psignbit(const Packet8bf& a) { - return _mm_srai_epi16(a, 15); + return _mm_cmpgt_epi16(_mm_setzero_si128(), a); } template <> EIGEN_STRONG_INLINE Packet8f psignbit(const Packet8f& a) { - return _mm256_castsi256_ps(parithmetic_shift_right<31>((Packet8i)_mm256_castps_si256(a))); + return _mm256_castsi256_ps(_mm256_cmpgt_epi32(_mm256_setzero_si256(), _mm256_castps_si256(a))); } template <> EIGEN_STRONG_INLINE Packet8ui psignbit(const Packet8ui& a) { - return pzero(a); + return _mm256_setzero_si256(); } #ifdef EIGEN_VECTORIZE_AVX2 template <> EIGEN_STRONG_INLINE Packet4d psignbit(const Packet4d& a) { - return _mm256_castsi256_pd(parithmetic_shift_right<63>((Packet4l)_mm256_castpd_si256(a))); + return _mm256_castsi256_pd(_mm256_cmpgt_epi64(_mm256_setzero_si256(), _mm256_castpd_si256(a))); } template <> EIGEN_STRONG_INLINE Packet4ul psignbit(const Packet4ul& a) { - return pzero(a); + return _mm256_setzero_si256(); } #endif diff --git a/Eigen/src/Core/arch/AVX/TypeCasting.h b/Eigen/src/Core/arch/AVX/TypeCasting.h index 2581eff66..9dcd6ef84 100644 --- a/Eigen/src/Core/arch/AVX/TypeCasting.h +++ b/Eigen/src/Core/arch/AVX/TypeCasting.h @@ -200,10 +200,38 @@ EIGEN_STRONG_INLINE Packet4l pcast(const Packet4d& a) { #if defined(EIGEN_VECTORIZE_AVX512DQ) && defined(EIGEN_VECTORIZE_AVS512VL) return _mm256_cvttpd_epi64(a); #else - EIGEN_ALIGN16 double aux[4]; - pstore(aux, a); - return _mm256_set_epi64x(static_cast(aux[3]), static_cast(aux[2]), static_cast(aux[1]), - static_cast(aux[0])); + + // if 'a' exceeds the numerical limits of int64_t, the behavior is undefined + + // e <= 0 corresponds to |a| < 1, which should result in zero. incidentally, intel intrinsics with shift arguments + // greater than or equal to 64 produce zero. furthermore, negative shifts appear to be interpreted as large positive + // shifts (two's complement), which also result in zero. therefore, e does not need to be clamped to [0, 64) + + constexpr int kTotalBits = sizeof(double) * CHAR_BIT, kMantissaBits = std::numeric_limits::digits - 1, + kExponentBits = kTotalBits - kMantissaBits - 1, kBias = (1 << (kExponentBits - 1)) - 1; + + const __m256i cst_one = _mm256_set1_epi64x(1); + const __m256i cst_total_bits = _mm256_set1_epi64x(kTotalBits); + const __m256i cst_bias = _mm256_set1_epi64x(kBias); + + __m256i a_bits = _mm256_castpd_si256(a); + // shift left by 1 to clear the sign bit, and shift right by kMantissaBits + 1 to recover biased exponent + __m256i biased_e = _mm256_srli_epi64(_mm256_slli_epi64(a_bits, 1), kMantissaBits + 1); + __m256i e = _mm256_sub_epi64(biased_e, cst_bias); + + // shift to the left by kExponentBits + 1 to clear the sign and exponent bits + __m256i shifted_mantissa = _mm256_slli_epi64(a_bits, kExponentBits + 1); + // shift to the right by kTotalBits - e to convert the significand to an integer + __m256i result_significand = _mm256_srlv_epi64(shifted_mantissa, _mm256_sub_epi64(cst_total_bits, e)); + + // add the implied bit + __m256i result_exponent = _mm256_sllv_epi64(cst_one, e); + // e <= 0 is interpreted as a large positive shift (2's complement), which also conveniently results in zero + __m256i result = _mm256_add_epi64(result_significand, result_exponent); + // handle negative arguments + __m256i sign_mask = _mm256_cmpgt_epi64(_mm256_setzero_si256(), a_bits); + result = _mm256_sub_epi64(_mm256_xor_si256(result, sign_mask), sign_mask); + return result; #endif }