diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index fb584c2af..64479f5d4 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -45,11 +45,11 @@ typedef __m128d Packet2d; typedef eigen_packet_wrapper<__m128i, 0> Packet4i; typedef eigen_packet_wrapper<__m128i, 1> Packet16b; -typedef eigen_packet_wrapper<__m128i, 2> Packet2l; template<> struct is_arithmetic<__m128> { enum { value = true }; }; template<> struct is_arithmetic<__m128i> { enum { value = true }; }; template<> struct is_arithmetic<__m128d> { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; template<> struct is_arithmetic { enum { value = true }; }; #define EIGEN_SSE_SHUFFLE_MASK(p,q,r,s) ((s)<<6|(r)<<4|(q)<<2|(p)) @@ -194,7 +194,6 @@ template<> struct unpacket_traits { template<> struct unpacket_traits { typedef double type; typedef Packet2d half; - typedef Packet2l integer_packet; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; }; template<> struct unpacket_traits { @@ -487,9 +486,6 @@ template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, con template EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(Packet4i a) { return _mm_srai_epi32(a,N); } template EIGEN_STRONG_INLINE Packet4i plogical_shift_right(Packet4i a) { return _mm_srli_epi32(a,N); } template EIGEN_STRONG_INLINE Packet4i plogical_shift_left(Packet4i a) { return _mm_slli_epi32(a,N); } -template EIGEN_STRONG_INLINE Packet2l plogical_shift_right(Packet2l a) { return _mm_srli_epi64(a,N); } -template EIGEN_STRONG_INLINE Packet2l plogical_shift_left(Packet2l a) { return _mm_slli_epi64(a,N); } - #ifdef EIGEN_VECTORIZE_SSE4_1 template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) @@ -756,15 +752,29 @@ template<> EIGEN_STRONG_INLINE Packet4f pfrexp(const Packet4f& a, Pack } template<> EIGEN_STRONG_INLINE Packet2d pfrexp(const Packet2d& a, Packet2d& exponent) { - return pfrexp_double(a,exponent); + const Packet2d cst_1022d = pset1(1022.0); + const Packet2d cst_half = pset1(0.5); + const Packet2d cst_inv_mant_mask = pset1frombits(static_cast(~0x7ff0000000000000ull)); + __m128i a_expo = _mm_srli_epi64(_mm_castpd_si128(a), 52); + exponent = psub(_mm_cvtepi32_pd(vec4i_swizzle1(a_expo, 0, 2, 1, 3)), cst_1022d); + return por(pand(a, cst_inv_mant_mask), cst_half); } template<> EIGEN_STRONG_INLINE Packet4f pldexp(const Packet4f& a, const Packet4f& exponent) { return pldexp_float(a,exponent); } +// We specialize pldexp here, since the generic implementation uses Packet2l, which is not well +// supported by SSE, and has more range than is needed for exponents. template<> EIGEN_STRONG_INLINE Packet2d pldexp(const Packet2d& a, const Packet2d& exponent) { - return pldexp_double(a,exponent); + const Packet2d cst_1023 = pset1(1023.0); + // Add exponent offset. + __m64 ei = _mm_cvtpd_pi32(padd(exponent, cst_1023)); + // Convert to exponents to int64 and swizzle to the low-order 32 bits. + __m128i el = _mm_set_epi64(_mm_setzero_si64(), ei); + el = vec4i_swizzle1(el, 0, 3, 1, 3); + // return a * 2^exponent + return pmul(a, _mm_castsi128_pd(_mm_slli_epi64(el, 52))); } // with AVX, the default implementations based on pload1 are faster diff --git a/Eigen/src/Core/arch/SSE/TypeCasting.h b/Eigen/src/Core/arch/SSE/TypeCasting.h index bb68986c2..3e6cd90e5 100644 --- a/Eigen/src/Core/arch/SSE/TypeCasting.h +++ b/Eigen/src/Core/arch/SSE/TypeCasting.h @@ -69,19 +69,6 @@ template<> EIGEN_STRONG_INLINE Packet2d pcast(const Packet4f return _mm_cvtps_pd(a); } -template<> EIGEN_STRONG_INLINE Packet2l pcast(const Packet2d& a) { - // using a[1]/a[0] to get high/low 64 bit from __m128d is faster than _mm_cvtsd_f64() ,but - // it will trigger the bug report at https://gitlab.com/libeigen/eigen/-/issues/1997 since the - // a[index] ops was not supported by MSVC compiler(supported by gcc). -#if EIGEN_COMP_MSVC - return _mm_set_epi64x(int64_t(_mm_cvtsd_f64(_mm_unpackhi_pd(a,a))), int64_t(_mm_cvtsd_f64(a))); -#elif ((defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW) && (__GXX_ABI_VERSION < 1004)) || EIGEN_OS_QNX - return _mm_set_epi64x(int64_t(a.m_val[1]), int64_t(a.m_val[0])); -#else - return _mm_set_epi64x(int64_t(a[1]), int64_t(a[0])); -#endif -} - template<> EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4f& a) { return _mm_castps_si128(a); } @@ -90,36 +77,6 @@ template<> EIGEN_STRONG_INLINE Packet4f preinterpret(const Pa return _mm_castsi128_ps(a); } -template<> EIGEN_STRONG_INLINE Packet2l preinterpret(const Packet2d& a) { - return _mm_castpd_si128(a); -} - -template<> EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet2l& a) { - return _mm_castsi128_pd(a); -} - -template <> -EIGEN_STRONG_INLINE Packet2d pcast(const Packet2l& a) { -#ifdef EIGEN_VECTORIZE_AVX512DQ - // AVX512DQ finally provides an instruction for this - return _mm_cvtepi64_pd(a); -#else - // Before AVX512, there is no packed epi64 to double cast instruction - // The idea is to convert upper and lower half separately, via bit-twiddling - // then add them together, but remove the offsets - Packet2d upper = preinterpret(plogical_shift_right<32>(a)); - Packet2d lower = pand(pset1frombits(static_cast(0xffffffffULL)), preinterpret(a)); - // upper = 2**(53+32) + ((a >> 32) + 0x80000000) - upper = pxor(pset1frombits(static_cast(0x4530000080000000ULL)), upper); // exponent of 52+32, and xor the upper bit of 32bit mantissa - // lower = 2**53 + (a & 0xffffffff) - lower = pxor(pset1frombits(static_cast(0x4330000000000000ULL)), lower); // exponent of 52 - // adding upper+lower would be 2**84+2**63+2**52 too big. Create the negative of that: - Packet2d offset = pset1frombits(static_cast(0xC530000080100000ULL)); - // add everything together, start with the bigger numbers, since the 2**84 will cancel out, giving an exact result - return padd(padd(offset, upper), lower); -#endif -} - // Disable the following code since it's broken on too many platforms / compilers. //#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC) #if 0