diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 4b56f0f7d..58a197f91 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -44,24 +44,30 @@ namespace internal { struct default_packet_traits { enum { + // Ops that are implemented for most types. HasAdd = 1, HasSub = 1, HasShift = 1, HasMul = 1, HasNegate = 1, HasAbs = 1, - HasArg = 0, HasAbs2 = 1, - HasAbsDiff = 0, HasMin = 1, HasMax = 1, HasConj = 1, HasSetLinear = 1, HasSign = 1, + + HasArg = 0, + HasAbsDiff = 0, HasBlend = 0, // This flag is used to indicate whether packet comparison is supported. // pcmp_eq, pcmp_lt and pcmp_le should be defined for it to be true. HasCmp = 0, + HasRound = 0, + HasRint = 0, + HasFloor = 0, + HasCeil = 0, HasDiv = 0, HasReciprocal = 0, @@ -73,7 +79,6 @@ struct default_packet_traits { HasLog1p = 0, HasLog10 = 0, HasPow = 0, - HasSin = 0, HasCos = 0, HasTan = 0, @@ -96,12 +101,7 @@ struct default_packet_traits { HasIGammaDerA = 0, HasGammaSampleDerAlpha = 0, HasIGammac = 0, - HasBetaInc = 0, - - HasRound = 0, - HasRint = 0, - HasFloor = 0, - HasCeil = 0 + HasBetaInc = 0 }; }; diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index bdbf75956..bc4eb9a6a 100644 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -52,6 +52,7 @@ typedef __m128d Packet2d; typedef eigen_packet_wrapper<__m128i, 0> Packet4i; typedef eigen_packet_wrapper<__m128i, 1> Packet16b; typedef eigen_packet_wrapper<__m128i, 4> Packet4ui; +typedef eigen_packet_wrapper<__m128i, 5> Packet2l; template <> struct is_arithmetic<__m128> { @@ -69,6 +70,10 @@ template <> struct is_arithmetic { enum { value = true }; }; +template <> +struct is_arithmetic { + enum { value = true }; +}; // Note that `Packet4ui` uses the underlying type `__m128i`, which is // interpreted as a vector of _signed_ `int32`s, which breaks some arithmetic // operations used in `GenericPacketMath.h`. @@ -213,10 +218,10 @@ struct packet_traits : default_packet_traits { enum { Vectorizable = 1, AlignedOnScalar = 1, - HasCmp = 1, - HasDiv = 1, size = 4, + HasCmp = 1, + HasDiv = 1, HasShift = 1, HasBlend = 1 }; @@ -232,10 +237,22 @@ struct packet_traits : default_packet_traits { HasDiv = 0, HasNegate = 0, - HasSqrt = 0, HasCmp = 1, - HasMin = 1, - HasMax = 1, + HasShift = 1, + HasBlend = 1 + }; +}; +template <> +struct packet_traits : default_packet_traits { + typedef Packet2l type; + typedef Packet2l half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 2, + + HasDiv = 0, + HasCmp = 1, HasShift = 1, HasBlend = 1 }; @@ -250,12 +267,8 @@ struct packet_traits : default_packet_traits { AlignedOnScalar = 1, size = 16, - HasAdd = 1, - HasSub = 1, HasCmp = 1, // note -- only pcmp_eq is defined HasShift = 0, - HasMul = 1, - HasNegate = 1, HasAbs = 0, HasAbs2 = 0, HasMin = 0, @@ -283,6 +296,19 @@ template <> struct unpacket_traits { typedef double type; typedef Packet2d half; + typedef Packet2l integer_packet; + enum { + size = 2, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template <> +struct unpacket_traits { + typedef int64_t type; + typedef Packet2l half; enum { size = 2, alignment = Aligned16, @@ -348,6 +374,10 @@ EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return _mm_set1_pd(from); } template <> +EIGEN_STRONG_INLINE Packet2l pset1(const int64_t& from) { + return _mm_set1_epi64x(from); +} +template <> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { return _mm_set1_epi32(from); } @@ -374,6 +404,10 @@ EIGEN_STRONG_INLINE Packet4f peven_mask(const Packet4f& /*a*/) { return _mm_castsi128_ps(_mm_set_epi32(0, -1, 0, -1)); } template <> +EIGEN_STRONG_INLINE Packet2l peven_mask(const Packet2l& /*a*/) { + return _mm_set_epi32(0, 0, -1, -1); +} +template <> EIGEN_STRONG_INLINE Packet4i peven_mask(const Packet4i& /*a*/) { return _mm_set_epi32(0, -1, 0, -1); } @@ -395,6 +429,10 @@ EIGEN_STRONG_INLINE Packet2d pzero(const Packet2d& /*a*/) { return _mm_setzero_pd(); } template <> +EIGEN_STRONG_INLINE Packet2l pzero(const Packet2l& /*a*/) { + return _mm_setzero_si128(); +} +template <> EIGEN_STRONG_INLINE Packet4i pzero(const Packet4i& /*a*/) { return _mm_setzero_si128(); } @@ -424,6 +462,10 @@ EIGEN_STRONG_INLINE Packet2d plset(const double& a) { return _mm_add_pd(pset1(a), _mm_set_pd(1, 0)); } template <> +EIGEN_STRONG_INLINE Packet2l plset(const int64_t& a) { + return _mm_add_epi32(pset1(a), _mm_set_epi64x(1, 0)); +} +template <> EIGEN_STRONG_INLINE Packet4i plset(const int& a) { return _mm_add_epi32(pset1(a), _mm_set_epi32(3, 2, 1, 0)); } @@ -441,6 +483,10 @@ EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b return _mm_add_pd(a, b); } template <> +EIGEN_STRONG_INLINE Packet2l padd(const Packet2l& a, const Packet2l& b) { + return _mm_add_epi64(a, b); +} +template <> EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const Packet4i& b) { return _mm_add_epi32(a, b); } @@ -474,6 +520,10 @@ EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b return _mm_sub_pd(a, b); } template <> +EIGEN_STRONG_INLINE Packet2l psub(const Packet2l& a, const Packet2l& b) { + return _mm_sub_epi64(a, b); +} +template <> EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a, b); } @@ -520,9 +570,14 @@ EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0, 0x80000000, 0x0, 0x80000000)); return _mm_xor_pd(a, mask); } +template <> +EIGEN_STRONG_INLINE Packet2l pnegate(const Packet2l& a) { + return psub(pzero(a), a); +} + template <> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { - return psub(Packet4i(_mm_setr_epi32(0, 0, 0, 0)), a); + return psub(pzero(a), a); } template <> @@ -539,6 +594,10 @@ EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } template <> +EIGEN_STRONG_INLINE Packet2l pconj(const Packet2l& a) { + return a; +} +template <> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } @@ -552,6 +611,21 @@ EIGEN_STRONG_INLINE Packet2d pmul(const Packet2d& a, const Packet2d& b return _mm_mul_pd(a, b); } template <> +EIGEN_STRONG_INLINE Packet2l pmul(const Packet2l& a, const Packet2l& b) { + // 64-bit mul requires avx512, so do this with 32-bit multiplication + __m128i upper32_a = _mm_srli_epi64(a, 32); + __m128i upper32_b = _mm_srli_epi64(b, 32); + + // upper * lower + __m128i mul1 = _mm_mul_epu32(upper32_a, b); + __m128i mul2 = _mm_mul_epu32(upper32_b, a); + // Gives us both upper*upper and lower*lower + __m128i mul3 = _mm_mul_epu32(a, b); + + __m128i high = _mm_slli_epi64(_mm_add_epi64(mul1, mul2), 32); + return _mm_add_epi64(high, mul3); +} +template <> EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const Packet4i& b) { #ifdef EIGEN_VECTORIZE_SSE4_1 return _mm_mullo_epi32(a, b); @@ -602,15 +676,6 @@ EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& a, const Packet4i& b #endif } -// for some weird raisons, it has to be overloaded for packet of integers -template <> -EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { - return padd(pmul(a, b), c); -} -template <> -EIGEN_STRONG_INLINE Packet4ui pmadd(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c) { - return padd(pmul(a, b), c); -} #ifdef EIGEN_VECTORIZE_FMA template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { @@ -659,26 +724,35 @@ EIGEN_STRONG_INLINE Packet2d pmadds(const Packet2d& a, const Packet2d& #ifdef EIGEN_VECTORIZE_SSE4_1 template <> -EIGEN_DEVICE_FUNC inline Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) { +EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) { return _mm_blendv_ps(b, a, mask); } template <> -EIGEN_DEVICE_FUNC inline Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) { +EIGEN_STRONG_INLINE Packet2l pselect(const Packet2l& mask, const Packet2l& a, const Packet2l& b) { + return _mm_castpd_si128(_mm_blendv_pd(_mm_castsi128_pd(b), _mm_castsi128_pd(a), _mm_castsi128_pd(mask))); +} + +template <> +EIGEN_STRONG_INLINE Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) { return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b), _mm_castsi128_ps(a), _mm_castsi128_ps(mask))); } template <> -EIGEN_DEVICE_FUNC inline Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b) { +EIGEN_STRONG_INLINE Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b) { return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b), _mm_castsi128_ps(a), _mm_castsi128_ps(mask))); } template <> -EIGEN_DEVICE_FUNC inline Packet2d pselect(const Packet2d& mask, const Packet2d& a, const Packet2d& b) { +EIGEN_STRONG_INLINE Packet2d pselect(const Packet2d& mask, const Packet2d& a, const Packet2d& b) { return _mm_blendv_pd(b, a, mask); } #endif +template <> +EIGEN_STRONG_INLINE Packet2l ptrue(const Packet2l& a) { + return _mm_cmpeq_epi32(a, a); +} template <> EIGEN_STRONG_INLINE Packet4i ptrue(const Packet4i& a) { return _mm_cmpeq_epi32(a, a); @@ -707,6 +781,10 @@ EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b return _mm_and_pd(a, b); } template <> +EIGEN_STRONG_INLINE Packet2l pand(const Packet2l& a, const Packet2l& b) { + return _mm_and_si128(a, b); +} +template <> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a, b); } @@ -728,6 +806,10 @@ EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) return _mm_or_pd(a, b); } template <> +EIGEN_STRONG_INLINE Packet2l por(const Packet2l& a, const Packet2l& b) { + return _mm_or_si128(a, b); +} +template <> EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a, b); } @@ -749,6 +831,10 @@ EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b return _mm_xor_pd(a, b); } template <> +EIGEN_STRONG_INLINE Packet2l pxor(const Packet2l& a, const Packet2l& b) { + return _mm_xor_si128(a, b); +} +template <> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a, b); } @@ -770,6 +856,10 @@ EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d return _mm_andnot_pd(b, a); } template <> +EIGEN_STRONG_INLINE Packet2l pandnot(const Packet2l& a, const Packet2l& b) { + return _mm_andnot_si128(b, a); +} +template <> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(b, a); } @@ -811,7 +901,6 @@ template <> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return _mm_cmpeq_pd(a, b); } - template <> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) { return _mm_cmplt_epi32(a, b); @@ -821,8 +910,35 @@ EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return _mm_cmpeq_epi32(a, b); } template <> -EIGEN_STRONG_INLINE Packet4ui pcmp_eq(const Packet4ui& a, const Packet4ui& b) { - return _mm_cmpeq_epi32(a, b); +EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { + return por(pcmp_lt(a, b), pcmp_eq(a, b)); +} +template <> +EIGEN_STRONG_INLINE Packet2l pcmp_lt(const Packet2l& a, const Packet2l& b) { +#ifdef EIGEN_VECTORIZE_SSE4_2 + return _mm_cmpgt_epi64(b, a); +#else + Packet4i eq = pcmp_eq(Packet4i(a), Packet4i(b)); + Packet2l hi_eq = Packet2l(_mm_shuffle_epi32(eq, (shuffle_mask<1, 1, 3, 3>::mask))); + Packet4i lt = pcmp_lt(Packet4i(a), Packet4i(b)); + Packet2l hi_lt = Packet2l(_mm_shuffle_epi32(lt, (shuffle_mask<1, 1, 3, 3>::mask))); + Packet2l lo_lt = Packet2l(_mm_shuffle_epi32(lt, (shuffle_mask<0, 0, 2, 2>::mask))); + // return hi(a) < hi(b) || (hi(a) == hi(b) && lo(a) < lo(b)) + return por(hi_lt, pand(hi_eq, lo_lt)); +#endif +} +template <> +EIGEN_STRONG_INLINE Packet2l pcmp_eq(const Packet2l& a, const Packet2l& b) { +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_cmpeq_epi64(a, b); +#else + Packet4i tmp = pcmp_eq(Packet4i(a), Packet4i(b)); + return Packet2l(pand(tmp, _mm_shuffle_epi32(tmp, (shuffle_mask<1, 0, 3, 2>::mask)))); +#endif +} +template <> +EIGEN_STRONG_INLINE Packet2l pcmp_le(const Packet2l& a, const Packet2l& b) { + return por(pcmp_lt(a, b), pcmp_eq(a, b)); } template <> EIGEN_STRONG_INLINE Packet16b pcmp_eq(const Packet16b& a, const Packet16b& b) { @@ -831,8 +947,8 @@ EIGEN_STRONG_INLINE Packet16b pcmp_eq(const Packet16b& a, const Packet16b& b) { return _mm_and_si128(_mm_cmpeq_epi8(a, b), kBoolMask); } template <> -EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { - return por(pcmp_lt(a, b), pcmp_eq(a, b)); +EIGEN_STRONG_INLINE Packet4ui pcmp_eq(const Packet4ui& a, const Packet4ui& b) { + return _mm_cmpeq_epi32(a, b); } template <> @@ -876,6 +992,11 @@ EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b #endif } template <> +EIGEN_STRONG_INLINE Packet2l pmin(const Packet2l& a, const Packet2l& b) { + Packet2l a_lt_mask = pcmp_lt(a, b); + return por(pandnot(b, a_lt_mask), pand(a, a_lt_mask)); +} +template <> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { #ifdef EIGEN_VECTORIZE_SSE4_1 return _mm_min_epi32(a, b); @@ -937,6 +1058,11 @@ EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b #endif } template <> +EIGEN_STRONG_INLINE Packet2l pmax(const Packet2l& a, const Packet2l& b) { + Packet2l a_lt_mask = pcmp_lt(a, b); + return por(pandnot(a, a_lt_mask), pand(b, a_lt_mask)); +} +template <> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { #ifdef EIGEN_VECTORIZE_SSE4_1 return _mm_max_epi32(a, b); @@ -1028,6 +1154,46 @@ EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, con return pminmax_propagate_nan(a, b, pmax); } +template <> +EIGEN_STRONG_INLINE Packet4f psignbit(const Packet4f& a) { + return _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(a), 31)); +} +template <> +EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) { + Packet4f tmp = psignbit(_mm_castpd_ps(a)); +#ifdef EIGEN_VECTORIZE_AVX + return _mm_castps_pd(_mm_permute_ps(tmp, (shuffle_mask<1, 1, 3, 3>::mask))); +#else + return _mm_castps_pd(_mm_shuffle_ps(tmp, tmp, (shuffle_mask<1, 1, 3, 3>::mask))); +#endif // EIGEN_VECTORIZE_AVX +} +template <> +EIGEN_STRONG_INLINE Packet4i psignbit(const Packet4i& a) { + return _mm_srai_epi32(a, 31); +} +template <> +EIGEN_STRONG_INLINE Packet4ui psignbit(const Packet4ui& a) { + return pzero(a); +} +template <> +EIGEN_STRONG_INLINE Packet2l psignbit(const Packet2l& a) { + Packet4i tmp = psignbit(Packet4i(a)); + return Packet2l(_mm_shuffle_epi32(tmp, (shuffle_mask<1, 1, 3, 3>::mask))); +} + +template +EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(const Packet2l& a) { + Packet2l signbit = psignbit(a); + return por(_mm_slli_epi64(signbit, 64 - N), _mm_srli_epi64(a, N)); +} +template +EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) { + return _mm_srli_epi64(a, N); +} +template +EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) { + return _mm_slli_epi64(a, N); +} template EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) { return _mm_srai_epi32(a, N); @@ -1040,7 +1206,6 @@ template EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a) { return _mm_slli_epi32(a, N); } - template EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(const Packet4ui& a) { return _mm_srli_epi32(a, N); @@ -1065,12 +1230,17 @@ EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return _mm_and_pd(a, mask); } template <> +EIGEN_STRONG_INLINE Packet2l pabs(const Packet2l& a) { + Packet2l signbit = psignbit(a); + return _mm_sub_epi64(_mm_xor_si128(a, signbit), signbit); +} +template <> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { #ifdef EIGEN_VECTORIZE_SSSE3 return _mm_abs_epi32(a); #else - Packet4i aux = _mm_srai_epi32(a, 31); - return _mm_sub_epi32(_mm_xor_si128(a, aux), aux); + Packet4i signbit = psignbit(a); + return _mm_sub_epi32(_mm_xor_si128(a, signbit), signbit); #endif } template <> @@ -1078,24 +1248,6 @@ EIGEN_STRONG_INLINE Packet4ui pabs(const Packet4ui& a) { return a; } -template <> -EIGEN_STRONG_INLINE Packet4f psignbit(const Packet4f& a) { - return _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(a), 31)); -} -template <> -EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) { - Packet4f tmp = psignbit(_mm_castpd_ps(a)); -#ifdef EIGEN_VECTORIZE_AVX - return _mm_castps_pd(_mm_permute_ps(tmp, (shuffle_mask<1, 1, 3, 3>::mask))); -#else - return _mm_castps_pd(_mm_shuffle_ps(tmp, tmp, (shuffle_mask<1, 1, 3, 3>::mask))); -#endif // EIGEN_VECTORIZE_AVX -} -template <> -EIGEN_STRONG_INLINE Packet4ui psignbit(const Packet4ui& a) { - return pzero(a); -} - #ifdef EIGEN_VECTORIZE_SSE4_1 template <> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) { @@ -1217,6 +1369,10 @@ EIGEN_STRONG_INLINE Packet2d pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); } template <> +EIGEN_STRONG_INLINE Packet2l pload(const int64_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast(from)); +} +template <> EIGEN_STRONG_INLINE Packet4i pload(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast(from)); } @@ -1251,6 +1407,11 @@ EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) { return _mm_loadu_pd(from); } template <> +EIGEN_STRONG_INLINE Packet2l ploadu(const int64_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD + return _mm_loadu_si128(reinterpret_cast(from)); +} +template <> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast(from)); @@ -1299,6 +1460,10 @@ EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) { return pset1(from[0]); } template <> +EIGEN_STRONG_INLINE Packet2l ploaddup(const int64_t* from) { + return pset1(from[0]); +} +template <> EIGEN_STRONG_INLINE Packet4i ploaddup(const int* from) { Packet4i tmp; tmp = _mm_loadl_epi64(reinterpret_cast(from)); @@ -1337,6 +1502,10 @@ EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); } template <> +EIGEN_STRONG_INLINE void pstore(int64_t* to, const Packet2l& from) { + EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); +} +template <> EIGEN_STRONG_INLINE void pstore(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); } @@ -1358,6 +1527,10 @@ EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ps(to, from); } template <> +EIGEN_STRONG_INLINE void pstoreu(int64_t* to, const Packet2l& from) { + EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); +} +template <> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); } @@ -1393,25 +1566,142 @@ EIGEN_STRONG_INLINE void pstores(double* to, const Packet2d& from) { } template <> -EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) { +EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { + return _mm_shuffle_ps(a, a, 0x1B); +} +template <> +EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { + return _mm_shuffle_pd(a, a, 0x1); +} +template <> +EIGEN_STRONG_INLINE Packet2l preverse(const Packet2l& a) { + return _mm_castpd_si128(preverse(_mm_castsi128_pd(a))); +} +template <> +EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { + return _mm_shuffle_epi32(a, 0x1B); +} +template <> +EIGEN_STRONG_INLINE Packet4ui preverse(const Packet4ui& a) { + return _mm_shuffle_epi32(a, 0x1B); +} +template <> +EIGEN_STRONG_INLINE Packet16b preverse(const Packet16b& a) { +#ifdef EIGEN_VECTORIZE_SSSE3 + __m128i mask = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + return _mm_shuffle_epi8(a, mask); +#else + Packet16b tmp = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 1, 2, 3)); + tmp = _mm_shufflehi_epi16(_mm_shufflelo_epi16(tmp, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1)); + return _mm_or_si128(_mm_slli_epi16(tmp, 8), _mm_srli_epi16(tmp, 8)); +#endif +} + +#if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64 +// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010 +// Direct of the struct members fixed bug #62. +template <> +EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { + return a.m128_f32[0]; +} +template <> +EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { + return a.m128d_f64[0]; +} +template <> +EIGEN_STRONG_INLINE int64_t pfirst(const Packet2l& a) { + int64_t x = _mm_cvtsi128_si64(a); + return x; +} +template <> +EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { + int x = _mm_cvtsi128_si32(a); + return x; +} +template <> +EIGEN_STRONG_INLINE uint32_t pfirst(const Packet4ui& a) { + uint32_t x = numext::bit_cast(_mm_cvtsi128_si32(a)); + return x; +} +#elif EIGEN_COMP_MSVC_STRICT +// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010 +template <> +EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { + float x = _mm_cvtss_f32(a); + return x; +} +template <> +EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { + double x = _mm_cvtsd_f64(a); + return x; +} +template <> +EIGEN_STRONG_INLINE int64_t pfirst(const Packet2l& a) { + int64_t x = _mm_cvtsi128_si64(a); + return x; +} +template <> +EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { + int x = _mm_cvtsi128_si32(a); + return x; +} +template <> +EIGEN_STRONG_INLINE uint32_t pfirst(const Packet4ui& a) { + uint32_t x = numext::bit_cast(_mm_cvtsi128_si32(a)); + return x; +} +#else +template <> +EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { + return _mm_cvtss_f32(a); +} +template <> +EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { + return _mm_cvtsd_f64(a); +} +template <> +EIGEN_STRONG_INLINE int64_t pfirst(const Packet2l& a) { + return _mm_cvtsi128_si64(a); +} +template <> +EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { + return _mm_cvtsi128_si32(a); +} +template <> +EIGEN_STRONG_INLINE uint32_t pfirst(const Packet4ui& a) { + return numext::bit_cast(_mm_cvtsi128_si32(a)); +} +#endif +template <> +EIGEN_STRONG_INLINE bool pfirst(const Packet16b& a) { + int x = _mm_cvtsi128_si32(a); + return static_cast(x & 1); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pgather(const float* from, Index stride) { return _mm_set_ps(from[3 * stride], from[2 * stride], from[1 * stride], from[0 * stride]); } template <> -EIGEN_DEVICE_FUNC inline Packet2d pgather(const double* from, Index stride) { +EIGEN_STRONG_INLINE Packet2d pgather(const double* from, Index stride) { return _mm_set_pd(from[1 * stride], from[0 * stride]); } template <> -EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* from, Index stride) { +EIGEN_STRONG_INLINE Packet2l pgather(const int64_t* from, Index stride) { + return _mm_set_epi64x(from[1 * stride], from[0 * stride]); +} +template <> +EIGEN_STRONG_INLINE Packet4i pgather(const int* from, Index stride) { return _mm_set_epi32(from[3 * stride], from[2 * stride], from[1 * stride], from[0 * stride]); } template <> -EIGEN_DEVICE_FUNC inline Packet4ui pgather(const uint32_t* from, Index stride) { +EIGEN_STRONG_INLINE Packet4ui pgather(const uint32_t* from, Index stride) { return _mm_set_epi32(numext::bit_cast(from[3 * stride]), numext::bit_cast(from[2 * stride]), numext::bit_cast(from[1 * stride]), numext::bit_cast(from[0 * stride])); } template <> -EIGEN_DEVICE_FUNC inline Packet16b pgather(const bool* from, Index stride) { +EIGEN_STRONG_INLINE Packet16b pgather(const bool* from, Index stride) { return _mm_set_epi8(from[15 * stride], from[14 * stride], from[13 * stride], from[12 * stride], from[11 * stride], from[10 * stride], from[9 * stride], from[8 * stride], from[7 * stride], from[6 * stride], from[5 * stride], from[4 * stride], from[3 * stride], from[2 * stride], from[1 * stride], @@ -1419,33 +1709,38 @@ EIGEN_DEVICE_FUNC inline Packet16b pgather(const bool* from, In } template <> -EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) { - to[stride * 0] = _mm_cvtss_f32(from); - to[stride * 1] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 1)); - to[stride * 2] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 2)); - to[stride * 3] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 3)); +EIGEN_STRONG_INLINE void pscatter(float* to, const Packet4f& from, Index stride) { + to[stride * 0] = pfirst(from); + to[stride * 1] = pfirst(_mm_shuffle_ps(from, from, 1)); + to[stride * 2] = pfirst(_mm_shuffle_ps(from, from, 2)); + to[stride * 3] = pfirst(_mm_shuffle_ps(from, from, 3)); } template <> -EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2d& from, Index stride) { - to[stride * 0] = _mm_cvtsd_f64(from); - to[stride * 1] = _mm_cvtsd_f64(_mm_shuffle_pd(from, from, 1)); +EIGEN_STRONG_INLINE void pscatter(double* to, const Packet2d& from, Index stride) { + to[stride * 0] = pfirst(from); + to[stride * 1] = pfirst(preverse(from)); } template <> -EIGEN_DEVICE_FUNC inline void pscatter(int* to, const Packet4i& from, Index stride) { +EIGEN_STRONG_INLINE void pscatter(int64_t* to, const Packet2l& from, Index stride) { + to[stride * 0] = pfirst(from); + to[stride * 1] = pfirst(preverse(from)); +} +template <> +EIGEN_STRONG_INLINE void pscatter(int* to, const Packet4i& from, Index stride) { to[stride * 0] = _mm_cvtsi128_si32(from); to[stride * 1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1)); to[stride * 2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2)); to[stride * 3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3)); } template <> -EIGEN_DEVICE_FUNC inline void pscatter(uint32_t* to, const Packet4ui& from, Index stride) { +EIGEN_STRONG_INLINE void pscatter(uint32_t* to, const Packet4ui& from, Index stride) { to[stride * 0] = numext::bit_cast(_mm_cvtsi128_si32(from)); to[stride * 1] = numext::bit_cast(_mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1))); to[stride * 2] = numext::bit_cast(_mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2))); to[stride * 3] = numext::bit_cast(_mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3))); } template <> -EIGEN_DEVICE_FUNC inline void pscatter(bool* to, const Packet16b& from, Index stride) { +EIGEN_STRONG_INLINE void pscatter(bool* to, const Packet16b& from, Index stride) { to[4 * stride * 0] = _mm_cvtsi128_si32(from); to[4 * stride * 1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1)); to[4 * stride * 2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2)); @@ -1485,106 +1780,15 @@ EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } template <> +EIGEN_STRONG_INLINE void prefetch(const int64_t* addr) { + _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); +} +template <> EIGEN_STRONG_INLINE void prefetch(const uint32_t* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } #endif -#if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64 -// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010 -// Direct of the struct members fixed bug #62. -template <> -EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { - return a.m128_f32[0]; -} -template <> -EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { - return a.m128d_f64[0]; -} -template <> -EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { - int x = _mm_cvtsi128_si32(a); - return x; -} -template <> -EIGEN_STRONG_INLINE uint32_t pfirst(const Packet4ui& a) { - uint32_t x = numext::bit_cast(_mm_cvtsi128_si32(a)); - return x; -} -#elif EIGEN_COMP_MSVC_STRICT -// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010 -template <> -EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { - float x = _mm_cvtss_f32(a); - return x; -} -template <> -EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { - double x = _mm_cvtsd_f64(a); - return x; -} -template <> -EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { - int x = _mm_cvtsi128_si32(a); - return x; -} -template <> -EIGEN_STRONG_INLINE uint32_t pfirst(const Packet4ui& a) { - uint32_t x = numext::bit_cast(_mm_cvtsi128_si32(a)); - return x; -} -#else -template <> -EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { - return _mm_cvtss_f32(a); -} -template <> -EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { - return _mm_cvtsd_f64(a); -} -template <> -EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { - return _mm_cvtsi128_si32(a); -} -template <> -EIGEN_STRONG_INLINE uint32_t pfirst(const Packet4ui& a) { - return numext::bit_cast(_mm_cvtsi128_si32(a)); -} -#endif -template <> -EIGEN_STRONG_INLINE bool pfirst(const Packet16b& a) { - int x = _mm_cvtsi128_si32(a); - return static_cast(x & 1); -} - -template <> -EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { - return _mm_shuffle_ps(a, a, 0x1B); -} -template <> -EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { - return _mm_shuffle_pd(a, a, 0x1); -} -template <> -EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { - return _mm_shuffle_epi32(a, 0x1B); -} -template <> -EIGEN_STRONG_INLINE Packet4ui preverse(const Packet4ui& a) { - return _mm_shuffle_epi32(a, 0x1B); -} -template <> -EIGEN_STRONG_INLINE Packet16b preverse(const Packet16b& a) { -#ifdef EIGEN_VECTORIZE_SSSE3 - __m128i mask = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - return _mm_shuffle_epi8(a, mask); -#else - Packet16b tmp = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 1, 2, 3)); - tmp = _mm_shufflehi_epi16(_mm_shufflelo_epi16(tmp, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1)); - return _mm_or_si128(_mm_slli_epi16(tmp, 8), _mm_srli_epi16(tmp, 8)); -#endif -} - template <> EIGEN_STRONG_INLINE Packet4f pfrexp(const Packet4f& a, Packet4f& exponent) { return pfrexp_generic(a, exponent); @@ -1610,6 +1814,7 @@ EIGEN_STRONG_INLINE Packet4f pldexp(const Packet4f& a, const Packet4f& // We specialize pldexp here, since the generic implementation uses Packet2l, which is not well // supported by SSE, and has more range than is needed for exponents. +// TODO(rmlarsen): Remove this specialization once Packet2l has support or casting. template <> EIGEN_STRONG_INLINE Packet2d pldexp(const Packet2d& a, const Packet2d& exponent) { // Clamp exponent to [-2099, 2099] @@ -1690,6 +1895,11 @@ EIGEN_STRONG_INLINE double predux(const Packet2d& a) { // #endif } +template <> +EIGEN_STRONG_INLINE int64_t predux(const Packet2l& a) { + return pfirst(_mm_add_epi64(a, _mm_unpackhi_epi64(a, a))); +} + #ifdef EIGEN_VECTORIZE_SSSE3 template <> EIGEN_STRONG_INLINE int predux(const Packet4i& a) { @@ -1701,7 +1911,6 @@ EIGEN_STRONG_INLINE uint32_t predux(const Packet4ui& a) { Packet4ui tmp0 = _mm_hadd_epi32(a, a); return pfirst(_mm_hadd_epi32(tmp0, tmp0)); } - #else template <> EIGEN_STRONG_INLINE int predux(const Packet4i& a) { @@ -1734,9 +1943,15 @@ EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) { return pfirst(_mm_mul_sd(a, _mm_unpackhi_pd(a, a))); } template <> +EIGEN_STRONG_INLINE int64_t predux_mul(const Packet2l& a) { + EIGEN_ALIGN16 int64_t aux[2]; + pstore(aux, a); + return aux[0] * aux[1]; +} +template <> EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) { // after some experiments, it is seems this is the fastest way to implement it - // for GCC (eg., reusing pmul is very slow !) + // for GCC (e.g., reusing pmul is very slow!) // TODO try to call _mm_mul_epu32 directly EIGEN_ALIGN16 int aux[4]; pstore(aux, a); @@ -1846,11 +2061,21 @@ EIGEN_STRONG_INLINE uint32_t predux_max(const Packet4ui& a) { // return _mm_movemask_ps(x) == 0xF; // } +template <> +EIGEN_STRONG_INLINE bool predux_any(const Packet2d& x) { + return _mm_movemask_pd(x) != 0x0; +} + template <> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) { return _mm_movemask_ps(x) != 0x0; } +template <> +EIGEN_STRONG_INLINE bool predux_any(const Packet2l& x) { + return _mm_movemask_pd(_mm_castsi128_pd(x)) != 0x0; +} + template <> EIGEN_STRONG_INLINE bool predux_any(const Packet4i& x) { return _mm_movemask_ps(_mm_castsi128_ps(x)) != 0x0; @@ -1860,17 +2085,23 @@ EIGEN_STRONG_INLINE bool predux_any(const Packet4ui& x) { return _mm_movemask_ps(_mm_castsi128_ps(x)) != 0x0; } -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]); } -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { __m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]); kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]); kernel.packet[1] = tmp; } -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + __m128i tmp = _mm_unpackhi_epi64(kernel.packet[0], kernel.packet[1]); + kernel.packet[0] = _mm_unpacklo_epi64(kernel.packet[0], kernel.packet[1]); + kernel.packet[1] = tmp; +} + +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { __m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]); __m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]); __m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]); @@ -1881,11 +2112,11 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { kernel.packet[2] = _mm_unpacklo_epi64(T2, T3); kernel.packet[3] = _mm_unpackhi_epi64(T2, T3); } -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { ptranspose((PacketBlock&)kernel); } -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { __m128i T0 = _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]); __m128i T1 = _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]); __m128i T2 = _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]); @@ -1896,7 +2127,7 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { kernel.packet[3] = _mm_unpackhi_epi16(T1, T3); } -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { // If we number the elements in the input thus: // kernel.packet[ 0] = {00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 0a, 0b, 0c, 0d, 0e, 0f} // kernel.packet[ 1] = {10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 1a, 1b, 1c, 1d, 1e, 1f} @@ -1982,6 +2213,18 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { kernel.packet[15] = _mm_unpackhi_epi64(u7, uf); } +template <> +EIGEN_STRONG_INLINE Packet2l pblend(const Selector<2>& ifPacket, const Packet2l& thenPacket, + const Packet2l& elsePacket) { + const __m128i zero = _mm_setzero_si128(); + const __m128i select = _mm_set_epi64x(ifPacket.select[1], ifPacket.select[0]); + __m128i false_mask = pcmp_eq(select, zero); +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blendv_epi8(thenPacket, elsePacket, false_mask); +#else + return _mm_or_si128(_mm_andnot_si128(false_mask, thenPacket), _mm_and_si128(false_mask, elsePacket)); +#endif +} template <> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { @@ -2189,11 +2432,6 @@ struct packet_traits : default_packet_traits { HasMax = 0, HasConj = 0, HasSetLinear = 0, - HasSqrt = 0, - HasRsqrt = 0, - HasExp = 0, - HasLog = 0, - HasBlend = 0 }; };