Add Packet2l for SSE.

This commit is contained in:
Rasmus Munk Larsen 2024-03-11 19:54:55 +00:00
parent 1d4369c2ff
commit 126ba1a166
2 changed files with 420 additions and 182 deletions

View File

@ -44,24 +44,30 @@ namespace internal {
struct default_packet_traits {
enum {
// Ops that are implemented for most types.
HasAdd = 1,
HasSub = 1,
HasShift = 1,
HasMul = 1,
HasNegate = 1,
HasAbs = 1,
HasArg = 0,
HasAbs2 = 1,
HasAbsDiff = 0,
HasMin = 1,
HasMax = 1,
HasConj = 1,
HasSetLinear = 1,
HasSign = 1,
HasArg = 0,
HasAbsDiff = 0,
HasBlend = 0,
// This flag is used to indicate whether packet comparison is supported.
// pcmp_eq, pcmp_lt and pcmp_le should be defined for it to be true.
HasCmp = 0,
HasRound = 0,
HasRint = 0,
HasFloor = 0,
HasCeil = 0,
HasDiv = 0,
HasReciprocal = 0,
@ -73,7 +79,6 @@ struct default_packet_traits {
HasLog1p = 0,
HasLog10 = 0,
HasPow = 0,
HasSin = 0,
HasCos = 0,
HasTan = 0,
@ -96,12 +101,7 @@ struct default_packet_traits {
HasIGammaDerA = 0,
HasGammaSampleDerAlpha = 0,
HasIGammac = 0,
HasBetaInc = 0,
HasRound = 0,
HasRint = 0,
HasFloor = 0,
HasCeil = 0
HasBetaInc = 0
};
};

View File

@ -52,6 +52,7 @@ typedef __m128d Packet2d;
typedef eigen_packet_wrapper<__m128i, 0> Packet4i;
typedef eigen_packet_wrapper<__m128i, 1> Packet16b;
typedef eigen_packet_wrapper<__m128i, 4> Packet4ui;
typedef eigen_packet_wrapper<__m128i, 5> Packet2l;
template <>
struct is_arithmetic<__m128> {
@ -69,6 +70,10 @@ template <>
struct is_arithmetic<Packet4i> {
enum { value = true };
};
template <>
struct is_arithmetic<Packet2l> {
enum { value = true };
};
// Note that `Packet4ui` uses the underlying type `__m128i`, which is
// interpreted as a vector of _signed_ `int32`s, which breaks some arithmetic
// operations used in `GenericPacketMath.h`.
@ -213,10 +218,10 @@ struct packet_traits<int> : default_packet_traits {
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
HasCmp = 1,
HasDiv = 1,
size = 4,
HasCmp = 1,
HasDiv = 1,
HasShift = 1,
HasBlend = 1
};
@ -232,10 +237,22 @@ struct packet_traits<uint32_t> : default_packet_traits {
HasDiv = 0,
HasNegate = 0,
HasSqrt = 0,
HasCmp = 1,
HasMin = 1,
HasMax = 1,
HasShift = 1,
HasBlend = 1
};
};
template <>
struct packet_traits<int64_t> : default_packet_traits {
typedef Packet2l type;
typedef Packet2l half;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size = 2,
HasDiv = 0,
HasCmp = 1,
HasShift = 1,
HasBlend = 1
};
@ -250,12 +267,8 @@ struct packet_traits<bool> : default_packet_traits {
AlignedOnScalar = 1,
size = 16,
HasAdd = 1,
HasSub = 1,
HasCmp = 1, // note -- only pcmp_eq is defined
HasShift = 0,
HasMul = 1,
HasNegate = 1,
HasAbs = 0,
HasAbs2 = 0,
HasMin = 0,
@ -283,6 +296,19 @@ template <>
struct unpacket_traits<Packet2d> {
typedef double type;
typedef Packet2d half;
typedef Packet2l integer_packet;
enum {
size = 2,
alignment = Aligned16,
vectorizable = true,
masked_load_available = false,
masked_store_available = false
};
};
template <>
struct unpacket_traits<Packet2l> {
typedef int64_t type;
typedef Packet2l half;
enum {
size = 2,
alignment = Aligned16,
@ -348,6 +374,10 @@ EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
return _mm_set1_pd(from);
}
template <>
EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(const int64_t& from) {
return _mm_set1_epi64x(from);
}
template <>
EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) {
return _mm_set1_epi32(from);
}
@ -374,6 +404,10 @@ EIGEN_STRONG_INLINE Packet4f peven_mask(const Packet4f& /*a*/) {
return _mm_castsi128_ps(_mm_set_epi32(0, -1, 0, -1));
}
template <>
EIGEN_STRONG_INLINE Packet2l peven_mask(const Packet2l& /*a*/) {
return _mm_set_epi32(0, 0, -1, -1);
}
template <>
EIGEN_STRONG_INLINE Packet4i peven_mask(const Packet4i& /*a*/) {
return _mm_set_epi32(0, -1, 0, -1);
}
@ -395,6 +429,10 @@ EIGEN_STRONG_INLINE Packet2d pzero(const Packet2d& /*a*/) {
return _mm_setzero_pd();
}
template <>
EIGEN_STRONG_INLINE Packet2l pzero(const Packet2l& /*a*/) {
return _mm_setzero_si128();
}
template <>
EIGEN_STRONG_INLINE Packet4i pzero(const Packet4i& /*a*/) {
return _mm_setzero_si128();
}
@ -424,6 +462,10 @@ EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
return _mm_add_pd(pset1<Packet2d>(a), _mm_set_pd(1, 0));
}
template <>
EIGEN_STRONG_INLINE Packet2l plset<Packet2l>(const int64_t& a) {
return _mm_add_epi32(pset1<Packet2l>(a), _mm_set_epi64x(1, 0));
}
template <>
EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) {
return _mm_add_epi32(pset1<Packet4i>(a), _mm_set_epi32(3, 2, 1, 0));
}
@ -441,6 +483,10 @@ EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b
return _mm_add_pd(a, b);
}
template <>
EIGEN_STRONG_INLINE Packet2l padd<Packet2l>(const Packet2l& a, const Packet2l& b) {
return _mm_add_epi64(a, b);
}
template <>
EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
return _mm_add_epi32(a, b);
}
@ -474,6 +520,10 @@ EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b
return _mm_sub_pd(a, b);
}
template <>
EIGEN_STRONG_INLINE Packet2l psub<Packet2l>(const Packet2l& a, const Packet2l& b) {
return _mm_sub_epi64(a, b);
}
template <>
EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
return _mm_sub_epi32(a, b);
}
@ -520,9 +570,14 @@ EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0, 0x80000000, 0x0, 0x80000000));
return _mm_xor_pd(a, mask);
}
template <>
EIGEN_STRONG_INLINE Packet2l pnegate(const Packet2l& a) {
return psub(pzero(a), a);
}
template <>
EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
return psub(Packet4i(_mm_setr_epi32(0, 0, 0, 0)), a);
return psub(pzero(a), a);
}
template <>
@ -539,6 +594,10 @@ EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
return a;
}
template <>
EIGEN_STRONG_INLINE Packet2l pconj(const Packet2l& a) {
return a;
}
template <>
EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
return a;
}
@ -552,6 +611,21 @@ EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b
return _mm_mul_pd(a, b);
}
template <>
EIGEN_STRONG_INLINE Packet2l pmul<Packet2l>(const Packet2l& a, const Packet2l& b) {
// 64-bit mul requires avx512, so do this with 32-bit multiplication
__m128i upper32_a = _mm_srli_epi64(a, 32);
__m128i upper32_b = _mm_srli_epi64(b, 32);
// upper * lower
__m128i mul1 = _mm_mul_epu32(upper32_a, b);
__m128i mul2 = _mm_mul_epu32(upper32_b, a);
// Gives us both upper*upper and lower*lower
__m128i mul3 = _mm_mul_epu32(a, b);
__m128i high = _mm_slli_epi64(_mm_add_epi64(mul1, mul2), 32);
return _mm_add_epi64(high, mul3);
}
template <>
EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
#ifdef EIGEN_VECTORIZE_SSE4_1
return _mm_mullo_epi32(a, b);
@ -602,15 +676,6 @@ EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b
#endif
}
// for some weird raisons, it has to be overloaded for packet of integers
template <>
EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
return padd(pmul(a, b), c);
}
template <>
EIGEN_STRONG_INLINE Packet4ui pmadd(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c) {
return padd(pmul(a, b), c);
}
#ifdef EIGEN_VECTORIZE_FMA
template <>
EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
@ -659,26 +724,35 @@ EIGEN_STRONG_INLINE Packet2d pmadds<Packet2d>(const Packet2d& a, const Packet2d&
#ifdef EIGEN_VECTORIZE_SSE4_1
template <>
EIGEN_DEVICE_FUNC inline Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
return _mm_blendv_ps(b, a, mask);
}
template <>
EIGEN_DEVICE_FUNC inline Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) {
EIGEN_STRONG_INLINE Packet2l pselect(const Packet2l& mask, const Packet2l& a, const Packet2l& b) {
return _mm_castpd_si128(_mm_blendv_pd(_mm_castsi128_pd(b), _mm_castsi128_pd(a), _mm_castsi128_pd(mask)));
}
template <>
EIGEN_STRONG_INLINE Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) {
return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b), _mm_castsi128_ps(a), _mm_castsi128_ps(mask)));
}
template <>
EIGEN_DEVICE_FUNC inline Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b) {
EIGEN_STRONG_INLINE Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b) {
return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b), _mm_castsi128_ps(a), _mm_castsi128_ps(mask)));
}
template <>
EIGEN_DEVICE_FUNC inline Packet2d pselect(const Packet2d& mask, const Packet2d& a, const Packet2d& b) {
EIGEN_STRONG_INLINE Packet2d pselect(const Packet2d& mask, const Packet2d& a, const Packet2d& b) {
return _mm_blendv_pd(b, a, mask);
}
#endif
template <>
EIGEN_STRONG_INLINE Packet2l ptrue<Packet2l>(const Packet2l& a) {
return _mm_cmpeq_epi32(a, a);
}
template <>
EIGEN_STRONG_INLINE Packet4i ptrue<Packet4i>(const Packet4i& a) {
return _mm_cmpeq_epi32(a, a);
@ -707,6 +781,10 @@ EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b
return _mm_and_pd(a, b);
}
template <>
EIGEN_STRONG_INLINE Packet2l pand<Packet2l>(const Packet2l& a, const Packet2l& b) {
return _mm_and_si128(a, b);
}
template <>
EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
return _mm_and_si128(a, b);
}
@ -728,6 +806,10 @@ EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b)
return _mm_or_pd(a, b);
}
template <>
EIGEN_STRONG_INLINE Packet2l por<Packet2l>(const Packet2l& a, const Packet2l& b) {
return _mm_or_si128(a, b);
}
template <>
EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
return _mm_or_si128(a, b);
}
@ -749,6 +831,10 @@ EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b
return _mm_xor_pd(a, b);
}
template <>
EIGEN_STRONG_INLINE Packet2l pxor<Packet2l>(const Packet2l& a, const Packet2l& b) {
return _mm_xor_si128(a, b);
}
template <>
EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
return _mm_xor_si128(a, b);
}
@ -770,6 +856,10 @@ EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d
return _mm_andnot_pd(b, a);
}
template <>
EIGEN_STRONG_INLINE Packet2l pandnot<Packet2l>(const Packet2l& a, const Packet2l& b) {
return _mm_andnot_si128(b, a);
}
template <>
EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
return _mm_andnot_si128(b, a);
}
@ -811,7 +901,6 @@ template <>
EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) {
return _mm_cmpeq_pd(a, b);
}
template <>
EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) {
return _mm_cmplt_epi32(a, b);
@ -821,8 +910,35 @@ EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) {
return _mm_cmpeq_epi32(a, b);
}
template <>
EIGEN_STRONG_INLINE Packet4ui pcmp_eq(const Packet4ui& a, const Packet4ui& b) {
return _mm_cmpeq_epi32(a, b);
EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) {
return por(pcmp_lt(a, b), pcmp_eq(a, b));
}
template <>
EIGEN_STRONG_INLINE Packet2l pcmp_lt(const Packet2l& a, const Packet2l& b) {
#ifdef EIGEN_VECTORIZE_SSE4_2
return _mm_cmpgt_epi64(b, a);
#else
Packet4i eq = pcmp_eq<Packet4i>(Packet4i(a), Packet4i(b));
Packet2l hi_eq = Packet2l(_mm_shuffle_epi32(eq, (shuffle_mask<1, 1, 3, 3>::mask)));
Packet4i lt = pcmp_lt<Packet4i>(Packet4i(a), Packet4i(b));
Packet2l hi_lt = Packet2l(_mm_shuffle_epi32(lt, (shuffle_mask<1, 1, 3, 3>::mask)));
Packet2l lo_lt = Packet2l(_mm_shuffle_epi32(lt, (shuffle_mask<0, 0, 2, 2>::mask)));
// return hi(a) < hi(b) || (hi(a) == hi(b) && lo(a) < lo(b))
return por(hi_lt, pand(hi_eq, lo_lt));
#endif
}
template <>
EIGEN_STRONG_INLINE Packet2l pcmp_eq(const Packet2l& a, const Packet2l& b) {
#ifdef EIGEN_VECTORIZE_SSE4_1
return _mm_cmpeq_epi64(a, b);
#else
Packet4i tmp = pcmp_eq<Packet4i>(Packet4i(a), Packet4i(b));
return Packet2l(pand<Packet4i>(tmp, _mm_shuffle_epi32(tmp, (shuffle_mask<1, 0, 3, 2>::mask))));
#endif
}
template <>
EIGEN_STRONG_INLINE Packet2l pcmp_le(const Packet2l& a, const Packet2l& b) {
return por(pcmp_lt(a, b), pcmp_eq(a, b));
}
template <>
EIGEN_STRONG_INLINE Packet16b pcmp_eq(const Packet16b& a, const Packet16b& b) {
@ -831,8 +947,8 @@ EIGEN_STRONG_INLINE Packet16b pcmp_eq(const Packet16b& a, const Packet16b& b) {
return _mm_and_si128(_mm_cmpeq_epi8(a, b), kBoolMask);
}
template <>
EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) {
return por(pcmp_lt(a, b), pcmp_eq(a, b));
EIGEN_STRONG_INLINE Packet4ui pcmp_eq(const Packet4ui& a, const Packet4ui& b) {
return _mm_cmpeq_epi32(a, b);
}
template <>
@ -876,6 +992,11 @@ EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b
#endif
}
template <>
EIGEN_STRONG_INLINE Packet2l pmin<Packet2l>(const Packet2l& a, const Packet2l& b) {
Packet2l a_lt_mask = pcmp_lt(a, b);
return por(pandnot(b, a_lt_mask), pand(a, a_lt_mask));
}
template <>
EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
#ifdef EIGEN_VECTORIZE_SSE4_1
return _mm_min_epi32(a, b);
@ -937,6 +1058,11 @@ EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b
#endif
}
template <>
EIGEN_STRONG_INLINE Packet2l pmax<Packet2l>(const Packet2l& a, const Packet2l& b) {
Packet2l a_lt_mask = pcmp_lt(a, b);
return por(pandnot(a, a_lt_mask), pand(b, a_lt_mask));
}
template <>
EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
#ifdef EIGEN_VECTORIZE_SSE4_1
return _mm_max_epi32(a, b);
@ -1028,6 +1154,46 @@ EIGEN_STRONG_INLINE Packet2d pmax<PropagateNaN, Packet2d>(const Packet2d& a, con
return pminmax_propagate_nan(a, b, pmax<Packet2d>);
}
template <>
EIGEN_STRONG_INLINE Packet4f psignbit(const Packet4f& a) {
return _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(a), 31));
}
template <>
EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) {
Packet4f tmp = psignbit<Packet4f>(_mm_castpd_ps(a));
#ifdef EIGEN_VECTORIZE_AVX
return _mm_castps_pd(_mm_permute_ps(tmp, (shuffle_mask<1, 1, 3, 3>::mask)));
#else
return _mm_castps_pd(_mm_shuffle_ps(tmp, tmp, (shuffle_mask<1, 1, 3, 3>::mask)));
#endif // EIGEN_VECTORIZE_AVX
}
template <>
EIGEN_STRONG_INLINE Packet4i psignbit(const Packet4i& a) {
return _mm_srai_epi32(a, 31);
}
template <>
EIGEN_STRONG_INLINE Packet4ui psignbit(const Packet4ui& a) {
return pzero(a);
}
template <>
EIGEN_STRONG_INLINE Packet2l psignbit(const Packet2l& a) {
Packet4i tmp = psignbit<Packet4i>(Packet4i(a));
return Packet2l(_mm_shuffle_epi32(tmp, (shuffle_mask<1, 1, 3, 3>::mask)));
}
template <int N>
EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(const Packet2l& a) {
Packet2l signbit = psignbit(a);
return por(_mm_slli_epi64(signbit, 64 - N), _mm_srli_epi64(a, N));
}
template <int N>
EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
return _mm_srli_epi64(a, N);
}
template <int N>
EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
return _mm_slli_epi64(a, N);
}
template <int N>
EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) {
return _mm_srai_epi32(a, N);
@ -1040,7 +1206,6 @@ template <int N>
EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a) {
return _mm_slli_epi32(a, N);
}
template <int N>
EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(const Packet4ui& a) {
return _mm_srli_epi32(a, N);
@ -1065,12 +1230,17 @@ EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) {
return _mm_and_pd(a, mask);
}
template <>
EIGEN_STRONG_INLINE Packet2l pabs(const Packet2l& a) {
Packet2l signbit = psignbit(a);
return _mm_sub_epi64(_mm_xor_si128(a, signbit), signbit);
}
template <>
EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) {
#ifdef EIGEN_VECTORIZE_SSSE3
return _mm_abs_epi32(a);
#else
Packet4i aux = _mm_srai_epi32(a, 31);
return _mm_sub_epi32(_mm_xor_si128(a, aux), aux);
Packet4i signbit = psignbit(a);
return _mm_sub_epi32(_mm_xor_si128(a, signbit), signbit);
#endif
}
template <>
@ -1078,24 +1248,6 @@ EIGEN_STRONG_INLINE Packet4ui pabs(const Packet4ui& a) {
return a;
}
template <>
EIGEN_STRONG_INLINE Packet4f psignbit(const Packet4f& a) {
return _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(a), 31));
}
template <>
EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) {
Packet4f tmp = psignbit<Packet4f>(_mm_castpd_ps(a));
#ifdef EIGEN_VECTORIZE_AVX
return _mm_castps_pd(_mm_permute_ps(tmp, (shuffle_mask<1, 1, 3, 3>::mask)));
#else
return _mm_castps_pd(_mm_shuffle_ps(tmp, tmp, (shuffle_mask<1, 1, 3, 3>::mask)));
#endif // EIGEN_VECTORIZE_AVX
}
template <>
EIGEN_STRONG_INLINE Packet4ui psignbit(const Packet4ui& a) {
return pzero(a);
}
#ifdef EIGEN_VECTORIZE_SSE4_1
template <>
EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
@ -1217,6 +1369,10 @@ EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from);
}
template <>
EIGEN_STRONG_INLINE Packet2l pload<Packet2l>(const int64_t* from) {
EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
}
template <>
EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) {
EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
}
@ -1251,6 +1407,11 @@ EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
return _mm_loadu_pd(from);
}
template <>
EIGEN_STRONG_INLINE Packet2l ploadu<Packet2l>(const int64_t* from) {
EIGEN_DEBUG_UNALIGNED_LOAD
return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
}
template <>
EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) {
EIGEN_DEBUG_UNALIGNED_LOAD
return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
@ -1299,6 +1460,10 @@ EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
return pset1<Packet2d>(from[0]);
}
template <>
EIGEN_STRONG_INLINE Packet2l ploaddup<Packet2l>(const int64_t* from) {
return pset1<Packet2l>(from[0]);
}
template <>
EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) {
Packet4i tmp;
tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(from));
@ -1337,6 +1502,10 @@ EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from);
}
template <>
EIGEN_STRONG_INLINE void pstore<int64_t>(int64_t* to, const Packet2l& from) {
EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
}
template <>
EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) {
EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
}
@ -1358,6 +1527,10 @@ EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ps(to, from);
}
template <>
EIGEN_STRONG_INLINE void pstoreu<int64_t>(int64_t* to, const Packet2l& from) {
EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
}
template <>
EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) {
EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
}
@ -1393,25 +1566,142 @@ EIGEN_STRONG_INLINE void pstores(double* to, const Packet2d& from) {
}
template <>
EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
return _mm_shuffle_ps(a, a, 0x1B);
}
template <>
EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
return _mm_shuffle_pd(a, a, 0x1);
}
template <>
EIGEN_STRONG_INLINE Packet2l preverse(const Packet2l& a) {
return _mm_castpd_si128(preverse(_mm_castsi128_pd(a)));
}
template <>
EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
return _mm_shuffle_epi32(a, 0x1B);
}
template <>
EIGEN_STRONG_INLINE Packet4ui preverse(const Packet4ui& a) {
return _mm_shuffle_epi32(a, 0x1B);
}
template <>
EIGEN_STRONG_INLINE Packet16b preverse(const Packet16b& a) {
#ifdef EIGEN_VECTORIZE_SSSE3
__m128i mask = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
return _mm_shuffle_epi8(a, mask);
#else
Packet16b tmp = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 1, 2, 3));
tmp = _mm_shufflehi_epi16(_mm_shufflelo_epi16(tmp, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1));
return _mm_or_si128(_mm_slli_epi16(tmp, 8), _mm_srli_epi16(tmp, 8));
#endif
}
#if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64
// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
// Direct of the struct members fixed bug #62.
template <>
EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
return a.m128_f32[0];
}
template <>
EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
return a.m128d_f64[0];
}
template <>
EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(const Packet2l& a) {
int64_t x = _mm_cvtsi128_si64(a);
return x;
}
template <>
EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
int x = _mm_cvtsi128_si32(a);
return x;
}
template <>
EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) {
uint32_t x = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a));
return x;
}
#elif EIGEN_COMP_MSVC_STRICT
// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
template <>
EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
float x = _mm_cvtss_f32(a);
return x;
}
template <>
EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
double x = _mm_cvtsd_f64(a);
return x;
}
template <>
EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(const Packet2l& a) {
int64_t x = _mm_cvtsi128_si64(a);
return x;
}
template <>
EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
int x = _mm_cvtsi128_si32(a);
return x;
}
template <>
EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) {
uint32_t x = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a));
return x;
}
#else
template <>
EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
return _mm_cvtss_f32(a);
}
template <>
EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
return _mm_cvtsd_f64(a);
}
template <>
EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(const Packet2l& a) {
return _mm_cvtsi128_si64(a);
}
template <>
EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
return _mm_cvtsi128_si32(a);
}
template <>
EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) {
return numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a));
}
#endif
template <>
EIGEN_STRONG_INLINE bool pfirst<Packet16b>(const Packet16b& a) {
int x = _mm_cvtsi128_si32(a);
return static_cast<bool>(x & 1);
}
template <>
EIGEN_STRONG_INLINE Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
return _mm_set_ps(from[3 * stride], from[2 * stride], from[1 * stride], from[0 * stride]);
}
template <>
EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
EIGEN_STRONG_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
return _mm_set_pd(from[1 * stride], from[0 * stride]);
}
template <>
EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride) {
EIGEN_STRONG_INLINE Packet2l pgather<int64_t, Packet2l>(const int64_t* from, Index stride) {
return _mm_set_epi64x(from[1 * stride], from[0 * stride]);
}
template <>
EIGEN_STRONG_INLINE Packet4i pgather<int, Packet4i>(const int* from, Index stride) {
return _mm_set_epi32(from[3 * stride], from[2 * stride], from[1 * stride], from[0 * stride]);
}
template <>
EIGEN_DEVICE_FUNC inline Packet4ui pgather<uint32_t, Packet4ui>(const uint32_t* from, Index stride) {
EIGEN_STRONG_INLINE Packet4ui pgather<uint32_t, Packet4ui>(const uint32_t* from, Index stride) {
return _mm_set_epi32(numext::bit_cast<int32_t>(from[3 * stride]), numext::bit_cast<int32_t>(from[2 * stride]),
numext::bit_cast<int32_t>(from[1 * stride]), numext::bit_cast<int32_t>(from[0 * stride]));
}
template <>
EIGEN_DEVICE_FUNC inline Packet16b pgather<bool, Packet16b>(const bool* from, Index stride) {
EIGEN_STRONG_INLINE Packet16b pgather<bool, Packet16b>(const bool* from, Index stride) {
return _mm_set_epi8(from[15 * stride], from[14 * stride], from[13 * stride], from[12 * stride], from[11 * stride],
from[10 * stride], from[9 * stride], from[8 * stride], from[7 * stride], from[6 * stride],
from[5 * stride], from[4 * stride], from[3 * stride], from[2 * stride], from[1 * stride],
@ -1419,33 +1709,38 @@ EIGEN_DEVICE_FUNC inline Packet16b pgather<bool, Packet16b>(const bool* from, In
}
template <>
EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
to[stride * 0] = _mm_cvtss_f32(from);
to[stride * 1] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 1));
to[stride * 2] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 2));
to[stride * 3] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 3));
EIGEN_STRONG_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
to[stride * 0] = pfirst(from);
to[stride * 1] = pfirst(_mm_shuffle_ps(from, from, 1));
to[stride * 2] = pfirst(_mm_shuffle_ps(from, from, 2));
to[stride * 3] = pfirst(_mm_shuffle_ps(from, from, 3));
}
template <>
EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
to[stride * 0] = _mm_cvtsd_f64(from);
to[stride * 1] = _mm_cvtsd_f64(_mm_shuffle_pd(from, from, 1));
EIGEN_STRONG_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
to[stride * 0] = pfirst(from);
to[stride * 1] = pfirst(preverse(from));
}
template <>
EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride) {
EIGEN_STRONG_INLINE void pscatter<int64_t, Packet2l>(int64_t* to, const Packet2l& from, Index stride) {
to[stride * 0] = pfirst(from);
to[stride * 1] = pfirst(preverse(from));
}
template <>
EIGEN_STRONG_INLINE void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride) {
to[stride * 0] = _mm_cvtsi128_si32(from);
to[stride * 1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
to[stride * 2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
to[stride * 3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
}
template <>
EIGEN_DEVICE_FUNC inline void pscatter<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from, Index stride) {
EIGEN_STRONG_INLINE void pscatter<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from, Index stride) {
to[stride * 0] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(from));
to[stride * 1] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1)));
to[stride * 2] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2)));
to[stride * 3] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3)));
}
template <>
EIGEN_DEVICE_FUNC inline void pscatter<bool, Packet16b>(bool* to, const Packet16b& from, Index stride) {
EIGEN_STRONG_INLINE void pscatter<bool, Packet16b>(bool* to, const Packet16b& from, Index stride) {
to[4 * stride * 0] = _mm_cvtsi128_si32(from);
to[4 * stride * 1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
to[4 * stride * 2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
@ -1485,106 +1780,15 @@ EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) {
_mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
}
template <>
EIGEN_STRONG_INLINE void prefetch<int64_t>(const int64_t* addr) {
_mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
}
template <>
EIGEN_STRONG_INLINE void prefetch<uint32_t>(const uint32_t* addr) {
_mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
}
#endif
#if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64
// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
// Direct of the struct members fixed bug #62.
template <>
EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
return a.m128_f32[0];
}
template <>
EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
return a.m128d_f64[0];
}
template <>
EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
int x = _mm_cvtsi128_si32(a);
return x;
}
template <>
EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) {
uint32_t x = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a));
return x;
}
#elif EIGEN_COMP_MSVC_STRICT
// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
template <>
EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
float x = _mm_cvtss_f32(a);
return x;
}
template <>
EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
double x = _mm_cvtsd_f64(a);
return x;
}
template <>
EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
int x = _mm_cvtsi128_si32(a);
return x;
}
template <>
EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) {
uint32_t x = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a));
return x;
}
#else
template <>
EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
return _mm_cvtss_f32(a);
}
template <>
EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
return _mm_cvtsd_f64(a);
}
template <>
EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
return _mm_cvtsi128_si32(a);
}
template <>
EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) {
return numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a));
}
#endif
template <>
EIGEN_STRONG_INLINE bool pfirst<Packet16b>(const Packet16b& a) {
int x = _mm_cvtsi128_si32(a);
return static_cast<bool>(x & 1);
}
template <>
EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
return _mm_shuffle_ps(a, a, 0x1B);
}
template <>
EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
return _mm_shuffle_pd(a, a, 0x1);
}
template <>
EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
return _mm_shuffle_epi32(a, 0x1B);
}
template <>
EIGEN_STRONG_INLINE Packet4ui preverse(const Packet4ui& a) {
return _mm_shuffle_epi32(a, 0x1B);
}
template <>
EIGEN_STRONG_INLINE Packet16b preverse(const Packet16b& a) {
#ifdef EIGEN_VECTORIZE_SSSE3
__m128i mask = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
return _mm_shuffle_epi8(a, mask);
#else
Packet16b tmp = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 1, 2, 3));
tmp = _mm_shufflehi_epi16(_mm_shufflelo_epi16(tmp, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1));
return _mm_or_si128(_mm_slli_epi16(tmp, 8), _mm_srli_epi16(tmp, 8));
#endif
}
template <>
EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
return pfrexp_generic(a, exponent);
@ -1610,6 +1814,7 @@ EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f&
// We specialize pldexp here, since the generic implementation uses Packet2l, which is not well
// supported by SSE, and has more range than is needed for exponents.
// TODO(rmlarsen): Remove this specialization once Packet2l has support or casting.
template <>
EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
// Clamp exponent to [-2099, 2099]
@ -1690,6 +1895,11 @@ EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
// #endif
}
template <>
EIGEN_STRONG_INLINE int64_t predux<Packet2l>(const Packet2l& a) {
return pfirst<Packet2l>(_mm_add_epi64(a, _mm_unpackhi_epi64(a, a)));
}
#ifdef EIGEN_VECTORIZE_SSSE3
template <>
EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) {
@ -1701,7 +1911,6 @@ EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a) {
Packet4ui tmp0 = _mm_hadd_epi32(a, a);
return pfirst<Packet4ui>(_mm_hadd_epi32(tmp0, tmp0));
}
#else
template <>
EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) {
@ -1734,9 +1943,15 @@ EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
return pfirst<Packet2d>(_mm_mul_sd(a, _mm_unpackhi_pd(a, a)));
}
template <>
EIGEN_STRONG_INLINE int64_t predux_mul<Packet2l>(const Packet2l& a) {
EIGEN_ALIGN16 int64_t aux[2];
pstore(aux, a);
return aux[0] * aux[1];
}
template <>
EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) {
// after some experiments, it is seems this is the fastest way to implement it
// for GCC (eg., reusing pmul is very slow !)
// for GCC (e.g., reusing pmul is very slow!)
// TODO try to call _mm_mul_epu32 directly
EIGEN_ALIGN16 int aux[4];
pstore(aux, a);
@ -1846,11 +2061,21 @@ EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(const Packet4ui& a) {
// return _mm_movemask_ps(x) == 0xF;
// }
template <>
EIGEN_STRONG_INLINE bool predux_any(const Packet2d& x) {
return _mm_movemask_pd(x) != 0x0;
}
template <>
EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) {
return _mm_movemask_ps(x) != 0x0;
}
template <>
EIGEN_STRONG_INLINE bool predux_any(const Packet2l& x) {
return _mm_movemask_pd(_mm_castsi128_pd(x)) != 0x0;
}
template <>
EIGEN_STRONG_INLINE bool predux_any(const Packet4i& x) {
return _mm_movemask_ps(_mm_castsi128_ps(x)) != 0x0;
@ -1860,17 +2085,23 @@ EIGEN_STRONG_INLINE bool predux_any(const Packet4ui& x) {
return _mm_movemask_ps(_mm_castsi128_ps(x)) != 0x0;
}
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
_MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]);
}
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
__m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]);
kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]);
kernel.packet[1] = tmp;
}
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2l, 2>& kernel) {
__m128i tmp = _mm_unpackhi_epi64(kernel.packet[0], kernel.packet[1]);
kernel.packet[0] = _mm_unpacklo_epi64(kernel.packet[0], kernel.packet[1]);
kernel.packet[1] = tmp;
}
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
__m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
__m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]);
__m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]);
@ -1881,11 +2112,11 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
kernel.packet[2] = _mm_unpacklo_epi64(T2, T3);
kernel.packet[3] = _mm_unpackhi_epi64(T2, T3);
}
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4ui, 4>& kernel) {
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4ui, 4>& kernel) {
ptranspose((PacketBlock<Packet4i, 4>&)kernel);
}
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16b, 4>& kernel) {
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16b, 4>& kernel) {
__m128i T0 = _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]);
__m128i T1 = _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]);
__m128i T2 = _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]);
@ -1896,7 +2127,7 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16b, 4>& kernel) {
kernel.packet[3] = _mm_unpackhi_epi16(T1, T3);
}
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16b, 16>& kernel) {
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16b, 16>& kernel) {
// If we number the elements in the input thus:
// kernel.packet[ 0] = {00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 0a, 0b, 0c, 0d, 0e, 0f}
// kernel.packet[ 1] = {10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 1a, 1b, 1c, 1d, 1e, 1f}
@ -1982,6 +2213,18 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16b, 16>& kernel) {
kernel.packet[15] = _mm_unpackhi_epi64(u7, uf);
}
template <>
EIGEN_STRONG_INLINE Packet2l pblend(const Selector<2>& ifPacket, const Packet2l& thenPacket,
const Packet2l& elsePacket) {
const __m128i zero = _mm_setzero_si128();
const __m128i select = _mm_set_epi64x(ifPacket.select[1], ifPacket.select[0]);
__m128i false_mask = pcmp_eq<Packet2l>(select, zero);
#ifdef EIGEN_VECTORIZE_SSE4_1
return _mm_blendv_epi8(thenPacket, elsePacket, false_mask);
#else
return _mm_or_si128(_mm_andnot_si128(false_mask, thenPacket), _mm_and_si128(false_mask, elsePacket));
#endif
}
template <>
EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket,
const Packet4i& elsePacket) {
@ -2189,11 +2432,6 @@ struct packet_traits<Eigen::half> : default_packet_traits {
HasMax = 0,
HasConj = 0,
HasSetLinear = 0,
HasSqrt = 0,
HasRsqrt = 0,
HasExp = 0,
HasLog = 0,
HasBlend = 0
};
};