mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-11 19:29:02 +08:00
Add Packet2l for SSE.
This commit is contained in:
parent
1d4369c2ff
commit
126ba1a166
@ -44,24 +44,30 @@ namespace internal {
|
||||
|
||||
struct default_packet_traits {
|
||||
enum {
|
||||
// Ops that are implemented for most types.
|
||||
HasAdd = 1,
|
||||
HasSub = 1,
|
||||
HasShift = 1,
|
||||
HasMul = 1,
|
||||
HasNegate = 1,
|
||||
HasAbs = 1,
|
||||
HasArg = 0,
|
||||
HasAbs2 = 1,
|
||||
HasAbsDiff = 0,
|
||||
HasMin = 1,
|
||||
HasMax = 1,
|
||||
HasConj = 1,
|
||||
HasSetLinear = 1,
|
||||
HasSign = 1,
|
||||
|
||||
HasArg = 0,
|
||||
HasAbsDiff = 0,
|
||||
HasBlend = 0,
|
||||
// This flag is used to indicate whether packet comparison is supported.
|
||||
// pcmp_eq, pcmp_lt and pcmp_le should be defined for it to be true.
|
||||
HasCmp = 0,
|
||||
HasRound = 0,
|
||||
HasRint = 0,
|
||||
HasFloor = 0,
|
||||
HasCeil = 0,
|
||||
|
||||
HasDiv = 0,
|
||||
HasReciprocal = 0,
|
||||
@ -73,7 +79,6 @@ struct default_packet_traits {
|
||||
HasLog1p = 0,
|
||||
HasLog10 = 0,
|
||||
HasPow = 0,
|
||||
|
||||
HasSin = 0,
|
||||
HasCos = 0,
|
||||
HasTan = 0,
|
||||
@ -96,12 +101,7 @@ struct default_packet_traits {
|
||||
HasIGammaDerA = 0,
|
||||
HasGammaSampleDerAlpha = 0,
|
||||
HasIGammac = 0,
|
||||
HasBetaInc = 0,
|
||||
|
||||
HasRound = 0,
|
||||
HasRint = 0,
|
||||
HasFloor = 0,
|
||||
HasCeil = 0
|
||||
HasBetaInc = 0
|
||||
};
|
||||
};
|
||||
|
||||
|
@ -52,6 +52,7 @@ typedef __m128d Packet2d;
|
||||
typedef eigen_packet_wrapper<__m128i, 0> Packet4i;
|
||||
typedef eigen_packet_wrapper<__m128i, 1> Packet16b;
|
||||
typedef eigen_packet_wrapper<__m128i, 4> Packet4ui;
|
||||
typedef eigen_packet_wrapper<__m128i, 5> Packet2l;
|
||||
|
||||
template <>
|
||||
struct is_arithmetic<__m128> {
|
||||
@ -69,6 +70,10 @@ template <>
|
||||
struct is_arithmetic<Packet4i> {
|
||||
enum { value = true };
|
||||
};
|
||||
template <>
|
||||
struct is_arithmetic<Packet2l> {
|
||||
enum { value = true };
|
||||
};
|
||||
// Note that `Packet4ui` uses the underlying type `__m128i`, which is
|
||||
// interpreted as a vector of _signed_ `int32`s, which breaks some arithmetic
|
||||
// operations used in `GenericPacketMath.h`.
|
||||
@ -213,10 +218,10 @@ struct packet_traits<int> : default_packet_traits {
|
||||
enum {
|
||||
Vectorizable = 1,
|
||||
AlignedOnScalar = 1,
|
||||
HasCmp = 1,
|
||||
HasDiv = 1,
|
||||
size = 4,
|
||||
|
||||
HasCmp = 1,
|
||||
HasDiv = 1,
|
||||
HasShift = 1,
|
||||
HasBlend = 1
|
||||
};
|
||||
@ -232,10 +237,22 @@ struct packet_traits<uint32_t> : default_packet_traits {
|
||||
|
||||
HasDiv = 0,
|
||||
HasNegate = 0,
|
||||
HasSqrt = 0,
|
||||
HasCmp = 1,
|
||||
HasMin = 1,
|
||||
HasMax = 1,
|
||||
HasShift = 1,
|
||||
HasBlend = 1
|
||||
};
|
||||
};
|
||||
template <>
|
||||
struct packet_traits<int64_t> : default_packet_traits {
|
||||
typedef Packet2l type;
|
||||
typedef Packet2l half;
|
||||
enum {
|
||||
Vectorizable = 1,
|
||||
AlignedOnScalar = 1,
|
||||
size = 2,
|
||||
|
||||
HasDiv = 0,
|
||||
HasCmp = 1,
|
||||
HasShift = 1,
|
||||
HasBlend = 1
|
||||
};
|
||||
@ -250,12 +267,8 @@ struct packet_traits<bool> : default_packet_traits {
|
||||
AlignedOnScalar = 1,
|
||||
size = 16,
|
||||
|
||||
HasAdd = 1,
|
||||
HasSub = 1,
|
||||
HasCmp = 1, // note -- only pcmp_eq is defined
|
||||
HasShift = 0,
|
||||
HasMul = 1,
|
||||
HasNegate = 1,
|
||||
HasAbs = 0,
|
||||
HasAbs2 = 0,
|
||||
HasMin = 0,
|
||||
@ -283,6 +296,19 @@ template <>
|
||||
struct unpacket_traits<Packet2d> {
|
||||
typedef double type;
|
||||
typedef Packet2d half;
|
||||
typedef Packet2l integer_packet;
|
||||
enum {
|
||||
size = 2,
|
||||
alignment = Aligned16,
|
||||
vectorizable = true,
|
||||
masked_load_available = false,
|
||||
masked_store_available = false
|
||||
};
|
||||
};
|
||||
template <>
|
||||
struct unpacket_traits<Packet2l> {
|
||||
typedef int64_t type;
|
||||
typedef Packet2l half;
|
||||
enum {
|
||||
size = 2,
|
||||
alignment = Aligned16,
|
||||
@ -348,6 +374,10 @@ EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
|
||||
return _mm_set1_pd(from);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(const int64_t& from) {
|
||||
return _mm_set1_epi64x(from);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) {
|
||||
return _mm_set1_epi32(from);
|
||||
}
|
||||
@ -374,6 +404,10 @@ EIGEN_STRONG_INLINE Packet4f peven_mask(const Packet4f& /*a*/) {
|
||||
return _mm_castsi128_ps(_mm_set_epi32(0, -1, 0, -1));
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2l peven_mask(const Packet2l& /*a*/) {
|
||||
return _mm_set_epi32(0, 0, -1, -1);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4i peven_mask(const Packet4i& /*a*/) {
|
||||
return _mm_set_epi32(0, -1, 0, -1);
|
||||
}
|
||||
@ -395,6 +429,10 @@ EIGEN_STRONG_INLINE Packet2d pzero(const Packet2d& /*a*/) {
|
||||
return _mm_setzero_pd();
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2l pzero(const Packet2l& /*a*/) {
|
||||
return _mm_setzero_si128();
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4i pzero(const Packet4i& /*a*/) {
|
||||
return _mm_setzero_si128();
|
||||
}
|
||||
@ -424,6 +462,10 @@ EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
|
||||
return _mm_add_pd(pset1<Packet2d>(a), _mm_set_pd(1, 0));
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2l plset<Packet2l>(const int64_t& a) {
|
||||
return _mm_add_epi32(pset1<Packet2l>(a), _mm_set_epi64x(1, 0));
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) {
|
||||
return _mm_add_epi32(pset1<Packet4i>(a), _mm_set_epi32(3, 2, 1, 0));
|
||||
}
|
||||
@ -441,6 +483,10 @@ EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b
|
||||
return _mm_add_pd(a, b);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2l padd<Packet2l>(const Packet2l& a, const Packet2l& b) {
|
||||
return _mm_add_epi64(a, b);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
||||
return _mm_add_epi32(a, b);
|
||||
}
|
||||
@ -474,6 +520,10 @@ EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b
|
||||
return _mm_sub_pd(a, b);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2l psub<Packet2l>(const Packet2l& a, const Packet2l& b) {
|
||||
return _mm_sub_epi64(a, b);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
||||
return _mm_sub_epi32(a, b);
|
||||
}
|
||||
@ -520,9 +570,14 @@ EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
|
||||
const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0, 0x80000000, 0x0, 0x80000000));
|
||||
return _mm_xor_pd(a, mask);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2l pnegate(const Packet2l& a) {
|
||||
return psub(pzero(a), a);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
|
||||
return psub(Packet4i(_mm_setr_epi32(0, 0, 0, 0)), a);
|
||||
return psub(pzero(a), a);
|
||||
}
|
||||
|
||||
template <>
|
||||
@ -539,6 +594,10 @@ EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
|
||||
return a;
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2l pconj(const Packet2l& a) {
|
||||
return a;
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
|
||||
return a;
|
||||
}
|
||||
@ -552,6 +611,21 @@ EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b
|
||||
return _mm_mul_pd(a, b);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2l pmul<Packet2l>(const Packet2l& a, const Packet2l& b) {
|
||||
// 64-bit mul requires avx512, so do this with 32-bit multiplication
|
||||
__m128i upper32_a = _mm_srli_epi64(a, 32);
|
||||
__m128i upper32_b = _mm_srli_epi64(b, 32);
|
||||
|
||||
// upper * lower
|
||||
__m128i mul1 = _mm_mul_epu32(upper32_a, b);
|
||||
__m128i mul2 = _mm_mul_epu32(upper32_b, a);
|
||||
// Gives us both upper*upper and lower*lower
|
||||
__m128i mul3 = _mm_mul_epu32(a, b);
|
||||
|
||||
__m128i high = _mm_slli_epi64(_mm_add_epi64(mul1, mul2), 32);
|
||||
return _mm_add_epi64(high, mul3);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
||||
#ifdef EIGEN_VECTORIZE_SSE4_1
|
||||
return _mm_mullo_epi32(a, b);
|
||||
@ -602,15 +676,6 @@ EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b
|
||||
#endif
|
||||
}
|
||||
|
||||
// for some weird raisons, it has to be overloaded for packet of integers
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
|
||||
return padd(pmul(a, b), c);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4ui pmadd(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c) {
|
||||
return padd(pmul(a, b), c);
|
||||
}
|
||||
#ifdef EIGEN_VECTORIZE_FMA
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
|
||||
@ -659,26 +724,35 @@ EIGEN_STRONG_INLINE Packet2d pmadds<Packet2d>(const Packet2d& a, const Packet2d&
|
||||
|
||||
#ifdef EIGEN_VECTORIZE_SSE4_1
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC inline Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
|
||||
EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
|
||||
return _mm_blendv_ps(b, a, mask);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC inline Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) {
|
||||
EIGEN_STRONG_INLINE Packet2l pselect(const Packet2l& mask, const Packet2l& a, const Packet2l& b) {
|
||||
return _mm_castpd_si128(_mm_blendv_pd(_mm_castsi128_pd(b), _mm_castsi128_pd(a), _mm_castsi128_pd(mask)));
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) {
|
||||
return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b), _mm_castsi128_ps(a), _mm_castsi128_ps(mask)));
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC inline Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b) {
|
||||
EIGEN_STRONG_INLINE Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b) {
|
||||
return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b), _mm_castsi128_ps(a), _mm_castsi128_ps(mask)));
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC inline Packet2d pselect(const Packet2d& mask, const Packet2d& a, const Packet2d& b) {
|
||||
EIGEN_STRONG_INLINE Packet2d pselect(const Packet2d& mask, const Packet2d& a, const Packet2d& b) {
|
||||
return _mm_blendv_pd(b, a, mask);
|
||||
}
|
||||
#endif
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2l ptrue<Packet2l>(const Packet2l& a) {
|
||||
return _mm_cmpeq_epi32(a, a);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4i ptrue<Packet4i>(const Packet4i& a) {
|
||||
return _mm_cmpeq_epi32(a, a);
|
||||
@ -707,6 +781,10 @@ EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b
|
||||
return _mm_and_pd(a, b);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2l pand<Packet2l>(const Packet2l& a, const Packet2l& b) {
|
||||
return _mm_and_si128(a, b);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
||||
return _mm_and_si128(a, b);
|
||||
}
|
||||
@ -728,6 +806,10 @@ EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b)
|
||||
return _mm_or_pd(a, b);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2l por<Packet2l>(const Packet2l& a, const Packet2l& b) {
|
||||
return _mm_or_si128(a, b);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
||||
return _mm_or_si128(a, b);
|
||||
}
|
||||
@ -749,6 +831,10 @@ EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b
|
||||
return _mm_xor_pd(a, b);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2l pxor<Packet2l>(const Packet2l& a, const Packet2l& b) {
|
||||
return _mm_xor_si128(a, b);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
||||
return _mm_xor_si128(a, b);
|
||||
}
|
||||
@ -770,6 +856,10 @@ EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d
|
||||
return _mm_andnot_pd(b, a);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2l pandnot<Packet2l>(const Packet2l& a, const Packet2l& b) {
|
||||
return _mm_andnot_si128(b, a);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
||||
return _mm_andnot_si128(b, a);
|
||||
}
|
||||
@ -811,7 +901,6 @@ template <>
|
||||
EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) {
|
||||
return _mm_cmpeq_pd(a, b);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) {
|
||||
return _mm_cmplt_epi32(a, b);
|
||||
@ -821,8 +910,35 @@ EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) {
|
||||
return _mm_cmpeq_epi32(a, b);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4ui pcmp_eq(const Packet4ui& a, const Packet4ui& b) {
|
||||
return _mm_cmpeq_epi32(a, b);
|
||||
EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) {
|
||||
return por(pcmp_lt(a, b), pcmp_eq(a, b));
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2l pcmp_lt(const Packet2l& a, const Packet2l& b) {
|
||||
#ifdef EIGEN_VECTORIZE_SSE4_2
|
||||
return _mm_cmpgt_epi64(b, a);
|
||||
#else
|
||||
Packet4i eq = pcmp_eq<Packet4i>(Packet4i(a), Packet4i(b));
|
||||
Packet2l hi_eq = Packet2l(_mm_shuffle_epi32(eq, (shuffle_mask<1, 1, 3, 3>::mask)));
|
||||
Packet4i lt = pcmp_lt<Packet4i>(Packet4i(a), Packet4i(b));
|
||||
Packet2l hi_lt = Packet2l(_mm_shuffle_epi32(lt, (shuffle_mask<1, 1, 3, 3>::mask)));
|
||||
Packet2l lo_lt = Packet2l(_mm_shuffle_epi32(lt, (shuffle_mask<0, 0, 2, 2>::mask)));
|
||||
// return hi(a) < hi(b) || (hi(a) == hi(b) && lo(a) < lo(b))
|
||||
return por(hi_lt, pand(hi_eq, lo_lt));
|
||||
#endif
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2l pcmp_eq(const Packet2l& a, const Packet2l& b) {
|
||||
#ifdef EIGEN_VECTORIZE_SSE4_1
|
||||
return _mm_cmpeq_epi64(a, b);
|
||||
#else
|
||||
Packet4i tmp = pcmp_eq<Packet4i>(Packet4i(a), Packet4i(b));
|
||||
return Packet2l(pand<Packet4i>(tmp, _mm_shuffle_epi32(tmp, (shuffle_mask<1, 0, 3, 2>::mask))));
|
||||
#endif
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2l pcmp_le(const Packet2l& a, const Packet2l& b) {
|
||||
return por(pcmp_lt(a, b), pcmp_eq(a, b));
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16b pcmp_eq(const Packet16b& a, const Packet16b& b) {
|
||||
@ -831,8 +947,8 @@ EIGEN_STRONG_INLINE Packet16b pcmp_eq(const Packet16b& a, const Packet16b& b) {
|
||||
return _mm_and_si128(_mm_cmpeq_epi8(a, b), kBoolMask);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) {
|
||||
return por(pcmp_lt(a, b), pcmp_eq(a, b));
|
||||
EIGEN_STRONG_INLINE Packet4ui pcmp_eq(const Packet4ui& a, const Packet4ui& b) {
|
||||
return _mm_cmpeq_epi32(a, b);
|
||||
}
|
||||
|
||||
template <>
|
||||
@ -876,6 +992,11 @@ EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b
|
||||
#endif
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2l pmin<Packet2l>(const Packet2l& a, const Packet2l& b) {
|
||||
Packet2l a_lt_mask = pcmp_lt(a, b);
|
||||
return por(pandnot(b, a_lt_mask), pand(a, a_lt_mask));
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
||||
#ifdef EIGEN_VECTORIZE_SSE4_1
|
||||
return _mm_min_epi32(a, b);
|
||||
@ -937,6 +1058,11 @@ EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b
|
||||
#endif
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2l pmax<Packet2l>(const Packet2l& a, const Packet2l& b) {
|
||||
Packet2l a_lt_mask = pcmp_lt(a, b);
|
||||
return por(pandnot(a, a_lt_mask), pand(b, a_lt_mask));
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
|
||||
#ifdef EIGEN_VECTORIZE_SSE4_1
|
||||
return _mm_max_epi32(a, b);
|
||||
@ -1028,6 +1154,46 @@ EIGEN_STRONG_INLINE Packet2d pmax<PropagateNaN, Packet2d>(const Packet2d& a, con
|
||||
return pminmax_propagate_nan(a, b, pmax<Packet2d>);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4f psignbit(const Packet4f& a) {
|
||||
return _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(a), 31));
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) {
|
||||
Packet4f tmp = psignbit<Packet4f>(_mm_castpd_ps(a));
|
||||
#ifdef EIGEN_VECTORIZE_AVX
|
||||
return _mm_castps_pd(_mm_permute_ps(tmp, (shuffle_mask<1, 1, 3, 3>::mask)));
|
||||
#else
|
||||
return _mm_castps_pd(_mm_shuffle_ps(tmp, tmp, (shuffle_mask<1, 1, 3, 3>::mask)));
|
||||
#endif // EIGEN_VECTORIZE_AVX
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4i psignbit(const Packet4i& a) {
|
||||
return _mm_srai_epi32(a, 31);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4ui psignbit(const Packet4ui& a) {
|
||||
return pzero(a);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2l psignbit(const Packet2l& a) {
|
||||
Packet4i tmp = psignbit<Packet4i>(Packet4i(a));
|
||||
return Packet2l(_mm_shuffle_epi32(tmp, (shuffle_mask<1, 1, 3, 3>::mask)));
|
||||
}
|
||||
|
||||
template <int N>
|
||||
EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(const Packet2l& a) {
|
||||
Packet2l signbit = psignbit(a);
|
||||
return por(_mm_slli_epi64(signbit, 64 - N), _mm_srli_epi64(a, N));
|
||||
}
|
||||
template <int N>
|
||||
EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
|
||||
return _mm_srli_epi64(a, N);
|
||||
}
|
||||
template <int N>
|
||||
EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
|
||||
return _mm_slli_epi64(a, N);
|
||||
}
|
||||
template <int N>
|
||||
EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) {
|
||||
return _mm_srai_epi32(a, N);
|
||||
@ -1040,7 +1206,6 @@ template <int N>
|
||||
EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a) {
|
||||
return _mm_slli_epi32(a, N);
|
||||
}
|
||||
|
||||
template <int N>
|
||||
EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(const Packet4ui& a) {
|
||||
return _mm_srli_epi32(a, N);
|
||||
@ -1065,12 +1230,17 @@ EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) {
|
||||
return _mm_and_pd(a, mask);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2l pabs(const Packet2l& a) {
|
||||
Packet2l signbit = psignbit(a);
|
||||
return _mm_sub_epi64(_mm_xor_si128(a, signbit), signbit);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) {
|
||||
#ifdef EIGEN_VECTORIZE_SSSE3
|
||||
return _mm_abs_epi32(a);
|
||||
#else
|
||||
Packet4i aux = _mm_srai_epi32(a, 31);
|
||||
return _mm_sub_epi32(_mm_xor_si128(a, aux), aux);
|
||||
Packet4i signbit = psignbit(a);
|
||||
return _mm_sub_epi32(_mm_xor_si128(a, signbit), signbit);
|
||||
#endif
|
||||
}
|
||||
template <>
|
||||
@ -1078,24 +1248,6 @@ EIGEN_STRONG_INLINE Packet4ui pabs(const Packet4ui& a) {
|
||||
return a;
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4f psignbit(const Packet4f& a) {
|
||||
return _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(a), 31));
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) {
|
||||
Packet4f tmp = psignbit<Packet4f>(_mm_castpd_ps(a));
|
||||
#ifdef EIGEN_VECTORIZE_AVX
|
||||
return _mm_castps_pd(_mm_permute_ps(tmp, (shuffle_mask<1, 1, 3, 3>::mask)));
|
||||
#else
|
||||
return _mm_castps_pd(_mm_shuffle_ps(tmp, tmp, (shuffle_mask<1, 1, 3, 3>::mask)));
|
||||
#endif // EIGEN_VECTORIZE_AVX
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4ui psignbit(const Packet4ui& a) {
|
||||
return pzero(a);
|
||||
}
|
||||
|
||||
#ifdef EIGEN_VECTORIZE_SSE4_1
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
|
||||
@ -1217,6 +1369,10 @@ EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
|
||||
EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2l pload<Packet2l>(const int64_t* from) {
|
||||
EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) {
|
||||
EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
|
||||
}
|
||||
@ -1251,6 +1407,11 @@ EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
|
||||
return _mm_loadu_pd(from);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2l ploadu<Packet2l>(const int64_t* from) {
|
||||
EIGEN_DEBUG_UNALIGNED_LOAD
|
||||
return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) {
|
||||
EIGEN_DEBUG_UNALIGNED_LOAD
|
||||
return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
|
||||
@ -1299,6 +1460,10 @@ EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
|
||||
return pset1<Packet2d>(from[0]);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2l ploaddup<Packet2l>(const int64_t* from) {
|
||||
return pset1<Packet2l>(from[0]);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) {
|
||||
Packet4i tmp;
|
||||
tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(from));
|
||||
@ -1337,6 +1502,10 @@ EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
|
||||
EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE void pstore<int64_t>(int64_t* to, const Packet2l& from) {
|
||||
EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) {
|
||||
EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
|
||||
}
|
||||
@ -1358,6 +1527,10 @@ EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
|
||||
EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ps(to, from);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE void pstoreu<int64_t>(int64_t* to, const Packet2l& from) {
|
||||
EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) {
|
||||
EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
|
||||
}
|
||||
@ -1393,25 +1566,142 @@ EIGEN_STRONG_INLINE void pstores(double* to, const Packet2d& from) {
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
|
||||
EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
|
||||
return _mm_shuffle_ps(a, a, 0x1B);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
|
||||
return _mm_shuffle_pd(a, a, 0x1);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2l preverse(const Packet2l& a) {
|
||||
return _mm_castpd_si128(preverse(_mm_castsi128_pd(a)));
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
|
||||
return _mm_shuffle_epi32(a, 0x1B);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4ui preverse(const Packet4ui& a) {
|
||||
return _mm_shuffle_epi32(a, 0x1B);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16b preverse(const Packet16b& a) {
|
||||
#ifdef EIGEN_VECTORIZE_SSSE3
|
||||
__m128i mask = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
return _mm_shuffle_epi8(a, mask);
|
||||
#else
|
||||
Packet16b tmp = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 1, 2, 3));
|
||||
tmp = _mm_shufflehi_epi16(_mm_shufflelo_epi16(tmp, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1));
|
||||
return _mm_or_si128(_mm_slli_epi16(tmp, 8), _mm_srli_epi16(tmp, 8));
|
||||
#endif
|
||||
}
|
||||
|
||||
#if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64
|
||||
// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
|
||||
// Direct of the struct members fixed bug #62.
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
|
||||
return a.m128_f32[0];
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
|
||||
return a.m128d_f64[0];
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(const Packet2l& a) {
|
||||
int64_t x = _mm_cvtsi128_si64(a);
|
||||
return x;
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
|
||||
int x = _mm_cvtsi128_si32(a);
|
||||
return x;
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) {
|
||||
uint32_t x = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a));
|
||||
return x;
|
||||
}
|
||||
#elif EIGEN_COMP_MSVC_STRICT
|
||||
// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
|
||||
float x = _mm_cvtss_f32(a);
|
||||
return x;
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
|
||||
double x = _mm_cvtsd_f64(a);
|
||||
return x;
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(const Packet2l& a) {
|
||||
int64_t x = _mm_cvtsi128_si64(a);
|
||||
return x;
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
|
||||
int x = _mm_cvtsi128_si32(a);
|
||||
return x;
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) {
|
||||
uint32_t x = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a));
|
||||
return x;
|
||||
}
|
||||
#else
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
|
||||
return _mm_cvtss_f32(a);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
|
||||
return _mm_cvtsd_f64(a);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(const Packet2l& a) {
|
||||
return _mm_cvtsi128_si64(a);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
|
||||
return _mm_cvtsi128_si32(a);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) {
|
||||
return numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a));
|
||||
}
|
||||
#endif
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE bool pfirst<Packet16b>(const Packet16b& a) {
|
||||
int x = _mm_cvtsi128_si32(a);
|
||||
return static_cast<bool>(x & 1);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
|
||||
return _mm_set_ps(from[3 * stride], from[2 * stride], from[1 * stride], from[0 * stride]);
|
||||
}
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
|
||||
EIGEN_STRONG_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
|
||||
return _mm_set_pd(from[1 * stride], from[0 * stride]);
|
||||
}
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride) {
|
||||
EIGEN_STRONG_INLINE Packet2l pgather<int64_t, Packet2l>(const int64_t* from, Index stride) {
|
||||
return _mm_set_epi64x(from[1 * stride], from[0 * stride]);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4i pgather<int, Packet4i>(const int* from, Index stride) {
|
||||
return _mm_set_epi32(from[3 * stride], from[2 * stride], from[1 * stride], from[0 * stride]);
|
||||
}
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC inline Packet4ui pgather<uint32_t, Packet4ui>(const uint32_t* from, Index stride) {
|
||||
EIGEN_STRONG_INLINE Packet4ui pgather<uint32_t, Packet4ui>(const uint32_t* from, Index stride) {
|
||||
return _mm_set_epi32(numext::bit_cast<int32_t>(from[3 * stride]), numext::bit_cast<int32_t>(from[2 * stride]),
|
||||
numext::bit_cast<int32_t>(from[1 * stride]), numext::bit_cast<int32_t>(from[0 * stride]));
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC inline Packet16b pgather<bool, Packet16b>(const bool* from, Index stride) {
|
||||
EIGEN_STRONG_INLINE Packet16b pgather<bool, Packet16b>(const bool* from, Index stride) {
|
||||
return _mm_set_epi8(from[15 * stride], from[14 * stride], from[13 * stride], from[12 * stride], from[11 * stride],
|
||||
from[10 * stride], from[9 * stride], from[8 * stride], from[7 * stride], from[6 * stride],
|
||||
from[5 * stride], from[4 * stride], from[3 * stride], from[2 * stride], from[1 * stride],
|
||||
@ -1419,33 +1709,38 @@ EIGEN_DEVICE_FUNC inline Packet16b pgather<bool, Packet16b>(const bool* from, In
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
|
||||
to[stride * 0] = _mm_cvtss_f32(from);
|
||||
to[stride * 1] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 1));
|
||||
to[stride * 2] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 2));
|
||||
to[stride * 3] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 3));
|
||||
EIGEN_STRONG_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
|
||||
to[stride * 0] = pfirst(from);
|
||||
to[stride * 1] = pfirst(_mm_shuffle_ps(from, from, 1));
|
||||
to[stride * 2] = pfirst(_mm_shuffle_ps(from, from, 2));
|
||||
to[stride * 3] = pfirst(_mm_shuffle_ps(from, from, 3));
|
||||
}
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
|
||||
to[stride * 0] = _mm_cvtsd_f64(from);
|
||||
to[stride * 1] = _mm_cvtsd_f64(_mm_shuffle_pd(from, from, 1));
|
||||
EIGEN_STRONG_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
|
||||
to[stride * 0] = pfirst(from);
|
||||
to[stride * 1] = pfirst(preverse(from));
|
||||
}
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride) {
|
||||
EIGEN_STRONG_INLINE void pscatter<int64_t, Packet2l>(int64_t* to, const Packet2l& from, Index stride) {
|
||||
to[stride * 0] = pfirst(from);
|
||||
to[stride * 1] = pfirst(preverse(from));
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride) {
|
||||
to[stride * 0] = _mm_cvtsi128_si32(from);
|
||||
to[stride * 1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
|
||||
to[stride * 2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
|
||||
to[stride * 3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
|
||||
}
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC inline void pscatter<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from, Index stride) {
|
||||
EIGEN_STRONG_INLINE void pscatter<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from, Index stride) {
|
||||
to[stride * 0] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(from));
|
||||
to[stride * 1] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1)));
|
||||
to[stride * 2] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2)));
|
||||
to[stride * 3] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3)));
|
||||
}
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC inline void pscatter<bool, Packet16b>(bool* to, const Packet16b& from, Index stride) {
|
||||
EIGEN_STRONG_INLINE void pscatter<bool, Packet16b>(bool* to, const Packet16b& from, Index stride) {
|
||||
to[4 * stride * 0] = _mm_cvtsi128_si32(from);
|
||||
to[4 * stride * 1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
|
||||
to[4 * stride * 2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
|
||||
@ -1485,106 +1780,15 @@ EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) {
|
||||
_mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE void prefetch<int64_t>(const int64_t* addr) {
|
||||
_mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE void prefetch<uint32_t>(const uint32_t* addr) {
|
||||
_mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64
|
||||
// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
|
||||
// Direct of the struct members fixed bug #62.
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
|
||||
return a.m128_f32[0];
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
|
||||
return a.m128d_f64[0];
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
|
||||
int x = _mm_cvtsi128_si32(a);
|
||||
return x;
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) {
|
||||
uint32_t x = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a));
|
||||
return x;
|
||||
}
|
||||
#elif EIGEN_COMP_MSVC_STRICT
|
||||
// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
|
||||
float x = _mm_cvtss_f32(a);
|
||||
return x;
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
|
||||
double x = _mm_cvtsd_f64(a);
|
||||
return x;
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
|
||||
int x = _mm_cvtsi128_si32(a);
|
||||
return x;
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) {
|
||||
uint32_t x = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a));
|
||||
return x;
|
||||
}
|
||||
#else
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
|
||||
return _mm_cvtss_f32(a);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
|
||||
return _mm_cvtsd_f64(a);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
|
||||
return _mm_cvtsi128_si32(a);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) {
|
||||
return numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a));
|
||||
}
|
||||
#endif
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE bool pfirst<Packet16b>(const Packet16b& a) {
|
||||
int x = _mm_cvtsi128_si32(a);
|
||||
return static_cast<bool>(x & 1);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
|
||||
return _mm_shuffle_ps(a, a, 0x1B);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
|
||||
return _mm_shuffle_pd(a, a, 0x1);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
|
||||
return _mm_shuffle_epi32(a, 0x1B);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4ui preverse(const Packet4ui& a) {
|
||||
return _mm_shuffle_epi32(a, 0x1B);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16b preverse(const Packet16b& a) {
|
||||
#ifdef EIGEN_VECTORIZE_SSSE3
|
||||
__m128i mask = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
return _mm_shuffle_epi8(a, mask);
|
||||
#else
|
||||
Packet16b tmp = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 1, 2, 3));
|
||||
tmp = _mm_shufflehi_epi16(_mm_shufflelo_epi16(tmp, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1));
|
||||
return _mm_or_si128(_mm_slli_epi16(tmp, 8), _mm_srli_epi16(tmp, 8));
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
|
||||
return pfrexp_generic(a, exponent);
|
||||
@ -1610,6 +1814,7 @@ EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f&
|
||||
|
||||
// We specialize pldexp here, since the generic implementation uses Packet2l, which is not well
|
||||
// supported by SSE, and has more range than is needed for exponents.
|
||||
// TODO(rmlarsen): Remove this specialization once Packet2l has support or casting.
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
|
||||
// Clamp exponent to [-2099, 2099]
|
||||
@ -1690,6 +1895,11 @@ EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
|
||||
// #endif
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE int64_t predux<Packet2l>(const Packet2l& a) {
|
||||
return pfirst<Packet2l>(_mm_add_epi64(a, _mm_unpackhi_epi64(a, a)));
|
||||
}
|
||||
|
||||
#ifdef EIGEN_VECTORIZE_SSSE3
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) {
|
||||
@ -1701,7 +1911,6 @@ EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a) {
|
||||
Packet4ui tmp0 = _mm_hadd_epi32(a, a);
|
||||
return pfirst<Packet4ui>(_mm_hadd_epi32(tmp0, tmp0));
|
||||
}
|
||||
|
||||
#else
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) {
|
||||
@ -1734,9 +1943,15 @@ EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
|
||||
return pfirst<Packet2d>(_mm_mul_sd(a, _mm_unpackhi_pd(a, a)));
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE int64_t predux_mul<Packet2l>(const Packet2l& a) {
|
||||
EIGEN_ALIGN16 int64_t aux[2];
|
||||
pstore(aux, a);
|
||||
return aux[0] * aux[1];
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) {
|
||||
// after some experiments, it is seems this is the fastest way to implement it
|
||||
// for GCC (eg., reusing pmul is very slow !)
|
||||
// for GCC (e.g., reusing pmul is very slow!)
|
||||
// TODO try to call _mm_mul_epu32 directly
|
||||
EIGEN_ALIGN16 int aux[4];
|
||||
pstore(aux, a);
|
||||
@ -1846,11 +2061,21 @@ EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(const Packet4ui& a) {
|
||||
// return _mm_movemask_ps(x) == 0xF;
|
||||
// }
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE bool predux_any(const Packet2d& x) {
|
||||
return _mm_movemask_pd(x) != 0x0;
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) {
|
||||
return _mm_movemask_ps(x) != 0x0;
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE bool predux_any(const Packet2l& x) {
|
||||
return _mm_movemask_pd(_mm_castsi128_pd(x)) != 0x0;
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE bool predux_any(const Packet4i& x) {
|
||||
return _mm_movemask_ps(_mm_castsi128_ps(x)) != 0x0;
|
||||
@ -1860,17 +2085,23 @@ EIGEN_STRONG_INLINE bool predux_any(const Packet4ui& x) {
|
||||
return _mm_movemask_ps(_mm_castsi128_ps(x)) != 0x0;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
|
||||
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
|
||||
_MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
|
||||
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
|
||||
__m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]);
|
||||
kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]);
|
||||
kernel.packet[1] = tmp;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
|
||||
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2l, 2>& kernel) {
|
||||
__m128i tmp = _mm_unpackhi_epi64(kernel.packet[0], kernel.packet[1]);
|
||||
kernel.packet[0] = _mm_unpacklo_epi64(kernel.packet[0], kernel.packet[1]);
|
||||
kernel.packet[1] = tmp;
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
|
||||
__m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
|
||||
__m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]);
|
||||
__m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]);
|
||||
@ -1881,11 +2112,11 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
|
||||
kernel.packet[2] = _mm_unpacklo_epi64(T2, T3);
|
||||
kernel.packet[3] = _mm_unpackhi_epi64(T2, T3);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4ui, 4>& kernel) {
|
||||
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4ui, 4>& kernel) {
|
||||
ptranspose((PacketBlock<Packet4i, 4>&)kernel);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16b, 4>& kernel) {
|
||||
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16b, 4>& kernel) {
|
||||
__m128i T0 = _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]);
|
||||
__m128i T1 = _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]);
|
||||
__m128i T2 = _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]);
|
||||
@ -1896,7 +2127,7 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16b, 4>& kernel) {
|
||||
kernel.packet[3] = _mm_unpackhi_epi16(T1, T3);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16b, 16>& kernel) {
|
||||
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16b, 16>& kernel) {
|
||||
// If we number the elements in the input thus:
|
||||
// kernel.packet[ 0] = {00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 0a, 0b, 0c, 0d, 0e, 0f}
|
||||
// kernel.packet[ 1] = {10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 1a, 1b, 1c, 1d, 1e, 1f}
|
||||
@ -1982,6 +2213,18 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16b, 16>& kernel) {
|
||||
kernel.packet[15] = _mm_unpackhi_epi64(u7, uf);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2l pblend(const Selector<2>& ifPacket, const Packet2l& thenPacket,
|
||||
const Packet2l& elsePacket) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i select = _mm_set_epi64x(ifPacket.select[1], ifPacket.select[0]);
|
||||
__m128i false_mask = pcmp_eq<Packet2l>(select, zero);
|
||||
#ifdef EIGEN_VECTORIZE_SSE4_1
|
||||
return _mm_blendv_epi8(thenPacket, elsePacket, false_mask);
|
||||
#else
|
||||
return _mm_or_si128(_mm_andnot_si128(false_mask, thenPacket), _mm_and_si128(false_mask, elsePacket));
|
||||
#endif
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket,
|
||||
const Packet4i& elsePacket) {
|
||||
@ -2189,11 +2432,6 @@ struct packet_traits<Eigen::half> : default_packet_traits {
|
||||
HasMax = 0,
|
||||
HasConj = 0,
|
||||
HasSetLinear = 0,
|
||||
HasSqrt = 0,
|
||||
HasRsqrt = 0,
|
||||
HasExp = 0,
|
||||
HasLog = 0,
|
||||
HasBlend = 0
|
||||
};
|
||||
};
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user