mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-11 11:19:02 +08:00
Unify SSE and AVX pexp for double.
This commit is contained in:
parent
4a347a0054
commit
502f92fa10
@ -123,82 +123,8 @@ ptanh<Packet8f>(const Packet8f& x) {
|
|||||||
|
|
||||||
template <>
|
template <>
|
||||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d
|
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d
|
||||||
pexp<Packet4d>(const Packet4d& _x) {
|
pexp<Packet4d>(const Packet4d& x) {
|
||||||
Packet4d x = _x;
|
return pexp_double(x);
|
||||||
|
|
||||||
_EIGEN_DECLARE_CONST_Packet4d(1, 1.0);
|
|
||||||
_EIGEN_DECLARE_CONST_Packet4d(2, 2.0);
|
|
||||||
_EIGEN_DECLARE_CONST_Packet4d(half, 0.5);
|
|
||||||
|
|
||||||
_EIGEN_DECLARE_CONST_Packet4d(exp_hi, 709.437);
|
|
||||||
_EIGEN_DECLARE_CONST_Packet4d(exp_lo, -709.436139303);
|
|
||||||
|
|
||||||
_EIGEN_DECLARE_CONST_Packet4d(cephes_LOG2EF, 1.4426950408889634073599);
|
|
||||||
|
|
||||||
_EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p0, 1.26177193074810590878e-4);
|
|
||||||
_EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p1, 3.02994407707441961300e-2);
|
|
||||||
_EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p2, 9.99999999999999999910e-1);
|
|
||||||
|
|
||||||
_EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q0, 3.00198505138664455042e-6);
|
|
||||||
_EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q1, 2.52448340349684104192e-3);
|
|
||||||
_EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q2, 2.27265548208155028766e-1);
|
|
||||||
_EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q3, 2.00000000000000000009e0);
|
|
||||||
|
|
||||||
_EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C1, 0.693145751953125);
|
|
||||||
_EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C2, 1.42860682030941723212e-6);
|
|
||||||
_EIGEN_DECLARE_CONST_Packet4i(1023, 1023);
|
|
||||||
|
|
||||||
Packet4d tmp, fx;
|
|
||||||
|
|
||||||
// clamp x
|
|
||||||
x = pmax(pmin(x, p4d_exp_hi), p4d_exp_lo);
|
|
||||||
// Express exp(x) as exp(g + n*log(2)).
|
|
||||||
fx = pmadd(p4d_cephes_LOG2EF, x, p4d_half);
|
|
||||||
|
|
||||||
// Get the integer modulus of log(2), i.e. the "n" described above.
|
|
||||||
fx = _mm256_floor_pd(fx);
|
|
||||||
|
|
||||||
// Get the remainder modulo log(2), i.e. the "g" described above. Subtract
|
|
||||||
// n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
|
|
||||||
// digits right.
|
|
||||||
tmp = pmul(fx, p4d_cephes_exp_C1);
|
|
||||||
Packet4d z = pmul(fx, p4d_cephes_exp_C2);
|
|
||||||
x = psub(x, tmp);
|
|
||||||
x = psub(x, z);
|
|
||||||
|
|
||||||
Packet4d x2 = pmul(x, x);
|
|
||||||
|
|
||||||
// Evaluate the numerator polynomial of the rational interpolant.
|
|
||||||
Packet4d px = p4d_cephes_exp_p0;
|
|
||||||
px = pmadd(px, x2, p4d_cephes_exp_p1);
|
|
||||||
px = pmadd(px, x2, p4d_cephes_exp_p2);
|
|
||||||
px = pmul(px, x);
|
|
||||||
|
|
||||||
// Evaluate the denominator polynomial of the rational interpolant.
|
|
||||||
Packet4d qx = p4d_cephes_exp_q0;
|
|
||||||
qx = pmadd(qx, x2, p4d_cephes_exp_q1);
|
|
||||||
qx = pmadd(qx, x2, p4d_cephes_exp_q2);
|
|
||||||
qx = pmadd(qx, x2, p4d_cephes_exp_q3);
|
|
||||||
|
|
||||||
// I don't really get this bit, copied from the SSE2 routines, so...
|
|
||||||
// TODO(gonnet): Figure out what is going on here, perhaps find a better
|
|
||||||
// rational interpolant?
|
|
||||||
x = _mm256_div_pd(px, psub(qx, px));
|
|
||||||
x = pmadd(p4d_2, x, p4d_1);
|
|
||||||
|
|
||||||
// Build e=2^n by constructing the exponents in a 128-bit vector and
|
|
||||||
// shifting them to where they belong in double-precision values.
|
|
||||||
__m128i emm0 = _mm256_cvtpd_epi32(fx);
|
|
||||||
emm0 = _mm_add_epi32(emm0, p4i_1023);
|
|
||||||
emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0));
|
|
||||||
__m128i lo = _mm_slli_epi64(emm0, 52);
|
|
||||||
__m128i hi = _mm_slli_epi64(_mm_srli_epi64(emm0, 32), 52);
|
|
||||||
__m256i e = _mm256_insertf128_si256(_mm256_setzero_si256(), lo, 0);
|
|
||||||
e = _mm256_insertf128_si256(e, hi, 1);
|
|
||||||
|
|
||||||
// Construct the result 2^n * exp(g) = e * x. The max is used to catch
|
|
||||||
// non-finite values in the input.
|
|
||||||
return pmax(pmul(x, _mm256_castsi256_pd(e)), _x);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Functions for sqrt.
|
// Functions for sqrt.
|
||||||
|
@ -405,6 +405,20 @@ template<> EIGEN_STRONG_INLINE Packet8f pldexp<Packet8f>(const Packet8f& a, cons
|
|||||||
return pldexp_float(a,exponent);
|
return pldexp_float(a,exponent);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4d pldexp<Packet4d>(const Packet4d& a, const Packet4d& exponent) {
|
||||||
|
// Build e=2^n by constructing the exponents in a 128-bit vector and
|
||||||
|
// shifting them to where they belong in double-precision values.
|
||||||
|
Packet4i cst_1023 = pset1<Packet4i>(1023);
|
||||||
|
__m128i emm0 = _mm256_cvtpd_epi32(exponent);
|
||||||
|
emm0 = _mm_add_epi32(emm0, cst_1023);
|
||||||
|
emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0));
|
||||||
|
__m128i lo = _mm_slli_epi64(emm0, 52);
|
||||||
|
__m128i hi = _mm_slli_epi64(_mm_srli_epi64(emm0, 32), 52);
|
||||||
|
__m256i e = _mm256_insertf128_si256(_mm256_setzero_si256(), lo, 0);
|
||||||
|
e = _mm256_insertf128_si256(e, hi, 1);
|
||||||
|
return pmul(a,_mm256_castsi256_pd(e));
|
||||||
|
}
|
||||||
|
|
||||||
// preduxp should be ok
|
// preduxp should be ok
|
||||||
// FIXME: why is this ok? why isn't the simply implementation working as expected?
|
// FIXME: why is this ok? why isn't the simply implementation working as expected?
|
||||||
template<> EIGEN_STRONG_INLINE Packet8f preduxp<Packet8f>(const Packet8f* vecs)
|
template<> EIGEN_STRONG_INLINE Packet8f preduxp<Packet8f>(const Packet8f* vecs)
|
||||||
|
@ -160,5 +160,74 @@ Packet pexp_float(const Packet _x)
|
|||||||
return pmax(pldexp(y,m), _x);
|
return pmax(pldexp(y,m), _x);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename Packet>
|
||||||
|
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||||
|
EIGEN_UNUSED
|
||||||
|
Packet pexp_double(const Packet _x)
|
||||||
|
{
|
||||||
|
Packet x = _x;
|
||||||
|
|
||||||
|
const Packet cst_1 = pset1<Packet>(1.0);
|
||||||
|
const Packet cst_2 = pset1<Packet>(2.0);
|
||||||
|
const Packet cst_half = pset1<Packet>(0.5);
|
||||||
|
|
||||||
|
const Packet cst_exp_hi = pset1<Packet>(709.437);
|
||||||
|
const Packet cst_exp_lo = pset1<Packet>(-709.436139303);
|
||||||
|
|
||||||
|
const Packet cst_cephes_LOG2EF = pset1<Packet>(1.4426950408889634073599);
|
||||||
|
const Packet cst_cephes_exp_p0 = pset1<Packet>(1.26177193074810590878e-4);
|
||||||
|
const Packet cst_cephes_exp_p1 = pset1<Packet>(3.02994407707441961300e-2);
|
||||||
|
const Packet cst_cephes_exp_p2 = pset1<Packet>(9.99999999999999999910e-1);
|
||||||
|
const Packet cst_cephes_exp_q0 = pset1<Packet>(3.00198505138664455042e-6);
|
||||||
|
const Packet cst_cephes_exp_q1 = pset1<Packet>(2.52448340349684104192e-3);
|
||||||
|
const Packet cst_cephes_exp_q2 = pset1<Packet>(2.27265548208155028766e-1);
|
||||||
|
const Packet cst_cephes_exp_q3 = pset1<Packet>(2.00000000000000000009e0);
|
||||||
|
const Packet cst_cephes_exp_C1 = pset1<Packet>(0.693145751953125);
|
||||||
|
const Packet cst_cephes_exp_C2 = pset1<Packet>(1.42860682030941723212e-6);
|
||||||
|
|
||||||
|
Packet tmp, fx;
|
||||||
|
|
||||||
|
// clamp x
|
||||||
|
x = pmax(pmin(x, cst_exp_hi), cst_exp_lo);
|
||||||
|
// Express exp(x) as exp(g + n*log(2)).
|
||||||
|
fx = pmadd(cst_cephes_LOG2EF, x, cst_half);
|
||||||
|
|
||||||
|
// Get the integer modulus of log(2), i.e. the "n" described above.
|
||||||
|
fx = pfloor(fx);
|
||||||
|
|
||||||
|
// Get the remainder modulo log(2), i.e. the "g" described above. Subtract
|
||||||
|
// n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
|
||||||
|
// digits right.
|
||||||
|
tmp = pmul(fx, cst_cephes_exp_C1);
|
||||||
|
Packet z = pmul(fx, cst_cephes_exp_C2);
|
||||||
|
x = psub(x, tmp);
|
||||||
|
x = psub(x, z);
|
||||||
|
|
||||||
|
Packet x2 = pmul(x, x);
|
||||||
|
|
||||||
|
// Evaluate the numerator polynomial of the rational interpolant.
|
||||||
|
Packet px = cst_cephes_exp_p0;
|
||||||
|
px = pmadd(px, x2, cst_cephes_exp_p1);
|
||||||
|
px = pmadd(px, x2, cst_cephes_exp_p2);
|
||||||
|
px = pmul(px, x);
|
||||||
|
|
||||||
|
// Evaluate the denominator polynomial of the rational interpolant.
|
||||||
|
Packet qx = cst_cephes_exp_q0;
|
||||||
|
qx = pmadd(qx, x2, cst_cephes_exp_q1);
|
||||||
|
qx = pmadd(qx, x2, cst_cephes_exp_q2);
|
||||||
|
qx = pmadd(qx, x2, cst_cephes_exp_q3);
|
||||||
|
|
||||||
|
// I don't really get this bit, copied from the SSE2 routines, so...
|
||||||
|
// TODO(gonnet): Figure out what is going on here, perhaps find a better
|
||||||
|
// rational interpolant?
|
||||||
|
x = pdiv(px, psub(qx, px));
|
||||||
|
x = pmadd(cst_2, x, cst_1);
|
||||||
|
|
||||||
|
// Construct the result 2^n * exp(g) = e * x. The max is used to catch
|
||||||
|
// non-finite values in the input.
|
||||||
|
//return pmax(pmul(x, _mm256_castsi256_pd(e)), _x);
|
||||||
|
return pmax(pldexp(x,fx), _x);
|
||||||
|
}
|
||||||
|
|
||||||
} // end namespace internal
|
} // end namespace internal
|
||||||
} // end namespace Eigen
|
} // end namespace Eigen
|
||||||
|
@ -32,78 +32,11 @@ Packet4f pexp<Packet4f>(const Packet4f& _x)
|
|||||||
{
|
{
|
||||||
return pexp_float(_x);
|
return pexp_float(_x);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
|
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
|
||||||
Packet2d pexp<Packet2d>(const Packet2d& _x)
|
Packet2d pexp<Packet2d>(const Packet2d& x)
|
||||||
{
|
{
|
||||||
Packet2d x = _x;
|
return pexp_double(x);
|
||||||
|
|
||||||
_EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
|
|
||||||
_EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
|
|
||||||
_EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
|
|
||||||
|
|
||||||
_EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437);
|
|
||||||
_EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303);
|
|
||||||
|
|
||||||
_EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
|
|
||||||
|
|
||||||
_EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
|
|
||||||
_EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
|
|
||||||
_EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
|
|
||||||
|
|
||||||
_EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
|
|
||||||
_EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
|
|
||||||
_EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
|
|
||||||
_EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
|
|
||||||
|
|
||||||
_EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
|
|
||||||
_EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
|
|
||||||
static const __m128i p4i_1023_0 = _mm_setr_epi32(1023, 1023, 0, 0);
|
|
||||||
|
|
||||||
Packet2d tmp, fx;
|
|
||||||
Packet4i emm0;
|
|
||||||
|
|
||||||
// clamp x
|
|
||||||
x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo);
|
|
||||||
/* express exp(x) as exp(g + n*log(2)) */
|
|
||||||
fx = pmadd(p2d_cephes_LOG2EF, x, p2d_half);
|
|
||||||
|
|
||||||
#ifdef EIGEN_VECTORIZE_SSE4_1
|
|
||||||
fx = _mm_floor_pd(fx);
|
|
||||||
#else
|
|
||||||
emm0 = _mm_cvttpd_epi32(fx);
|
|
||||||
tmp = _mm_cvtepi32_pd(emm0);
|
|
||||||
/* if greater, substract 1 */
|
|
||||||
Packet2d mask = _mm_cmpgt_pd(tmp, fx);
|
|
||||||
mask = _mm_and_pd(mask, p2d_1);
|
|
||||||
fx = psub(tmp, mask);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
tmp = pmul(fx, p2d_cephes_exp_C1);
|
|
||||||
Packet2d z = pmul(fx, p2d_cephes_exp_C2);
|
|
||||||
x = psub(x, tmp);
|
|
||||||
x = psub(x, z);
|
|
||||||
|
|
||||||
Packet2d x2 = pmul(x,x);
|
|
||||||
|
|
||||||
Packet2d px = p2d_cephes_exp_p0;
|
|
||||||
px = pmadd(px, x2, p2d_cephes_exp_p1);
|
|
||||||
px = pmadd(px, x2, p2d_cephes_exp_p2);
|
|
||||||
px = pmul (px, x);
|
|
||||||
|
|
||||||
Packet2d qx = p2d_cephes_exp_q0;
|
|
||||||
qx = pmadd(qx, x2, p2d_cephes_exp_q1);
|
|
||||||
qx = pmadd(qx, x2, p2d_cephes_exp_q2);
|
|
||||||
qx = pmadd(qx, x2, p2d_cephes_exp_q3);
|
|
||||||
|
|
||||||
x = pdiv(px,psub(qx,px));
|
|
||||||
x = pmadd(p2d_2,x,p2d_1);
|
|
||||||
|
|
||||||
// build 2^n
|
|
||||||
emm0 = _mm_cvttpd_epi32(fx);
|
|
||||||
emm0 = _mm_add_epi32(emm0, p4i_1023_0);
|
|
||||||
emm0 = _mm_slli_epi32(emm0, 20);
|
|
||||||
emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3));
|
|
||||||
return pmax(pmul(x, Packet2d(_mm_castsi128_pd(emm0))), _x);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* evaluation of 4 sines at once, using SSE2 intrinsics.
|
/* evaluation of 4 sines at once, using SSE2 intrinsics.
|
||||||
|
@ -338,6 +338,21 @@ template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4
|
|||||||
template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return _mm_cmpeq_ps(a,b); }
|
template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return _mm_cmpeq_ps(a,b); }
|
||||||
template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); }
|
template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); }
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); }
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); }
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); }
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(b,a); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(b,a); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(b,a); }
|
||||||
|
|
||||||
#ifdef EIGEN_VECTORIZE_SSE4_1
|
#ifdef EIGEN_VECTORIZE_SSE4_1
|
||||||
template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return _mm_round_ps(a, 0); }
|
template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return _mm_round_ps(a, 0); }
|
||||||
@ -356,27 +371,24 @@ template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
|
|||||||
Packet4f tmp = _mm_cvtepi32_ps(emm0);
|
Packet4f tmp = _mm_cvtepi32_ps(emm0);
|
||||||
/* if greater, substract 1 */
|
/* if greater, substract 1 */
|
||||||
Packet4f mask = _mm_cmpgt_ps(tmp, a);
|
Packet4f mask = _mm_cmpgt_ps(tmp, a);
|
||||||
mask = _mm_and_ps(mask, cst_1);
|
mask = pand(mask, cst_1);
|
||||||
|
return psub(tmp, mask);
|
||||||
|
}
|
||||||
|
|
||||||
|
// WARNING: this pfloor implementation makes sense for small inputs only,
|
||||||
|
// It is currently only used by pexp and not exposed through HasFloor.
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a)
|
||||||
|
{
|
||||||
|
const Packet2d cst_1 = pset1<Packet2d>(1.0);
|
||||||
|
Packet4i emm0 = _mm_cvttpd_epi32(a);
|
||||||
|
Packet2d tmp = _mm_cvtepi32_pd(emm0);
|
||||||
|
/* if greater, substract 1 */
|
||||||
|
Packet2d mask = _mm_cmpgt_pd(tmp, a);
|
||||||
|
mask = pand(mask, cst_1);
|
||||||
return psub(tmp, mask);
|
return psub(tmp, mask);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); }
|
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); }
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); }
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); }
|
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); }
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); }
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); }
|
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(b,a); }
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(b,a); }
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(b,a); }
|
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
|
template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
|
||||||
template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
|
template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
|
||||||
template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
|
template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
|
||||||
@ -557,6 +569,15 @@ template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, cons
|
|||||||
return pldexp_float(a,exponent);
|
return pldexp_float(a,exponent);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
|
||||||
|
const __m128i cst_1023_0 = _mm_setr_epi32(1023, 1023, 0, 0);
|
||||||
|
Packet4i emm0 = _mm_cvttpd_epi32(exponent);
|
||||||
|
emm0 = padd(emm0, cst_1023_0);
|
||||||
|
emm0 = _mm_slli_epi32(emm0, 20);
|
||||||
|
emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3));
|
||||||
|
return pmul(a, Packet2d(_mm_castsi128_pd(emm0)));
|
||||||
|
}
|
||||||
|
|
||||||
// with AVX, the default implementations based on pload1 are faster
|
// with AVX, the default implementations based on pload1 are faster
|
||||||
#ifndef __AVX__
|
#ifndef __AVX__
|
||||||
template<> EIGEN_STRONG_INLINE void
|
template<> EIGEN_STRONG_INLINE void
|
||||||
|
Loading…
x
Reference in New Issue
Block a user