Unify SSE and AVX pexp for double.

This commit is contained in:
Gael Guennebaud 2018-11-26 23:12:44 +01:00
parent 4a347a0054
commit 502f92fa10
5 changed files with 126 additions and 163 deletions

View File

@ -123,82 +123,8 @@ ptanh<Packet8f>(const Packet8f& x) {
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d
pexp<Packet4d>(const Packet4d& _x) {
Packet4d x = _x;
_EIGEN_DECLARE_CONST_Packet4d(1, 1.0);
_EIGEN_DECLARE_CONST_Packet4d(2, 2.0);
_EIGEN_DECLARE_CONST_Packet4d(half, 0.5);
_EIGEN_DECLARE_CONST_Packet4d(exp_hi, 709.437);
_EIGEN_DECLARE_CONST_Packet4d(exp_lo, -709.436139303);
_EIGEN_DECLARE_CONST_Packet4d(cephes_LOG2EF, 1.4426950408889634073599);
_EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p0, 1.26177193074810590878e-4);
_EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p1, 3.02994407707441961300e-2);
_EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p2, 9.99999999999999999910e-1);
_EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q0, 3.00198505138664455042e-6);
_EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q1, 2.52448340349684104192e-3);
_EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q2, 2.27265548208155028766e-1);
_EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q3, 2.00000000000000000009e0);
_EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C1, 0.693145751953125);
_EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C2, 1.42860682030941723212e-6);
_EIGEN_DECLARE_CONST_Packet4i(1023, 1023);
Packet4d tmp, fx;
// clamp x
x = pmax(pmin(x, p4d_exp_hi), p4d_exp_lo);
// Express exp(x) as exp(g + n*log(2)).
fx = pmadd(p4d_cephes_LOG2EF, x, p4d_half);
// Get the integer modulus of log(2), i.e. the "n" described above.
fx = _mm256_floor_pd(fx);
// Get the remainder modulo log(2), i.e. the "g" described above. Subtract
// n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
// digits right.
tmp = pmul(fx, p4d_cephes_exp_C1);
Packet4d z = pmul(fx, p4d_cephes_exp_C2);
x = psub(x, tmp);
x = psub(x, z);
Packet4d x2 = pmul(x, x);
// Evaluate the numerator polynomial of the rational interpolant.
Packet4d px = p4d_cephes_exp_p0;
px = pmadd(px, x2, p4d_cephes_exp_p1);
px = pmadd(px, x2, p4d_cephes_exp_p2);
px = pmul(px, x);
// Evaluate the denominator polynomial of the rational interpolant.
Packet4d qx = p4d_cephes_exp_q0;
qx = pmadd(qx, x2, p4d_cephes_exp_q1);
qx = pmadd(qx, x2, p4d_cephes_exp_q2);
qx = pmadd(qx, x2, p4d_cephes_exp_q3);
// I don't really get this bit, copied from the SSE2 routines, so...
// TODO(gonnet): Figure out what is going on here, perhaps find a better
// rational interpolant?
x = _mm256_div_pd(px, psub(qx, px));
x = pmadd(p4d_2, x, p4d_1);
// Build e=2^n by constructing the exponents in a 128-bit vector and
// shifting them to where they belong in double-precision values.
__m128i emm0 = _mm256_cvtpd_epi32(fx);
emm0 = _mm_add_epi32(emm0, p4i_1023);
emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0));
__m128i lo = _mm_slli_epi64(emm0, 52);
__m128i hi = _mm_slli_epi64(_mm_srli_epi64(emm0, 32), 52);
__m256i e = _mm256_insertf128_si256(_mm256_setzero_si256(), lo, 0);
e = _mm256_insertf128_si256(e, hi, 1);
// Construct the result 2^n * exp(g) = e * x. The max is used to catch
// non-finite values in the input.
return pmax(pmul(x, _mm256_castsi256_pd(e)), _x);
pexp<Packet4d>(const Packet4d& x) {
return pexp_double(x);
}
// Functions for sqrt.

View File

@ -405,6 +405,20 @@ template<> EIGEN_STRONG_INLINE Packet8f pldexp<Packet8f>(const Packet8f& a, cons
return pldexp_float(a,exponent);
}
template<> EIGEN_STRONG_INLINE Packet4d pldexp<Packet4d>(const Packet4d& a, const Packet4d& exponent) {
// Build e=2^n by constructing the exponents in a 128-bit vector and
// shifting them to where they belong in double-precision values.
Packet4i cst_1023 = pset1<Packet4i>(1023);
__m128i emm0 = _mm256_cvtpd_epi32(exponent);
emm0 = _mm_add_epi32(emm0, cst_1023);
emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0));
__m128i lo = _mm_slli_epi64(emm0, 52);
__m128i hi = _mm_slli_epi64(_mm_srli_epi64(emm0, 32), 52);
__m256i e = _mm256_insertf128_si256(_mm256_setzero_si256(), lo, 0);
e = _mm256_insertf128_si256(e, hi, 1);
return pmul(a,_mm256_castsi256_pd(e));
}
// preduxp should be ok
// FIXME: why is this ok? why isn't the simply implementation working as expected?
template<> EIGEN_STRONG_INLINE Packet8f preduxp<Packet8f>(const Packet8f* vecs)

View File

@ -160,5 +160,74 @@ Packet pexp_float(const Packet _x)
return pmax(pldexp(y,m), _x);
}
template <typename Packet>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet pexp_double(const Packet _x)
{
Packet x = _x;
const Packet cst_1 = pset1<Packet>(1.0);
const Packet cst_2 = pset1<Packet>(2.0);
const Packet cst_half = pset1<Packet>(0.5);
const Packet cst_exp_hi = pset1<Packet>(709.437);
const Packet cst_exp_lo = pset1<Packet>(-709.436139303);
const Packet cst_cephes_LOG2EF = pset1<Packet>(1.4426950408889634073599);
const Packet cst_cephes_exp_p0 = pset1<Packet>(1.26177193074810590878e-4);
const Packet cst_cephes_exp_p1 = pset1<Packet>(3.02994407707441961300e-2);
const Packet cst_cephes_exp_p2 = pset1<Packet>(9.99999999999999999910e-1);
const Packet cst_cephes_exp_q0 = pset1<Packet>(3.00198505138664455042e-6);
const Packet cst_cephes_exp_q1 = pset1<Packet>(2.52448340349684104192e-3);
const Packet cst_cephes_exp_q2 = pset1<Packet>(2.27265548208155028766e-1);
const Packet cst_cephes_exp_q3 = pset1<Packet>(2.00000000000000000009e0);
const Packet cst_cephes_exp_C1 = pset1<Packet>(0.693145751953125);
const Packet cst_cephes_exp_C2 = pset1<Packet>(1.42860682030941723212e-6);
Packet tmp, fx;
// clamp x
x = pmax(pmin(x, cst_exp_hi), cst_exp_lo);
// Express exp(x) as exp(g + n*log(2)).
fx = pmadd(cst_cephes_LOG2EF, x, cst_half);
// Get the integer modulus of log(2), i.e. the "n" described above.
fx = pfloor(fx);
// Get the remainder modulo log(2), i.e. the "g" described above. Subtract
// n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
// digits right.
tmp = pmul(fx, cst_cephes_exp_C1);
Packet z = pmul(fx, cst_cephes_exp_C2);
x = psub(x, tmp);
x = psub(x, z);
Packet x2 = pmul(x, x);
// Evaluate the numerator polynomial of the rational interpolant.
Packet px = cst_cephes_exp_p0;
px = pmadd(px, x2, cst_cephes_exp_p1);
px = pmadd(px, x2, cst_cephes_exp_p2);
px = pmul(px, x);
// Evaluate the denominator polynomial of the rational interpolant.
Packet qx = cst_cephes_exp_q0;
qx = pmadd(qx, x2, cst_cephes_exp_q1);
qx = pmadd(qx, x2, cst_cephes_exp_q2);
qx = pmadd(qx, x2, cst_cephes_exp_q3);
// I don't really get this bit, copied from the SSE2 routines, so...
// TODO(gonnet): Figure out what is going on here, perhaps find a better
// rational interpolant?
x = pdiv(px, psub(qx, px));
x = pmadd(cst_2, x, cst_1);
// Construct the result 2^n * exp(g) = e * x. The max is used to catch
// non-finite values in the input.
//return pmax(pmul(x, _mm256_castsi256_pd(e)), _x);
return pmax(pldexp(x,fx), _x);
}
} // end namespace internal
} // end namespace Eigen

View File

@ -32,78 +32,11 @@ Packet4f pexp<Packet4f>(const Packet4f& _x)
{
return pexp_float(_x);
}
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet2d pexp<Packet2d>(const Packet2d& _x)
Packet2d pexp<Packet2d>(const Packet2d& x)
{
Packet2d x = _x;
_EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
_EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
_EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
_EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437);
_EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303);
_EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
_EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
_EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
_EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
_EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
_EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
_EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
_EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
_EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
_EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
static const __m128i p4i_1023_0 = _mm_setr_epi32(1023, 1023, 0, 0);
Packet2d tmp, fx;
Packet4i emm0;
// clamp x
x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo);
/* express exp(x) as exp(g + n*log(2)) */
fx = pmadd(p2d_cephes_LOG2EF, x, p2d_half);
#ifdef EIGEN_VECTORIZE_SSE4_1
fx = _mm_floor_pd(fx);
#else
emm0 = _mm_cvttpd_epi32(fx);
tmp = _mm_cvtepi32_pd(emm0);
/* if greater, substract 1 */
Packet2d mask = _mm_cmpgt_pd(tmp, fx);
mask = _mm_and_pd(mask, p2d_1);
fx = psub(tmp, mask);
#endif
tmp = pmul(fx, p2d_cephes_exp_C1);
Packet2d z = pmul(fx, p2d_cephes_exp_C2);
x = psub(x, tmp);
x = psub(x, z);
Packet2d x2 = pmul(x,x);
Packet2d px = p2d_cephes_exp_p0;
px = pmadd(px, x2, p2d_cephes_exp_p1);
px = pmadd(px, x2, p2d_cephes_exp_p2);
px = pmul (px, x);
Packet2d qx = p2d_cephes_exp_q0;
qx = pmadd(qx, x2, p2d_cephes_exp_q1);
qx = pmadd(qx, x2, p2d_cephes_exp_q2);
qx = pmadd(qx, x2, p2d_cephes_exp_q3);
x = pdiv(px,psub(qx,px));
x = pmadd(p2d_2,x,p2d_1);
// build 2^n
emm0 = _mm_cvttpd_epi32(fx);
emm0 = _mm_add_epi32(emm0, p4i_1023_0);
emm0 = _mm_slli_epi32(emm0, 20);
emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3));
return pmax(pmul(x, Packet2d(_mm_castsi128_pd(emm0))), _x);
return pexp_double(x);
}
/* evaluation of 4 sines at once, using SSE2 intrinsics.

View File

@ -338,6 +338,21 @@ template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4
template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return _mm_cmpeq_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(b,a); }
template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(b,a); }
template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(b,a); }
#ifdef EIGEN_VECTORIZE_SSE4_1
template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return _mm_round_ps(a, 0); }
@ -356,27 +371,24 @@ template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
Packet4f tmp = _mm_cvtepi32_ps(emm0);
/* if greater, substract 1 */
Packet4f mask = _mm_cmpgt_ps(tmp, a);
mask = _mm_and_ps(mask, cst_1);
mask = pand(mask, cst_1);
return psub(tmp, mask);
}
// WARNING: this pfloor implementation makes sense for small inputs only,
// It is currently only used by pexp and not exposed through HasFloor.
template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a)
{
const Packet2d cst_1 = pset1<Packet2d>(1.0);
Packet4i emm0 = _mm_cvttpd_epi32(a);
Packet2d tmp = _mm_cvtepi32_pd(emm0);
/* if greater, substract 1 */
Packet2d mask = _mm_cmpgt_pd(tmp, a);
mask = pand(mask, cst_1);
return psub(tmp, mask);
}
#endif
template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(b,a); }
template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(b,a); }
template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(b,a); }
template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
@ -557,6 +569,15 @@ template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, cons
return pldexp_float(a,exponent);
}
template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
const __m128i cst_1023_0 = _mm_setr_epi32(1023, 1023, 0, 0);
Packet4i emm0 = _mm_cvttpd_epi32(exponent);
emm0 = padd(emm0, cst_1023_0);
emm0 = _mm_slli_epi32(emm0, 20);
emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3));
return pmul(a, Packet2d(_mm_castsi128_pd(emm0)));
}
// with AVX, the default implementations based on pload1 are faster
#ifndef __AVX__
template<> EIGEN_STRONG_INLINE void