Unify SSE and AVX pexp for double.

2025-08-11 11:19:02 +08:00 · 2018-11-26 23:12:44 +01:00 · 2018-11-26 23:12:44 +01:00 · 502f92fa10
commit 502f92fa10
parent 4a347a0054
5 changed files with 126 additions and 163 deletions
--- a/Eigen/src/Core/arch/AVX/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX/MathFunctions.h
@ -123,82 +123,8 @@ ptanh<Packet8f>(const Packet8f& x) {
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d
-pexp<Packet4d>(const Packet4d& _x) {
+pexp<Packet4d>(const Packet4d& x) {
-  Packet4d x = _x;
+  return pexp_double(x);
  _EIGEN_DECLARE_CONST_Packet4d(1, 1.0);
  _EIGEN_DECLARE_CONST_Packet4d(2, 2.0);
  _EIGEN_DECLARE_CONST_Packet4d(half, 0.5);
  _EIGEN_DECLARE_CONST_Packet4d(exp_hi, 709.437);
  _EIGEN_DECLARE_CONST_Packet4d(exp_lo, -709.436139303);
  _EIGEN_DECLARE_CONST_Packet4d(cephes_LOG2EF, 1.4426950408889634073599);
  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p0, 1.26177193074810590878e-4);
  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p1, 3.02994407707441961300e-2);
  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p2, 9.99999999999999999910e-1);
  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q0, 3.00198505138664455042e-6);
  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q1, 2.52448340349684104192e-3);
  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q2, 2.27265548208155028766e-1);
  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q3, 2.00000000000000000009e0);
  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C1, 0.693145751953125);
  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C2, 1.42860682030941723212e-6);
  _EIGEN_DECLARE_CONST_Packet4i(1023, 1023);
  Packet4d tmp, fx;
  // clamp x
  x = pmax(pmin(x, p4d_exp_hi), p4d_exp_lo);
  // Express exp(x) as exp(g + n*log(2)).
  fx = pmadd(p4d_cephes_LOG2EF, x, p4d_half);
  // Get the integer modulus of log(2), i.e. the "n" described above.
  fx = _mm256_floor_pd(fx);
  // Get the remainder modulo log(2), i.e. the "g" described above. Subtract
  // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
  // digits right.
  tmp = pmul(fx, p4d_cephes_exp_C1);
  Packet4d z = pmul(fx, p4d_cephes_exp_C2);
  x = psub(x, tmp);
  x = psub(x, z);
  Packet4d x2 = pmul(x, x);
  // Evaluate the numerator polynomial of the rational interpolant.
  Packet4d px = p4d_cephes_exp_p0;
  px = pmadd(px, x2, p4d_cephes_exp_p1);
  px = pmadd(px, x2, p4d_cephes_exp_p2);
  px = pmul(px, x);
  // Evaluate the denominator polynomial of the rational interpolant.
  Packet4d qx = p4d_cephes_exp_q0;
  qx = pmadd(qx, x2, p4d_cephes_exp_q1);
  qx = pmadd(qx, x2, p4d_cephes_exp_q2);
  qx = pmadd(qx, x2, p4d_cephes_exp_q3);
  // I don't really get this bit, copied from the SSE2 routines, so...
  // TODO(gonnet): Figure out what is going on here, perhaps find a better
  // rational interpolant?
  x = _mm256_div_pd(px, psub(qx, px));
  x = pmadd(p4d_2, x, p4d_1);
  // Build e=2^n by constructing the exponents in a 128-bit vector and
  // shifting them to where they belong in double-precision values.
  __m128i emm0 = _mm256_cvtpd_epi32(fx);
  emm0 = _mm_add_epi32(emm0, p4i_1023);
  emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0));
  __m128i lo = _mm_slli_epi64(emm0, 52);
  __m128i hi = _mm_slli_epi64(_mm_srli_epi64(emm0, 32), 52);
  __m256i e = _mm256_insertf128_si256(_mm256_setzero_si256(), lo, 0);
  e = _mm256_insertf128_si256(e, hi, 1);
  // Construct the result 2^n * exp(g) = e * x. The max is used to catch
  // non-finite values in the input.
  return pmax(pmul(x, _mm256_castsi256_pd(e)), _x);
 }
 // Functions for sqrt.
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@ -405,6 +405,20 @@ template<> EIGEN_STRONG_INLINE Packet8f pldexp<Packet8f>(const Packet8f& a, cons
  return pldexp_float(a,exponent);
 }
 template<> EIGEN_STRONG_INLINE Packet4d pldexp<Packet4d>(const Packet4d& a, const Packet4d& exponent) {
  // Build e=2^n by constructing the exponents in a 128-bit vector and
  // shifting them to where they belong in double-precision values.
  Packet4i cst_1023 = pset1<Packet4i>(1023);
  __m128i emm0 = _mm256_cvtpd_epi32(exponent);
  emm0 = _mm_add_epi32(emm0, cst_1023);
  emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0));
  __m128i lo = _mm_slli_epi64(emm0, 52);
  __m128i hi = _mm_slli_epi64(_mm_srli_epi64(emm0, 32), 52);
  __m256i e = _mm256_insertf128_si256(_mm256_setzero_si256(), lo, 0);
  e = _mm256_insertf128_si256(e, hi, 1);
  return pmul(a,_mm256_castsi256_pd(e));
 }
 // preduxp should be ok
 // FIXME: why is this ok? why isn't the simply implementation working as expected?
 template<> EIGEN_STRONG_INLINE Packet8f preduxp<Packet8f>(const Packet8f* vecs)
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@ -160,5 +160,74 @@ Packet pexp_float(const Packet _x)
  return pmax(pldexp(y,m), _x);
 }
 template <typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 EIGEN_UNUSED
 Packet pexp_double(const Packet _x)
 {
  Packet x = _x;
  const Packet cst_1 = pset1<Packet>(1.0);
  const Packet cst_2 = pset1<Packet>(2.0);
  const Packet cst_half = pset1<Packet>(0.5);
  const Packet cst_exp_hi = pset1<Packet>(709.437);
  const Packet cst_exp_lo = pset1<Packet>(-709.436139303);
  const Packet cst_cephes_LOG2EF = pset1<Packet>(1.4426950408889634073599);
  const Packet cst_cephes_exp_p0 = pset1<Packet>(1.26177193074810590878e-4);
  const Packet cst_cephes_exp_p1 = pset1<Packet>(3.02994407707441961300e-2);
  const Packet cst_cephes_exp_p2 = pset1<Packet>(9.99999999999999999910e-1);
  const Packet cst_cephes_exp_q0 = pset1<Packet>(3.00198505138664455042e-6);
  const Packet cst_cephes_exp_q1 = pset1<Packet>(2.52448340349684104192e-3);
  const Packet cst_cephes_exp_q2 = pset1<Packet>(2.27265548208155028766e-1);
  const Packet cst_cephes_exp_q3 = pset1<Packet>(2.00000000000000000009e0);
  const Packet cst_cephes_exp_C1 = pset1<Packet>(0.693145751953125);
  const Packet cst_cephes_exp_C2 = pset1<Packet>(1.42860682030941723212e-6);
  Packet tmp, fx;
  // clamp x
  x = pmax(pmin(x, cst_exp_hi), cst_exp_lo);
  // Express exp(x) as exp(g + n*log(2)).
  fx = pmadd(cst_cephes_LOG2EF, x, cst_half);
  // Get the integer modulus of log(2), i.e. the "n" described above.
  fx = pfloor(fx);
  // Get the remainder modulo log(2), i.e. the "g" described above. Subtract
  // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
  // digits right.
  tmp = pmul(fx, cst_cephes_exp_C1);
  Packet z = pmul(fx, cst_cephes_exp_C2);
  x = psub(x, tmp);
  x = psub(x, z);
  Packet x2 = pmul(x, x);
  // Evaluate the numerator polynomial of the rational interpolant.
  Packet px = cst_cephes_exp_p0;
  px = pmadd(px, x2, cst_cephes_exp_p1);
  px = pmadd(px, x2, cst_cephes_exp_p2);
  px = pmul(px, x);
  // Evaluate the denominator polynomial of the rational interpolant.
  Packet qx = cst_cephes_exp_q0;
  qx = pmadd(qx, x2, cst_cephes_exp_q1);
  qx = pmadd(qx, x2, cst_cephes_exp_q2);
  qx = pmadd(qx, x2, cst_cephes_exp_q3);
  // I don't really get this bit, copied from the SSE2 routines, so...
  // TODO(gonnet): Figure out what is going on here, perhaps find a better
  // rational interpolant?
  x = pdiv(px, psub(qx, px));
  x = pmadd(cst_2, x, cst_1);
  // Construct the result 2^n * exp(g) = e * x. The max is used to catch
  // non-finite values in the input.
  //return pmax(pmul(x, _mm256_castsi256_pd(e)), _x);
  return pmax(pldexp(x,fx), _x);
 }
 } // end namespace internal
 } // end namespace Eigen
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@ -32,78 +32,11 @@ Packet4f pexp<Packet4f>(const Packet4f& _x)
 {
  return pexp_float(_x);
 }
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet2d pexp<Packet2d>(const Packet2d& _x)
+Packet2d pexp<Packet2d>(const Packet2d& x)
 {
-  Packet2d x = _x;
+  return pexp_double(x);
  _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
  _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
  _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
  _EIGEN_DECLARE_CONST_Packet2d(exp_hi,  709.437);
  _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303);
  _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
  static const __m128i p4i_1023_0 = _mm_setr_epi32(1023, 1023, 0, 0);
  Packet2d tmp, fx;
  Packet4i emm0;
  // clamp x
  x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo);
  /* express exp(x) as exp(g + n*log(2)) */
  fx = pmadd(p2d_cephes_LOG2EF, x, p2d_half);
 #ifdef EIGEN_VECTORIZE_SSE4_1
  fx = _mm_floor_pd(fx);
 #else
  emm0 = _mm_cvttpd_epi32(fx);
  tmp  = _mm_cvtepi32_pd(emm0);
  /* if greater, substract 1 */
  Packet2d mask = _mm_cmpgt_pd(tmp, fx);
  mask = _mm_and_pd(mask, p2d_1);
  fx = psub(tmp, mask);
 #endif
  tmp = pmul(fx, p2d_cephes_exp_C1);
  Packet2d z = pmul(fx, p2d_cephes_exp_C2);
  x = psub(x, tmp);
  x = psub(x, z);
  Packet2d x2 = pmul(x,x);
  Packet2d px = p2d_cephes_exp_p0;
  px = pmadd(px, x2, p2d_cephes_exp_p1);
  px = pmadd(px, x2, p2d_cephes_exp_p2);
  px = pmul (px, x);
  Packet2d qx = p2d_cephes_exp_q0;
  qx = pmadd(qx, x2, p2d_cephes_exp_q1);
  qx = pmadd(qx, x2, p2d_cephes_exp_q2);
  qx = pmadd(qx, x2, p2d_cephes_exp_q3);
  x = pdiv(px,psub(qx,px));
  x = pmadd(p2d_2,x,p2d_1);
  // build 2^n
  emm0 = _mm_cvttpd_epi32(fx);
  emm0 = _mm_add_epi32(emm0, p4i_1023_0);
  emm0 = _mm_slli_epi32(emm0, 20);
  emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3));
  return pmax(pmul(x, Packet2d(_mm_castsi128_pd(emm0))), _x);
 }
 /* evaluation of 4 sines at once, using SSE2 intrinsics.
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@ -338,6 +338,21 @@ template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4
 template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return _mm_cmpeq_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(b,a); }
 template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(b,a); }
 template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(b,a); }
 #ifdef EIGEN_VECTORIZE_SSE4_1
 template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return _mm_round_ps(a, 0); }
@ -356,27 +371,24 @@ template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
  Packet4f tmp  = _mm_cvtepi32_ps(emm0);
  /* if greater, substract 1 */
  Packet4f mask = _mm_cmpgt_ps(tmp, a);
-  mask = _mm_and_ps(mask, cst_1);
+  mask = pand(mask, cst_1);
  return psub(tmp, mask);
 }
 // WARNING: this pfloor implementation makes sense for small inputs only,
 // It is currently only used by pexp and not exposed through HasFloor.
 template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a)
 {
  const Packet2d cst_1 = pset1<Packet2d>(1.0);
  Packet4i emm0 = _mm_cvttpd_epi32(a);
  Packet2d tmp  = _mm_cvtepi32_pd(emm0);
  /* if greater, substract 1 */
  Packet2d mask = _mm_cmpgt_pd(tmp, a);
  mask = pand(mask, cst_1);
  return psub(tmp, mask);
 }
 #endif
 template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(b,a); }
 template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(b,a); }
 template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(b,a); }
 template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float*   from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
 template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double*  from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
 template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
@ -557,6 +569,15 @@ template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, cons
  return pldexp_float(a,exponent);
 }
 template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
  const __m128i cst_1023_0 = _mm_setr_epi32(1023, 1023, 0, 0);
  Packet4i emm0 = _mm_cvttpd_epi32(exponent);
  emm0 = padd(emm0, cst_1023_0);
  emm0 = _mm_slli_epi32(emm0, 20);
  emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3));
  return pmul(a, Packet2d(_mm_castsi128_pd(emm0)));
 }
 // with AVX, the default implementations based on pload1 are faster
 #ifndef __AVX__
 template<> EIGEN_STRONG_INLINE void