Vectorize erfc(x) for double and improve erfc(x) for float.

2025-10-09 22:56:31 +08:00 · 2024-11-08 17:21:11 +00:00 · 2024-11-08 17:21:11 +00:00 · 0d366f6532
commit 0d366f6532
parent 8adf43640e
9 changed files with 141 additions and 18 deletions
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@ -124,6 +124,7 @@ struct packet_traits<float> : default_packet_traits {
    HasRsqrt = 1,
    HasTanh = EIGEN_FAST_MATH,
    HasErf = EIGEN_FAST_MATH,
+    HasErfc = EIGEN_FAST_MATH,
    HasBlend = 1
  };
 };
@ -144,6 +145,7 @@ struct packet_traits<double> : default_packet_traits {
 #endif
    HasTanh = EIGEN_FAST_MATH,
    HasLog = 1,
+    HasErfc = 1,
    HasExp = 1,
    HasSqrt = 1,
    HasRsqrt = 1,
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@ -133,6 +133,7 @@ struct packet_traits<float> : default_packet_traits {
    HasReciprocal = EIGEN_FAST_MATH,
    HasTanh = EIGEN_FAST_MATH,
    HasErf = EIGEN_FAST_MATH,
+    HasErfc = EIGEN_FAST_MATH,
    HasCmp = 1,
    HasDiv = 1
  };
@ -154,6 +155,7 @@ struct packet_traits<double> : default_packet_traits {
    HasExp = 1,
    HasATan = 1,
    HasTanh = EIGEN_FAST_MATH,
+    HasErfc = EIGEN_FAST_MATH,
    HasATanh = 1,
    HasCmp = 1,
    HasDiv = 1
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@ -193,6 +193,7 @@ struct packet_traits<float> : default_packet_traits {
 #endif
    HasTanh = EIGEN_FAST_MATH,
    HasErf = EIGEN_FAST_MATH,
+    HasErfc = EIGEN_FAST_MATH,
 #else
    HasSqrt = 0,
    HasRsqrt = 0,
@ -3182,6 +3183,7 @@ struct packet_traits<double> : default_packet_traits {
    HasSin = EIGEN_FAST_MATH,
    HasCos = EIGEN_FAST_MATH,
    HasTanh = EIGEN_FAST_MATH,
+    HasErfc = EIGEN_FAST_MATH,
    HasATanh = 1,
    HasATan = 0,
    HasLog = 0,
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@ -1659,6 +1659,13 @@ EIGEN_STRONG_INLINE void twoprod(const Packet& x, const Packet& y, Packet& p_hi,
  p_lo = pmsub(x, y, p_hi);
 }

+// A version of twoprod that takes x, y, and fl(x*y) as input and returns the p_lo such that
+// x * y = xy + p_lo holds exactly.
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet twoprod_low(const Packet& x, const Packet& y, const Packet& xy) {
+  return pmsub(x, y, xy);
+}
+
 #else

 // This function implements the Veltkamp splitting. Given a floating point
@ -1694,6 +1701,21 @@ EIGEN_STRONG_INLINE void twoprod(const Packet& x, const Packet& y, Packet& p_hi,
  p_lo = pmadd(x_lo, y_lo, p_lo);
 }

+// A version of twoprod that takes x, y, and fl(x*y) as input and returns the p_lo such that
+// x * y = xy + p_lo holds exactly.
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet twoprod_low(const Packet& x, const Packet& y, const Packet& xy) {
+  Packet x_hi, x_lo, y_hi, y_lo;
+  veltkamp_splitting(x, x_hi, x_lo);
+  veltkamp_splitting(y, y_hi, y_lo);
+
+  Packet p_lo = pmadd(x_hi, y_hi, pnegate(xy));
+  p_lo = pmadd(x_hi, y_lo, p_lo);
+  p_lo = pmadd(x_lo, y_hi, p_lo);
+  p_lo = pmadd(x_lo, y_lo, p_lo);
+  return p_lo;
+}
+
 #endif  // EIGEN_VECTORIZE_FMA

 // This function implements Dekker's algorithm for the addition
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@ -209,6 +209,7 @@ struct packet_traits<float> : default_packet_traits {
    HasRsqrt = 1,
    HasTanh = EIGEN_FAST_MATH,
    HasErf = EIGEN_FAST_MATH,
+    HasErfc = EIGEN_FAST_MATH,
    HasBessel = 0,  // Issues with accuracy.
    HasNdtri = 0
  };
@ -5140,7 +5141,8 @@ struct packet_traits<double> : default_packet_traits {
    HasSqrt = 1,
    HasRsqrt = 1,
    HasTanh = EIGEN_FAST_MATH,
-    HasErf = 0
+    HasErf = 0,
+    HasErfc = EIGEN_FAST_MATH
  };
 };

--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@ -197,6 +197,7 @@ struct packet_traits<float> : default_packet_traits {
    HasRsqrt = 1,
    HasTanh = EIGEN_FAST_MATH,
    HasErf = EIGEN_FAST_MATH,
+    HasErfc = EIGEN_FAST_MATH,
    HasBlend = 1,
    HasSign = 0  // The manually vectorized version is slightly slower for SSE.
  };
@ -216,6 +217,7 @@ struct packet_traits<double> : default_packet_traits {
    HasCos = EIGEN_FAST_MATH,
    HasTanh = EIGEN_FAST_MATH,
    HasLog = 1,
+    HasErfc = EIGEN_FAST_MATH,
    HasExp = 1,
    HasSqrt = 1,
    HasRsqrt = 1,
--- a/Eigen/src/Core/arch/SVE/PacketMath.h
+++ b/Eigen/src/Core/arch/SVE/PacketMath.h
@ -360,7 +360,8 @@ struct packet_traits<float> : default_packet_traits {
    HasExp = 1,
    HasSqrt = 1,
    HasTanh = EIGEN_FAST_MATH,
-    HasErf = EIGEN_FAST_MATH
+    HasErf = EIGEN_FAST_MATH,
+    HasErfc = EIGEN_FAST_MATH
  };
 };

--- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
@ -345,11 +345,17 @@ struct erf_impl<double> {
 /***************************************************************************
 * Implementation of erfc, requires C++11/C99                               *
 ****************************************************************************/
+template <typename Scalar>
+struct generic_fast_erfc {
+  template <typename T>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T run(const T& x_in);
+};
+
+template <>
 template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erfc_float(const T& x) {
-  const T x_abs = pmin(pabs(x), pset1<T>(10.0f));
-  const T one = pset1<T>(1.0f);
-  const T x_abs_gt_one_mask = pcmp_lt(one, x_abs);
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erfc<float>::run(const T& x_in) {
+  constexpr float kClamp = 11.0f;
+  const T x = pmin(pmax(x_in, pset1<T>(-kClamp)), pset1<T>(kClamp));

  // erfc(x) = 1 + x * S(x^2), |x| <= 1.
  //
@ -360,10 +366,12 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erfc_float(const T& x) {
                             2.67075151205062866210937500000e-02, -1.12800106406211853027343750000e-01,
                             3.76122951507568359375000000000e-01, -1.12837910652160644531250000000e+00};
  const T x2 = pmul(x, x);
+  const T one = pset1<T>(1.0);
  const T erfc_small = pmadd(x, ppolevl<T, 5>::run(x2, alpha), one);

  // Return early if we don't need the more expensive approximation for any
  // entry in a.
+  const T x_abs_gt_one_mask = pcmp_lt(one, x2);
  if (!predux_any(x_abs_gt_one_mask)) return erfc_small;

  // erfc(x) = exp(-x^2) * 1/x * P(1/x^2) / Q(1/x^2), 1 < x < 9.
@ -377,23 +385,103 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erfc_float(const T& x) {
  constexpr float delta[] = {1.7251677811145782470703125e-02f, 3.9137163758277893066406250e-01f,
                             1.0000000000000000000000000e+00f, 6.2173241376876831054687500e-01f,
                             9.5662862062454223632812500e-02f};
-  const T z = pexp(pnegate(x2));
+  const T x2_lo = twoprod_low(x, x, x2);
+  // Here we use that
+  //   exp(-x^2) = exp(-(x2+x2_lo)^2) ~= exp(-x2)*exp(-x2_lo) ~= exp(-x2)*(1-x2_lo)
+  // since x2_lo < kClamp * eps << 1 in the region we care about. This trick reduces the max error
+  // from 34 ulps to below 5 ulps.
+  const T exp2_hi = pexp(pnegate(x2));
+  const T z = pnmadd(exp2_hi, x2_lo, exp2_hi);
  const T q2 = preciprocal(x2);
  const T num = ppolevl<T, 3>::run(q2, gamma);
-  const T denom = pmul(x_abs, ppolevl<T, 4>::run(q2, delta));
+  const T denom = pmul(x, ppolevl<T, 4>::run(q2, delta));
  const T r = pdiv(num, denom);
-  // If x < -1 then use erfc(x) = 2 - erfc(|x|).
-  const T x_negative = pcmp_lt(x, pset1<T>(0.0f));
-  const T erfc_large = pselect(x_negative, pnmadd(z, r, pset1<T>(2.0f)), pmul(z, r));
-
+  const T maybe_two = pand(pcmp_lt(x, pset1<T>(0.0)), pset1<T>(2.0));
+  const T erfc_large = pmadd(z, r, maybe_two);
  return pselect(x_abs_gt_one_mask, erfc_large, erfc_small);
 }

-template <typename Scalar>
-struct erfc_impl {
-  EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), THIS_TYPE_IS_NOT_SUPPORTED)
+template <>
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erfc<double>::run(const T& x_in) {
+  // Clamp x to [-27:27] beyond which erfc(x) is either two or zero (below the underflow threshold).
+  // This avoids having to deal with twoprod(x,x) producing NaN for sufficiently large x.
+  constexpr double kClamp = 28.0;
+  const T x = pmin(pmax(x_in, pset1<T>(-kClamp)), pset1<T>(kClamp));

-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Scalar) { return Scalar(0); }
+  // erfc(x) = 1 + x * S(x^2) / T(x^2), |x| <= 1.
+  //
+  // Coefficients for S and T generated with Rminimax command:
+  //  ./ratapprox --function="erfc(x)-1" --dom='[-1,1]' --type=[9,10]
+  //  --num="odd" --numF="[D]" --den="even" --denF="[D]" --log --dispCoeff="dec"
+  constexpr double alpha[] = {-1.9493725660006057018823477644531294572516344487667083740234375e-04,
+                              -1.8272566210022942682217328425053892715368419885635375976562500e-03,
+                              -4.5303363351690106863856044583371840417385101318359375000000000e-02,
+                              -1.4215015503619179981775744181504705920815467834472656250000000e-01,
+                              -1.1283791670955125585606992899556644260883331298828125000000000e+00};
+  constexpr double beta[] = {2.0294484101083099089526257108317963684385176748037338256835938e-05,
+                             6.8117805899186819641732970609382391558028757572174072265625000e-04,
+                             1.0582026056098614921752165685120417037978768348693847656250000e-02,
+                             9.3252603143757495374188692949246615171432495117187500000000000e-02,
+                             4.5931062818368939559832142549566924571990966796875000000000000e-01,
+                             1.0};
+  const T x2 = pmul(x, x);
+  const T num_small = ppolevl<T, 4>::run(x2, alpha);
+  const T denom_small = ppolevl<T, 5>::run(x2, beta);
+  const T one = pset1<T>(1.0);
+  const T erfc_small = pmadd(x, pdiv(num_small, denom_small), one);
+
+  // Return early if we don't need the more expensive approximation for any
+  // entry in a.
+  const T x_abs_gt_one_mask = pcmp_lt(one, x2);
+  if (!predux_any(x_abs_gt_one_mask)) return erfc_small;
+
+  // erfc(x) = exp(-x^2) * 1/x * P(x) / Q(x), 1 < x < 27.
+  //
+  // Coefficients for P and Q generated with Rminimax command:
+  //  ./ratapprox --function="erfc(1/sqrt(x))*exp(1/x)/sqrt(x)"  --dom='[0.0013717,1]' --type=[9,9] --numF="[D]"
+  //  --denF="[D]" --log --dispCoeff="dec"
+  constexpr double gamma[] = {1.5252844933226974316088642158462107545346952974796295166015625e-04,
+                              1.0909912393738931124520519233556115068495273590087890625000000e-02,
+                              1.0628604636755033252537572252549580298364162445068359375000000e-01,
+                              3.3492472973137982217295416376146022230386734008789062500000000e-01,
+                              4.5065776215933289750026347064704168587923049926757812500000000e-01,
+                              2.9433039130294824659017649537418037652969360351562500000000000e-01,
+                              9.8792676360600226170838311645638896152377128601074218750000000e-02,
+                              1.7095935395503719655962981960328761488199234008789062500000000e-02,
+                              1.4249109729504577659398023570247460156679153442382812500000000e-03,
+                              4.4567378313647954771875570045835956989321857690811157226562500e-05};
+  constexpr double delta[] = {2.041985103115789845773520028160419315099716186523437500000000e-03,
+                              5.316030659946043707142493417450168635696172714233398437500000e-02,
+                              3.426242193784684864077405563875799998641014099121093750000000e-01,
+                              8.565637124308049799026321124983951449394226074218750000000000e-01,
+                              1.000000000000000000000000000000000000000000000000000000000000e+00,
+                              5.968805280570776972126623149961233139038085937500000000000000e-01,
+                              1.890922854723317836356244470152887515723705291748046875000000e-01,
+                              3.152505418656005586885981983868987299501895904541015625000000e-02,
+                              2.565085751861882583380047861965067568235099315643310546875000e-03,
+                              7.899362131678837697403017248376499992446042597293853759765625e-05};
+
+  const T x2_lo = twoprod_low(x, x, x2);
+  // Here we use that
+  //   exp(-x^2) = exp(-(x2+x2_lo)^2) ~= exp(-x2)*exp(-x2_lo) ~= exp(-x2)*(1-x2_lo)
+  // since x2_lo < kClamp *eps << 1 in the region we care about. This trick reduces the max error
+  // from 258 ulps to below 7 ulps.
+  const T exp2_hi = pexp(pnegate(x2));
+  const T z = pnmadd(exp2_hi, x2_lo, exp2_hi);
+  const T q2 = preciprocal(x2);
+  const T num_large = ppolevl<T, 9>::run(q2, gamma);
+  const T denom_large = pmul(x, ppolevl<T, 9>::run(q2, delta));
+  const T r = pdiv(num_large, denom_large);
+  const T maybe_two = pand(pcmp_lt(x, pset1<T>(0.0)), pset1<T>(2.0));
+  const T erfc_large = pmadd(z, r, maybe_two);
+  return pselect(x_abs_gt_one_mask, erfc_large, erfc_small);
+}
+
+template <typename T>
+struct erfc_impl {
+  typedef typename unpacket_traits<T>::type Scalar;
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) { return generic_fast_erfc<Scalar>::run(x); }
 };

 template <typename Scalar>
@ -408,7 +496,7 @@ struct erfc_impl<float> {
 #if defined(SYCL_DEVICE_ONLY)
    return cl::sycl::erfc(x);
 #else
-    return generic_fast_erfc_float(x);
+    return generic_fast_erfc<float>::run(x);
 #endif
  }
 };
@ -419,7 +507,7 @@ struct erfc_impl<double> {
 #if defined(SYCL_DEVICE_ONLY)
    return cl::sycl::erfc(x);
 #else
-    return ::erfc(x);
+    return generic_fast_erfc<double>::run(x);
 #endif
  }
 };
--- a/unsupported/test/special_packetmath.cpp
+++ b/unsupported/test/special_packetmath.cpp
@ -117,6 +117,8 @@ void packetmath_real() {
 #if EIGEN_HAS_C99_MATH
  CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasLGamma, std::lgamma, internal::plgamma);
  CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasErf, std::erf, internal::perf);
+  // FIXME(rmlarsen): This test occasionally fails due to difference in tiny subnormal results
+  // near the underflow boundary. I am not sure which version is correct.
  CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasErfc, std::erfc, internal::perfc);
 #endif
 }