Vectorize erfc(x) for double and improve erfc(x) for float.

This commit is contained in:
Rasmus Munk Larsen 2024-11-08 17:21:11 +00:00
parent 8adf43640e
commit 0d366f6532
9 changed files with 141 additions and 18 deletions

View File

@ -124,6 +124,7 @@ struct packet_traits<float> : default_packet_traits {
HasRsqrt = 1,
HasTanh = EIGEN_FAST_MATH,
HasErf = EIGEN_FAST_MATH,
HasErfc = EIGEN_FAST_MATH,
HasBlend = 1
};
};
@ -144,6 +145,7 @@ struct packet_traits<double> : default_packet_traits {
#endif
HasTanh = EIGEN_FAST_MATH,
HasLog = 1,
HasErfc = 1,
HasExp = 1,
HasSqrt = 1,
HasRsqrt = 1,

View File

@ -133,6 +133,7 @@ struct packet_traits<float> : default_packet_traits {
HasReciprocal = EIGEN_FAST_MATH,
HasTanh = EIGEN_FAST_MATH,
HasErf = EIGEN_FAST_MATH,
HasErfc = EIGEN_FAST_MATH,
HasCmp = 1,
HasDiv = 1
};
@ -154,6 +155,7 @@ struct packet_traits<double> : default_packet_traits {
HasExp = 1,
HasATan = 1,
HasTanh = EIGEN_FAST_MATH,
HasErfc = EIGEN_FAST_MATH,
HasATanh = 1,
HasCmp = 1,
HasDiv = 1

View File

@ -193,6 +193,7 @@ struct packet_traits<float> : default_packet_traits {
#endif
HasTanh = EIGEN_FAST_MATH,
HasErf = EIGEN_FAST_MATH,
HasErfc = EIGEN_FAST_MATH,
#else
HasSqrt = 0,
HasRsqrt = 0,
@ -3182,6 +3183,7 @@ struct packet_traits<double> : default_packet_traits {
HasSin = EIGEN_FAST_MATH,
HasCos = EIGEN_FAST_MATH,
HasTanh = EIGEN_FAST_MATH,
HasErfc = EIGEN_FAST_MATH,
HasATanh = 1,
HasATan = 0,
HasLog = 0,

View File

@ -1659,6 +1659,13 @@ EIGEN_STRONG_INLINE void twoprod(const Packet& x, const Packet& y, Packet& p_hi,
p_lo = pmsub(x, y, p_hi);
}
// A version of twoprod that takes x, y, and fl(x*y) as input and returns the p_lo such that
// x * y = xy + p_lo holds exactly.
template <typename Packet>
EIGEN_STRONG_INLINE Packet twoprod_low(const Packet& x, const Packet& y, const Packet& xy) {
return pmsub(x, y, xy);
}
#else
// This function implements the Veltkamp splitting. Given a floating point
@ -1694,6 +1701,21 @@ EIGEN_STRONG_INLINE void twoprod(const Packet& x, const Packet& y, Packet& p_hi,
p_lo = pmadd(x_lo, y_lo, p_lo);
}
// A version of twoprod that takes x, y, and fl(x*y) as input and returns the p_lo such that
// x * y = xy + p_lo holds exactly.
template <typename Packet>
EIGEN_STRONG_INLINE Packet twoprod_low(const Packet& x, const Packet& y, const Packet& xy) {
Packet x_hi, x_lo, y_hi, y_lo;
veltkamp_splitting(x, x_hi, x_lo);
veltkamp_splitting(y, y_hi, y_lo);
Packet p_lo = pmadd(x_hi, y_hi, pnegate(xy));
p_lo = pmadd(x_hi, y_lo, p_lo);
p_lo = pmadd(x_lo, y_hi, p_lo);
p_lo = pmadd(x_lo, y_lo, p_lo);
return p_lo;
}
#endif // EIGEN_VECTORIZE_FMA
// This function implements Dekker's algorithm for the addition

View File

@ -209,6 +209,7 @@ struct packet_traits<float> : default_packet_traits {
HasRsqrt = 1,
HasTanh = EIGEN_FAST_MATH,
HasErf = EIGEN_FAST_MATH,
HasErfc = EIGEN_FAST_MATH,
HasBessel = 0, // Issues with accuracy.
HasNdtri = 0
};
@ -5140,7 +5141,8 @@ struct packet_traits<double> : default_packet_traits {
HasSqrt = 1,
HasRsqrt = 1,
HasTanh = EIGEN_FAST_MATH,
HasErf = 0
HasErf = 0,
HasErfc = EIGEN_FAST_MATH
};
};

View File

@ -197,6 +197,7 @@ struct packet_traits<float> : default_packet_traits {
HasRsqrt = 1,
HasTanh = EIGEN_FAST_MATH,
HasErf = EIGEN_FAST_MATH,
HasErfc = EIGEN_FAST_MATH,
HasBlend = 1,
HasSign = 0 // The manually vectorized version is slightly slower for SSE.
};
@ -216,6 +217,7 @@ struct packet_traits<double> : default_packet_traits {
HasCos = EIGEN_FAST_MATH,
HasTanh = EIGEN_FAST_MATH,
HasLog = 1,
HasErfc = EIGEN_FAST_MATH,
HasExp = 1,
HasSqrt = 1,
HasRsqrt = 1,

View File

@ -360,7 +360,8 @@ struct packet_traits<float> : default_packet_traits {
HasExp = 1,
HasSqrt = 1,
HasTanh = EIGEN_FAST_MATH,
HasErf = EIGEN_FAST_MATH
HasErf = EIGEN_FAST_MATH,
HasErfc = EIGEN_FAST_MATH
};
};

View File

@ -345,11 +345,17 @@ struct erf_impl<double> {
/***************************************************************************
* Implementation of erfc, requires C++11/C99 *
****************************************************************************/
template <typename Scalar>
struct generic_fast_erfc {
template <typename T>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T run(const T& x_in);
};
template <>
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erfc_float(const T& x) {
const T x_abs = pmin(pabs(x), pset1<T>(10.0f));
const T one = pset1<T>(1.0f);
const T x_abs_gt_one_mask = pcmp_lt(one, x_abs);
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erfc<float>::run(const T& x_in) {
constexpr float kClamp = 11.0f;
const T x = pmin(pmax(x_in, pset1<T>(-kClamp)), pset1<T>(kClamp));
// erfc(x) = 1 + x * S(x^2), |x| <= 1.
//
@ -360,10 +366,12 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erfc_float(const T& x) {
2.67075151205062866210937500000e-02, -1.12800106406211853027343750000e-01,
3.76122951507568359375000000000e-01, -1.12837910652160644531250000000e+00};
const T x2 = pmul(x, x);
const T one = pset1<T>(1.0);
const T erfc_small = pmadd(x, ppolevl<T, 5>::run(x2, alpha), one);
// Return early if we don't need the more expensive approximation for any
// entry in a.
const T x_abs_gt_one_mask = pcmp_lt(one, x2);
if (!predux_any(x_abs_gt_one_mask)) return erfc_small;
// erfc(x) = exp(-x^2) * 1/x * P(1/x^2) / Q(1/x^2), 1 < x < 9.
@ -377,23 +385,103 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erfc_float(const T& x) {
constexpr float delta[] = {1.7251677811145782470703125e-02f, 3.9137163758277893066406250e-01f,
1.0000000000000000000000000e+00f, 6.2173241376876831054687500e-01f,
9.5662862062454223632812500e-02f};
const T z = pexp(pnegate(x2));
const T x2_lo = twoprod_low(x, x, x2);
// Here we use that
// exp(-x^2) = exp(-(x2+x2_lo)^2) ~= exp(-x2)*exp(-x2_lo) ~= exp(-x2)*(1-x2_lo)
// since x2_lo < kClamp * eps << 1 in the region we care about. This trick reduces the max error
// from 34 ulps to below 5 ulps.
const T exp2_hi = pexp(pnegate(x2));
const T z = pnmadd(exp2_hi, x2_lo, exp2_hi);
const T q2 = preciprocal(x2);
const T num = ppolevl<T, 3>::run(q2, gamma);
const T denom = pmul(x_abs, ppolevl<T, 4>::run(q2, delta));
const T denom = pmul(x, ppolevl<T, 4>::run(q2, delta));
const T r = pdiv(num, denom);
// If x < -1 then use erfc(x) = 2 - erfc(|x|).
const T x_negative = pcmp_lt(x, pset1<T>(0.0f));
const T erfc_large = pselect(x_negative, pnmadd(z, r, pset1<T>(2.0f)), pmul(z, r));
const T maybe_two = pand(pcmp_lt(x, pset1<T>(0.0)), pset1<T>(2.0));
const T erfc_large = pmadd(z, r, maybe_two);
return pselect(x_abs_gt_one_mask, erfc_large, erfc_small);
}
template <typename Scalar>
struct erfc_impl {
EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), THIS_TYPE_IS_NOT_SUPPORTED)
template <>
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erfc<double>::run(const T& x_in) {
// Clamp x to [-27:27] beyond which erfc(x) is either two or zero (below the underflow threshold).
// This avoids having to deal with twoprod(x,x) producing NaN for sufficiently large x.
constexpr double kClamp = 28.0;
const T x = pmin(pmax(x_in, pset1<T>(-kClamp)), pset1<T>(kClamp));
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Scalar) { return Scalar(0); }
// erfc(x) = 1 + x * S(x^2) / T(x^2), |x| <= 1.
//
// Coefficients for S and T generated with Rminimax command:
// ./ratapprox --function="erfc(x)-1" --dom='[-1,1]' --type=[9,10]
// --num="odd" --numF="[D]" --den="even" --denF="[D]" --log --dispCoeff="dec"
constexpr double alpha[] = {-1.9493725660006057018823477644531294572516344487667083740234375e-04,
-1.8272566210022942682217328425053892715368419885635375976562500e-03,
-4.5303363351690106863856044583371840417385101318359375000000000e-02,
-1.4215015503619179981775744181504705920815467834472656250000000e-01,
-1.1283791670955125585606992899556644260883331298828125000000000e+00};
constexpr double beta[] = {2.0294484101083099089526257108317963684385176748037338256835938e-05,
6.8117805899186819641732970609382391558028757572174072265625000e-04,
1.0582026056098614921752165685120417037978768348693847656250000e-02,
9.3252603143757495374188692949246615171432495117187500000000000e-02,
4.5931062818368939559832142549566924571990966796875000000000000e-01,
1.0};
const T x2 = pmul(x, x);
const T num_small = ppolevl<T, 4>::run(x2, alpha);
const T denom_small = ppolevl<T, 5>::run(x2, beta);
const T one = pset1<T>(1.0);
const T erfc_small = pmadd(x, pdiv(num_small, denom_small), one);
// Return early if we don't need the more expensive approximation for any
// entry in a.
const T x_abs_gt_one_mask = pcmp_lt(one, x2);
if (!predux_any(x_abs_gt_one_mask)) return erfc_small;
// erfc(x) = exp(-x^2) * 1/x * P(x) / Q(x), 1 < x < 27.
//
// Coefficients for P and Q generated with Rminimax command:
// ./ratapprox --function="erfc(1/sqrt(x))*exp(1/x)/sqrt(x)" --dom='[0.0013717,1]' --type=[9,9] --numF="[D]"
// --denF="[D]" --log --dispCoeff="dec"
constexpr double gamma[] = {1.5252844933226974316088642158462107545346952974796295166015625e-04,
1.0909912393738931124520519233556115068495273590087890625000000e-02,
1.0628604636755033252537572252549580298364162445068359375000000e-01,
3.3492472973137982217295416376146022230386734008789062500000000e-01,
4.5065776215933289750026347064704168587923049926757812500000000e-01,
2.9433039130294824659017649537418037652969360351562500000000000e-01,
9.8792676360600226170838311645638896152377128601074218750000000e-02,
1.7095935395503719655962981960328761488199234008789062500000000e-02,
1.4249109729504577659398023570247460156679153442382812500000000e-03,
4.4567378313647954771875570045835956989321857690811157226562500e-05};
constexpr double delta[] = {2.041985103115789845773520028160419315099716186523437500000000e-03,
5.316030659946043707142493417450168635696172714233398437500000e-02,
3.426242193784684864077405563875799998641014099121093750000000e-01,
8.565637124308049799026321124983951449394226074218750000000000e-01,
1.000000000000000000000000000000000000000000000000000000000000e+00,
5.968805280570776972126623149961233139038085937500000000000000e-01,
1.890922854723317836356244470152887515723705291748046875000000e-01,
3.152505418656005586885981983868987299501895904541015625000000e-02,
2.565085751861882583380047861965067568235099315643310546875000e-03,
7.899362131678837697403017248376499992446042597293853759765625e-05};
const T x2_lo = twoprod_low(x, x, x2);
// Here we use that
// exp(-x^2) = exp(-(x2+x2_lo)^2) ~= exp(-x2)*exp(-x2_lo) ~= exp(-x2)*(1-x2_lo)
// since x2_lo < kClamp *eps << 1 in the region we care about. This trick reduces the max error
// from 258 ulps to below 7 ulps.
const T exp2_hi = pexp(pnegate(x2));
const T z = pnmadd(exp2_hi, x2_lo, exp2_hi);
const T q2 = preciprocal(x2);
const T num_large = ppolevl<T, 9>::run(q2, gamma);
const T denom_large = pmul(x, ppolevl<T, 9>::run(q2, delta));
const T r = pdiv(num_large, denom_large);
const T maybe_two = pand(pcmp_lt(x, pset1<T>(0.0)), pset1<T>(2.0));
const T erfc_large = pmadd(z, r, maybe_two);
return pselect(x_abs_gt_one_mask, erfc_large, erfc_small);
}
template <typename T>
struct erfc_impl {
typedef typename unpacket_traits<T>::type Scalar;
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) { return generic_fast_erfc<Scalar>::run(x); }
};
template <typename Scalar>
@ -408,7 +496,7 @@ struct erfc_impl<float> {
#if defined(SYCL_DEVICE_ONLY)
return cl::sycl::erfc(x);
#else
return generic_fast_erfc_float(x);
return generic_fast_erfc<float>::run(x);
#endif
}
};
@ -419,7 +507,7 @@ struct erfc_impl<double> {
#if defined(SYCL_DEVICE_ONLY)
return cl::sycl::erfc(x);
#else
return ::erfc(x);
return generic_fast_erfc<double>::run(x);
#endif
}
};

View File

@ -117,6 +117,8 @@ void packetmath_real() {
#if EIGEN_HAS_C99_MATH
CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasLGamma, std::lgamma, internal::plgamma);
CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasErf, std::erf, internal::perf);
// FIXME(rmlarsen): This test occasionally fails due to difference in tiny subnormal results
// near the underflow boundary. I am not sure which version is correct.
CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasErfc, std::erfc, internal::perfc);
#endif
}