diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 58fdb08d8..1980e928b 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -124,6 +124,7 @@ struct packet_traits : default_packet_traits { HasRsqrt = 1, HasTanh = EIGEN_FAST_MATH, HasErf = EIGEN_FAST_MATH, + HasErfc = EIGEN_FAST_MATH, HasBlend = 1 }; }; @@ -144,6 +145,7 @@ struct packet_traits : default_packet_traits { #endif HasTanh = EIGEN_FAST_MATH, HasLog = 1, + HasErfc = 1, HasExp = 1, HasSqrt = 1, HasRsqrt = 1, diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 0681a0a0a..78d17d537 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -133,6 +133,7 @@ struct packet_traits : default_packet_traits { HasReciprocal = EIGEN_FAST_MATH, HasTanh = EIGEN_FAST_MATH, HasErf = EIGEN_FAST_MATH, + HasErfc = EIGEN_FAST_MATH, HasCmp = 1, HasDiv = 1 }; @@ -154,6 +155,7 @@ struct packet_traits : default_packet_traits { HasExp = 1, HasATan = 1, HasTanh = EIGEN_FAST_MATH, + HasErfc = EIGEN_FAST_MATH, HasATanh = 1, HasCmp = 1, HasDiv = 1 diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 7401c0b39..da26cd437 100644 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -193,6 +193,7 @@ struct packet_traits : default_packet_traits { #endif HasTanh = EIGEN_FAST_MATH, HasErf = EIGEN_FAST_MATH, + HasErfc = EIGEN_FAST_MATH, #else HasSqrt = 0, HasRsqrt = 0, @@ -3182,6 +3183,7 @@ struct packet_traits : default_packet_traits { HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, HasTanh = EIGEN_FAST_MATH, + HasErfc = EIGEN_FAST_MATH, HasATanh = 1, HasATan = 0, HasLog = 0, diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index a1849312b..d3c067e45 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -1659,6 +1659,13 @@ EIGEN_STRONG_INLINE void twoprod(const Packet& x, const Packet& y, Packet& p_hi, p_lo = pmsub(x, y, p_hi); } +// A version of twoprod that takes x, y, and fl(x*y) as input and returns the p_lo such that +// x * y = xy + p_lo holds exactly. +template +EIGEN_STRONG_INLINE Packet twoprod_low(const Packet& x, const Packet& y, const Packet& xy) { + return pmsub(x, y, xy); +} + #else // This function implements the Veltkamp splitting. Given a floating point @@ -1694,6 +1701,21 @@ EIGEN_STRONG_INLINE void twoprod(const Packet& x, const Packet& y, Packet& p_hi, p_lo = pmadd(x_lo, y_lo, p_lo); } +// A version of twoprod that takes x, y, and fl(x*y) as input and returns the p_lo such that +// x * y = xy + p_lo holds exactly. +template +EIGEN_STRONG_INLINE Packet twoprod_low(const Packet& x, const Packet& y, const Packet& xy) { + Packet x_hi, x_lo, y_hi, y_lo; + veltkamp_splitting(x, x_hi, x_lo); + veltkamp_splitting(y, y_hi, y_lo); + + Packet p_lo = pmadd(x_hi, y_hi, pnegate(xy)); + p_lo = pmadd(x_hi, y_lo, p_lo); + p_lo = pmadd(x_lo, y_hi, p_lo); + p_lo = pmadd(x_lo, y_lo, p_lo); + return p_lo; +} + #endif // EIGEN_VECTORIZE_FMA // This function implements Dekker's algorithm for the addition diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 414f0f587..2f401fdff 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -209,6 +209,7 @@ struct packet_traits : default_packet_traits { HasRsqrt = 1, HasTanh = EIGEN_FAST_MATH, HasErf = EIGEN_FAST_MATH, + HasErfc = EIGEN_FAST_MATH, HasBessel = 0, // Issues with accuracy. HasNdtri = 0 }; @@ -5140,7 +5141,8 @@ struct packet_traits : default_packet_traits { HasSqrt = 1, HasRsqrt = 1, HasTanh = EIGEN_FAST_MATH, - HasErf = 0 + HasErf = 0, + HasErfc = EIGEN_FAST_MATH }; }; diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 37f6048e7..c749763df 100644 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -197,6 +197,7 @@ struct packet_traits : default_packet_traits { HasRsqrt = 1, HasTanh = EIGEN_FAST_MATH, HasErf = EIGEN_FAST_MATH, + HasErfc = EIGEN_FAST_MATH, HasBlend = 1, HasSign = 0 // The manually vectorized version is slightly slower for SSE. }; @@ -216,6 +217,7 @@ struct packet_traits : default_packet_traits { HasCos = EIGEN_FAST_MATH, HasTanh = EIGEN_FAST_MATH, HasLog = 1, + HasErfc = EIGEN_FAST_MATH, HasExp = 1, HasSqrt = 1, HasRsqrt = 1, diff --git a/Eigen/src/Core/arch/SVE/PacketMath.h b/Eigen/src/Core/arch/SVE/PacketMath.h index 51bbfe097..952d7561b 100644 --- a/Eigen/src/Core/arch/SVE/PacketMath.h +++ b/Eigen/src/Core/arch/SVE/PacketMath.h @@ -360,7 +360,8 @@ struct packet_traits : default_packet_traits { HasExp = 1, HasSqrt = 1, HasTanh = EIGEN_FAST_MATH, - HasErf = EIGEN_FAST_MATH + HasErf = EIGEN_FAST_MATH, + HasErfc = EIGEN_FAST_MATH }; }; diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h index 5169f1cb6..0b266f96e 100644 --- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h +++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h @@ -345,11 +345,17 @@ struct erf_impl { /*************************************************************************** * Implementation of erfc, requires C++11/C99 * ****************************************************************************/ +template +struct generic_fast_erfc { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T run(const T& x_in); +}; + +template <> template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erfc_float(const T& x) { - const T x_abs = pmin(pabs(x), pset1(10.0f)); - const T one = pset1(1.0f); - const T x_abs_gt_one_mask = pcmp_lt(one, x_abs); +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erfc::run(const T& x_in) { + constexpr float kClamp = 11.0f; + const T x = pmin(pmax(x_in, pset1(-kClamp)), pset1(kClamp)); // erfc(x) = 1 + x * S(x^2), |x| <= 1. // @@ -360,10 +366,12 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erfc_float(const T& x) { 2.67075151205062866210937500000e-02, -1.12800106406211853027343750000e-01, 3.76122951507568359375000000000e-01, -1.12837910652160644531250000000e+00}; const T x2 = pmul(x, x); + const T one = pset1(1.0); const T erfc_small = pmadd(x, ppolevl::run(x2, alpha), one); // Return early if we don't need the more expensive approximation for any // entry in a. + const T x_abs_gt_one_mask = pcmp_lt(one, x2); if (!predux_any(x_abs_gt_one_mask)) return erfc_small; // erfc(x) = exp(-x^2) * 1/x * P(1/x^2) / Q(1/x^2), 1 < x < 9. @@ -377,23 +385,103 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erfc_float(const T& x) { constexpr float delta[] = {1.7251677811145782470703125e-02f, 3.9137163758277893066406250e-01f, 1.0000000000000000000000000e+00f, 6.2173241376876831054687500e-01f, 9.5662862062454223632812500e-02f}; - const T z = pexp(pnegate(x2)); + const T x2_lo = twoprod_low(x, x, x2); + // Here we use that + // exp(-x^2) = exp(-(x2+x2_lo)^2) ~= exp(-x2)*exp(-x2_lo) ~= exp(-x2)*(1-x2_lo) + // since x2_lo < kClamp * eps << 1 in the region we care about. This trick reduces the max error + // from 34 ulps to below 5 ulps. + const T exp2_hi = pexp(pnegate(x2)); + const T z = pnmadd(exp2_hi, x2_lo, exp2_hi); const T q2 = preciprocal(x2); const T num = ppolevl::run(q2, gamma); - const T denom = pmul(x_abs, ppolevl::run(q2, delta)); + const T denom = pmul(x, ppolevl::run(q2, delta)); const T r = pdiv(num, denom); - // If x < -1 then use erfc(x) = 2 - erfc(|x|). - const T x_negative = pcmp_lt(x, pset1(0.0f)); - const T erfc_large = pselect(x_negative, pnmadd(z, r, pset1(2.0f)), pmul(z, r)); - + const T maybe_two = pand(pcmp_lt(x, pset1(0.0)), pset1(2.0)); + const T erfc_large = pmadd(z, r, maybe_two); return pselect(x_abs_gt_one_mask, erfc_large, erfc_small); } -template -struct erfc_impl { - EIGEN_STATIC_ASSERT((internal::is_same::value == false), THIS_TYPE_IS_NOT_SUPPORTED) +template <> +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erfc::run(const T& x_in) { + // Clamp x to [-27:27] beyond which erfc(x) is either two or zero (below the underflow threshold). + // This avoids having to deal with twoprod(x,x) producing NaN for sufficiently large x. + constexpr double kClamp = 28.0; + const T x = pmin(pmax(x_in, pset1(-kClamp)), pset1(kClamp)); - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Scalar) { return Scalar(0); } + // erfc(x) = 1 + x * S(x^2) / T(x^2), |x| <= 1. + // + // Coefficients for S and T generated with Rminimax command: + // ./ratapprox --function="erfc(x)-1" --dom='[-1,1]' --type=[9,10] + // --num="odd" --numF="[D]" --den="even" --denF="[D]" --log --dispCoeff="dec" + constexpr double alpha[] = {-1.9493725660006057018823477644531294572516344487667083740234375e-04, + -1.8272566210022942682217328425053892715368419885635375976562500e-03, + -4.5303363351690106863856044583371840417385101318359375000000000e-02, + -1.4215015503619179981775744181504705920815467834472656250000000e-01, + -1.1283791670955125585606992899556644260883331298828125000000000e+00}; + constexpr double beta[] = {2.0294484101083099089526257108317963684385176748037338256835938e-05, + 6.8117805899186819641732970609382391558028757572174072265625000e-04, + 1.0582026056098614921752165685120417037978768348693847656250000e-02, + 9.3252603143757495374188692949246615171432495117187500000000000e-02, + 4.5931062818368939559832142549566924571990966796875000000000000e-01, + 1.0}; + const T x2 = pmul(x, x); + const T num_small = ppolevl::run(x2, alpha); + const T denom_small = ppolevl::run(x2, beta); + const T one = pset1(1.0); + const T erfc_small = pmadd(x, pdiv(num_small, denom_small), one); + + // Return early if we don't need the more expensive approximation for any + // entry in a. + const T x_abs_gt_one_mask = pcmp_lt(one, x2); + if (!predux_any(x_abs_gt_one_mask)) return erfc_small; + + // erfc(x) = exp(-x^2) * 1/x * P(x) / Q(x), 1 < x < 27. + // + // Coefficients for P and Q generated with Rminimax command: + // ./ratapprox --function="erfc(1/sqrt(x))*exp(1/x)/sqrt(x)" --dom='[0.0013717,1]' --type=[9,9] --numF="[D]" + // --denF="[D]" --log --dispCoeff="dec" + constexpr double gamma[] = {1.5252844933226974316088642158462107545346952974796295166015625e-04, + 1.0909912393738931124520519233556115068495273590087890625000000e-02, + 1.0628604636755033252537572252549580298364162445068359375000000e-01, + 3.3492472973137982217295416376146022230386734008789062500000000e-01, + 4.5065776215933289750026347064704168587923049926757812500000000e-01, + 2.9433039130294824659017649537418037652969360351562500000000000e-01, + 9.8792676360600226170838311645638896152377128601074218750000000e-02, + 1.7095935395503719655962981960328761488199234008789062500000000e-02, + 1.4249109729504577659398023570247460156679153442382812500000000e-03, + 4.4567378313647954771875570045835956989321857690811157226562500e-05}; + constexpr double delta[] = {2.041985103115789845773520028160419315099716186523437500000000e-03, + 5.316030659946043707142493417450168635696172714233398437500000e-02, + 3.426242193784684864077405563875799998641014099121093750000000e-01, + 8.565637124308049799026321124983951449394226074218750000000000e-01, + 1.000000000000000000000000000000000000000000000000000000000000e+00, + 5.968805280570776972126623149961233139038085937500000000000000e-01, + 1.890922854723317836356244470152887515723705291748046875000000e-01, + 3.152505418656005586885981983868987299501895904541015625000000e-02, + 2.565085751861882583380047861965067568235099315643310546875000e-03, + 7.899362131678837697403017248376499992446042597293853759765625e-05}; + + const T x2_lo = twoprod_low(x, x, x2); + // Here we use that + // exp(-x^2) = exp(-(x2+x2_lo)^2) ~= exp(-x2)*exp(-x2_lo) ~= exp(-x2)*(1-x2_lo) + // since x2_lo < kClamp *eps << 1 in the region we care about. This trick reduces the max error + // from 258 ulps to below 7 ulps. + const T exp2_hi = pexp(pnegate(x2)); + const T z = pnmadd(exp2_hi, x2_lo, exp2_hi); + const T q2 = preciprocal(x2); + const T num_large = ppolevl::run(q2, gamma); + const T denom_large = pmul(x, ppolevl::run(q2, delta)); + const T r = pdiv(num_large, denom_large); + const T maybe_two = pand(pcmp_lt(x, pset1(0.0)), pset1(2.0)); + const T erfc_large = pmadd(z, r, maybe_two); + return pselect(x_abs_gt_one_mask, erfc_large, erfc_small); +} + +template +struct erfc_impl { + typedef typename unpacket_traits::type Scalar; + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) { return generic_fast_erfc::run(x); } }; template @@ -408,7 +496,7 @@ struct erfc_impl { #if defined(SYCL_DEVICE_ONLY) return cl::sycl::erfc(x); #else - return generic_fast_erfc_float(x); + return generic_fast_erfc::run(x); #endif } }; @@ -419,7 +507,7 @@ struct erfc_impl { #if defined(SYCL_DEVICE_ONLY) return cl::sycl::erfc(x); #else - return ::erfc(x); + return generic_fast_erfc::run(x); #endif } }; diff --git a/unsupported/test/special_packetmath.cpp b/unsupported/test/special_packetmath.cpp index 1ec5c3d34..044ce671e 100644 --- a/unsupported/test/special_packetmath.cpp +++ b/unsupported/test/special_packetmath.cpp @@ -117,6 +117,8 @@ void packetmath_real() { #if EIGEN_HAS_C99_MATH CHECK_CWISE1_IF(internal::packet_traits::HasLGamma, std::lgamma, internal::plgamma); CHECK_CWISE1_IF(internal::packet_traits::HasErf, std::erf, internal::perf); + // FIXME(rmlarsen): This test occasionally fails due to difference in tiny subnormal results + // near the underflow boundary. I am not sure which version is correct. CHECK_CWISE1_IF(internal::packet_traits::HasErfc, std::erfc, internal::perfc); #endif }