From 5133c836c032a2efd0636ddc0da31cc4a7848328 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Sat, 16 Nov 2024 19:05:16 +0000 Subject: [PATCH] Vectorize erf(x) for double. --- Eigen/src/Core/arch/AVX/PacketMath.h | 2 + Eigen/src/Core/arch/AVX512/PacketMath.h | 1 + Eigen/src/Core/arch/AltiVec/PacketMath.h | 1 + Eigen/src/Core/arch/NEON/PacketMath.h | 2 +- Eigen/src/Core/arch/SSE/PacketMath.h | 1 + .../SpecialFunctions/SpecialFunctionsImpl.h | 220 ++++++++++-------- 6 files changed, 134 insertions(+), 93 deletions(-) diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 1980e928b..e3dcfae70 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -1,3 +1,4 @@ + // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // @@ -145,6 +146,7 @@ struct packet_traits : default_packet_traits { #endif HasTanh = EIGEN_FAST_MATH, HasLog = 1, + HasErf = 1, HasErfc = 1, HasExp = 1, HasSqrt = 1, diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 78d17d537..5d869e42b 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -155,6 +155,7 @@ struct packet_traits : default_packet_traits { HasExp = 1, HasATan = 1, HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH, HasErfc = EIGEN_FAST_MATH, HasATanh = 1, HasCmp = 1, diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index da26cd437..49220cafc 100644 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -3183,6 +3183,7 @@ struct packet_traits : default_packet_traits { HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH, HasErfc = EIGEN_FAST_MATH, HasATanh = 1, HasATan = 0, diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 2f401fdff..3f2d9d51a 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -5141,7 +5141,7 @@ struct packet_traits : default_packet_traits { HasSqrt = 1, HasRsqrt = 1, HasTanh = EIGEN_FAST_MATH, - HasErf = 0, + HasErf = EIGEN_FAST_MATH, HasErfc = EIGEN_FAST_MATH }; }; diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index c749763df..b3c526fbe 100644 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -217,6 +217,7 @@ struct packet_traits : default_packet_traits { HasCos = EIGEN_FAST_MATH, HasTanh = EIGEN_FAST_MATH, HasLog = 1, + HasErf = EIGEN_FAST_MATH, HasErfc = EIGEN_FAST_MATH, HasExp = 1, HasSqrt = 1, diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h index 0b266f96e..e8fa32b25 100644 --- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h +++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h @@ -269,81 +269,8 @@ struct digamma_impl { } }; -/**************************************************************************** - * Implementation of erf, requires C++11/C99 * - ****************************************************************************/ - -/** \internal \returns the error function of \a a (coeff-wise) - This uses a 11/10-degree rational interpolantand is accurate to 3 ulp for - normalized floats. - - This implementation works on both scalars and SIMD "packets". -*/ -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erf_float(const T& x) { - // The monomial coefficients of the numerator polynomial (odd). - constexpr float alpha[] = {2.123732201653183437883853912353515625e-06f, 2.861979592125862836837768554687500000e-04f, - 3.658048342913389205932617187500000000e-03f, 5.243302136659622192382812500000000000e-02f, - 1.874160766601562500000000000000000000e-01f, 1.128379106521606445312500000000000000e+00f}; - - // The monomial coefficients of the denominator polynomial (even). - constexpr float beta[] = {3.89185734093189239501953125000e-05f, 1.14329601638019084930419921875e-03f, - 1.47520881146192550659179687500e-02f, 1.12945675849914550781250000000e-01f, - 4.99425798654556274414062500000e-01f, 1.0f}; - - // Since the polynomials are odd/even, we need x^2. - // Since erf(4) == 1 in float, we clamp x^2 to 16 to avoid - // computing Inf/Inf below. - const T x2 = pmin(pset1(16.0f), pmul(x, x)); - - // Evaluate the numerator polynomial p. - T p = ppolevl::run(x2, alpha); - p = pmul(x, p); - - // Evaluate the denominator polynomial p. - T q = ppolevl::run(x2, beta); - const T r = pdiv(p, q); - - // Clamp to [-1:1]. - return pmax(pmin(r, pset1(1.0f)), pset1(-1.0f)); -} - -template -struct erf_impl { - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) { return generic_fast_erf_float(x); } -}; - -template -struct erf_retval { - typedef Scalar type; -}; - -#if EIGEN_HAS_C99_MATH -template <> -struct erf_impl { - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float run(float x) { -#if defined(SYCL_DEVICE_ONLY) - return cl::sycl::erf(x); -#else - return generic_fast_erf_float(x); -#endif - } -}; - -template <> -struct erf_impl { - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE double run(double x) { -#if defined(SYCL_DEVICE_ONLY) - return cl::sycl::erf(x); -#else - return ::erf(x); -#endif - } -}; -#endif // EIGEN_HAS_C99_MATH - /*************************************************************************** - * Implementation of erfc, requires C++11/C99 * + * Implementation of erfc. ****************************************************************************/ template struct generic_fast_erfc { @@ -366,7 +293,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erfc::run(const T& x 2.67075151205062866210937500000e-02, -1.12800106406211853027343750000e-01, 3.76122951507568359375000000000e-01, -1.12837910652160644531250000000e+00}; const T x2 = pmul(x, x); - const T one = pset1(1.0); + const T one = pset1(1.0f); const T erfc_small = pmadd(x, ppolevl::run(x2, alpha), one); // Return early if we don't need the more expensive approximation for any @@ -401,42 +328,53 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erfc::run(const T& x return pselect(x_abs_gt_one_mask, erfc_large, erfc_small); } -template <> -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erfc::run(const T& x_in) { - // Clamp x to [-27:27] beyond which erfc(x) is either two or zero (below the underflow threshold). - // This avoids having to deal with twoprod(x,x) producing NaN for sufficiently large x. - constexpr double kClamp = 28.0; - const T x = pmin(pmax(x_in, pset1(-kClamp)), pset1(kClamp)); - // erfc(x) = 1 + x * S(x^2) / T(x^2), |x| <= 1. +// Computes erf(x)/x for |x| <= 1. Used by both erf and erfc implementations. +// Takes x2 = x^2 as input. +// +// PRECONDITION: x2 <= 1. +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T erf_over_x_double_small(const T& x2) { + // erf(x)/x = S(x^2) / T(x^2), x^2 <= 1. // // Coefficients for S and T generated with Rminimax command: - // ./ratapprox --function="erfc(x)-1" --dom='[-1,1]' --type=[9,10] + // ./ratapprox --function="erf(x)" --dom='[-1,1]' --type=[9,10] // --num="odd" --numF="[D]" --den="even" --denF="[D]" --log --dispCoeff="dec" - constexpr double alpha[] = {-1.9493725660006057018823477644531294572516344487667083740234375e-04, - -1.8272566210022942682217328425053892715368419885635375976562500e-03, - -4.5303363351690106863856044583371840417385101318359375000000000e-02, - -1.4215015503619179981775744181504705920815467834472656250000000e-01, - -1.1283791670955125585606992899556644260883331298828125000000000e+00}; + constexpr double alpha[] = {1.9493725660006057018823477644531294572516344487667083740234375e-04, + 1.8272566210022942682217328425053892715368419885635375976562500e-03, + 4.5303363351690106863856044583371840417385101318359375000000000e-02, + 1.4215015503619179981775744181504705920815467834472656250000000e-01, + 1.1283791670955125585606992899556644260883331298828125000000000e+00}; constexpr double beta[] = {2.0294484101083099089526257108317963684385176748037338256835938e-05, 6.8117805899186819641732970609382391558028757572174072265625000e-04, 1.0582026056098614921752165685120417037978768348693847656250000e-02, 9.3252603143757495374188692949246615171432495117187500000000000e-02, 4.5931062818368939559832142549566924571990966796875000000000000e-01, 1.0}; - const T x2 = pmul(x, x); const T num_small = ppolevl::run(x2, alpha); const T denom_small = ppolevl::run(x2, beta); + return pdiv(num_small, denom_small); +} + +template <> +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erfc::run(const T& x_in) { + // Clamp x to [-28:28] beyond which erfc(x) is either two or zero (below the underflow threshold). + // This avoids having to deal with twoprod(x,x) producing NaN for sufficiently large x. + constexpr double kClamp = 28.0; + const T x = pmin(pmax(x_in, pset1(-kClamp)), pset1(kClamp)); + + // For |x| < 1, we use erfc(x) = 1 - erf(x). + const T x2 = pmul(x, x); const T one = pset1(1.0); - const T erfc_small = pmadd(x, pdiv(num_small, denom_small), one); + const T erfc_small = pnmadd(x, erf_over_x_double_small(x2), one); // Return early if we don't need the more expensive approximation for any // entry in a. const T x_abs_gt_one_mask = pcmp_lt(one, x2); if (!predux_any(x_abs_gt_one_mask)) return erfc_small; - // erfc(x) = exp(-x^2) * 1/x * P(x) / Q(x), 1 < x < 27. + // erfc(x) = exp(-x^2) * 1/x * P(x) / Q(x), 1 < x < 28. // // Coefficients for P and Q generated with Rminimax command: // ./ratapprox --function="erfc(1/sqrt(x))*exp(1/x)/sqrt(x)" --dom='[0.0013717,1]' --type=[9,9] --numF="[D]" @@ -513,6 +451,104 @@ struct erfc_impl { }; #endif // EIGEN_HAS_C99_MATH + +/**************************************************************************** + * Implementation of erf. + ****************************************************************************/ + +template +struct generic_fast_erf { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T run(const T& x_in); +}; + +/** \internal \returns the error function of \a a (coeff-wise) + This uses a 11/10-degree rational interpolantand is accurate to 3 ulp for + normalized floats. + + This implementation works on both scalars and SIMD "packets". +*/ +template <> +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erf::run(const T& x) { + // The monomial coefficients of the numerator polynomial (odd). + constexpr float alpha[] = {2.123732201653183437883853912353515625e-06f, 2.861979592125862836837768554687500000e-04f, + 3.658048342913389205932617187500000000e-03f, 5.243302136659622192382812500000000000e-02f, + 1.874160766601562500000000000000000000e-01f, 1.128379106521606445312500000000000000e+00f}; + + // The monomial coefficients of the denominator polynomial (even). + constexpr float beta[] = {3.89185734093189239501953125000e-05f, 1.14329601638019084930419921875e-03f, + 1.47520881146192550659179687500e-02f, 1.12945675849914550781250000000e-01f, + 4.99425798654556274414062500000e-01f, 1.0f}; + + // Since the polynomials are odd/even, we need x^2. + // Since erf(4) == 1 in float, we clamp x^2 to 16 to avoid + // computing Inf/Inf below. + const T x2 = pmin(pset1(16.0f), pmul(x, x)); + + // Evaluate the numerator polynomial p. + T p = ppolevl::run(x2, alpha); + p = pmul(x, p); + + // Evaluate the denominator polynomial p. + T q = ppolevl::run(x2, beta); + const T r = pdiv(p, q); + + // Clamp to [-1:1]. + return pmax(pmin(r, pset1(1.0f)), pset1(-1.0f)); +} + +template<> +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erf::run(const T& x) { + T x2 = pmul(x, x); + T erf_small = pmul(x, erf_over_x_double_small(x2)); + + // Return early if we don't need the more expensive approximation for any + // entry in a. + const T one = pset1(1.0); + const T x_abs_gt_one_mask = pcmp_lt(one, x2); + if (!predux_any(x_abs_gt_one_mask)) return erf_small; + + // For |x| > 1, use erf(x) = 1 - erfc(x). + return psub(one, generic_fast_erfc::run(x)); +} + +template +struct erf_impl { + typedef typename unpacket_traits::type Scalar; + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) { return generic_fast_erf::run(x); } +}; + +template +struct erf_retval { + typedef Scalar type; +}; + +#if EIGEN_HAS_C99_MATH +template <> +struct erf_impl { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float run(const float x) { +#if defined(SYCL_DEVICE_ONLY) + return cl::sycl::erf(x); +#else + return generic_fast_erf::run(x); +#endif + } +}; + +template <> +struct erf_impl { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE double run(const double x) { +#if defined(SYCL_DEVICE_ONLY) + return cl::sycl::erf(x); +#else + return generic_fast_erf::run(x); +#endif + } +}; +#endif // EIGEN_HAS_C99_MATH + /*************************************************************************** * Implementation of ndtri. * ****************************************************************************/