From 8eab7b6886662a0b370eab5af96e8a26fa19f6df Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Tue, 28 Dec 2021 15:00:19 +0000 Subject: [PATCH] Improve exp(): Don't flush denormal results +4% speedup. 1. Speed up exp(x) by reducing the polynomial approximant from degree 7 to degree 6. With exactly representable coefficients computed by the Sollya tool, this still gives a maximum relative error of 1 ulp, i.e. faithfully rounded, for arguments where exp(x) is a normalized float. This change results in a speedup of about 4% for AVX2. 2. Extend the range where exp(x) returns a non-zero result to from ~[-88;88] to ~[-104;88] i.e. return denormalized values for large negative arguments instead of zero. Compared to exp(x) the denormalized results gradually decrease in accuracy down to 0.033 relative error for arguments around x = -104 where exp(x) is ~std::numeric::denorm_min(). This is expected and acceptable. --- .../arch/Default/GenericPacketMathFunctions.h | 39 +++++++++---------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index d3c314b40..6934e2a30 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -435,24 +435,24 @@ Packet generic_expm1(const Packet& x) // Exponential function. Works by writing "x = m*log(2) + r" where // "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then // "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1). +// exp(r) is computed using a 6th order minimax polynomial approximation. template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet pexp_float(const Packet _x) { const Packet cst_zero = pset1(0.0f); - const Packet cst_1 = pset1(1.0f); + const Packet cst_one = pset1(1.0f); const Packet cst_half = pset1(0.5f); - const Packet cst_exp_hi = pset1( 88.723f); - const Packet cst_exp_lo = pset1(-88.723f); + const Packet cst_exp_hi = pset1(88.723f); + const Packet cst_exp_lo = pset1(-104.f); const Packet cst_cephes_LOG2EF = pset1(1.44269504088896341f); - const Packet cst_cephes_exp_p0 = pset1(1.9875691500E-4f); - const Packet cst_cephes_exp_p1 = pset1(1.3981999507E-3f); - const Packet cst_cephes_exp_p2 = pset1(8.3334519073E-3f); - const Packet cst_cephes_exp_p3 = pset1(4.1665795894E-2f); - const Packet cst_cephes_exp_p4 = pset1(1.6666665459E-1f); - const Packet cst_cephes_exp_p5 = pset1(5.0000001201E-1f); + const Packet cst_p2 = pset1(0.49999988079071044921875f); + const Packet cst_p3 = pset1(0.16666518151760101318359375f); + const Packet cst_p4 = pset1(4.166965186595916748046875e-2f); + const Packet cst_p5 = pset1(8.36894474923610687255859375e-3f); + const Packet cst_p6 = pset1(1.37449637986719608306884765625e-3f); // Clamp x. Packet zero_mask = pcmp_lt(_x, cst_exp_lo); @@ -470,18 +470,15 @@ Packet pexp_float(const Packet _x) Packet r = pmadd(m, cst_cephes_exp_C1, x); r = pmadd(m, cst_cephes_exp_C2, r); - Packet r2 = pmul(r, r); - Packet r3 = pmul(r2, r); - - // Evaluate the polynomial approximant,improved by instruction-level parallelism. - Packet y, y1, y2; - y = pmadd(cst_cephes_exp_p0, r, cst_cephes_exp_p1); - y1 = pmadd(cst_cephes_exp_p3, r, cst_cephes_exp_p4); - y2 = padd(r, cst_1); - y = pmadd(y, r, cst_cephes_exp_p2); - y1 = pmadd(y1, r, cst_cephes_exp_p5); - y = pmadd(y, r3, y1); - y = pmadd(y, r2, y2); + // Evaluate the 6th order polynomial approximation to exp(r) + // with r in the interval [-ln(2)/2;ln(2)/2]. + const Packet r2 = pmul(r, r); + Packet p_even = pmadd(r2, cst_p6, cst_p4); + const Packet p_odd = pmadd(r2, cst_p5, cst_p3); + p_even = pmadd(r2, p_even, cst_p2); + const Packet p_low = padd(r, cst_one); + Packet y = pmadd(r, p_odd, p_even); + y = pmadd(r2, y, p_low); // Return 2^m * exp(r). // TODO: replace pldexp with faster implementation since y in [-1, 1).