From 8eab7b6886662a0b370eab5af96e8a26fa19f6df Mon Sep 17 00:00:00 2001
From: Rasmus Munk Larsen <rmlarsen@google.com>
Date: Tue, 28 Dec 2021 15:00:19 +0000
Subject: [PATCH] Improve exp<float>(): Don't flush denormal results +4%
 speedup.

1. Speed up exp(x) by reducing the polynomial approximant from degree 7 to
degree 6. With exactly representable coefficients computed by the Sollya tool,
this still gives a maximum relative error of 1 ulp, i.e. faithfully rounded, for
arguments where exp(x) is a normalized float. This change results in a speedup
of about 4% for AVX2.


2. Extend the range where exp(x) returns a non-zero result to from ~[-88;88] to
~[-104;88] i.e. return denormalized values for large negative arguments instead
of zero. Compared to exp<double>(x) the denormalized results gradually decrease
in accuracy down to 0.033 relative error for arguments around x = -104 where
exp(x) is ~std::numeric<float>::denorm_min(). This is expected and acceptable.
---
 .../arch/Default/GenericPacketMathFunctions.h | 39 +++++++++----------
 1 file changed, 18 insertions(+), 21 deletions(-)
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
index d3c314b40..6934e2a30 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -435,24 +435,24 @@ Packet generic_expm1(const Packet& x)
 // Exponential function. Works by writing "x = m*log(2) + r" where
 // "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then
 // "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1).
+// exp(r) is computed using a 6th order minimax polynomial approximation.
 template <typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 EIGEN_UNUSED
 Packet pexp_float(const Packet _x)
 {
   const Packet cst_zero   = pset1<Packet>(0.0f);
-  const Packet cst_1      = pset1<Packet>(1.0f);
+  const Packet cst_one    = pset1<Packet>(1.0f);
   const Packet cst_half   = pset1<Packet>(0.5f);
-  const Packet cst_exp_hi = pset1<Packet>( 88.723f);
-  const Packet cst_exp_lo = pset1<Packet>(-88.723f);
+  const Packet cst_exp_hi = pset1<Packet>(88.723f);
+  const Packet cst_exp_lo = pset1<Packet>(-104.f);
 
   const Packet cst_cephes_LOG2EF = pset1<Packet>(1.44269504088896341f);
-  const Packet cst_cephes_exp_p0 = pset1<Packet>(1.9875691500E-4f);
-  const Packet cst_cephes_exp_p1 = pset1<Packet>(1.3981999507E-3f);
-  const Packet cst_cephes_exp_p2 = pset1<Packet>(8.3334519073E-3f);
-  const Packet cst_cephes_exp_p3 = pset1<Packet>(4.1665795894E-2f);
-  const Packet cst_cephes_exp_p4 = pset1<Packet>(1.6666665459E-1f);
-  const Packet cst_cephes_exp_p5 = pset1<Packet>(5.0000001201E-1f);
+  const Packet cst_p2 = pset1<Packet>(0.49999988079071044921875f);
+  const Packet cst_p3 = pset1<Packet>(0.16666518151760101318359375f);
+  const Packet cst_p4 = pset1<Packet>(4.166965186595916748046875e-2f);
+  const Packet cst_p5 = pset1<Packet>(8.36894474923610687255859375e-3f);
+  const Packet cst_p6 = pset1<Packet>(1.37449637986719608306884765625e-3f);
 
   // Clamp x.
   Packet zero_mask = pcmp_lt(_x, cst_exp_lo);
@@ -470,18 +470,15 @@ Packet pexp_float(const Packet _x)
   Packet r = pmadd(m, cst_cephes_exp_C1, x);
   r = pmadd(m, cst_cephes_exp_C2, r);
 
-  Packet r2 = pmul(r, r);
-  Packet r3 = pmul(r2, r);
-
-  // Evaluate the polynomial approximant,improved by instruction-level parallelism.
-  Packet y, y1, y2;
-  y  = pmadd(cst_cephes_exp_p0, r, cst_cephes_exp_p1);
-  y1 = pmadd(cst_cephes_exp_p3, r, cst_cephes_exp_p4);
-  y2 = padd(r, cst_1);
-  y  = pmadd(y, r, cst_cephes_exp_p2);
-  y1 = pmadd(y1, r, cst_cephes_exp_p5);
-  y  = pmadd(y, r3, y1);
-  y  = pmadd(y, r2, y2);
+  // Evaluate the 6th order polynomial approximation to exp(r)
+  // with r in the interval [-ln(2)/2;ln(2)/2].
+  const Packet r2 = pmul(r, r);
+  Packet p_even = pmadd(r2, cst_p6, cst_p4);
+  const Packet p_odd = pmadd(r2, cst_p5, cst_p3);
+  p_even = pmadd(r2, p_even, cst_p2);
+  const Packet p_low = padd(r, cst_one);
+  Packet y = pmadd(r, p_odd, p_even);
+  y = pmadd(r2, y, p_low);
 
   // Return 2^m * exp(r).
   // TODO: replace pldexp with faster implementation since y in [-1, 1).