Improve exp<float>(): Don't flush denormal results +4% speedup.

1. Speed up exp(x) by reducing the polynomial approximant from degree 7 to
degree 6. With exactly representable coefficients computed by the Sollya tool,
this still gives a maximum relative error of 1 ulp, i.e. faithfully rounded, for
arguments where exp(x) is a normalized float. This change results in a speedup
of about 4% for AVX2.


2. Extend the range where exp(x) returns a non-zero result to from ~[-88;88] to
~[-104;88] i.e. return denormalized values for large negative arguments instead
of zero. Compared to exp<double>(x) the denormalized results gradually decrease
in accuracy down to 0.033 relative error for arguments around x = -104 where
exp(x) is ~std::numeric<float>::denorm_min(). This is expected and acceptable.
This commit is contained in:
Rasmus Munk Larsen 2021-12-28 15:00:19 +00:00 committed by David Tellenbach
parent 6e95c0cd9a
commit 8eab7b6886

View File

@ -435,24 +435,24 @@ Packet generic_expm1(const Packet& x)
// Exponential function. Works by writing "x = m*log(2) + r" where
// "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then
// "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1).
// exp(r) is computed using a 6th order minimax polynomial approximation.
template <typename Packet>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
EIGEN_UNUSED
Packet pexp_float(const Packet _x)
{
const Packet cst_zero = pset1<Packet>(0.0f);
const Packet cst_1 = pset1<Packet>(1.0f);
const Packet cst_one = pset1<Packet>(1.0f);
const Packet cst_half = pset1<Packet>(0.5f);
const Packet cst_exp_hi = pset1<Packet>( 88.723f);
const Packet cst_exp_lo = pset1<Packet>(-88.723f);
const Packet cst_exp_hi = pset1<Packet>(88.723f);
const Packet cst_exp_lo = pset1<Packet>(-104.f);
const Packet cst_cephes_LOG2EF = pset1<Packet>(1.44269504088896341f);
const Packet cst_cephes_exp_p0 = pset1<Packet>(1.9875691500E-4f);
const Packet cst_cephes_exp_p1 = pset1<Packet>(1.3981999507E-3f);
const Packet cst_cephes_exp_p2 = pset1<Packet>(8.3334519073E-3f);
const Packet cst_cephes_exp_p3 = pset1<Packet>(4.1665795894E-2f);
const Packet cst_cephes_exp_p4 = pset1<Packet>(1.6666665459E-1f);
const Packet cst_cephes_exp_p5 = pset1<Packet>(5.0000001201E-1f);
const Packet cst_p2 = pset1<Packet>(0.49999988079071044921875f);
const Packet cst_p3 = pset1<Packet>(0.16666518151760101318359375f);
const Packet cst_p4 = pset1<Packet>(4.166965186595916748046875e-2f);
const Packet cst_p5 = pset1<Packet>(8.36894474923610687255859375e-3f);
const Packet cst_p6 = pset1<Packet>(1.37449637986719608306884765625e-3f);
// Clamp x.
Packet zero_mask = pcmp_lt(_x, cst_exp_lo);
@ -470,18 +470,15 @@ Packet pexp_float(const Packet _x)
Packet r = pmadd(m, cst_cephes_exp_C1, x);
r = pmadd(m, cst_cephes_exp_C2, r);
Packet r2 = pmul(r, r);
Packet r3 = pmul(r2, r);
// Evaluate the polynomial approximant,improved by instruction-level parallelism.
Packet y, y1, y2;
y = pmadd(cst_cephes_exp_p0, r, cst_cephes_exp_p1);
y1 = pmadd(cst_cephes_exp_p3, r, cst_cephes_exp_p4);
y2 = padd(r, cst_1);
y = pmadd(y, r, cst_cephes_exp_p2);
y1 = pmadd(y1, r, cst_cephes_exp_p5);
y = pmadd(y, r3, y1);
y = pmadd(y, r2, y2);
// Evaluate the 6th order polynomial approximation to exp(r)
// with r in the interval [-ln(2)/2;ln(2)/2].
const Packet r2 = pmul(r, r);
Packet p_even = pmadd(r2, cst_p6, cst_p4);
const Packet p_odd = pmadd(r2, cst_p5, cst_p3);
p_even = pmadd(r2, p_even, cst_p2);
const Packet p_low = padd(r, cst_one);
Packet y = pmadd(r, p_odd, p_even);
y = pmadd(r2, y, p_low);
// Return 2^m * exp(r).
// TODO: replace pldexp with faster implementation since y in [-1, 1).