From 7ac8897431e4e7f8da4537be73a2ad51d9cd7029 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Sun, 4 Jun 2023 22:25:33 +0000 Subject: [PATCH] Reduce max relative error of prsqrt from 3 to 2 ulps. --- Eigen/src/Core/MathFunctionsImpl.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/Eigen/src/Core/MathFunctionsImpl.h b/Eigen/src/Core/MathFunctionsImpl.h index e5ae03db7..5782db45e 100644 --- a/Eigen/src/Core/MathFunctionsImpl.h +++ b/Eigen/src/Core/MathFunctionsImpl.h @@ -80,26 +80,26 @@ struct generic_rsqrt_newton_step { using Scalar = typename unpacket_traits::type; EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Packet run(const Packet& a, const Packet& approx_rsqrt) { + constexpr Scalar kMinusHalf = Scalar(-1)/Scalar(2); + const Packet cst_minus_half = pset1(kMinusHalf); const Packet cst_minus_one = pset1(Scalar(-1)); - const Packet cst_minus_half = pset1(Scalar(-1)/Scalar(2)); - - // Refine the approximation using one Newton-Raphson step: - // The approximation is expressed this way to avoid over/under-flows. - // x' = x - (x/2) * ( (a*x)*x - 1) - Packet x = approx_rsqrt; + Packet inv_sqrt = approx_rsqrt; for (int step = 0; step < Steps; ++step) { - Packet minushalfx = pmul(cst_minus_half, x); - Packet ax = pmul(a, x); - Packet ax2m1 = pmadd(ax, x, cst_minus_one); - x = pmadd(ax2m1, minushalfx, x); + // Refine the approximation using one Newton-Raphson step: + // h_n = x * (inv_sqrt * inv_sqrt) - 1 (so that h_n is nearly 0). + // inv_sqrt = inv_sqrt - 0.5 * inv_sqrt * h_n + Packet r2 = pmul(inv_sqrt, inv_sqrt); + Packet half_r = pmul(inv_sqrt, cst_minus_half); + Packet h_n = pmadd(a, r2, cst_minus_one); + inv_sqrt = pmadd(half_r, h_n, inv_sqrt); } // If x is NaN, then either: // 1) the input is NaN // 2) zero and infinity were multiplied // In either of these cases, return approx_rsqrt - return pselect(pisnan(x), approx_rsqrt, x); + return pselect(pisnan(inv_sqrt), approx_rsqrt, inv_sqrt); } };