From 8f2c6f0aa6a59e24df245ef36abfb45ea190766b Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Wed, 26 Jan 2022 20:38:05 +0000 Subject: [PATCH] Make preciprocal IEEE compliant w.r.t. 1/0 and 1/inf. --- Eigen/src/Core/MathFunctionsImpl.h | 26 ++++++++++++++++++------- Eigen/src/Core/arch/SSE/MathFunctions.h | 10 ++++++++-- test/packetmath.cpp | 17 ++++++++++++++++ 3 files changed, 44 insertions(+), 9 deletions(-) diff --git a/Eigen/src/Core/MathFunctionsImpl.h b/Eigen/src/Core/MathFunctionsImpl.h index 6b248d59c..58f14a9b0 100644 --- a/Eigen/src/Core/MathFunctionsImpl.h +++ b/Eigen/src/Core/MathFunctionsImpl.h @@ -18,9 +18,18 @@ namespace Eigen { namespace internal { /** \internal Fast reciprocal using Newton-Raphson's method. - We assume that the starting guess provided in approx_a_recip has at least - half the leading mantissa bits in the correct result, such that a single - Newton-Raphson step is sufficient to get within 1-2 ulps of the currect result. + + Preconditions: + 1. The starting guess provided in approx_a_recip must have at least half + the leading mantissa bits in the correct result, such that a single + Newton-Raphson step is sufficient to get within 1-2 ulps of the currect + result. + 2. If a is zero, approx_a_recip must be infinite with the same sign as a. + 3. If a is infinite, approx_a_recip must be zero with the same sign as a. + + If the preconditions are satisfied, which they are for for the _*_rcp_ps + instructions on x86, the result has a maximum relative error of 2 ulps, + and correctly handles reciprocals of zero and infinity. */ template struct generic_reciprocal_newton_step { @@ -29,12 +38,15 @@ struct generic_reciprocal_newton_step { run(const Packet& a, const Packet& approx_a_recip) { using Scalar = typename unpacket_traits::type; const Packet two = pset1(Scalar(2)); - const Packet neg_a = pnegate(a); // Refine the approximation using one Newton-Raphson step: // x_{i} = x_{i-1} * (2 - a * x_{i-1}) - const Packet x = - generic_reciprocal_newton_step::run(a, approx_a_recip); - return pmul(x, pmadd(neg_a, x, two)); + const Packet x = + generic_reciprocal_newton_step::run(a, approx_a_recip); + const Packet tmp = pnmadd(a, x, two); + // If tmp is NaN, it means that a is either +/-0 or +/-Inf. + // In this case return the approximation directly. + const Packet is_not_nan = pcmp_eq(tmp, tmp); + return pselect(is_not_nan, pmul(x, tmp), x); } }; diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h index 10bc107e0..10e495ec0 100644 --- a/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -148,11 +148,17 @@ Packet4f prsqrt(const Packet4f& _x) { return pselect(not_normal_finite_mask, y_approx, y_newton); } -#endif - +#ifdef EIGEN_VECTORIZE_FMA +// Trying to speed up reciprocal using Newton-Raphson is counterproductive +// unless FMA is available. Without FMA pdiv(pset1(Scalar(1),a) is +// 30% faster. template<> EIGEN_STRONG_INLINE Packet4f preciprocal(const Packet4f& a) { return generic_reciprocal_newton_step::run(a, _mm_rcp_ps(a)); } +#endif + +#endif + // Hyperbolic Tangent function. diff --git a/test/packetmath.cpp b/test/packetmath.cpp index e60308a90..c4cc448e2 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -1007,6 +1007,23 @@ void packetmath_real() { VERIFY_IS_EQUAL(data2[0], Scalar(1)); } } + if (PacketTraits::HasReciprocal && PacketSize >= 2) { + test::packet_helper h; + const Scalar inf = NumTraits::infinity(); + const Scalar zero = Scalar(0); + data1[0] = zero; + data1[1] = -zero; + h.store(data2, internal::preciprocal(h.load(data1))); + VERIFY_IS_EQUAL(data2[0], inf); + VERIFY_IS_EQUAL(data2[1], -inf); + + data1[0] = inf; + data1[1] = -inf; + h.store(data2, internal::preciprocal(h.load(data1))); + VERIFY_IS_EQUAL(data2[0], zero); + VERIFY_IS_EQUAL(data2[1], -zero); + + } } #define CAST_CHECK_CWISE1_IF(COND, REFOP, POP, SCALAR, REFTYPE) if(COND) { \