Make preciprocal IEEE compliant w.r.t. 1/0 and 1/inf.

2025-04-19 08:09:36 +08:00 · 2022-01-26 20:38:05 +00:00 · 2022-01-26 20:38:05 +00:00 · 8f2c6f0aa6
commit 8f2c6f0aa6
parent d271a7d545
3 changed files with 44 additions and 9 deletions
--- a/Eigen/src/Core/MathFunctionsImpl.h
+++ b/Eigen/src/Core/MathFunctionsImpl.h
@ -18,9 +18,18 @@ namespace Eigen {
 namespace internal {

 /** \internal Fast reciprocal using Newton-Raphson's method.
- We assume that the starting guess provided in approx_a_recip has at least
- half the leading mantissa bits in the correct result, such that a single
- Newton-Raphson step is sufficient to get within 1-2 ulps of the currect result.
+
+ Preconditions:
+   1. The starting guess provided in approx_a_recip must have at least half
+      the leading mantissa bits in the correct result, such that a single
+      Newton-Raphson step is sufficient to get within 1-2 ulps of the currect
+      result.
+   2. If a is zero, approx_a_recip must be infinite with the same sign as a.
+   3. If a is infinite, approx_a_recip must be zero with the same sign as a.
+
+   If the preconditions are satisfied, which they are for for the _*_rcp_ps
+   instructions on x86, the result has a maximum relative error of 2 ulps,
+   and correctly handles reciprocals of zero and infinity.
 */
 template <typename Packet, int Steps>
 struct generic_reciprocal_newton_step {
@ -29,12 +38,15 @@ struct generic_reciprocal_newton_step {
  run(const Packet& a, const Packet& approx_a_recip) {
    using Scalar = typename unpacket_traits<Packet>::type;
    const Packet two = pset1<Packet>(Scalar(2));
-    const Packet neg_a = pnegate(a);
    // Refine the approximation using one Newton-Raphson step:
    //   x_{i} = x_{i-1} * (2 - a * x_{i-1})
-    const Packet x =
-        generic_reciprocal_newton_step<Packet,Steps - 1>::run(a, approx_a_recip);
-    return pmul(x, pmadd(neg_a, x, two));
+     const Packet x =
+         generic_reciprocal_newton_step<Packet,Steps - 1>::run(a, approx_a_recip);
+     const Packet tmp = pnmadd(a, x, two);
+     // If tmp is NaN, it means that a is either +/-0 or +/-Inf.
+     // In this case return the approximation directly.
+     const Packet is_not_nan = pcmp_eq(tmp, tmp);
+     return pselect(is_not_nan, pmul(x, tmp), x);
  }
 };

--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@ -148,11 +148,17 @@ Packet4f prsqrt<Packet4f>(const Packet4f& _x) {
  return pselect<Packet4f>(not_normal_finite_mask, y_approx, y_newton);
 }

-#endif
-
+#ifdef EIGEN_VECTORIZE_FMA
+// Trying to speed up reciprocal using Newton-Raphson is counterproductive
+// unless FMA is available. Without FMA pdiv(pset1<Packet>(Scalar(1),a) is
+// 30% faster.
 template<> EIGEN_STRONG_INLINE Packet4f preciprocal<Packet4f>(const Packet4f& a) {
  return generic_reciprocal_newton_step<Packet4f, /*Steps=*/1>::run(a, _mm_rcp_ps(a));
 }
+#endif
+
+#endif
+


 // Hyperbolic Tangent function.
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@ -1007,6 +1007,23 @@ void packetmath_real() {
      VERIFY_IS_EQUAL(data2[0], Scalar(1));
    }
  }
+  if (PacketTraits::HasReciprocal && PacketSize >= 2) {
+    test::packet_helper<PacketTraits::HasReciprocal, Packet> h;
+    const Scalar inf = NumTraits<Scalar>::infinity();
+    const Scalar zero = Scalar(0);
+    data1[0] = zero;
+    data1[1] = -zero;
+    h.store(data2, internal::preciprocal(h.load(data1)));
+    VERIFY_IS_EQUAL(data2[0], inf);
+    VERIFY_IS_EQUAL(data2[1], -inf);
+
+    data1[0] = inf;
+    data1[1] = -inf;
+    h.store(data2, internal::preciprocal(h.load(data1)));
+    VERIFY_IS_EQUAL(data2[0], zero);
+    VERIFY_IS_EQUAL(data2[1], -zero);
+
+  }
 }

 #define CAST_CHECK_CWISE1_IF(COND, REFOP, POP, SCALAR, REFTYPE) if(COND) { \