From 8f2c6f0aa6a59e24df245ef36abfb45ea190766b Mon Sep 17 00:00:00 2001
From: Rasmus Munk Larsen <rmlarsen@google.com>
Date: Wed, 26 Jan 2022 20:38:05 +0000
Subject: [PATCH] Make preciprocal IEEE compliant w.r.t. 1/0 and 1/inf.

---
 Eigen/src/Core/MathFunctionsImpl.h      | 26 ++++++++++++++++++-------
 Eigen/src/Core/arch/SSE/MathFunctions.h | 10 ++++++++--
 test/packetmath.cpp                     | 17 ++++++++++++++++
 3 files changed, 44 insertions(+), 9 deletions(-)
diff --git a/Eigen/src/Core/MathFunctionsImpl.h b/Eigen/src/Core/MathFunctionsImpl.h
index 6b248d59c..58f14a9b0 100644
--- a/Eigen/src/Core/MathFunctionsImpl.h
+++ b/Eigen/src/Core/MathFunctionsImpl.h
@@ -18,9 +18,18 @@ namespace Eigen {
 namespace internal {
 
 /** \internal Fast reciprocal using Newton-Raphson's method.
- We assume that the starting guess provided in approx_a_recip has at least
- half the leading mantissa bits in the correct result, such that a single
- Newton-Raphson step is sufficient to get within 1-2 ulps of the currect result.
+
+ Preconditions:
+   1. The starting guess provided in approx_a_recip must have at least half
+      the leading mantissa bits in the correct result, such that a single
+      Newton-Raphson step is sufficient to get within 1-2 ulps of the currect
+      result.
+   2. If a is zero, approx_a_recip must be infinite with the same sign as a.
+   3. If a is infinite, approx_a_recip must be zero with the same sign as a.
+
+   If the preconditions are satisfied, which they are for for the _*_rcp_ps
+   instructions on x86, the result has a maximum relative error of 2 ulps,
+   and correctly handles reciprocals of zero and infinity.
 */
 template <typename Packet, int Steps>
 struct generic_reciprocal_newton_step {
@@ -29,12 +38,15 @@ struct generic_reciprocal_newton_step {
   run(const Packet& a, const Packet& approx_a_recip) {
     using Scalar = typename unpacket_traits<Packet>::type;
     const Packet two = pset1<Packet>(Scalar(2));
-    const Packet neg_a = pnegate(a);
     // Refine the approximation using one Newton-Raphson step:
     //   x_{i} = x_{i-1} * (2 - a * x_{i-1})
-    const Packet x =
-        generic_reciprocal_newton_step<Packet,Steps - 1>::run(a, approx_a_recip);
-    return pmul(x, pmadd(neg_a, x, two));
+     const Packet x =
+         generic_reciprocal_newton_step<Packet,Steps - 1>::run(a, approx_a_recip);
+     const Packet tmp = pnmadd(a, x, two);
+     // If tmp is NaN, it means that a is either +/-0 or +/-Inf.
+     // In this case return the approximation directly.
+     const Packet is_not_nan = pcmp_eq(tmp, tmp);
+     return pselect(is_not_nan, pmul(x, tmp), x);
   }
 };
 
diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h
index 10bc107e0..10e495ec0 100644
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -148,11 +148,17 @@ Packet4f prsqrt<Packet4f>(const Packet4f& _x) {
   return pselect<Packet4f>(not_normal_finite_mask, y_approx, y_newton);
 }
 
-#endif
-
+#ifdef EIGEN_VECTORIZE_FMA
+// Trying to speed up reciprocal using Newton-Raphson is counterproductive
+// unless FMA is available. Without FMA pdiv(pset1<Packet>(Scalar(1),a) is
+// 30% faster.
 template<> EIGEN_STRONG_INLINE Packet4f preciprocal<Packet4f>(const Packet4f& a) {
   return generic_reciprocal_newton_step<Packet4f, /*Steps=*/1>::run(a, _mm_rcp_ps(a));
 }
+#endif
+
+#endif
+
 
 
 // Hyperbolic Tangent function.
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index e60308a90..c4cc448e2 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -1007,6 +1007,23 @@ void packetmath_real() {
       VERIFY_IS_EQUAL(data2[0], Scalar(1));
     }
   }
+  if (PacketTraits::HasReciprocal && PacketSize >= 2) {
+    test::packet_helper<PacketTraits::HasReciprocal, Packet> h;
+    const Scalar inf = NumTraits<Scalar>::infinity();
+    const Scalar zero = Scalar(0);
+    data1[0] = zero;
+    data1[1] = -zero;
+    h.store(data2, internal::preciprocal(h.load(data1)));
+    VERIFY_IS_EQUAL(data2[0], inf);
+    VERIFY_IS_EQUAL(data2[1], -inf);
+
+    data1[0] = inf;
+    data1[1] = -inf;
+    h.store(data2, internal::preciprocal(h.load(data1)));
+    VERIFY_IS_EQUAL(data2[0], zero);
+    VERIFY_IS_EQUAL(data2[1], -zero);
+
+  }
 }
 
 #define CAST_CHECK_CWISE1_IF(COND, REFOP, POP, SCALAR, REFTYPE) if(COND) { \