From 81b48065ea673cd352d11ef9b6a3d86778ac962d Mon Sep 17 00:00:00 2001
From: Charles Schlosser <cs.schlosser@gmail.com>
Date: Tue, 29 Aug 2023 00:36:07 +0000
Subject: [PATCH] Fix arm32 float division and related bugs

---
 Eigen/src/Core/arch/NEON/PacketMath.h | 158 ++++++++++++++++----------
 test/array_cwise.cpp                  |  16 ++-
 test/packetmath.cpp                   |   4 +-
 test/sparse_permutations.cpp          |  19 ----
 4 files changed, 115 insertions(+), 82 deletions(-)
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index adb1342b4..e70f8b0e2 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -956,57 +956,6 @@ template<> EIGEN_STRONG_INLINE Packet2ul pmul<Packet2ul>(const Packet2ul& a, con
     vdup_n_u64(vgetq_lane_u64(a, 1)*vgetq_lane_u64(b, 1)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet2f pdiv<Packet2f>(const Packet2f& a, const Packet2f& b)
-{
-#if EIGEN_ARCH_ARM64
-  return vdiv_f32(a,b);
-#else
-  Packet2f inv, restep, div;
-
-  // NEON does not offer a divide instruction, we have to do a reciprocal approximation
-  // However NEON in contrast to other SIMD engines (AltiVec/SSE), offers
-  // a reciprocal estimate AND a reciprocal step -which saves a few instructions
-  // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with
-  // Newton-Raphson and vrecpsq_f32()
-  inv = vrecpe_f32(b);
-
-  // This returns a differential, by which we will have to multiply inv to get a better
-  // approximation of 1/b.
-  restep = vrecps_f32(b, inv);
-  inv = vmul_f32(restep, inv);
-
-  // Finally, multiply a by 1/b and get the wanted result of the division.
-  div = vmul_f32(a, inv);
-
-  return div;
-#endif
-}
-template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
-#if EIGEN_ARCH_ARM64
-  return vdivq_f32(a,b);
-#else
-  Packet4f inv, restep, div;
-
-  // NEON does not offer a divide instruction, we have to do a reciprocal approximation
-  // However NEON in contrast to other SIMD engines (AltiVec/SSE), offers
-  // a reciprocal estimate AND a reciprocal step -which saves a few instructions
-  // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with
-  // Newton-Raphson and vrecpsq_f32()
-  inv = vrecpeq_f32(b);
-
-  // This returns a differential, by which we will have to multiply inv to get a better
-  // approximation of 1/b.
-  restep = vrecpsq_f32(b, inv);
-  inv = vmulq_f32(restep, inv);
-
-  // Finally, multiply a by 1/b and get the wanted result of the division.
-  div = vmulq_f32(a, inv);
-
-  return div;
-#endif
-}
-
 template<> EIGEN_STRONG_INLINE Packet4c pdiv<Packet4c>(const Packet4c& /*a*/, const Packet4c& /*b*/)
 {
   eigen_assert(false && "packet integer division are not supported by NEON");
@@ -3362,26 +3311,115 @@ template<> EIGEN_STRONG_INLINE Packet4ui psqrt(const Packet4ui& a) {
   return res;
 }
 
+EIGEN_STRONG_INLINE Packet4f prsqrt_float_unsafe(const Packet4f& a) {
+  // Compute approximate reciprocal sqrt.
+  // Does not correctly handle +/- 0 or +inf
+  float32x4_t result = vrsqrteq_f32(a);
+  result = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, result), result), result);
+  result = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, result), result), result);
+  return result;
+}
+
+EIGEN_STRONG_INLINE Packet2f prsqrt_float_unsafe(const Packet2f& a) {
+  // Compute approximate reciprocal sqrt.
+  // Does not correctly handle +/- 0 or +inf
+  float32x2_t result = vrsqrte_f32(a);
+  result = vmul_f32(vrsqrts_f32(vmul_f32(a, result), result), result);
+  result = vmul_f32(vrsqrts_f32(vmul_f32(a, result), result), result);
+  return result;
+}
+
+template<typename Packet> Packet prsqrt_float_common(const Packet& a) {
+  const Packet cst_zero = pzero(a);
+  const Packet cst_inf = pset1<Packet>(NumTraits<float>::infinity());
+  Packet return_zero = pcmp_eq(a, cst_inf);
+  Packet return_inf = pcmp_eq(a, cst_zero);
+  Packet result = prsqrt_float_unsafe(a);
+  result = pselect(return_inf, por(cst_inf, a), result);
+  result = pandnot(result, return_zero);
+  return result;
+}
+
 template<> EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) {
-  // Do Newton iterations for 1/sqrt(x).
-  return generic_rsqrt_newton_step<Packet4f, /*Steps=*/2>::run(a, vrsqrteq_f32(a));
+  return prsqrt_float_common(a);
 }
 
 template<> EIGEN_STRONG_INLINE Packet2f prsqrt(const Packet2f& a) {
-  // Compute approximate reciprocal sqrt.
-  return generic_rsqrt_newton_step<Packet2f, /*Steps=*/2>::run(a, vrsqrte_f32(a));
+  return prsqrt_float_common(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f preciprocal<Packet4f>(const Packet4f& a)
+{
+  // Compute approximate reciprocal.
+  float32x4_t result = vrecpeq_f32(a);
+  result = vmulq_f32(vrecpsq_f32(a, result), result);
+  result = vmulq_f32(vrecpsq_f32(a, result), result);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2f preciprocal<Packet2f>(const Packet2f& a)
+{
+  // Compute approximate reciprocal.
+  float32x2_t result = vrecpe_f32(a);
+  result = vmul_f32(vrecps_f32(a, result), result);
+  result = vmul_f32(vrecps_f32(a, result), result);
+  return result;
 }
 
 // Unfortunately vsqrt_f32 is only available for A64.
 #if EIGEN_ARCH_ARM64
-template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& _x){return vsqrtq_f32(_x);}
-template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& _x){return vsqrt_f32(_x); }
+template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) { return vsqrtq_f32(a); }
+
+template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& a) { return vsqrt_f32(a); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { return vdivq_f32(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet2f pdiv(const Packet2f& a, const Packet2f& b) { return vdiv_f32(a, b); }
 #else
-template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) {
-  return generic_sqrt_newton_step<Packet4f>::run(a, prsqrt(a));
+template<typename Packet>
+EIGEN_STRONG_INLINE Packet psqrt_float_common(const Packet& a) {
+  const Packet cst_zero = pzero(a);
+  const Packet cst_inf = pset1<Packet>(NumTraits<float>::infinity());
+  
+  Packet result = pmul(a, prsqrt_float_unsafe(a));  
+  Packet a_is_zero = pcmp_eq(a, cst_zero);
+  Packet a_is_inf = pcmp_eq(a, cst_inf);
+  Packet return_a = por(a_is_zero, a_is_inf);
+  
+  result = pselect(return_a, a, result);
+  return result;
 }
+
+template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) {
+  return psqrt_float_common(a);
+}
+
 template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& a) {
-  return generic_sqrt_newton_step<Packet2f>::run(a, prsqrt(a));
+  return psqrt_float_common(a);
+}
+
+template<typename Packet>
+EIGEN_STRONG_INLINE Packet pdiv_float_common(const Packet& a, const Packet& b) {
+  // if b is large, NEON intrinsics will flush preciprocal(b) to zero
+  // avoid underflow with the following manipulation:
+  // a / b = f * (a * reciprocal(f * b))
+
+  const Packet cst_one = pset1<Packet>(1.0f);
+  const Packet cst_quarter = pset1<Packet>(0.25f);
+  const Packet cst_thresh = pset1<Packet>(NumTraits<float>::highest() / 4.0f);
+  
+  Packet b_will_underflow = pcmp_le(cst_thresh, pabs(b));
+  Packet f = pselect(b_will_underflow, cst_quarter, cst_one);
+  Packet result = pmul(f, pmul(a, preciprocal(pmul(b, f))));
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return pdiv_float_common(a, b);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2f pdiv<Packet2f>(const Packet2f& a, const Packet2f& b) {
+  return pdiv_float_common(a, b);
 }
 #endif
 
diff --git a/test/array_cwise.cpp b/test/array_cwise.cpp
index 49e667252..058b721fa 100644
--- a/test/array_cwise.cpp
+++ b/test/array_cwise.cpp
@@ -47,7 +47,7 @@ std::vector<Scalar> special_values() {
   const Scalar sqrt2 = Scalar(std::sqrt(2));    
   const Scalar inf = Eigen::NumTraits<Scalar>::infinity();
   const Scalar nan = Eigen::NumTraits<Scalar>::quiet_NaN();
-  const Scalar denorm_min = std::numeric_limits<Scalar>::denorm_min();
+  const Scalar denorm_min = EIGEN_ARCH_ARM ? zero : std::numeric_limits<Scalar>::denorm_min();
   const Scalar min = (std::numeric_limits<Scalar>::min)();
   const Scalar max = (std::numeric_limits<Scalar>::max)();
   const Scalar max_exp = (static_cast<Scalar>(int(Eigen::NumTraits<Scalar>::max_exponent())) * Scalar(EIGEN_LN2)) / eps;
@@ -97,6 +97,12 @@ void binary_op_test(std::string name, Fn fun, RefFn ref) {
     for (Index j = 0; j < lhs.cols(); ++j) {
       Scalar e = static_cast<Scalar>(ref(lhs(i,j), rhs(i,j)));
       Scalar a = actual(i, j);
+      #if EIGEN_ARCH_ARM
+      // Work around NEON flush-to-zero mode
+      // if ref returns denormalized value and Eigen returns 0, then skip the test
+      int ref_fpclass = std::fpclassify(e);
+      if (a == Scalar(0) && ref_fpclass == FP_SUBNORMAL) continue;
+      #endif
       bool success = (a==e) || ((numext::isfinite)(e) && internal::isApprox(a, e, tol)) || ((numext::isnan)(a) && (numext::isnan)(e));
       if ((a == a) && (e == e)) success &= (bool)numext::signbit(e) == (bool)numext::signbit(a);
       all_pass &= success;
@@ -767,7 +773,12 @@ template<typename ArrayType> void array_real(const ArrayType& m)
             m3(rows, cols),
             m4 = m1;
 
-  m4 = (m4.abs()==Scalar(0)).select(Scalar(1),m4);
+  // avoid denormalized values so verification doesn't fail on platforms that don't support them
+  // denormalized behavior is tested elsewhere (unary_op_test, binary_ops_test)
+  const Scalar min = (std::numeric_limits<Scalar>::min)();
+  m1 = (m1.abs()<min).select(Scalar(0),m1);
+  m2 = (m2.abs()<min).select(Scalar(0),m2);
+  m4 = (m4.abs()<min).select(Scalar(1),m4);
 
   Scalar  s1 = internal::random<Scalar>();
 
@@ -808,6 +819,7 @@ template<typename ArrayType> void array_real(const ArrayType& m)
 
   // avoid inf and NaNs so verification doesn't fail
   m3 = m4.abs();
+
   VERIFY_IS_APPROX(m3.sqrt(), sqrt(abs(m3)));
   VERIFY_IS_APPROX(m3.rsqrt(), Scalar(1)/sqrt(abs(m3)));
   VERIFY_IS_APPROX(rsqrt(m3), Scalar(1)/sqrt(abs(m3)));
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index 5dd4cbc3d..b5b040187 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -754,7 +754,7 @@ void packetmath_test_IEEE_corner_cases(const RefFunctorT& ref_fun,
   }
 
   // Test for subnormals.
-  if (Cond && std::numeric_limits<Scalar>::has_denorm == std::denorm_present) {
+  if (Cond && std::numeric_limits<Scalar>::has_denorm == std::denorm_present && !EIGEN_ARCH_ARM) {
 
     for (int scale = 1; scale < 5; ++scale) {
       // When EIGEN_FAST_MATH is 1 we relax the conditions slightly, and allow the function
@@ -912,12 +912,14 @@ void packetmath_real() {
   CHECK_CWISE1_BYREF1_IF(PacketTraits::HasExp, REF_FREXP, internal::pfrexp);
   if (PacketTraits::HasExp) {
     // Check denormals:
+    #if !EIGEN_ARCH_ARM
     for (int j=0; j<3; ++j) {
       data1[0] = Scalar(std::ldexp(1, NumTraits<Scalar>::min_exponent()-j));
       CHECK_CWISE1_BYREF1_IF(PacketTraits::HasExp, REF_FREXP, internal::pfrexp);
       data1[0] = -data1[0];
       CHECK_CWISE1_BYREF1_IF(PacketTraits::HasExp, REF_FREXP, internal::pfrexp);
     }
+    #endif
 
     // zero
     data1[0] = Scalar(0);
diff --git a/test/sparse_permutations.cpp b/test/sparse_permutations.cpp
index 775c5608b..3e46bfa35 100644
--- a/test/sparse_permutations.cpp
+++ b/test/sparse_permutations.cpp
@@ -113,25 +113,6 @@ template<int OtherStorage, typename SparseMatrixType> void sparse_permutations(c
   res_d = p.inverse()*mat_d;
   VERIFY(res.isApprox(res_d) && "inv(p)*mat");
 
-  // test non-plaintype expressions that require additional temporary
-  const Scalar alpha(2.34);
-
-  res_d = p * (alpha * mat_d);
-  VERIFY_TEMPORARY_COUNT( res = p * (alpha * mat), 2);
-  VERIFY( res.isApprox(res_d) && "p*(alpha*mat)" );
-
-  res_d = (alpha * mat_d) * p;
-  VERIFY_TEMPORARY_COUNT( res = (alpha * mat) * p, 2);
-  VERIFY( res.isApprox(res_d) && "(alpha*mat)*p" );
-
-  res_d = p.inverse() * (alpha * mat_d);
-  VERIFY_TEMPORARY_COUNT( res = p.inverse() * (alpha * mat), 2);
-  VERIFY( res.isApprox(res_d) && "inv(p)*(alpha*mat)" );
-
-  res_d = (alpha * mat_d) * p.inverse();
-  VERIFY_TEMPORARY_COUNT( res = (alpha * mat) * p.inverse(), 2);
-  VERIFY( res.isApprox(res_d) && "(alpha*mat)*inv(p)" );
-
   //
 
   VERIFY( is_sorted( (p * mat * p.inverse()).eval() ));