Add generic fast psqrt and prsqrt impls and make them correct for 0, +Inf, NaN, and negative arguments.

2025-10-10 07:06:32 +08:00 · 2022-02-05 00:20:13 +00:00 · 2022-02-05 00:20:13 +00:00 · 979fdd58a4
commit 979fdd58a4
parent 4bffbe84f9
5 changed files with 135 additions and 169 deletions
--- a/Eigen/src/Core/MathFunctionsImpl.h
+++ b/Eigen/src/Core/MathFunctionsImpl.h
@ -29,7 +29,7 @@ namespace internal {
   If the preconditions are satisfied, which they are for for the _*_rcp_ps
   instructions on x86, the result has a maximum relative error of 2 ulps,
-   and correctly handles reciprocals of zero and infinity.
+   and correctly handles reciprocals of zero, infinity, and NaN.
 */
 template <typename Packet, int Steps>
 struct generic_reciprocal_newton_step {
@ -53,11 +53,109 @@ struct generic_reciprocal_newton_step {
 template<typename Packet>
 struct generic_reciprocal_newton_step<Packet, 0> {
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Packet
-   run(const Packet& /*unused*/, const Packet& approx_a_recip) {
+   run(const Packet& /*unused*/, const Packet& approx_rsqrt) {
-    return approx_a_recip;
+    return approx_rsqrt;
  }
 };
 /** \internal Fast reciprocal sqrt using Newton-Raphson's method.
 Preconditions:
   1. The starting guess provided in approx_a_recip must have at least half
      the leading mantissa bits in the correct result, such that a single
      Newton-Raphson step is sufficient to get within 1-2 ulps of the currect
      result.
   2. If a is zero, approx_a_recip must be infinite with the same sign as a.
   3. If a is infinite, approx_a_recip must be zero with the same sign as a.
   If the preconditions are satisfied, which they are for for the _*_rcp_ps
   instructions on x86, the result has a maximum relative error of 2 ulps,
   and correctly handles zero, infinity, and NaN. Positive denormals are
   treated as zero.
 */
 template <typename Packet, int Steps>
 struct generic_rsqrt_newton_step {
  static_assert(Steps > 0, "Steps must be at least 1.");
  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE  Packet
  run(const Packet& a, const Packet& approx_rsqrt) {
    using Scalar = typename unpacket_traits<Packet>::type;
    const Packet one_point_five = pset1<Packet>(Scalar(1.5));
    const Packet minus_half = pset1<Packet>(Scalar(-0.5));
    const Packet minus_half_a = pmul(minus_half, a);
    const Packet neg_mask = pcmp_lt(a, pzero(a));
    Packet x =
         generic_rsqrt_newton_step<Packet,Steps - 1>::run(a, approx_rsqrt);
     const Packet tmp = pmul(minus_half_a, x);
     // If tmp is NaN, it means that a is either 0 or Inf.
     // In this case return the approximation directly.
     const Packet is_not_nan = pcmp_eq(tmp, tmp);
     // If a is negative, return NaN.
     x = por(x, neg_mask);
    // Refine the approximation using one Newton-Raphson step:
    //   x_{n+1} = x_n * (1.5 - x_n * ((0.5 * a) * x_n)).
     const Packet x_newton  = pmul(x, pmadd(tmp, x, one_point_five));
     return pselect(is_not_nan, x_newton, x);
  }
 };
 template<typename Packet>
 struct generic_rsqrt_newton_step<Packet, 0> {
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Packet
   run(const Packet& /*unused*/, const Packet& approx_rsqrt) {
    return approx_rsqrt;
  }
 };
 /** \internal Fast sqrt using Newton-Raphson's method.
 Preconditions:
   1. The starting guess for the reciprocal sqrt provided in approx_rsqrt must
      have at least half the leading mantissa bits in the correct result, such
      that a single Newton-Raphson step is sufficient to get within 1-2 ulps of
      the currect result.
   2. If a is zero, approx_rsqrt must be infinite.
   3. If a is infinite, approx_rsqrt must be zero.
   If the preconditions are satisfied, which they are for for the _*_rsqrt_ps
   instructions on x86, the result has a maximum relative error of 2 ulps,
   and correctly handles zero and infinity, and NaN. Positive denormal inputs
   are treated as zero.
 */
 template <typename Packet, int Steps=1>
 struct generic_sqrt_newton_step {
  static_assert(Steps > 0, "Steps must be at least 1.");
  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE  Packet
  run(const Packet& a, const Packet& approx_rsqrt) {
    using Scalar = typename unpacket_traits<Packet>::type;
    const Packet one_point_five = pset1<Packet>(Scalar(1.5));
    const Packet negative_mask = pcmp_lt(a, pzero(a));
    const Packet minus_half_a = pmul(a, pset1<Packet>(Scalar(-0.5)));
    // Set negative arguments to NaN.
    const Packet a_poisoned = por(a, negative_mask);
    // Do a single step of Newton's iteration for reciprocal square root:
    //   x_{n+1} = x_n * (1.5 - x_n * ((0.5 * a) * x_n)).
    const Packet tmp = pmul(approx_rsqrt, minus_half_a);
    // If tmp is NaN, it means that the argument was either 0 or +inf,
    // and we should return the argument itself as the result.
    const Packet return_rsqrt = pcmp_eq(tmp, tmp);
    Packet rsqrt = pmul(approx_rsqrt, pmadd(tmp, approx_rsqrt, one_point_five));
    for (int step = 1; step < Steps; ++step) {
      rsqrt = pmul(rsqrt, pmadd(pmul(rsqrt, minus_half_a), rsqrt, one_point_five));
    }
    // Return sqrt(x) = x * rsqrt(x) for non-zero finite positive arguments.
    // Return a itself for 0 or +inf, NaN for negative arguments.
    return pselect(return_rsqrt, pmul(a_poisoned, rsqrt), por(a, negative_mask));
  }
 };
 /** \internal \returns the hyperbolic tan of \a a (coeff-wise)
    Doesn't do anything fancy, just a 13/6-degree rational interpolant which
    is accurate up to a couple of ulps in the (approximate) range [-8, 8],
--- a/Eigen/src/Core/arch/AVX/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX/MathFunctions.h
@ -89,34 +89,12 @@ pexp<Packet4d>(const Packet4d& _x) {
  return pexp_double(_x);
 }
 // Functions for sqrt.
 // The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step
 // of Newton's method, at a cost of 1-2 bits of precision as opposed to the
 // exact solution. It does not handle +inf, or denormalized numbers correctly.
 // The main advantage of this approach is not just speed, but also the fact that
 // it can be inlined and pipelined with other computations, further reducing its
 // effective latency. This is similar to Quake3's fast inverse square root.
 // For detail see here: http://www.beyond3d.com/content/articles/8/
 #if EIGEN_FAST_MATH
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet8f psqrt<Packet8f>(const Packet8f& _x) {
-  const Packet8f minus_half_x = pmul(_x, pset1<Packet8f>(-0.5f));
+  return generic_sqrt_newton_step<Packet8f>::run(_x, _mm256_rsqrt_ps(_x));
  const Packet8f negative_mask = pcmp_lt(_x, pzero(_x));
  const Packet8f denormal_mask =
      pandnot(pcmp_lt(_x, pset1<Packet8f>((std::numeric_limits<float>::min)())),
              negative_mask);
  // Compute approximate reciprocal sqrt.
  Packet8f rs = _mm256_rsqrt_ps(_x);
  // Flush negative arguments to zero. This is a workaround which ensures
  // that sqrt of a negative denormal returns -NaN, despite _mm256_rsqrt_ps
  // returning -Inf for such values.
  const Packet8f x_flushed = pandnot(_x, negative_mask);
  // Do a single step of Newton's iteration.
  rs = pmul(rs, pmadd(minus_half_x, pmul(rs,rs), pset1<Packet8f>(1.5f)));
  // Flush results for denormals to zero.
  return pandnot(pmul(x_flushed, rs), denormal_mask);
 }
 #else
@ -135,35 +113,8 @@ Packet4d psqrt<Packet4d>(const Packet4d& _x) {
 #if EIGEN_FAST_MATH
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet8f prsqrt<Packet8f>(const Packet8f& _x) {
+Packet8f prsqrt<Packet8f>(const Packet8f& a) {
-  EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inf, 0x7f800000);
+  return generic_rsqrt_newton_step<Packet8f, /*Steps=*/1>::run(a, _mm256_rsqrt_ps(a));
  EIGEN_DECLARE_CONST_Packet8f(one_point_five, 1.5f);
  EIGEN_DECLARE_CONST_Packet8f(minus_half, -0.5f);
  EIGEN_DECLARE_CONST_Packet8f_FROM_INT(flt_min, 0x00800000);
  Packet8f neg_half = pmul(_x, p8f_minus_half);
  // select only the inverse sqrt of positive normal inputs (denormals are
  // flushed to zero and cause infs as well).
  Packet8f lt_min_mask = _mm256_cmp_ps(_x, p8f_flt_min, _CMP_LT_OQ);
  Packet8f inf_mask =  _mm256_cmp_ps(_x, p8f_inf, _CMP_EQ_OQ);
  Packet8f not_normal_finite_mask = _mm256_or_ps(lt_min_mask, inf_mask);
  // Compute an approximate result using the rsqrt intrinsic.
  Packet8f y_approx = _mm256_rsqrt_ps(_x);
  // Do a single step of Newton-Raphson iteration to improve the approximation.
  // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n).
  // It is essential to evaluate the inner term like this because forming
  // y_n^2 may over- or underflow.
  Packet8f y_newton = pmul(y_approx, pmadd(y_approx, pmul(neg_half, y_approx), p8f_one_point_five));
  // Select the result of the Newton-Raphson step for positive normal arguments.
  // For other arguments, choose the output of the intrinsic. This will
  // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(x) = +inf if
  // x is zero or a positive denormalized float (equivalent to flushing positive
  // denormalized inputs to zero).
  return pselect<Packet8f>(not_normal_finite_mask, y_approx, y_newton);
 }
 template<> EIGEN_STRONG_INLINE Packet8f preciprocal<Packet8f>(const Packet8f& a) {
--- a/Eigen/src/Core/arch/AVX512/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h
@ -155,49 +155,18 @@ EIGEN_STRONG_INLINE Packet16bf pldexp(const Packet16bf& a, const Packet16bf& exp
  return F32ToBf16(pldexp<Packet16f>(Bf16ToF32(a), Bf16ToF32(exponent)));
 }
 // Functions for sqrt.
 // The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step
 // of Newton's method, at a cost of 1-2 bits of precision as opposed to the
 // exact solution. The main advantage of this approach is not just speed, but
 // also the fact that it can be inlined and pipelined with other computations,
 // further reducing its effective latency.
 #if EIGEN_FAST_MATH
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
 psqrt<Packet16f>(const Packet16f& _x) {
-  Packet16f neg_half = pmul(_x, pset1<Packet16f>(-.5f));
+  return generic_sqrt_newton_step<Packet16f>::run(_x, _mm512_rsqrt14_ps(_x));
  __mmask16 denormal_mask = _mm512_kand(
      _mm512_cmp_ps_mask(_x, pset1<Packet16f>((std::numeric_limits<float>::min)()),
                        _CMP_LT_OQ),
      _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_GE_OQ));
  Packet16f x = _mm512_rsqrt14_ps(_x);
  // Do a single step of Newton's iteration.
  x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet16f>(1.5f)));
  // Flush results for denormals to zero.
  return _mm512_mask_blend_ps(denormal_mask, pmul(_x,x), _mm512_setzero_ps());
 }
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
 psqrt<Packet8d>(const Packet8d& _x) {
-  Packet8d neg_half = pmul(_x, pset1<Packet8d>(-.5));
+  // Double requires 2 Newton-Raphson steps for convergence.
-  __mmask16 denormal_mask = _mm512_kand(
+  return generic_sqrt_newton_step<Packet8d, /*Steps=*/2>::run(_x, _mm512_rsqrt14_pd(_x));
      _mm512_cmp_pd_mask(_x, pset1<Packet8d>((std::numeric_limits<double>::min)()),
                        _CMP_LT_OQ),
      _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_GE_OQ));
  Packet8d x = _mm512_rsqrt14_pd(_x);
  // Do a single step of Newton's iteration.
  x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet8d>(1.5)));
  // Do a second step of Newton's iteration.
  x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet8d>(1.5)));
  return _mm512_mask_blend_pd(denormal_mask, pmul(_x,x), _mm512_setzero_pd());
 }
 #else
 template <>
@ -226,31 +195,7 @@ EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
 prsqrt<Packet16f>(const Packet16f& _x) {
-  EIGEN_DECLARE_CONST_Packet16f_FROM_INT(inf, 0x7f800000);
+  return generic_rsqrt_newton_step<Packet16f, /*Steps=*/1>::run(_x, _mm512_rsqrt14_ps(_x));
  EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f);
  EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f);
  Packet16f neg_half = pmul(_x, p16f_minus_half);
  // Identity infinite, negative and denormal arguments.
  __mmask16 inf_mask = _mm512_cmp_ps_mask(_x, p16f_inf, _CMP_EQ_OQ);
  __mmask16 not_pos_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LE_OQ);
  __mmask16 not_finite_pos_mask = not_pos_mask | inf_mask;
  // Compute an approximate result using the rsqrt intrinsic, forcing +inf
  // for denormals for consistency with AVX and SSE implementations.
  Packet16f y_approx = _mm512_rsqrt14_ps(_x);
  // Do a single step of Newton-Raphson iteration to improve the approximation.
  // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n).
  // It is essential to evaluate the inner term like this because forming
  // y_n^2 may over- or underflow.
  Packet16f y_newton = pmul(y_approx, pmadd(y_approx, pmul(neg_half, y_approx), p16f_one_point_five));
  // Select the result of the Newton-Raphson step for positive finite arguments.
  // For other arguments, choose the output of the intrinsic. This will
  // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(0) = +inf.
  return _mm512_mask_blend_ps(not_finite_pos_mask, y_newton, y_approx);
 }
 #endif
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@ -75,30 +75,12 @@ Packet4f pcos<Packet4f>(const Packet4f& _x)
  return pcos_float(_x);
 }
 // Functions for sqrt.
 // The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step
 // of Newton's method, at a cost of 1-2 bits of precision as opposed to the
 // exact solution. It does not handle +inf, or denormalized numbers correctly.
 // The main advantage of this approach is not just speed, but also the fact that
 // it can be inlined and pipelined with other computations, further reducing its
 // effective latency. This is similar to Quake3's fast inverse square root.
 // For detail see here: http://www.beyond3d.com/content/articles/8/
 #if EIGEN_FAST_MATH
 template<>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f psqrt<Packet4f>(const Packet4f& _x)
 {
-  const Packet4f minus_half_x = pmul(_x, pset1<Packet4f>(-0.5f));
+  return generic_sqrt_newton_step<Packet4f>::run(_x, _mm_rsqrt_ps(_x));
  const Packet4f denormal_mask = pandnot(
      pcmp_lt(_x, pset1<Packet4f>((std::numeric_limits<float>::min)())),
      pcmp_lt(_x, pzero(_x)));
  // Compute approximate reciprocal sqrt.
  Packet4f x = _mm_rsqrt_ps(_x);
  // Do a single step of Newton's iteration.
  x = pmul(x, pmadd(minus_half_x, pmul(x,x), pset1<Packet4f>(1.5f)));
  // Flush results for denormals to zero.
  return pandnot(pmul(_x,x), denormal_mask);
 }
 #else
@ -117,43 +99,16 @@ Packet16b psqrt<Packet16b>(const Packet16b& x) { return x; }
 #if EIGEN_FAST_MATH
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f prsqrt<Packet4f>(const Packet4f& _x) {
+Packet4f prsqrt<Packet4f>(const Packet4f& x) {
-  EIGEN_DECLARE_CONST_Packet4f(one_point_five, 1.5f);
+  return generic_rsqrt_newton_step<Packet4f, /*Steps=*/1>::run(x, _mm_rsqrt_ps(x));
  EIGEN_DECLARE_CONST_Packet4f(minus_half, -0.5f);
  EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000u);
  EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000u);
  Packet4f neg_half = pmul(_x, p4f_minus_half);
  // Identity infinite, zero, negative and denormal arguments.
  Packet4f lt_min_mask = _mm_cmplt_ps(_x, p4f_flt_min);
  Packet4f inf_mask = _mm_cmpeq_ps(_x, p4f_inf);
  Packet4f not_normal_finite_mask = _mm_or_ps(lt_min_mask, inf_mask);
  // Compute an approximate result using the rsqrt intrinsic.
  Packet4f y_approx = _mm_rsqrt_ps(_x);
  // Do a single step of Newton-Raphson iteration to improve the approximation.
  // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n).
  // It is essential to evaluate the inner term like this because forming
  // y_n^2 may over- or underflow.
  Packet4f y_newton = pmul(
      y_approx, pmadd(y_approx, pmul(neg_half, y_approx), p4f_one_point_five));
  // Select the result of the Newton-Raphson step for positive normal arguments.
  // For other arguments, choose the output of the intrinsic. This will
  // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(x) = +inf if
  // x is zero or a positive denormalized float (equivalent to flushing positive
  // denormalized inputs to zero).
  return pselect<Packet4f>(not_normal_finite_mask, y_approx, y_newton);
 }
 #ifdef EIGEN_VECTORIZE_FMA
 // Trying to speed up reciprocal using Newton-Raphson is counterproductive
 // unless FMA is available. Without FMA pdiv(pset1<Packet>(Scalar(1),a) is
 // 30% faster.
-template<> EIGEN_STRONG_INLINE Packet4f preciprocal<Packet4f>(const Packet4f& a) {
+template<> EIGEN_STRONG_INLINE Packet4f preciprocal<Packet4f>(const Packet4f& x) {
-  return generic_reciprocal_newton_step<Packet4f, /*Steps=*/1>::run(a, _mm_rcp_ps(a));
+  return generic_reciprocal_newton_step<Packet4f, /*Steps=*/1>::run(x, _mm_rcp_ps(x));
 }
 #endif
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@ -946,18 +946,35 @@ void packetmath_real() {
      VERIFY((numext::isnan)(data2[0]));
      VERIFY((numext::isnan)(data2[1]));
    }
    if (PacketTraits::HasSqrt) {
      test::packet_helper<PacketTraits::HasSqrt, Packet> h;
      data1[0] = Scalar(-1.0f);
      if (std::numeric_limits<Scalar>::has_denorm == std::denorm_present) {
        data1[1] = -std::numeric_limits<Scalar>::denorm_min();
      } else {
        data1[1] = -((std::numeric_limits<Scalar>::min)());
      }
-      h.store(data2, internal::psqrt(h.load(data1)));
+      CHECK_CWISE1(numext::sqrt, internal::psqrt);
-      VERIFY((numext::isnan)(data2[0]));
+
-      VERIFY((numext::isnan)(data2[1]));
+      data1[0] = Scalar(0.0f);
      data1[1] = NumTraits<Scalar>::infinity();
      CHECK_CWISE1(numext::sqrt, internal::psqrt);
    }
    if (PacketTraits::HasRsqrt) {
      data1[0] = Scalar(-1.0f);
      if (std::numeric_limits<Scalar>::has_denorm == std::denorm_present) {
        data1[1] = -std::numeric_limits<Scalar>::denorm_min();
      } else {
        data1[1] = -((std::numeric_limits<Scalar>::min)());
      }
      CHECK_CWISE1(numext::rsqrt, internal::prsqrt);
      data1[0] = Scalar(0.0f);
      data1[1] = NumTraits<Scalar>::infinity();
      CHECK_CWISE1(numext::rsqrt, internal::prsqrt);
    }
    // TODO(rmlarsen): Re-enable for half and bfloat16.
    if (PacketTraits::HasCos
        && !internal::is_same<Scalar, half>::value