diff --git a/Eigen/src/Core/MathFunctionsImpl.h b/Eigen/src/Core/MathFunctionsImpl.h index 55557b7d4..8a6121c59 100644 --- a/Eigen/src/Core/MathFunctionsImpl.h +++ b/Eigen/src/Core/MathFunctionsImpl.h @@ -84,19 +84,19 @@ struct generic_rsqrt_newton_step { const Packet one_point_five = pset1(Scalar(1.5)); const Packet minus_half = pset1(Scalar(-0.5)); const Packet minus_half_a = pmul(minus_half, a); - const Packet neg_mask = pcmp_lt(a, pzero(a)); + const Scalar norm_min = (std::numeric_limits::min)(); + const Packet denorm_mask = pcmp_lt(a, pset1(norm_min)); Packet x = generic_rsqrt_newton_step::run(a, approx_rsqrt); const Packet tmp = pmul(minus_half_a, x); // If tmp is NaN, it means that a is either 0 or Inf. - // In this case return the approximation directly. - const Packet is_not_nan = pcmp_eq(tmp, tmp); - // If a is negative, return NaN. - x = por(x, neg_mask); + // In this case return the approximation directly. Do the same for + // positive subnormals. Otherwise return the Newton iterate. + const Packet return_x_newton = pandnot(pcmp_eq(tmp, tmp), denorm_mask); // Refine the approximation using one Newton-Raphson step: // x_{n+1} = x_n * (1.5 - x_n * ((0.5 * a) * x_n)). const Packet x_newton = pmul(x, pmadd(tmp, x, one_point_five)); - return pselect(is_not_nan, x_newton, x); + return pselect(return_x_newton, x_newton, x); } }; @@ -133,9 +133,11 @@ struct generic_sqrt_newton_step { using Scalar = typename unpacket_traits::type; const Packet one_point_five = pset1(Scalar(1.5)); const Packet negative_mask = pcmp_lt(a, pzero(a)); - const Packet minus_half_a = pmul(a, pset1(Scalar(-0.5))); - // Set negative arguments to NaN. - const Packet a_poisoned = por(a, negative_mask); + const Scalar norm_min = (std::numeric_limits::min)(); + const Packet denorm_mask = pcmp_lt(a, pset1(norm_min)); + // Set negative arguments to NaN and positive subnormals to zero. + const Packet a_poisoned = por(pandnot(a, denorm_mask), negative_mask); + const Packet minus_half_a = pmul(a_poisoned, pset1(Scalar(-0.5))); // Do a single step of Newton's iteration for reciprocal square root: // x_{n+1} = x_n * (1.5 - x_n * ((0.5 * a) * x_n)). @@ -150,12 +152,10 @@ struct generic_sqrt_newton_step { // Return sqrt(x) = x * rsqrt(x) for non-zero finite positive arguments. // Return a itself for 0 or +inf, NaN for negative arguments. - return pselect(return_rsqrt, pmul(a_poisoned, rsqrt), por(a, negative_mask)); + return pselect(return_rsqrt, pmul(a_poisoned, rsqrt), a_poisoned); } }; - - /** \internal \returns the hyperbolic tan of \a a (coeff-wise) Doesn't do anything fancy, just a 13/6-degree rational interpolant which is accurate up to a couple of ulps in the (approximate) range [-8, 8], diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h index cecf1aade..6ea3861ca 100644 --- a/Eigen/src/Core/arch/AVX/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX/MathFunctions.h @@ -89,28 +89,22 @@ pexp(const Packet4d& _x) { return pexp_double(_x); } -#if EIGEN_FAST_MATH - -template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet8f psqrt(const Packet8f& _x) { - return generic_sqrt_newton_step::run(_x, _mm256_rsqrt_ps(_x)); -} - -#else +// Notice that for newer processors, it is counterproductive to use Newton +// iteration for square root. In particular, Skylake and Zen2 processors +// have approximately doubled throughput of the _mm_sqrt_ps instruction +// compared to their predecessors. template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f psqrt(const Packet8f& _x) { return _mm256_sqrt_ps(_x); } - -#endif - template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d psqrt(const Packet4d& _x) { return _mm256_sqrt_pd(_x); } + +// Even on Skylake, using Newton iteration is a win for reciprocal square root. #if EIGEN_FAST_MATH template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f prsqrt(const Packet8f& a) { diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h index 8f4a66f5d..15db02bcf 100644 --- a/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -75,29 +75,19 @@ Packet4f pcos(const Packet4f& _x) return pcos_float(_x); } -#if EIGEN_FAST_MATH -template<> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f psqrt(const Packet4f& _x) -{ - return generic_sqrt_newton_step::run(_x, _mm_rsqrt_ps(_x)); -} - -#else - +// Notice that for newer processors, it is counterproductive to use Newton +// iteration for square root. In particular, Skylake and Zen2 processors +// have approximately doubled throughput of the _mm_sqrt_ps instruction +// compared to their predecessors. template<>EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f psqrt(const Packet4f& x) { return _mm_sqrt_ps(x); } - -#endif - template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d psqrt(const Packet2d& x) { return _mm_sqrt_pd(x); } - template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16b psqrt(const Packet16b& x) { return x; } #if EIGEN_FAST_MATH - +// Even on Skylake, using Newton iteration is a win for reciprocal square root. template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f prsqrt(const Packet4f& x) { return generic_rsqrt_newton_step::run(x, _mm_rsqrt_ps(x)); @@ -105,7 +95,7 @@ Packet4f prsqrt(const Packet4f& x) { #ifdef EIGEN_VECTORIZE_FMA // Trying to speed up reciprocal using Newton-Raphson is counterproductive -// unless FMA is available. Without FMA pdiv(pset1(Scalar(1),a) is +// unless FMA is available. Without FMA pdiv(pset1(Scalar(1),a)) is // 30% faster. template<> EIGEN_STRONG_INLINE Packet4f preciprocal(const Packet4f& x) { return generic_reciprocal_newton_step::run(x, _mm_rcp_ps(x)); diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 71ccb86f7..12d524385 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -657,6 +657,72 @@ Scalar log2(Scalar x) { return Scalar(EIGEN_LOG2E) * std::log(x); } +// TODO(rmlarsen): Run this test for more functions. +template +void packetmath_test_IEEE_corner_cases(const REF_FUNCTOR_T& ref_fun, + const FUNCTOR_T& fun) { + const int PacketSize = internal::unpacket_traits::size; + const Scalar norm_min = (std::numeric_limits::min)(); + + constexpr int size = PacketSize * 2; + EIGEN_ALIGN_MAX Scalar data1[size]; + EIGEN_ALIGN_MAX Scalar data2[size]; + EIGEN_ALIGN_MAX Scalar ref[size]; + for (int i = 0; i < size; ++i) { + data1[i] = data2[i] = ref[i] = Scalar(0); + } + + // Test for subnormals. + if (std::numeric_limits::has_denorm == std::denorm_present) { + + // When EIGEN_FAST_MATH is 1 we relax the conditions slightly, and allow the function + // to return the same value for subnormals as the reference would return for zero with + // the same sign as the input. + // TODO(rmlarsen): Currently we ignore the error that occurs if the input is equal to + // denorm_min. Specifically, the term 0.5*x in the Newton iteration for reciprocal sqrt + // underflows to zero and the result ends up a factor of 2 too large. +#if EIGEN_FAST_MATH + // TODO(rmlarsen): Remove factor of 2 here if we can fix the underflow in reciprocal sqrt. + data1[0] = Scalar(2) * std::numeric_limits::denorm_min(); + data1[1] = -data1[0]; + test::packet_helper h; + h.store(data2, fun(h.load(data1))); + for (int i=0; i < 2; ++i) { + const Scalar ref_zero = ref_fun(data1[i] < 0 ? -Scalar(0) : Scalar(0)); + const Scalar ref_val = ref_fun(data1[i]); + // TODO(rmlarsen): Remove debug cruft. + // std::cerr << "x = " << data1[i] << "y = " << data2[i] << ", ref_val = " << ref_val << ", ref_zero " << ref_zero << std::endl; + VERIFY(((std::isnan)(data2[i]) && (std::isnan)(ref_val)) || data2[i] == ref_zero || + verifyIsApprox(data2[i], ref_val)); + } +#else + data1[0] = std::numeric_limits::denorm_min(); + data1[1] = -data1[0]; + CHECK_CWISE1_IF(Cond, ref_fun, fun); +#endif + } + + // Test for smallest normalized floats. + data1[0] = norm_min; + data1[1] = -data1[0]; + CHECK_CWISE1_IF(Cond, ref_fun, fun); + + // Test for zeros. + data1[0] = Scalar(0.0); + data1[1] = -data1[0]; + CHECK_CWISE1_IF(Cond, ref_fun, fun); + + // Test for infinities. + data1[0] = NumTraits::infinity(); + data1[1] = -data1[0]; + CHECK_CWISE1_IF(Cond, ref_fun, fun); + + // Test for quiet NaNs. + data1[0] = std::numeric_limits::quiet_NaN(); + data1[1] = -std::numeric_limits::quiet_NaN(); + CHECK_CWISE1_IF(Cond, ref_fun, fun); +} + template void packetmath_real() { typedef internal::packet_traits PacketTraits; @@ -947,33 +1013,8 @@ void packetmath_real() { VERIFY((numext::isnan)(data2[1])); } - if (PacketTraits::HasSqrt) { - data1[0] = Scalar(-1.0f); - if (std::numeric_limits::has_denorm == std::denorm_present) { - data1[1] = -std::numeric_limits::denorm_min(); - } else { - data1[1] = -((std::numeric_limits::min)()); - } - CHECK_CWISE1_IF(PacketTraits::HasSqrt, numext::sqrt, internal::psqrt); - - data1[0] = Scalar(0.0f); - data1[1] = NumTraits::infinity(); - CHECK_CWISE1_IF(PacketTraits::HasSqrt, numext::sqrt, internal::psqrt); - } - - if (PacketTraits::HasRsqrt) { - data1[0] = Scalar(-1.0f); - if (std::numeric_limits::has_denorm == std::denorm_present) { - data1[1] = -std::numeric_limits::denorm_min(); - } else { - data1[1] = -((std::numeric_limits::min)()); - } - CHECK_CWISE1_IF(PacketTraits::HasRsqrt, numext::rsqrt, internal::prsqrt); - - data1[0] = Scalar(0.0f); - data1[1] = NumTraits::infinity(); - CHECK_CWISE1_IF(PacketTraits::HasRsqrt, numext::rsqrt, internal::prsqrt); - } + packetmath_test_IEEE_corner_cases(numext::sqrt, internal::psqrt); + packetmath_test_IEEE_corner_cases(numext::rsqrt, internal::prsqrt); // TODO(rmlarsen): Re-enable for half and bfloat16. if (PacketTraits::HasCos