From f1e83073082f2733eec6235f2fdf251217a54ade Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 15 Nov 2019 17:09:46 -0800 Subject: [PATCH] 1. Fix a bug in psqrt and make it return 0 for +inf arguments. 2. Simplify handling of special cases by taking advantage of the fact that the builtin vrsqrt approximation handles negative, zero and +inf arguments correctly. This speeds up the SSE and AVX implementations by ~20%. 3. Make the Newton-Raphson formula used for rsqrt more numerically robust: Before: y = y * (1.5 - x/2 * y^2) After: y = y * (1.5 - y * (x/2) * y) Forming y^2 can overflow for very large or very small (denormalized) values of x, while x*y ~= 1. For AVX512, this makes it possible to compute accurate results for denormal inputs down to ~1e-42 in single precision. 4. Add a faster double precision implementation for Knights Landing using the vrsqrt28 instruction and a single Newton-Raphson iteration. Benchmark results: https://bitbucket.org/snippets/rmlarsen/5LBq9o --- Eigen/src/Core/arch/AVX/MathFunctions.h | 28 ++--- Eigen/src/Core/arch/AVX512/MathFunctions.h | 113 +++++++++++++-------- Eigen/src/Core/arch/SSE/MathFunctions.h | 34 ++++--- test/packetmath.cpp | 6 +- 4 files changed, 109 insertions(+), 72 deletions(-) diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h index da1b1e3f8..c5394430f 100644 --- a/Eigen/src/Core/arch/AVX/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX/MathFunctions.h @@ -109,7 +109,6 @@ Packet4d psqrt(const Packet4d& x) { template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f prsqrt(const Packet8f& _x) { _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inf, 0x7f800000); - _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(nan, 0x7fc00000); _EIGEN_DECLARE_CONST_Packet8f(one_point_five, 1.5f); _EIGEN_DECLARE_CONST_Packet8f(minus_half, -0.5f); _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(flt_min, 0x00800000); @@ -118,20 +117,25 @@ Packet8f prsqrt(const Packet8f& _x) { // select only the inverse sqrt of positive normal inputs (denormals are // flushed to zero and cause infs as well). - Packet8f le_zero_mask = _mm256_cmp_ps(_x, p8f_flt_min, _CMP_LT_OQ); - Packet8f x = _mm256_andnot_ps(le_zero_mask, _mm256_rsqrt_ps(_x)); + Packet8f lt_min_mask = _mm256_cmp_ps(_x, p8f_flt_min, _CMP_LT_OQ); + Packet8f inf_mask = _mm256_cmp_ps(_x, p8f_inf, _CMP_EQ_OQ); + Packet8f not_normal_finite_mask = _mm256_or_ps(lt_min_mask, inf_mask); - // Fill in NaNs and Infs for the negative/zero entries. - Packet8f neg_mask = _mm256_cmp_ps(_x, _mm256_setzero_ps(), _CMP_LT_OQ); - Packet8f zero_mask = _mm256_andnot_ps(neg_mask, le_zero_mask); - Packet8f infs_and_nans = _mm256_or_ps(_mm256_and_ps(neg_mask, p8f_nan), - _mm256_and_ps(zero_mask, p8f_inf)); + // Compute an approximate result using the rsqrt intrinsic. + Packet8f y_approx = _mm256_rsqrt_ps(_x); - // Do a single step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), p8f_one_point_five)); + // Do a single step of Newton-Raphson iteration to improve the approximation. + // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n). + // It is essential to evaluate the inner term like this because forming + // y_n^2 may over- or underflow. + Packet8f y_newton = pmul(y_approx, pmadd(y_approx, pmul(neg_half, y_approx), p8f_one_point_five)); - // Insert NaNs and Infs in all the right places. - return _mm256_or_ps(x, infs_and_nans); + // Select the result of the Newton-Raphson step for positive normal arguments. + // For other arguments, choose the output of the intrinsic. This will + // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(x) = +inf if + // x is zero or a positive denormalized float (equivalent to flushing positive + // denormalized inputs to zero). + return pselect(not_normal_finite_mask, y_approx, y_newton); } #else diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h index 273c22c18..0346c1d95 100644 --- a/Eigen/src/Core/arch/AVX512/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h @@ -253,6 +253,7 @@ pexp(const Packet8d& _x) { return pmax(pmul(x, e), _x); }*/ + // Functions for sqrt. // The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step // of Newton's method, at a cost of 1-2 bits of precision as opposed to the @@ -308,74 +309,100 @@ EIGEN_STRONG_INLINE Packet8d psqrt(const Packet8d& x) { } #endif -// Functions for rsqrt. -// Almost identical to the sqrt routine, just leave out the last multiplication -// and fill in NaN/Inf where needed. Note that this function only exists as an -// iterative version for doubles since there is no instruction for diretly -// computing the reciprocal square root in AVX-512. -#ifdef EIGEN_FAST_MATH +// prsqrt for float. +#if defined(EIGEN_VECTORIZE_AVX512ER) + +template <> +EIGEN_STRONG_INLINE Packet16f prsqrt(const Packet16f& x) { + return _mm512_rsqrt28_ps(x); +} + +#elif EIGEN_FAST_MATH + template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f prsqrt(const Packet16f& _x) { _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(inf, 0x7f800000); - _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000); _EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f); _EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f); - _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000); Packet16f neg_half = pmul(_x, p16f_minus_half); - // select only the inverse sqrt of positive normal inputs (denormals are - // flushed to zero and cause infs as well). - __mmask16 le_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_LT_OQ); - Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_rsqrt14_ps(_x), _mm512_setzero_ps()); + // Identity infinite, negative and denormal arguments. + __mmask16 inf_mask = _mm512_cmp_ps_mask(_x, p16f_inf, _CMP_EQ_OQ); + __mmask16 not_pos_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LE_OQ); + __mmask16 not_finite_pos_mask = not_pos_mask | inf_mask; + + // Compute an approximate result using the rsqrt intrinsic, forcing +inf + // for denormals for consistency with AVX and SSE implementations. + Packet16f y_approx = _mm512_rsqrt14_ps(_x); - // Fill in NaNs and Infs for the negative/zero entries. - __mmask16 neg_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LT_OQ); - Packet16f infs_and_nans = _mm512_mask_blend_ps( - neg_mask, _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(), p16f_inf), p16f_nan); + // Do a single step of Newton-Raphson iteration to improve the approximation. + // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n). + // It is essential to evaluate the inner term like this because forming + // y_n^2 may over- or underflow. + Packet16f y_newton = pmul(y_approx, pmadd(y_approx, pmul(neg_half, y_approx), p16f_one_point_five)); - // Do a single step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five)); + // Select the result of the Newton-Raphson step for positive finite arguments. + // For other arguments, choose the output of the intrinsic. This will + // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(0) = +inf. + return _mm512_mask_blend_ps(not_finite_pos_mask, y_newton, y_approx); + } - // Insert NaNs and Infs in all the right places. - return _mm512_mask_blend_ps(le_zero_mask, x, infs_and_nans); +#else + +template <> +EIGEN_STRONG_INLINE Packet16f prsqrt(const Packet16f& x) { + _EIGEN_DECLARE_CONST_Packet16f(one, 1.0f); + return _mm512_div_ps(p16f_one, _mm512_sqrt_ps(x)); } +#endif + +// prsqrt for double. +#if EIGEN_FAST_MATH template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d prsqrt(const Packet8d& _x) { - _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(inf, 0x7ff0000000000000LL); - _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(nan, 0x7ff1000000000000LL); _EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5); _EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5); - _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL); + _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(inf, 0x7ff0000000000000LL); Packet8d neg_half = pmul(_x, p8d_minus_half); - // select only the inverse sqrt of positive normal inputs (denormals are - // flushed to zero and cause infs as well). - __mmask8 le_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_LT_OQ); - Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_rsqrt14_pd(_x), _mm512_setzero_pd()); + // Identity infinite, negative and denormal arguments. + __mmask8 inf_mask = _mm512_cmp_pd_mask(_x, p8d_inf, _CMP_EQ_OQ); + __mmask8 not_pos_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LE_OQ); + __mmask8 not_finite_pos_mask = not_pos_mask | inf_mask; - // Fill in NaNs and Infs for the negative/zero entries. - __mmask8 neg_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LT_OQ); - Packet8d infs_and_nans = _mm512_mask_blend_pd( - neg_mask, _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(), p8d_inf), p8d_nan); - - // Do a first step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); - - // Do a second step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); - - // Insert NaNs and Infs in all the right places. - return _mm512_mask_blend_pd(le_zero_mask, x, infs_and_nans); + // Compute an approximate result using the rsqrt intrinsic, forcing +inf + // for denormals for consistency with AVX and SSE implementations. +#if defined(EIGEN_VECTORIZE_AVX512ER) + Packet8d y_approx = _mm512_rsqrt28_pd(_x); +#else + Packet8d y_approx = _mm512_rsqrt14_pd(_x); +#endif + // Do one or two steps of Newton-Raphson's to improve the approximation, depending on the + // starting accuracy (either 2^-14 or 2^-28, depending on whether AVX512ER is available). + // The Newton-Raphson algorithm has quadratic convergence and roughly doubles the number + // of correct digits for each step. + // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n). + // It is essential to evaluate the inner term like this because forming + // y_n^2 may over- or underflow. + Packet8d y_newton = pmul(y_approx, pmadd(neg_half, pmul(y_approx, y_approx), p8d_one_point_five)); +#if !defined(EIGEN_VECTORIZE_AVX512ER) + y_newton = pmul(y_newton, pmadd(y_newton, pmul(neg_half, y_newton), p8d_one_point_five)); +#endif + // Select the result of the Newton-Raphson step for positive finite arguments. + // For other arguments, choose the output of the intrinsic. This will + // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(0) = +inf. + return _mm512_mask_blend_pd(not_finite_pos_mask, y_newton, y_approx); } -#elif defined(EIGEN_VECTORIZE_AVX512ER) +#else template <> -EIGEN_STRONG_INLINE Packet16f prsqrt(const Packet16f& x) { - return _mm512_rsqrt28_ps(x); +EIGEN_STRONG_INLINE Packet8d prsqrt(const Packet8d& x) { + _EIGEN_DECLARE_CONST_Packet8d(one, 1.0f); + return _mm512_div_pd(p8d_one, _mm512_sqrt_pd(x)); } #endif diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h index b21bb93bf..85255ad23 100644 --- a/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -98,30 +98,34 @@ Packet2d psqrt(const Packet2d& x) { return _mm_sqrt_pd(x); } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f prsqrt(const Packet4f& _x) { - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000u); - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(nan, 0x7fc00000u); _EIGEN_DECLARE_CONST_Packet4f(one_point_five, 1.5f); _EIGEN_DECLARE_CONST_Packet4f(minus_half, -0.5f); + _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000u); _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000u); Packet4f neg_half = pmul(_x, p4f_minus_half); - // select only the inverse sqrt of positive normal inputs (denormals are - // flushed to zero and cause infs as well). - Packet4f le_zero_mask = _mm_cmple_ps(_x, p4f_flt_min); - Packet4f x = _mm_andnot_ps(le_zero_mask, _mm_rsqrt_ps(_x)); + // Identity infinite, zero, negative and denormal arguments. + Packet4f lt_min_mask = _mm_cmplt_ps(_x, p4f_flt_min); + Packet4f inf_mask = _mm_cmpeq_ps(_x, p4f_inf); + Packet4f not_normal_finite_mask = _mm_or_ps(lt_min_mask, inf_mask); - // Fill in NaNs and Infs for the negative/zero entries. - Packet4f neg_mask = _mm_cmplt_ps(_x, _mm_setzero_ps()); - Packet4f zero_mask = _mm_andnot_ps(neg_mask, le_zero_mask); - Packet4f infs_and_nans = _mm_or_ps(_mm_and_ps(neg_mask, p4f_nan), - _mm_and_ps(zero_mask, p4f_inf)); + // Compute an approximate result using the rsqrt intrinsic. + Packet4f y_approx = _mm_rsqrt_ps(_x); - // Do a single step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), p4f_one_point_five)); + // Do a single step of Newton-Raphson iteration to improve the approximation. + // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n). + // It is essential to evaluate the inner term like this because forming + // y_n^2 may over- or underflow. + Packet4f y_newton = pmul( + y_approx, pmadd(y_approx, pmul(neg_half, y_approx), p4f_one_point_five)); - // Insert NaNs and Infs in all the right places. - return _mm_or_ps(x, infs_and_nans); + // Select the result of the Newton-Raphson step for positive normal arguments. + // For other arguments, choose the output of the intrinsic. This will + // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(x) = +inf if + // x is zero or a positive denormalized float (equivalent to flushing positive + // denormalized inputs to zero). + return pselect(not_normal_finite_mask, y_approx, y_newton); } #else diff --git a/test/packetmath.cpp b/test/packetmath.cpp index d652082b0..64dd3dbf6 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -605,9 +605,8 @@ template void packetmath_real() } if(internal::random(0,1)<0.1f) - data1[internal::random(0, PacketSize)] = 0; + data1[internal::random(0, PacketSize)] = 0; CHECK_CWISE1_IF(PacketTraits::HasSqrt, std::sqrt, internal::psqrt); - CHECK_CWISE1_IF(PacketTraits::HasSqrt, Scalar(1)/std::sqrt, internal::prsqrt); CHECK_CWISE1_IF(PacketTraits::HasLog, std::log, internal::plog); CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_i0, internal::pbessel_i0); CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_i0e, internal::pbessel_i0e); @@ -616,6 +615,9 @@ template void packetmath_real() CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_j0, internal::pbessel_j0); CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_j1, internal::pbessel_j1); + data1[0] = std::numeric_limits::infinity(); + CHECK_CWISE1_IF(PacketTraits::HasRsqrt, Scalar(1)/std::sqrt, internal::prsqrt); + // Use a smaller data range for the positive bessel operations as these // can have much more error at very small and very large values. for (int i=0; i