1. Fix a bug in psqrt and make it return 0 for +inf arguments.

2. Simplify handling of special cases by taking advantage of the fact that the
   builtin vrsqrt approximation handles negative, zero and +inf arguments correctly.
   This speeds up the SSE and AVX implementations by ~20%.
3. Make the Newton-Raphson formula used for rsqrt more numerically robust:

Before: y = y * (1.5 - x/2 * y^2)
After: y = y * (1.5 - y * (x/2) * y)

Forming y^2 can overflow for very large or very small (denormalized) values of x, while x*y ~= 1. For AVX512, this makes it possible to compute accurate results for denormal inputs down to ~1e-42 in single precision.

4. Add a faster double precision implementation for Knights Landing using the vrsqrt28 instruction and a single Newton-Raphson iteration.

Benchmark results: https://bitbucket.org/snippets/rmlarsen/5LBq9o
This commit is contained in:
Rasmus Munk Larsen 2019-11-15 17:09:46 -08:00
parent 2cb2915f90
commit f1e8307308
4 changed files with 109 additions and 72 deletions

View File

@ -109,7 +109,6 @@ Packet4d psqrt<Packet4d>(const Packet4d& x) {
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet8f prsqrt<Packet8f>(const Packet8f& _x) {
_EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inf, 0x7f800000);
_EIGEN_DECLARE_CONST_Packet8f_FROM_INT(nan, 0x7fc00000);
_EIGEN_DECLARE_CONST_Packet8f(one_point_five, 1.5f);
_EIGEN_DECLARE_CONST_Packet8f(minus_half, -0.5f);
_EIGEN_DECLARE_CONST_Packet8f_FROM_INT(flt_min, 0x00800000);
@ -118,20 +117,25 @@ Packet8f prsqrt<Packet8f>(const Packet8f& _x) {
// select only the inverse sqrt of positive normal inputs (denormals are
// flushed to zero and cause infs as well).
Packet8f le_zero_mask = _mm256_cmp_ps(_x, p8f_flt_min, _CMP_LT_OQ);
Packet8f x = _mm256_andnot_ps(le_zero_mask, _mm256_rsqrt_ps(_x));
Packet8f lt_min_mask = _mm256_cmp_ps(_x, p8f_flt_min, _CMP_LT_OQ);
Packet8f inf_mask = _mm256_cmp_ps(_x, p8f_inf, _CMP_EQ_OQ);
Packet8f not_normal_finite_mask = _mm256_or_ps(lt_min_mask, inf_mask);
// Fill in NaNs and Infs for the negative/zero entries.
Packet8f neg_mask = _mm256_cmp_ps(_x, _mm256_setzero_ps(), _CMP_LT_OQ);
Packet8f zero_mask = _mm256_andnot_ps(neg_mask, le_zero_mask);
Packet8f infs_and_nans = _mm256_or_ps(_mm256_and_ps(neg_mask, p8f_nan),
_mm256_and_ps(zero_mask, p8f_inf));
// Compute an approximate result using the rsqrt intrinsic.
Packet8f y_approx = _mm256_rsqrt_ps(_x);
// Do a single step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p8f_one_point_five));
// Do a single step of Newton-Raphson iteration to improve the approximation.
// This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n).
// It is essential to evaluate the inner term like this because forming
// y_n^2 may over- or underflow.
Packet8f y_newton = pmul(y_approx, pmadd(y_approx, pmul(neg_half, y_approx), p8f_one_point_five));
// Insert NaNs and Infs in all the right places.
return _mm256_or_ps(x, infs_and_nans);
// Select the result of the Newton-Raphson step for positive normal arguments.
// For other arguments, choose the output of the intrinsic. This will
// return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(x) = +inf if
// x is zero or a positive denormalized float (equivalent to flushing positive
// denormalized inputs to zero).
return pselect<Packet8f>(not_normal_finite_mask, y_approx, y_newton);
}
#else

View File

@ -253,6 +253,7 @@ pexp<Packet8d>(const Packet8d& _x) {
return pmax(pmul(x, e), _x);
}*/
// Functions for sqrt.
// The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step
// of Newton's method, at a cost of 1-2 bits of precision as opposed to the
@ -308,74 +309,100 @@ EIGEN_STRONG_INLINE Packet8d psqrt<Packet8d>(const Packet8d& x) {
}
#endif
// Functions for rsqrt.
// Almost identical to the sqrt routine, just leave out the last multiplication
// and fill in NaN/Inf where needed. Note that this function only exists as an
// iterative version for doubles since there is no instruction for diretly
// computing the reciprocal square root in AVX-512.
#ifdef EIGEN_FAST_MATH
// prsqrt for float.
#if defined(EIGEN_VECTORIZE_AVX512ER)
template <>
EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
return _mm512_rsqrt28_ps(x);
}
#elif EIGEN_FAST_MATH
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
prsqrt<Packet16f>(const Packet16f& _x) {
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(inf, 0x7f800000);
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000);
_EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f);
_EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f);
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000);
Packet16f neg_half = pmul(_x, p16f_minus_half);
// select only the inverse sqrt of positive normal inputs (denormals are
// flushed to zero and cause infs as well).
__mmask16 le_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_LT_OQ);
Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_rsqrt14_ps(_x), _mm512_setzero_ps());
// Identity infinite, negative and denormal arguments.
__mmask16 inf_mask = _mm512_cmp_ps_mask(_x, p16f_inf, _CMP_EQ_OQ);
__mmask16 not_pos_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LE_OQ);
__mmask16 not_finite_pos_mask = not_pos_mask | inf_mask;
// Compute an approximate result using the rsqrt intrinsic, forcing +inf
// for denormals for consistency with AVX and SSE implementations.
Packet16f y_approx = _mm512_rsqrt14_ps(_x);
// Fill in NaNs and Infs for the negative/zero entries.
__mmask16 neg_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LT_OQ);
Packet16f infs_and_nans = _mm512_mask_blend_ps(
neg_mask, _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(), p16f_inf), p16f_nan);
// Do a single step of Newton-Raphson iteration to improve the approximation.
// This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n).
// It is essential to evaluate the inner term like this because forming
// y_n^2 may over- or underflow.
Packet16f y_newton = pmul(y_approx, pmadd(y_approx, pmul(neg_half, y_approx), p16f_one_point_five));
// Do a single step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));
// Select the result of the Newton-Raphson step for positive finite arguments.
// For other arguments, choose the output of the intrinsic. This will
// return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(0) = +inf.
return _mm512_mask_blend_ps(not_finite_pos_mask, y_newton, y_approx);
}
// Insert NaNs and Infs in all the right places.
return _mm512_mask_blend_ps(le_zero_mask, x, infs_and_nans);
#else
template <>
EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
_EIGEN_DECLARE_CONST_Packet16f(one, 1.0f);
return _mm512_div_ps(p16f_one, _mm512_sqrt_ps(x));
}
#endif
// prsqrt for double.
#if EIGEN_FAST_MATH
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
prsqrt<Packet8d>(const Packet8d& _x) {
_EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(inf, 0x7ff0000000000000LL);
_EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(nan, 0x7ff1000000000000LL);
_EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5);
_EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5);
_EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL);
_EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(inf, 0x7ff0000000000000LL);
Packet8d neg_half = pmul(_x, p8d_minus_half);
// select only the inverse sqrt of positive normal inputs (denormals are
// flushed to zero and cause infs as well).
__mmask8 le_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_LT_OQ);
Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_rsqrt14_pd(_x), _mm512_setzero_pd());
// Identity infinite, negative and denormal arguments.
__mmask8 inf_mask = _mm512_cmp_pd_mask(_x, p8d_inf, _CMP_EQ_OQ);
__mmask8 not_pos_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LE_OQ);
__mmask8 not_finite_pos_mask = not_pos_mask | inf_mask;
// Fill in NaNs and Infs for the negative/zero entries.
__mmask8 neg_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LT_OQ);
Packet8d infs_and_nans = _mm512_mask_blend_pd(
neg_mask, _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(), p8d_inf), p8d_nan);
// Do a first step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
// Do a second step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
// Insert NaNs and Infs in all the right places.
return _mm512_mask_blend_pd(le_zero_mask, x, infs_and_nans);
// Compute an approximate result using the rsqrt intrinsic, forcing +inf
// for denormals for consistency with AVX and SSE implementations.
#if defined(EIGEN_VECTORIZE_AVX512ER)
Packet8d y_approx = _mm512_rsqrt28_pd(_x);
#else
Packet8d y_approx = _mm512_rsqrt14_pd(_x);
#endif
// Do one or two steps of Newton-Raphson's to improve the approximation, depending on the
// starting accuracy (either 2^-14 or 2^-28, depending on whether AVX512ER is available).
// The Newton-Raphson algorithm has quadratic convergence and roughly doubles the number
// of correct digits for each step.
// This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n).
// It is essential to evaluate the inner term like this because forming
// y_n^2 may over- or underflow.
Packet8d y_newton = pmul(y_approx, pmadd(neg_half, pmul(y_approx, y_approx), p8d_one_point_five));
#if !defined(EIGEN_VECTORIZE_AVX512ER)
y_newton = pmul(y_newton, pmadd(y_newton, pmul(neg_half, y_newton), p8d_one_point_five));
#endif
// Select the result of the Newton-Raphson step for positive finite arguments.
// For other arguments, choose the output of the intrinsic. This will
// return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(0) = +inf.
return _mm512_mask_blend_pd(not_finite_pos_mask, y_newton, y_approx);
}
#elif defined(EIGEN_VECTORIZE_AVX512ER)
#else
template <>
EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
return _mm512_rsqrt28_ps(x);
EIGEN_STRONG_INLINE Packet8d prsqrt<Packet8d>(const Packet8d& x) {
_EIGEN_DECLARE_CONST_Packet8d(one, 1.0f);
return _mm512_div_pd(p8d_one, _mm512_sqrt_pd(x));
}
#endif

View File

@ -98,30 +98,34 @@ Packet2d psqrt<Packet2d>(const Packet2d& x) { return _mm_sqrt_pd(x); }
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet4f prsqrt<Packet4f>(const Packet4f& _x) {
_EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000u);
_EIGEN_DECLARE_CONST_Packet4f_FROM_INT(nan, 0x7fc00000u);
_EIGEN_DECLARE_CONST_Packet4f(one_point_five, 1.5f);
_EIGEN_DECLARE_CONST_Packet4f(minus_half, -0.5f);
_EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000u);
_EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000u);
Packet4f neg_half = pmul(_x, p4f_minus_half);
// select only the inverse sqrt of positive normal inputs (denormals are
// flushed to zero and cause infs as well).
Packet4f le_zero_mask = _mm_cmple_ps(_x, p4f_flt_min);
Packet4f x = _mm_andnot_ps(le_zero_mask, _mm_rsqrt_ps(_x));
// Identity infinite, zero, negative and denormal arguments.
Packet4f lt_min_mask = _mm_cmplt_ps(_x, p4f_flt_min);
Packet4f inf_mask = _mm_cmpeq_ps(_x, p4f_inf);
Packet4f not_normal_finite_mask = _mm_or_ps(lt_min_mask, inf_mask);
// Fill in NaNs and Infs for the negative/zero entries.
Packet4f neg_mask = _mm_cmplt_ps(_x, _mm_setzero_ps());
Packet4f zero_mask = _mm_andnot_ps(neg_mask, le_zero_mask);
Packet4f infs_and_nans = _mm_or_ps(_mm_and_ps(neg_mask, p4f_nan),
_mm_and_ps(zero_mask, p4f_inf));
// Compute an approximate result using the rsqrt intrinsic.
Packet4f y_approx = _mm_rsqrt_ps(_x);
// Do a single step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p4f_one_point_five));
// Do a single step of Newton-Raphson iteration to improve the approximation.
// This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n).
// It is essential to evaluate the inner term like this because forming
// y_n^2 may over- or underflow.
Packet4f y_newton = pmul(
y_approx, pmadd(y_approx, pmul(neg_half, y_approx), p4f_one_point_five));
// Insert NaNs and Infs in all the right places.
return _mm_or_ps(x, infs_and_nans);
// Select the result of the Newton-Raphson step for positive normal arguments.
// For other arguments, choose the output of the intrinsic. This will
// return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(x) = +inf if
// x is zero or a positive denormalized float (equivalent to flushing positive
// denormalized inputs to zero).
return pselect<Packet4f>(not_normal_finite_mask, y_approx, y_newton);
}
#else

View File

@ -605,9 +605,8 @@ template<typename Scalar,typename Packet> void packetmath_real()
}
if(internal::random<float>(0,1)<0.1f)
data1[internal::random<int>(0, PacketSize)] = 0;
data1[internal::random<int>(0, PacketSize)] = 0;
CHECK_CWISE1_IF(PacketTraits::HasSqrt, std::sqrt, internal::psqrt);
CHECK_CWISE1_IF(PacketTraits::HasSqrt, Scalar(1)/std::sqrt, internal::prsqrt);
CHECK_CWISE1_IF(PacketTraits::HasLog, std::log, internal::plog);
CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_i0, internal::pbessel_i0);
CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_i0e, internal::pbessel_i0e);
@ -616,6 +615,9 @@ template<typename Scalar,typename Packet> void packetmath_real()
CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_j0, internal::pbessel_j0);
CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_j1, internal::pbessel_j1);
data1[0] = std::numeric_limits<Scalar>::infinity();
CHECK_CWISE1_IF(PacketTraits::HasRsqrt, Scalar(1)/std::sqrt, internal::prsqrt);
// Use a smaller data range for the positive bessel operations as these
// can have much more error at very small and very large values.
for (int i=0; i<size; ++i) {