mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-11 11:19:02 +08:00
Modified sqrt/rsqrt for denormal handling.
This commit is contained in:
parent
1c2690ed24
commit
9c07e201ff
@ -83,20 +83,20 @@ struct generic_rsqrt_newton_step {
|
||||
using Scalar = typename unpacket_traits<Packet>::type;
|
||||
const Packet one_point_five = pset1<Packet>(Scalar(1.5));
|
||||
const Packet minus_half = pset1<Packet>(Scalar(-0.5));
|
||||
const Packet minus_half_a = pmul(minus_half, a);
|
||||
const Scalar norm_min = (std::numeric_limits<Scalar>::min)();
|
||||
const Packet denorm_mask = pcmp_lt(a, pset1<Packet>(norm_min));
|
||||
Packet x =
|
||||
generic_rsqrt_newton_step<Packet,Steps - 1>::run(a, approx_rsqrt);
|
||||
const Packet tmp = pmul(minus_half_a, x);
|
||||
// If tmp is NaN, it means that a is either 0 or Inf.
|
||||
// In this case return the approximation directly. Do the same for
|
||||
// positive subnormals. Otherwise return the Newton iterate.
|
||||
const Packet return_x_newton = pandnot(pcmp_eq(tmp, tmp), denorm_mask);
|
||||
|
||||
// Refine the approximation using one Newton-Raphson step:
|
||||
// x_{n+1} = x_n * (1.5 - x_n * ((0.5 * a) * x_n)).
|
||||
const Packet x_newton = pmul(x, pmadd(tmp, x, one_point_five));
|
||||
return pselect(return_x_newton, x_newton, x);
|
||||
// x_{n+1} = x_n * (1.5 + (-0.5 * x_n) * (a * x_n)).
|
||||
// The approximation is expressed this way to avoid over/under-flows.
|
||||
Packet x_newton = pmul(approx_rsqrt, pmadd(pmul(minus_half, approx_rsqrt), pmul(a, approx_rsqrt), one_point_five));
|
||||
for (int step = 1; step < Steps; ++step) {
|
||||
x_newton = pmul(x_newton, pmadd(pmul(minus_half, x_newton), pmul(a, x_newton), one_point_five));
|
||||
}
|
||||
|
||||
// If approx_rsqrt is 0 or +/-inf, we should return it as is. Note:
|
||||
// on intel, approx_rsqrt can be inf for small denormal values.
|
||||
const Packet return_approx = por(pcmp_eq(approx_rsqrt, pzero(a)),
|
||||
pcmp_eq(pabs(approx_rsqrt), pset1<Packet>(NumTraits<Scalar>::infinity())));
|
||||
return pselect(return_approx, approx_rsqrt, x_newton);
|
||||
}
|
||||
};
|
||||
|
||||
@ -132,27 +132,21 @@ struct generic_sqrt_newton_step {
|
||||
run(const Packet& a, const Packet& approx_rsqrt) {
|
||||
using Scalar = typename unpacket_traits<Packet>::type;
|
||||
const Packet one_point_five = pset1<Packet>(Scalar(1.5));
|
||||
const Packet negative_mask = pcmp_lt(a, pzero(a));
|
||||
const Scalar norm_min = (std::numeric_limits<Scalar>::min)();
|
||||
const Packet denorm_mask = pcmp_lt(a, pset1<Packet>(norm_min));
|
||||
// Set negative arguments to NaN and positive subnormals to zero.
|
||||
const Packet a_poisoned = por(pandnot(a, denorm_mask), negative_mask);
|
||||
const Packet minus_half_a = pmul(a_poisoned, pset1<Packet>(Scalar(-0.5)));
|
||||
|
||||
const Packet minus_half = pset1<Packet>(Scalar(-0.5));
|
||||
// If a is inf or zero, return a directly.
|
||||
const Packet inf_mask = pcmp_eq(a, pset1<Packet>(NumTraits<Scalar>::infinity()));
|
||||
const Packet return_a = por(pcmp_eq(a, pzero(a)), inf_mask);
|
||||
// Do a single step of Newton's iteration for reciprocal square root:
|
||||
// x_{n+1} = x_n * (1.5 - x_n * ((0.5 * a) * x_n)).
|
||||
const Packet tmp = pmul(approx_rsqrt, minus_half_a);
|
||||
// If tmp is NaN, it means that the argument was either 0 or +inf,
|
||||
// and we should return the argument itself as the result.
|
||||
const Packet return_rsqrt = pcmp_eq(tmp, tmp);
|
||||
Packet rsqrt = pmul(approx_rsqrt, pmadd(tmp, approx_rsqrt, one_point_five));
|
||||
// x_{n+1} = x_n * (1.5 + (-0.5 * x_n) * (a * x_n))).
|
||||
// The Newton's step is computed this way to avoid over/under-flows.
|
||||
Packet rsqrt = pmul(approx_rsqrt, pmadd(pmul(minus_half, approx_rsqrt), pmul(a, approx_rsqrt), one_point_five));
|
||||
for (int step = 1; step < Steps; ++step) {
|
||||
rsqrt = pmul(rsqrt, pmadd(pmul(rsqrt, minus_half_a), rsqrt, one_point_five));
|
||||
rsqrt = pmul(rsqrt, pmadd(pmul(minus_half, rsqrt), pmul(a, rsqrt), one_point_five));
|
||||
}
|
||||
|
||||
// Return sqrt(x) = x * rsqrt(x) for non-zero finite positive arguments.
|
||||
// Return a itself for 0 or +inf, NaN for negative arguments.
|
||||
return pselect(return_rsqrt, pmul(a_poisoned, rsqrt), a_poisoned);
|
||||
return pselect(return_a, a, pmul(a, rsqrt));
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -108,6 +108,12 @@ Packet4d psqrt<Packet4d>(const Packet4d& _x) {
|
||||
#if EIGEN_FAST_MATH
|
||||
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
|
||||
Packet8f prsqrt<Packet8f>(const Packet8f& a) {
|
||||
// _mm256_rsqrt_ps returns -inf for negative denormals.
|
||||
// _mm512_rsqrt**_ps returns -NaN for negative denormals. We may want
|
||||
// consistency here.
|
||||
// const Packet8f rsqrt = pselect(pcmp_lt(a, pzero(a)),
|
||||
// pset1<Packet8f>(-NumTraits<float>::quiet_NaN()),
|
||||
// _mm256_rsqrt_ps(a));
|
||||
return generic_rsqrt_newton_step<Packet8f, /*Steps=*/1>::run(a, _mm256_rsqrt_ps(a));
|
||||
}
|
||||
|
||||
|
@ -207,44 +207,16 @@ BF16_PACKET_FUNCTION(Packet16f, Packet16bf, prsqrt)
|
||||
template <>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
|
||||
prsqrt<Packet8d>(const Packet8d& _x) {
|
||||
EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5);
|
||||
EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5);
|
||||
EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(inf, 0x7ff0000000000000LL);
|
||||
|
||||
Packet8d neg_half = pmul(_x, p8d_minus_half);
|
||||
|
||||
// Identity infinite, negative and denormal arguments.
|
||||
__mmask8 inf_mask = _mm512_cmp_pd_mask(_x, p8d_inf, _CMP_EQ_OQ);
|
||||
__mmask8 not_pos_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LE_OQ);
|
||||
__mmask8 not_finite_pos_mask = not_pos_mask | inf_mask;
|
||||
|
||||
// Compute an approximate result using the rsqrt intrinsic, forcing +inf
|
||||
// for denormals for consistency with AVX and SSE implementations.
|
||||
#if defined(EIGEN_VECTORIZE_AVX512ER)
|
||||
Packet8d y_approx = _mm512_rsqrt28_pd(_x);
|
||||
#else
|
||||
Packet8d y_approx = _mm512_rsqrt14_pd(_x);
|
||||
#endif
|
||||
// Do one or two steps of Newton-Raphson's to improve the approximation, depending on the
|
||||
// starting accuracy (either 2^-14 or 2^-28, depending on whether AVX512ER is available).
|
||||
// The Newton-Raphson algorithm has quadratic convergence and roughly doubles the number
|
||||
// of correct digits for each step.
|
||||
// This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n).
|
||||
// It is essential to evaluate the inner term like this because forming
|
||||
// y_n^2 may over- or underflow.
|
||||
Packet8d y_newton = pmul(y_approx, pmadd(neg_half, pmul(y_approx, y_approx), p8d_one_point_five));
|
||||
#if !defined(EIGEN_VECTORIZE_AVX512ER)
|
||||
y_newton = pmul(y_newton, pmadd(y_newton, pmul(neg_half, y_newton), p8d_one_point_five));
|
||||
#endif
|
||||
// Select the result of the Newton-Raphson step for positive finite arguments.
|
||||
// For other arguments, choose the output of the intrinsic. This will
|
||||
// return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(0) = +inf.
|
||||
return _mm512_mask_blend_pd(not_finite_pos_mask, y_newton, y_approx);
|
||||
#ifdef EIGEN_VECTORIZE_AVX512ER
|
||||
return generic_rsqrt_newton_step<Packet8d, /*Steps=*/1>::run(_x, _mm512_rsqrt28_pd(_x));
|
||||
#else
|
||||
return generic_rsqrt_newton_step<Packet8d, /*Steps=*/2>::run(_x, _mm512_rsqrt14_pd(_x));
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet16f preciprocal<Packet16f>(const Packet16f& a) {
|
||||
#ifdef EIGEN_VECTORIZE_AVX512ER
|
||||
return _mm512_rcp28_ps(a));
|
||||
return _mm512_rcp28_ps(a);
|
||||
#else
|
||||
return generic_reciprocal_newton_step<Packet16f, /*Steps=*/1>::run(a, _mm512_rcp14_ps(a));
|
||||
#endif
|
||||
|
@ -682,6 +682,7 @@ void packetmath_test_IEEE_corner_cases(const RefFunctorT& ref_fun,
|
||||
const FunctorT& fun) {
|
||||
const int PacketSize = internal::unpacket_traits<Packet>::size;
|
||||
const Scalar norm_min = (std::numeric_limits<Scalar>::min)();
|
||||
const Scalar norm_max = (std::numeric_limits<Scalar>::max)();
|
||||
|
||||
constexpr int size = PacketSize * 2;
|
||||
EIGEN_ALIGN_MAX Scalar data1[size];
|
||||
@ -694,37 +695,36 @@ void packetmath_test_IEEE_corner_cases(const RefFunctorT& ref_fun,
|
||||
// Test for subnormals.
|
||||
if (std::numeric_limits<Scalar>::has_denorm == std::denorm_present) {
|
||||
|
||||
// When EIGEN_FAST_MATH is 1 we relax the conditions slightly, and allow the function
|
||||
// to return the same value for subnormals as the reference would return for zero with
|
||||
// the same sign as the input.
|
||||
// TODO(rmlarsen): Currently we ignore the error that occurs if the input is equal to
|
||||
// denorm_min. Specifically, the term 0.5*x in the Newton iteration for reciprocal sqrt
|
||||
// underflows to zero and the result ends up a factor of 2 too large.
|
||||
#if EIGEN_FAST_MATH
|
||||
// TODO(rmlarsen): Remove factor of 2 here if we can fix the underflow in reciprocal sqrt.
|
||||
data1[0] = Scalar(2) * std::numeric_limits<Scalar>::denorm_min();
|
||||
data1[1] = -data1[0];
|
||||
test::packet_helper<Cond, Packet> h;
|
||||
h.store(data2, fun(h.load(data1)));
|
||||
for (int i=0; i < 2; ++i) {
|
||||
const Scalar ref_zero = ref_fun(data1[i] < 0 ? -Scalar(0) : Scalar(0));
|
||||
const Scalar ref_val = ref_fun(data1[i]);
|
||||
// TODO(rmlarsen): Remove debug cruft.
|
||||
// std::cerr << "x = " << data1[i] << "y = " << data2[i] << ", ref_val = " << ref_val << ", ref_zero " << ref_zero << std::endl;
|
||||
VERIFY(((std::isnan)(data2[i]) && (std::isnan)(ref_val)) || data2[i] == ref_zero ||
|
||||
verifyIsApprox(data2[i], ref_val));
|
||||
for (int scale = 1; scale < 5; ++scale) {
|
||||
// When EIGEN_FAST_MATH is 1 we relax the conditions slightly, and allow the function
|
||||
// to return the same value for subnormals as the reference would return for zero with
|
||||
// the same sign as the input.
|
||||
#if EIGEN_FAST_MATH
|
||||
data1[0] = Scalar(scale) * std::numeric_limits<Scalar>::denorm_min();
|
||||
data1[1] = -data1[0];
|
||||
test::packet_helper<Cond, Packet> h;
|
||||
h.store(data2, fun(h.load(data1)));
|
||||
for (int i=0; i < PacketSize; ++i) {
|
||||
const Scalar ref_zero = ref_fun(data1[i] < 0 ? -Scalar(0) : Scalar(0));
|
||||
const Scalar ref_val = ref_fun(data1[i]);
|
||||
VERIFY(((std::isnan)(data2[i]) && (std::isnan)(ref_val)) || data2[i] == ref_zero ||
|
||||
verifyIsApprox(data2[i], ref_val));
|
||||
}
|
||||
#else
|
||||
CHECK_CWISE1_IF(Cond, ref_fun, fun);
|
||||
#endif
|
||||
}
|
||||
#else
|
||||
data1[0] = std::numeric_limits<Scalar>::denorm_min();
|
||||
data1[1] = -data1[0];
|
||||
CHECK_CWISE1_IF(Cond, ref_fun, fun);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Test for smallest normalized floats.
|
||||
data1[0] = norm_min;
|
||||
data1[1] = -data1[0];
|
||||
CHECK_CWISE1_IF(Cond, ref_fun, fun);
|
||||
|
||||
// Test for largest floats.
|
||||
data1[0] = norm_max;
|
||||
data1[1] = -data1[0];
|
||||
CHECK_CWISE1_IF(Cond, ref_fun, fun);
|
||||
|
||||
// Test for zeros.
|
||||
data1[0] = Scalar(0.0);
|
||||
|
Loading…
x
Reference in New Issue
Block a user