mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-12 19:59:05 +08:00
Changes to fast SQRT/RSQRT
This commit is contained in:
parent
f9b7564faa
commit
8b875dbef1
@ -84,19 +84,19 @@ struct generic_rsqrt_newton_step {
|
|||||||
const Packet one_point_five = pset1<Packet>(Scalar(1.5));
|
const Packet one_point_five = pset1<Packet>(Scalar(1.5));
|
||||||
const Packet minus_half = pset1<Packet>(Scalar(-0.5));
|
const Packet minus_half = pset1<Packet>(Scalar(-0.5));
|
||||||
const Packet minus_half_a = pmul(minus_half, a);
|
const Packet minus_half_a = pmul(minus_half, a);
|
||||||
const Packet neg_mask = pcmp_lt(a, pzero(a));
|
const Scalar norm_min = (std::numeric_limits<Scalar>::min)();
|
||||||
|
const Packet denorm_mask = pcmp_lt(a, pset1<Packet>(norm_min));
|
||||||
Packet x =
|
Packet x =
|
||||||
generic_rsqrt_newton_step<Packet,Steps - 1>::run(a, approx_rsqrt);
|
generic_rsqrt_newton_step<Packet,Steps - 1>::run(a, approx_rsqrt);
|
||||||
const Packet tmp = pmul(minus_half_a, x);
|
const Packet tmp = pmul(minus_half_a, x);
|
||||||
// If tmp is NaN, it means that a is either 0 or Inf.
|
// If tmp is NaN, it means that a is either 0 or Inf.
|
||||||
// In this case return the approximation directly.
|
// In this case return the approximation directly. Do the same for
|
||||||
const Packet is_not_nan = pcmp_eq(tmp, tmp);
|
// positive subnormals. Otherwise return the Newton iterate.
|
||||||
// If a is negative, return NaN.
|
const Packet return_x_newton = pandnot(pcmp_eq(tmp, tmp), denorm_mask);
|
||||||
x = por(x, neg_mask);
|
|
||||||
// Refine the approximation using one Newton-Raphson step:
|
// Refine the approximation using one Newton-Raphson step:
|
||||||
// x_{n+1} = x_n * (1.5 - x_n * ((0.5 * a) * x_n)).
|
// x_{n+1} = x_n * (1.5 - x_n * ((0.5 * a) * x_n)).
|
||||||
const Packet x_newton = pmul(x, pmadd(tmp, x, one_point_five));
|
const Packet x_newton = pmul(x, pmadd(tmp, x, one_point_five));
|
||||||
return pselect(is_not_nan, x_newton, x);
|
return pselect(return_x_newton, x_newton, x);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -133,9 +133,11 @@ struct generic_sqrt_newton_step {
|
|||||||
using Scalar = typename unpacket_traits<Packet>::type;
|
using Scalar = typename unpacket_traits<Packet>::type;
|
||||||
const Packet one_point_five = pset1<Packet>(Scalar(1.5));
|
const Packet one_point_five = pset1<Packet>(Scalar(1.5));
|
||||||
const Packet negative_mask = pcmp_lt(a, pzero(a));
|
const Packet negative_mask = pcmp_lt(a, pzero(a));
|
||||||
const Packet minus_half_a = pmul(a, pset1<Packet>(Scalar(-0.5)));
|
const Scalar norm_min = (std::numeric_limits<Scalar>::min)();
|
||||||
// Set negative arguments to NaN.
|
const Packet denorm_mask = pcmp_lt(a, pset1<Packet>(norm_min));
|
||||||
const Packet a_poisoned = por(a, negative_mask);
|
// Set negative arguments to NaN and positive subnormals to zero.
|
||||||
|
const Packet a_poisoned = por(pandnot(a, denorm_mask), negative_mask);
|
||||||
|
const Packet minus_half_a = pmul(a_poisoned, pset1<Packet>(Scalar(-0.5)));
|
||||||
|
|
||||||
// Do a single step of Newton's iteration for reciprocal square root:
|
// Do a single step of Newton's iteration for reciprocal square root:
|
||||||
// x_{n+1} = x_n * (1.5 - x_n * ((0.5 * a) * x_n)).
|
// x_{n+1} = x_n * (1.5 - x_n * ((0.5 * a) * x_n)).
|
||||||
@ -150,12 +152,10 @@ struct generic_sqrt_newton_step {
|
|||||||
|
|
||||||
// Return sqrt(x) = x * rsqrt(x) for non-zero finite positive arguments.
|
// Return sqrt(x) = x * rsqrt(x) for non-zero finite positive arguments.
|
||||||
// Return a itself for 0 or +inf, NaN for negative arguments.
|
// Return a itself for 0 or +inf, NaN for negative arguments.
|
||||||
return pselect(return_rsqrt, pmul(a_poisoned, rsqrt), por(a, negative_mask));
|
return pselect(return_rsqrt, pmul(a_poisoned, rsqrt), a_poisoned);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/** \internal \returns the hyperbolic tan of \a a (coeff-wise)
|
/** \internal \returns the hyperbolic tan of \a a (coeff-wise)
|
||||||
Doesn't do anything fancy, just a 13/6-degree rational interpolant which
|
Doesn't do anything fancy, just a 13/6-degree rational interpolant which
|
||||||
is accurate up to a couple of ulps in the (approximate) range [-8, 8],
|
is accurate up to a couple of ulps in the (approximate) range [-8, 8],
|
||||||
|
@ -89,28 +89,22 @@ pexp<Packet4d>(const Packet4d& _x) {
|
|||||||
return pexp_double(_x);
|
return pexp_double(_x);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if EIGEN_FAST_MATH
|
|
||||||
|
|
||||||
template <>
|
|
||||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
|
|
||||||
Packet8f psqrt<Packet8f>(const Packet8f& _x) {
|
|
||||||
return generic_sqrt_newton_step<Packet8f>::run(_x, _mm256_rsqrt_ps(_x));
|
|
||||||
}
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
|
// Notice that for newer processors, it is counterproductive to use Newton
|
||||||
|
// iteration for square root. In particular, Skylake and Zen2 processors
|
||||||
|
// have approximately doubled throughput of the _mm_sqrt_ps instruction
|
||||||
|
// compared to their predecessors.
|
||||||
template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
|
template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
|
||||||
Packet8f psqrt<Packet8f>(const Packet8f& _x) {
|
Packet8f psqrt<Packet8f>(const Packet8f& _x) {
|
||||||
return _mm256_sqrt_ps(_x);
|
return _mm256_sqrt_ps(_x);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
|
template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
|
||||||
Packet4d psqrt<Packet4d>(const Packet4d& _x) {
|
Packet4d psqrt<Packet4d>(const Packet4d& _x) {
|
||||||
return _mm256_sqrt_pd(_x);
|
return _mm256_sqrt_pd(_x);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Even on Skylake, using Newton iteration is a win for reciprocal square root.
|
||||||
#if EIGEN_FAST_MATH
|
#if EIGEN_FAST_MATH
|
||||||
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
|
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
|
||||||
Packet8f prsqrt<Packet8f>(const Packet8f& a) {
|
Packet8f prsqrt<Packet8f>(const Packet8f& a) {
|
||||||
|
@ -75,29 +75,19 @@ Packet4f pcos<Packet4f>(const Packet4f& _x)
|
|||||||
return pcos_float(_x);
|
return pcos_float(_x);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if EIGEN_FAST_MATH
|
// Notice that for newer processors, it is counterproductive to use Newton
|
||||||
template<>
|
// iteration for square root. In particular, Skylake and Zen2 processors
|
||||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
|
// have approximately doubled throughput of the _mm_sqrt_ps instruction
|
||||||
Packet4f psqrt<Packet4f>(const Packet4f& _x)
|
// compared to their predecessors.
|
||||||
{
|
|
||||||
return generic_sqrt_newton_step<Packet4f>::run(_x, _mm_rsqrt_ps(_x));
|
|
||||||
}
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
template<>EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
|
template<>EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
|
||||||
Packet4f psqrt<Packet4f>(const Packet4f& x) { return _mm_sqrt_ps(x); }
|
Packet4f psqrt<Packet4f>(const Packet4f& x) { return _mm_sqrt_ps(x); }
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
|
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
|
||||||
Packet2d psqrt<Packet2d>(const Packet2d& x) { return _mm_sqrt_pd(x); }
|
Packet2d psqrt<Packet2d>(const Packet2d& x) { return _mm_sqrt_pd(x); }
|
||||||
|
|
||||||
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
|
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
|
||||||
Packet16b psqrt<Packet16b>(const Packet16b& x) { return x; }
|
Packet16b psqrt<Packet16b>(const Packet16b& x) { return x; }
|
||||||
|
|
||||||
#if EIGEN_FAST_MATH
|
#if EIGEN_FAST_MATH
|
||||||
|
// Even on Skylake, using Newton iteration is a win for reciprocal square root.
|
||||||
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
|
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
|
||||||
Packet4f prsqrt<Packet4f>(const Packet4f& x) {
|
Packet4f prsqrt<Packet4f>(const Packet4f& x) {
|
||||||
return generic_rsqrt_newton_step<Packet4f, /*Steps=*/1>::run(x, _mm_rsqrt_ps(x));
|
return generic_rsqrt_newton_step<Packet4f, /*Steps=*/1>::run(x, _mm_rsqrt_ps(x));
|
||||||
@ -105,7 +95,7 @@ Packet4f prsqrt<Packet4f>(const Packet4f& x) {
|
|||||||
|
|
||||||
#ifdef EIGEN_VECTORIZE_FMA
|
#ifdef EIGEN_VECTORIZE_FMA
|
||||||
// Trying to speed up reciprocal using Newton-Raphson is counterproductive
|
// Trying to speed up reciprocal using Newton-Raphson is counterproductive
|
||||||
// unless FMA is available. Without FMA pdiv(pset1<Packet>(Scalar(1),a) is
|
// unless FMA is available. Without FMA pdiv(pset1<Packet>(Scalar(1),a)) is
|
||||||
// 30% faster.
|
// 30% faster.
|
||||||
template<> EIGEN_STRONG_INLINE Packet4f preciprocal<Packet4f>(const Packet4f& x) {
|
template<> EIGEN_STRONG_INLINE Packet4f preciprocal<Packet4f>(const Packet4f& x) {
|
||||||
return generic_reciprocal_newton_step<Packet4f, /*Steps=*/1>::run(x, _mm_rcp_ps(x));
|
return generic_reciprocal_newton_step<Packet4f, /*Steps=*/1>::run(x, _mm_rcp_ps(x));
|
||||||
|
@ -657,6 +657,72 @@ Scalar log2(Scalar x) {
|
|||||||
return Scalar(EIGEN_LOG2E) * std::log(x);
|
return Scalar(EIGEN_LOG2E) * std::log(x);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO(rmlarsen): Run this test for more functions.
|
||||||
|
template <bool Cond, typename Scalar, typename Packet, typename REF_FUNCTOR_T, typename FUNCTOR_T>
|
||||||
|
void packetmath_test_IEEE_corner_cases(const REF_FUNCTOR_T& ref_fun,
|
||||||
|
const FUNCTOR_T& fun) {
|
||||||
|
const int PacketSize = internal::unpacket_traits<Packet>::size;
|
||||||
|
const Scalar norm_min = (std::numeric_limits<Scalar>::min)();
|
||||||
|
|
||||||
|
constexpr int size = PacketSize * 2;
|
||||||
|
EIGEN_ALIGN_MAX Scalar data1[size];
|
||||||
|
EIGEN_ALIGN_MAX Scalar data2[size];
|
||||||
|
EIGEN_ALIGN_MAX Scalar ref[size];
|
||||||
|
for (int i = 0; i < size; ++i) {
|
||||||
|
data1[i] = data2[i] = ref[i] = Scalar(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test for subnormals.
|
||||||
|
if (std::numeric_limits<Scalar>::has_denorm == std::denorm_present) {
|
||||||
|
|
||||||
|
// When EIGEN_FAST_MATH is 1 we relax the conditions slightly, and allow the function
|
||||||
|
// to return the same value for subnormals as the reference would return for zero with
|
||||||
|
// the same sign as the input.
|
||||||
|
// TODO(rmlarsen): Currently we ignore the error that occurs if the input is equal to
|
||||||
|
// denorm_min. Specifically, the term 0.5*x in the Newton iteration for reciprocal sqrt
|
||||||
|
// underflows to zero and the result ends up a factor of 2 too large.
|
||||||
|
#if EIGEN_FAST_MATH
|
||||||
|
// TODO(rmlarsen): Remove factor of 2 here if we can fix the underflow in reciprocal sqrt.
|
||||||
|
data1[0] = Scalar(2) * std::numeric_limits<Scalar>::denorm_min();
|
||||||
|
data1[1] = -data1[0];
|
||||||
|
test::packet_helper<Cond, Packet> h;
|
||||||
|
h.store(data2, fun(h.load(data1)));
|
||||||
|
for (int i=0; i < 2; ++i) {
|
||||||
|
const Scalar ref_zero = ref_fun(data1[i] < 0 ? -Scalar(0) : Scalar(0));
|
||||||
|
const Scalar ref_val = ref_fun(data1[i]);
|
||||||
|
// TODO(rmlarsen): Remove debug cruft.
|
||||||
|
// std::cerr << "x = " << data1[i] << "y = " << data2[i] << ", ref_val = " << ref_val << ", ref_zero " << ref_zero << std::endl;
|
||||||
|
VERIFY(((std::isnan)(data2[i]) && (std::isnan)(ref_val)) || data2[i] == ref_zero ||
|
||||||
|
verifyIsApprox(data2[i], ref_val));
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
data1[0] = std::numeric_limits<Scalar>::denorm_min();
|
||||||
|
data1[1] = -data1[0];
|
||||||
|
CHECK_CWISE1_IF(Cond, ref_fun, fun);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test for smallest normalized floats.
|
||||||
|
data1[0] = norm_min;
|
||||||
|
data1[1] = -data1[0];
|
||||||
|
CHECK_CWISE1_IF(Cond, ref_fun, fun);
|
||||||
|
|
||||||
|
// Test for zeros.
|
||||||
|
data1[0] = Scalar(0.0);
|
||||||
|
data1[1] = -data1[0];
|
||||||
|
CHECK_CWISE1_IF(Cond, ref_fun, fun);
|
||||||
|
|
||||||
|
// Test for infinities.
|
||||||
|
data1[0] = NumTraits<Scalar>::infinity();
|
||||||
|
data1[1] = -data1[0];
|
||||||
|
CHECK_CWISE1_IF(Cond, ref_fun, fun);
|
||||||
|
|
||||||
|
// Test for quiet NaNs.
|
||||||
|
data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
|
||||||
|
data1[1] = -std::numeric_limits<Scalar>::quiet_NaN();
|
||||||
|
CHECK_CWISE1_IF(Cond, ref_fun, fun);
|
||||||
|
}
|
||||||
|
|
||||||
template <typename Scalar, typename Packet>
|
template <typename Scalar, typename Packet>
|
||||||
void packetmath_real() {
|
void packetmath_real() {
|
||||||
typedef internal::packet_traits<Scalar> PacketTraits;
|
typedef internal::packet_traits<Scalar> PacketTraits;
|
||||||
@ -947,33 +1013,8 @@ void packetmath_real() {
|
|||||||
VERIFY((numext::isnan)(data2[1]));
|
VERIFY((numext::isnan)(data2[1]));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (PacketTraits::HasSqrt) {
|
packetmath_test_IEEE_corner_cases<PacketTraits::HasSqrt, Scalar, Packet>(numext::sqrt<Scalar>, internal::psqrt<Packet>);
|
||||||
data1[0] = Scalar(-1.0f);
|
packetmath_test_IEEE_corner_cases<PacketTraits::HasRsqrt, Scalar, Packet>(numext::rsqrt<Scalar>, internal::prsqrt<Packet>);
|
||||||
if (std::numeric_limits<Scalar>::has_denorm == std::denorm_present) {
|
|
||||||
data1[1] = -std::numeric_limits<Scalar>::denorm_min();
|
|
||||||
} else {
|
|
||||||
data1[1] = -((std::numeric_limits<Scalar>::min)());
|
|
||||||
}
|
|
||||||
CHECK_CWISE1_IF(PacketTraits::HasSqrt, numext::sqrt, internal::psqrt);
|
|
||||||
|
|
||||||
data1[0] = Scalar(0.0f);
|
|
||||||
data1[1] = NumTraits<Scalar>::infinity();
|
|
||||||
CHECK_CWISE1_IF(PacketTraits::HasSqrt, numext::sqrt, internal::psqrt);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (PacketTraits::HasRsqrt) {
|
|
||||||
data1[0] = Scalar(-1.0f);
|
|
||||||
if (std::numeric_limits<Scalar>::has_denorm == std::denorm_present) {
|
|
||||||
data1[1] = -std::numeric_limits<Scalar>::denorm_min();
|
|
||||||
} else {
|
|
||||||
data1[1] = -((std::numeric_limits<Scalar>::min)());
|
|
||||||
}
|
|
||||||
CHECK_CWISE1_IF(PacketTraits::HasRsqrt, numext::rsqrt, internal::prsqrt);
|
|
||||||
|
|
||||||
data1[0] = Scalar(0.0f);
|
|
||||||
data1[1] = NumTraits<Scalar>::infinity();
|
|
||||||
CHECK_CWISE1_IF(PacketTraits::HasRsqrt, numext::rsqrt, internal::prsqrt);
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO(rmlarsen): Re-enable for half and bfloat16.
|
// TODO(rmlarsen): Re-enable for half and bfloat16.
|
||||||
if (PacketTraits::HasCos
|
if (PacketTraits::HasCos
|
||||||
|
Loading…
x
Reference in New Issue
Block a user