mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-07-30 16:52:01 +08:00
Fix arm32 float division and related bugs
(cherry picked from commit 81b48065ea673cd352d11ef9b6a3d86778ac962d)
This commit is contained in:
parent
526a6328e2
commit
72f77ccb3e
@ -960,57 +960,6 @@ template<> EIGEN_STRONG_INLINE Packet2ul pmul<Packet2ul>(const Packet2ul& a, con
|
|||||||
vdup_n_u64(vgetq_lane_u64(a, 1)*vgetq_lane_u64(b, 1)));
|
vdup_n_u64(vgetq_lane_u64(a, 1)*vgetq_lane_u64(b, 1)));
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet2f pdiv<Packet2f>(const Packet2f& a, const Packet2f& b)
|
|
||||||
{
|
|
||||||
#if EIGEN_ARCH_ARM64
|
|
||||||
return vdiv_f32(a,b);
|
|
||||||
#else
|
|
||||||
Packet2f inv, restep, div;
|
|
||||||
|
|
||||||
// NEON does not offer a divide instruction, we have to do a reciprocal approximation
|
|
||||||
// However NEON in contrast to other SIMD engines (AltiVec/SSE), offers
|
|
||||||
// a reciprocal estimate AND a reciprocal step -which saves a few instructions
|
|
||||||
// vrecpeq_f32() returns an estimate to 1/b, which we will finetune with
|
|
||||||
// Newton-Raphson and vrecpsq_f32()
|
|
||||||
inv = vrecpe_f32(b);
|
|
||||||
|
|
||||||
// This returns a differential, by which we will have to multiply inv to get a better
|
|
||||||
// approximation of 1/b.
|
|
||||||
restep = vrecps_f32(b, inv);
|
|
||||||
inv = vmul_f32(restep, inv);
|
|
||||||
|
|
||||||
// Finally, multiply a by 1/b and get the wanted result of the division.
|
|
||||||
div = vmul_f32(a, inv);
|
|
||||||
|
|
||||||
return div;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
|
|
||||||
{
|
|
||||||
#if EIGEN_ARCH_ARM64
|
|
||||||
return vdivq_f32(a,b);
|
|
||||||
#else
|
|
||||||
Packet4f inv, restep, div;
|
|
||||||
|
|
||||||
// NEON does not offer a divide instruction, we have to do a reciprocal approximation
|
|
||||||
// However NEON in contrast to other SIMD engines (AltiVec/SSE), offers
|
|
||||||
// a reciprocal estimate AND a reciprocal step -which saves a few instructions
|
|
||||||
// vrecpeq_f32() returns an estimate to 1/b, which we will finetune with
|
|
||||||
// Newton-Raphson and vrecpsq_f32()
|
|
||||||
inv = vrecpeq_f32(b);
|
|
||||||
|
|
||||||
// This returns a differential, by which we will have to multiply inv to get a better
|
|
||||||
// approximation of 1/b.
|
|
||||||
restep = vrecpsq_f32(b, inv);
|
|
||||||
inv = vmulq_f32(restep, inv);
|
|
||||||
|
|
||||||
// Finally, multiply a by 1/b and get the wanted result of the division.
|
|
||||||
div = vmulq_f32(a, inv);
|
|
||||||
|
|
||||||
return div;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet4c pdiv<Packet4c>(const Packet4c& /*a*/, const Packet4c& /*b*/)
|
template<> EIGEN_STRONG_INLINE Packet4c pdiv<Packet4c>(const Packet4c& /*a*/, const Packet4c& /*b*/)
|
||||||
{
|
{
|
||||||
eigen_assert(false && "packet integer division are not supported by NEON");
|
eigen_assert(false && "packet integer division are not supported by NEON");
|
||||||
@ -3289,40 +3238,115 @@ template<> EIGEN_STRONG_INLINE Packet4ui psqrt(const Packet4ui& a) {
|
|||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) {
|
EIGEN_STRONG_INLINE Packet4f prsqrt_float_unsafe(const Packet4f& a) {
|
||||||
// Compute approximate reciprocal sqrt.
|
// Compute approximate reciprocal sqrt.
|
||||||
Packet4f x = vrsqrteq_f32(a);
|
// Does not correctly handle +/- 0 or +inf
|
||||||
// Do Newton iterations for 1/sqrt(x).
|
float32x4_t result = vrsqrteq_f32(a);
|
||||||
x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, x), x), x);
|
result = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, result), result), result);
|
||||||
x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, x), x), x);
|
result = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, result), result), result);
|
||||||
const Packet4f infinity = pset1<Packet4f>(NumTraits<float>::infinity());
|
return result;
|
||||||
return pselect(pcmp_eq(a, pzero(a)), infinity, x);
|
}
|
||||||
|
|
||||||
|
EIGEN_STRONG_INLINE Packet2f prsqrt_float_unsafe(const Packet2f& a) {
|
||||||
|
// Compute approximate reciprocal sqrt.
|
||||||
|
// Does not correctly handle +/- 0 or +inf
|
||||||
|
float32x2_t result = vrsqrte_f32(a);
|
||||||
|
result = vmul_f32(vrsqrts_f32(vmul_f32(a, result), result), result);
|
||||||
|
result = vmul_f32(vrsqrts_f32(vmul_f32(a, result), result), result);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename Packet> Packet prsqrt_float_common(const Packet& a) {
|
||||||
|
const Packet cst_zero = pzero(a);
|
||||||
|
const Packet cst_inf = pset1<Packet>(NumTraits<float>::infinity());
|
||||||
|
Packet return_zero = pcmp_eq(a, cst_inf);
|
||||||
|
Packet return_inf = pcmp_eq(a, cst_zero);
|
||||||
|
Packet result = prsqrt_float_unsafe(a);
|
||||||
|
result = pselect(return_inf, por(cst_inf, a), result);
|
||||||
|
result = pandnot(result, return_zero);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) {
|
||||||
|
return prsqrt_float_common(a);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet2f prsqrt(const Packet2f& a) {
|
template<> EIGEN_STRONG_INLINE Packet2f prsqrt(const Packet2f& a) {
|
||||||
// Compute approximate reciprocal sqrt.
|
return prsqrt_float_common(a);
|
||||||
Packet2f x = vrsqrte_f32(a);
|
}
|
||||||
// Do Newton iterations for 1/sqrt(x).
|
|
||||||
x = vmul_f32(vrsqrts_f32(vmul_f32(a, x), x), x);
|
EIGEN_STRONG_INLINE Packet4f preciprocal(const Packet4f& a)
|
||||||
x = vmul_f32(vrsqrts_f32(vmul_f32(a, x), x), x);
|
{
|
||||||
const Packet2f infinity = pset1<Packet2f>(NumTraits<float>::infinity());
|
// Compute approximate reciprocal.
|
||||||
return pselect(pcmp_eq(a, pzero(a)), infinity, x);
|
float32x4_t result = vrecpeq_f32(a);
|
||||||
|
result = vmulq_f32(vrecpsq_f32(a, result), result);
|
||||||
|
result = vmulq_f32(vrecpsq_f32(a, result), result);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_STRONG_INLINE Packet2f preciprocal(const Packet2f& a)
|
||||||
|
{
|
||||||
|
// Compute approximate reciprocal.
|
||||||
|
float32x2_t result = vrecpe_f32(a);
|
||||||
|
result = vmul_f32(vrecps_f32(a, result), result);
|
||||||
|
result = vmul_f32(vrecps_f32(a, result), result);
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Unfortunately vsqrt_f32 is only available for A64.
|
// Unfortunately vsqrt_f32 is only available for A64.
|
||||||
#if EIGEN_ARCH_ARM64
|
#if EIGEN_ARCH_ARM64
|
||||||
template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& _x){return vsqrtq_f32(_x);}
|
template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) { return vsqrtq_f32(a); }
|
||||||
template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& _x){return vsqrt_f32(_x); }
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& a) { return vsqrt_f32(a); }
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { return vdivq_f32(a, b); }
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2f pdiv(const Packet2f& a, const Packet2f& b) { return vdiv_f32(a, b); }
|
||||||
#else
|
#else
|
||||||
template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) {
|
template<typename Packet>
|
||||||
const Packet4f infinity = pset1<Packet4f>(NumTraits<float>::infinity());
|
EIGEN_STRONG_INLINE Packet psqrt_float_common(const Packet& a) {
|
||||||
const Packet4f is_zero_or_inf = por(pcmp_eq(a, pzero(a)), pcmp_eq(a, infinity));
|
const Packet cst_zero = pzero(a);
|
||||||
return pselect(is_zero_or_inf, a, pmul(a, prsqrt(a)));
|
const Packet cst_inf = pset1<Packet>(NumTraits<float>::infinity());
|
||||||
|
|
||||||
|
Packet result = pmul(a, prsqrt_float_unsafe(a));
|
||||||
|
Packet a_is_zero = pcmp_eq(a, cst_zero);
|
||||||
|
Packet a_is_inf = pcmp_eq(a, cst_inf);
|
||||||
|
Packet return_a = por(a_is_zero, a_is_inf);
|
||||||
|
|
||||||
|
result = pselect(return_a, a, result);
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) {
|
||||||
|
return psqrt_float_common(a);
|
||||||
|
}
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& a) {
|
template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& a) {
|
||||||
const Packet2f infinity = pset1<Packet2f>(NumTraits<float>::infinity());
|
return psqrt_float_common(a);
|
||||||
const Packet2f is_zero_or_inf = por(pcmp_eq(a, pzero(a)), pcmp_eq(a, infinity));
|
}
|
||||||
return pselect(is_zero_or_inf, a, pmul(a, prsqrt(a)));
|
|
||||||
|
template<typename Packet>
|
||||||
|
EIGEN_STRONG_INLINE Packet pdiv_float_common(const Packet& a, const Packet& b) {
|
||||||
|
// if b is large, NEON intrinsics will flush preciprocal(b) to zero
|
||||||
|
// avoid underflow with the following manipulation:
|
||||||
|
// a / b = f * (a * reciprocal(f * b))
|
||||||
|
|
||||||
|
const Packet cst_one = pset1<Packet>(1.0f);
|
||||||
|
const Packet cst_quarter = pset1<Packet>(0.25f);
|
||||||
|
const Packet cst_thresh = pset1<Packet>(NumTraits<float>::highest() / 4.0f);
|
||||||
|
|
||||||
|
Packet b_will_underflow = pcmp_le(cst_thresh, pabs(b));
|
||||||
|
Packet f = pselect(b_will_underflow, cst_quarter, cst_one);
|
||||||
|
Packet result = pmul(f, pmul(a, preciprocal(pmul(b, f))));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
|
||||||
|
return pdiv_float_common(a, b);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2f pdiv<Packet2f>(const Packet2f& a, const Packet2f& b) {
|
||||||
|
return pdiv_float_common(a, b);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -22,7 +22,7 @@ void pow_test() {
|
|||||||
const Scalar sqrt2 = Scalar(std::sqrt(2));
|
const Scalar sqrt2 = Scalar(std::sqrt(2));
|
||||||
const Scalar inf = Eigen::NumTraits<Scalar>::infinity();
|
const Scalar inf = Eigen::NumTraits<Scalar>::infinity();
|
||||||
const Scalar nan = Eigen::NumTraits<Scalar>::quiet_NaN();
|
const Scalar nan = Eigen::NumTraits<Scalar>::quiet_NaN();
|
||||||
const Scalar denorm_min = std::numeric_limits<Scalar>::denorm_min();
|
const Scalar denorm_min = EIGEN_ARCH_ARM ? zero : std::numeric_limits<Scalar>::denorm_min();
|
||||||
const Scalar min = (std::numeric_limits<Scalar>::min)();
|
const Scalar min = (std::numeric_limits<Scalar>::min)();
|
||||||
const Scalar max = (std::numeric_limits<Scalar>::max)();
|
const Scalar max = (std::numeric_limits<Scalar>::max)();
|
||||||
const Scalar max_exp = (static_cast<Scalar>(int(Eigen::NumTraits<Scalar>::max_exponent())) * Scalar(EIGEN_LN2)) / eps;
|
const Scalar max_exp = (static_cast<Scalar>(int(Eigen::NumTraits<Scalar>::max_exponent())) * Scalar(EIGEN_LN2)) / eps;
|
||||||
@ -356,7 +356,12 @@ template<typename ArrayType> void array_real(const ArrayType& m)
|
|||||||
m3(rows, cols),
|
m3(rows, cols),
|
||||||
m4 = m1;
|
m4 = m1;
|
||||||
|
|
||||||
m4 = (m4.abs()==Scalar(0)).select(Scalar(1),m4);
|
// avoid denormalized values so verification doesn't fail on platforms that don't support them
|
||||||
|
// denormalized behavior is tested elsewhere (unary_op_test, binary_ops_test)
|
||||||
|
const Scalar min = (std::numeric_limits<Scalar>::min)();
|
||||||
|
m1 = (m1.abs()<min).select(Scalar(0),m1);
|
||||||
|
m2 = (m2.abs()<min).select(Scalar(0),m2);
|
||||||
|
m4 = (m4.abs()<min).select(Scalar(1),m4);
|
||||||
|
|
||||||
Scalar s1 = internal::random<Scalar>();
|
Scalar s1 = internal::random<Scalar>();
|
||||||
|
|
||||||
@ -396,6 +401,7 @@ template<typename ArrayType> void array_real(const ArrayType& m)
|
|||||||
|
|
||||||
// avoid inf and NaNs so verification doesn't fail
|
// avoid inf and NaNs so verification doesn't fail
|
||||||
m3 = m4.abs();
|
m3 = m4.abs();
|
||||||
|
|
||||||
VERIFY_IS_APPROX(m3.sqrt(), sqrt(abs(m3)));
|
VERIFY_IS_APPROX(m3.sqrt(), sqrt(abs(m3)));
|
||||||
VERIFY_IS_APPROX(m3.rsqrt(), Scalar(1)/sqrt(abs(m3)));
|
VERIFY_IS_APPROX(m3.rsqrt(), Scalar(1)/sqrt(abs(m3)));
|
||||||
VERIFY_IS_APPROX(rsqrt(m3), Scalar(1)/sqrt(abs(m3)));
|
VERIFY_IS_APPROX(rsqrt(m3), Scalar(1)/sqrt(abs(m3)));
|
||||||
|
@ -631,6 +631,85 @@ Scalar log2(Scalar x) {
|
|||||||
return Scalar(EIGEN_LOG2E) * std::log(x);
|
return Scalar(EIGEN_LOG2E) * std::log(x);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Create a functor out of a function so it can be passed (with overloads)
|
||||||
|
// to another function as an input argument.
|
||||||
|
#define CREATE_FUNCTOR(Name, Func) \
|
||||||
|
struct Name { \
|
||||||
|
template<typename T> \
|
||||||
|
T operator()(const T& val) const { \
|
||||||
|
return Func(val); \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
CREATE_FUNCTOR(psqrt_functor, internal::psqrt);
|
||||||
|
CREATE_FUNCTOR(prsqrt_functor, internal::prsqrt);
|
||||||
|
|
||||||
|
// TODO(rmlarsen): Run this test for more functions.
|
||||||
|
template <bool Cond, typename Scalar, typename Packet, typename RefFunctorT, typename FunctorT>
|
||||||
|
void packetmath_test_IEEE_corner_cases(const RefFunctorT& ref_fun,
|
||||||
|
const FunctorT& fun) {
|
||||||
|
const int PacketSize = internal::unpacket_traits<Packet>::size;
|
||||||
|
const Scalar norm_min = (std::numeric_limits<Scalar>::min)();
|
||||||
|
const Scalar norm_max = (std::numeric_limits<Scalar>::max)();
|
||||||
|
|
||||||
|
constexpr int size = PacketSize * 2;
|
||||||
|
EIGEN_ALIGN_MAX Scalar data1[size];
|
||||||
|
EIGEN_ALIGN_MAX Scalar data2[size];
|
||||||
|
EIGEN_ALIGN_MAX Scalar ref[size];
|
||||||
|
for (int i = 0; i < size; ++i) {
|
||||||
|
data1[i] = data2[i] = ref[i] = Scalar(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test for subnormals.
|
||||||
|
if (Cond && std::numeric_limits<Scalar>::has_denorm == std::denorm_present && !EIGEN_ARCH_ARM) {
|
||||||
|
|
||||||
|
for (int scale = 1; scale < 5; ++scale) {
|
||||||
|
// When EIGEN_FAST_MATH is 1 we relax the conditions slightly, and allow the function
|
||||||
|
// to return the same value for subnormals as the reference would return for zero with
|
||||||
|
// the same sign as the input.
|
||||||
|
#if EIGEN_FAST_MATH
|
||||||
|
data1[0] = Scalar(scale) * std::numeric_limits<Scalar>::denorm_min();
|
||||||
|
data1[1] = -data1[0];
|
||||||
|
test::packet_helper<Cond, Packet> h;
|
||||||
|
h.store(data2, fun(h.load(data1)));
|
||||||
|
for (int i=0; i < PacketSize; ++i) {
|
||||||
|
const Scalar ref_zero = ref_fun(data1[i] < 0 ? -Scalar(0) : Scalar(0));
|
||||||
|
const Scalar ref_val = ref_fun(data1[i]);
|
||||||
|
VERIFY(((std::isnan)(data2[i]) && (std::isnan)(ref_val)) || data2[i] == ref_zero ||
|
||||||
|
verifyIsApprox(data2[i], ref_val));
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
CHECK_CWISE1_IF(Cond, ref_fun, fun);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test for smallest normalized floats.
|
||||||
|
data1[0] = norm_min;
|
||||||
|
data1[1] = -data1[0];
|
||||||
|
CHECK_CWISE1_IF(Cond, ref_fun, fun);
|
||||||
|
|
||||||
|
// Test for largest floats.
|
||||||
|
data1[0] = norm_max;
|
||||||
|
data1[1] = -data1[0];
|
||||||
|
CHECK_CWISE1_IF(Cond, ref_fun, fun);
|
||||||
|
|
||||||
|
// Test for zeros.
|
||||||
|
data1[0] = Scalar(0.0);
|
||||||
|
data1[1] = -data1[0];
|
||||||
|
CHECK_CWISE1_IF(Cond, ref_fun, fun);
|
||||||
|
|
||||||
|
// Test for infinities.
|
||||||
|
data1[0] = NumTraits<Scalar>::infinity();
|
||||||
|
data1[1] = -data1[0];
|
||||||
|
CHECK_CWISE1_IF(Cond, ref_fun, fun);
|
||||||
|
|
||||||
|
// Test for quiet NaNs.
|
||||||
|
data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
|
||||||
|
data1[1] = -std::numeric_limits<Scalar>::quiet_NaN();
|
||||||
|
CHECK_CWISE1_IF(Cond, ref_fun, fun);
|
||||||
|
}
|
||||||
|
|
||||||
template <typename Scalar, typename Packet>
|
template <typename Scalar, typename Packet>
|
||||||
void packetmath_real() {
|
void packetmath_real() {
|
||||||
typedef internal::packet_traits<Scalar> PacketTraits;
|
typedef internal::packet_traits<Scalar> PacketTraits;
|
||||||
@ -735,12 +814,14 @@ void packetmath_real() {
|
|||||||
CHECK_CWISE1_BYREF1_IF(PacketTraits::HasExp, REF_FREXP, internal::pfrexp);
|
CHECK_CWISE1_BYREF1_IF(PacketTraits::HasExp, REF_FREXP, internal::pfrexp);
|
||||||
if (PacketTraits::HasExp) {
|
if (PacketTraits::HasExp) {
|
||||||
// Check denormals:
|
// Check denormals:
|
||||||
|
#if !EIGEN_ARCH_ARM
|
||||||
for (int j=0; j<3; ++j) {
|
for (int j=0; j<3; ++j) {
|
||||||
data1[0] = Scalar(std::ldexp(1, NumTraits<Scalar>::min_exponent()-j));
|
data1[0] = Scalar(std::ldexp(1, NumTraits<Scalar>::min_exponent()-j));
|
||||||
CHECK_CWISE1_BYREF1_IF(PacketTraits::HasExp, REF_FREXP, internal::pfrexp);
|
CHECK_CWISE1_BYREF1_IF(PacketTraits::HasExp, REF_FREXP, internal::pfrexp);
|
||||||
data1[0] = -data1[0];
|
data1[0] = -data1[0];
|
||||||
CHECK_CWISE1_BYREF1_IF(PacketTraits::HasExp, REF_FREXP, internal::pfrexp);
|
CHECK_CWISE1_BYREF1_IF(PacketTraits::HasExp, REF_FREXP, internal::pfrexp);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// zero
|
// zero
|
||||||
data1[0] = Scalar(0);
|
data1[0] = Scalar(0);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user