mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-12 03:39:01 +08:00
Fix rint SSE/NEON again, using optimization barrier.
This is a new version of !423, which failed for MSVC. Defined `EIGEN_OPTIMIZATION_BARRIER(X)` that uses inline assembly to prevent operations involving `X` from crossing that barrier. Should work on most `GNUC` compatible compilers (MSVC doesn't seem to need this). This is a modified version adapted from what was used in `psincos_float` and tested on more platforms (see #1674, https://godbolt.org/z/73ezTG). Modified `rint` to use the barrier to prevent the add/subtract rounding trick from being optimized away. Also fixed an edge case for large inputs that get bumped up a power of two and ends up rounding away more than just the fractional part. If we are over `2^digits` then just return the input. This edge case was missed in the test since the test was comparing approximate equality, which was still satisfied. Adding a strict equality option catches it.
This commit is contained in:
parent
5f0b4a4010
commit
82d61af3a4
@ -630,14 +630,6 @@ __attribute__((optimize("-fno-unsafe-math-optimizations")))
|
||||
#endif
|
||||
Packet psincos_float(const Packet& _x)
|
||||
{
|
||||
// Workaround -ffast-math aggressive optimizations
|
||||
// See bug 1674
|
||||
#if EIGEN_COMP_CLANG && defined(EIGEN_VECTORIZE_SSE)
|
||||
#define EIGEN_SINCOS_DONT_OPT(X) __asm__ ("" : "+x" (X));
|
||||
#else
|
||||
#define EIGEN_SINCOS_DONT_OPT(X)
|
||||
#endif
|
||||
|
||||
typedef typename unpacket_traits<Packet>::integer_packet PacketI;
|
||||
|
||||
const Packet cst_2oPI = pset1<Packet>(0.636619746685028076171875f); // 2/PI
|
||||
@ -652,7 +644,7 @@ Packet psincos_float(const Packet& _x)
|
||||
|
||||
// Rounding trick:
|
||||
Packet y_round = padd(y, cst_rounding_magic);
|
||||
EIGEN_SINCOS_DONT_OPT(y_round)
|
||||
EIGEN_OPTIMIZATION_BARRIER(y_round)
|
||||
PacketI y_int = preinterpret<PacketI>(y_round); // last 23 digits represent integer (if abs(x)<2^24)
|
||||
y = psub(y_round, cst_rounding_magic); // nearest integer to x*4/pi
|
||||
|
||||
@ -674,9 +666,9 @@ Packet psincos_float(const Packet& _x)
|
||||
// and 2 ULP up to:
|
||||
const float huge_th = ComputeSine ? 25966.f : 18838.f;
|
||||
x = pmadd(y, pset1<Packet>(-1.5703125), x); // = 0xbfc90000
|
||||
EIGEN_SINCOS_DONT_OPT(x)
|
||||
EIGEN_OPTIMIZATION_BARRIER(x)
|
||||
x = pmadd(y, pset1<Packet>(-0.000483989715576171875), x); // = 0xb9fdc000
|
||||
EIGEN_SINCOS_DONT_OPT(x)
|
||||
EIGEN_OPTIMIZATION_BARRIER(x)
|
||||
x = pmadd(y, pset1<Packet>(1.62865035235881805419921875e-07), x); // = 0x342ee000
|
||||
x = pmadd(y, pset1<Packet>(5.5644315544167710640977020375430583953857421875e-11), x); // = 0x2e74b9ee
|
||||
|
||||
@ -753,8 +745,6 @@ Packet psincos_float(const Packet& _x)
|
||||
|
||||
// Update the sign and filter huge inputs
|
||||
return pxor(y, sign_bit);
|
||||
|
||||
#undef EIGEN_SINCOS_DONT_OPT
|
||||
}
|
||||
|
||||
template<typename Packet>
|
||||
|
@ -3207,20 +3207,30 @@ template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) {
|
||||
// Adds and subtracts signum(a) * 2^23 to force rounding.
|
||||
const Packet4f offset =
|
||||
pselect(pcmp_lt(a, pzero(a)),
|
||||
pset1<Packet4f>(-static_cast<float>(1<<23)),
|
||||
pset1<Packet4f>(+static_cast<float>(1<<23)));
|
||||
return psub(padd(a, offset), offset);
|
||||
const Packet4f limit = pset1<Packet4f>(static_cast<float>(1<<23));
|
||||
const Packet4f abs_a = pabs(a);
|
||||
Packet4f r = padd(abs_a, limit);
|
||||
// Don't compile-away addition and subtraction.
|
||||
EIGEN_OPTIMIZATION_BARRIER(r);
|
||||
r = psub(r, limit);
|
||||
// If greater than limit, simply return a. Otherwise, account for sign.
|
||||
r = pselect(pcmp_lt(abs_a, limit),
|
||||
pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
|
||||
return r;
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2f print(const Packet2f& a) {
|
||||
// Adds and subtracts signum(a) * 2^23 to force rounding.
|
||||
const Packet2f offset =
|
||||
pselect(pcmp_lt(a, pzero(a)),
|
||||
pset1<Packet2f>(-static_cast<float>(1<<23)),
|
||||
pset1<Packet2f>(+static_cast<float>(1<<23)));
|
||||
return psub(padd(a, offset), offset);
|
||||
const Packet2f limit = pset1<Packet2f>(static_cast<float>(1<<23));
|
||||
const Packet2f abs_a = pabs(a);
|
||||
Packet2f r = padd(abs_a, limit);
|
||||
// Don't compile-away addition and subtraction.
|
||||
EIGEN_OPTIMIZATION_BARRIER(r);
|
||||
r = psub(r, limit);
|
||||
// If greater than limit, simply return a. Otherwise, account for sign.
|
||||
r = pselect(pcmp_lt(abs_a, limit),
|
||||
pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
|
||||
return r;
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
|
||||
|
@ -646,20 +646,30 @@ template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { re
|
||||
#else
|
||||
template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) {
|
||||
// Adds and subtracts signum(a) * 2^23 to force rounding.
|
||||
const Packet4f offset =
|
||||
pselect(pcmp_lt(a, pzero(a)),
|
||||
pset1<Packet4f>(-static_cast<float>(1<<23)),
|
||||
pset1<Packet4f>(+static_cast<float>(1<<23)));
|
||||
return psub(padd(a, offset), offset);
|
||||
const Packet4f limit = pset1<Packet4f>(static_cast<float>(1<<23));
|
||||
const Packet4f abs_a = pabs(a);
|
||||
Packet4f r = padd(abs_a, limit);
|
||||
// Don't compile-away addition and subtraction.
|
||||
EIGEN_OPTIMIZATION_BARRIER(r);
|
||||
r = psub(r, limit);
|
||||
// If greater than limit, simply return a. Otherwise, account for sign.
|
||||
r = pselect(pcmp_lt(abs_a, limit),
|
||||
pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
|
||||
return r;
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d print(const Packet2d& a) {
|
||||
// Adds and subtracts signum(a) * 2^52 to force rounding.
|
||||
const Packet2d offset =
|
||||
pselect(pcmp_lt(a, pzero(a)),
|
||||
pset1<Packet2d>(-static_cast<double>(1ull<<52)),
|
||||
pset1<Packet2d>(+static_cast<double>(1ull<<52)));
|
||||
return psub(padd(a, offset), offset);
|
||||
const Packet2d limit = pset1<Packet2d>(static_cast<double>(1ull<<52));
|
||||
const Packet2d abs_a = pabs(a);
|
||||
Packet2d r = padd(abs_a, limit);
|
||||
// Don't compile-away addition and subtraction.
|
||||
EIGEN_OPTIMIZATION_BARRIER(r);
|
||||
r = psub(r, limit);
|
||||
// If greater than limit, simply return a. Otherwise, account for sign.
|
||||
r = pselect(pcmp_lt(abs_a, limit),
|
||||
pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
|
||||
return r;
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
|
||||
|
@ -51,7 +51,11 @@
|
||||
|
||||
#ifndef EIGEN_STACK_ALLOCATION_LIMIT
|
||||
// 131072 == 128 KB
|
||||
#define EIGEN_STACK_ALLOCATION_LIMIT 131072
|
||||
#if defined(__AVX512F__)
|
||||
#define EIGEN_STACK_ALLOCATION_LIMIT 0
|
||||
#else
|
||||
#define EIGEN_STACK_ALLOCATION_LIMIT 16384
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//------------------------------------------------------------------------------------------
|
||||
@ -1063,6 +1067,64 @@ namespace Eigen {
|
||||
#endif
|
||||
|
||||
|
||||
// Acts as a barrier preventing operations involving `X` from crossing. This
|
||||
// occurs, for example, in the fast rounding trick where a magic constant is
|
||||
// added then subtracted, which is otherwise compiled away with -ffast-math.
|
||||
//
|
||||
// See bug 1674
|
||||
#if !defined(EIGEN_OPTIMIZATION_BARRIER)
|
||||
#if EIGEN_COMP_GNUC
|
||||
// According to https://gcc.gnu.org/onlinedocs/gcc/Constraints.html:
|
||||
// X: Any operand whatsoever.
|
||||
// r: A register operand is allowed provided that it is in a general
|
||||
// register.
|
||||
// g: Any register, memory or immediate integer operand is allowed, except
|
||||
// for registers that are not general registers.
|
||||
// w: (AArch32/AArch64) Floating point register, Advanced SIMD vector
|
||||
// register or SVE vector register.
|
||||
// x: (SSE) Any SSE register.
|
||||
// (AArch64) Like w, but restricted to registers 0 to 15 inclusive.
|
||||
// v: (PowerPC) An Altivec vector register.
|
||||
// wa:(PowerPC) A VSX register.
|
||||
//
|
||||
// "X" (uppercase) should work for all cases, though this seems to fail for
|
||||
// some versions of GCC for arm/aarch64 with
|
||||
// "error: inconsistent operand constraints in an 'asm'"
|
||||
// Clang x86_64/arm/aarch64 seems to require "g" to support both scalars and
|
||||
// vectors, otherwise
|
||||
// "error: non-trivial scalar-to-vector conversion, possible invalid
|
||||
// constraint for vector type"
|
||||
//
|
||||
// GCC for ppc64le generates an internal compiler error with x/X/g.
|
||||
// GCC for AVX generates an internal compiler error with X.
|
||||
//
|
||||
// Tested on icc/gcc/clang for sse, avx, avx2, avx512dq
|
||||
// gcc for arm, aarch64,
|
||||
// gcc for ppc64le,
|
||||
// both vectors and scalars.
|
||||
//
|
||||
// Note that this is restricted to plain types - this will not work
|
||||
// directly for std::complex<T>, Eigen::half, Eigen::bfloat16. For these,
|
||||
// you will need to apply to the underlying POD type.
|
||||
#if EIGEN_ARCH_PPC
|
||||
// General, Altivec, VSX.
|
||||
#define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+r,v,wa" (X));
|
||||
#elif EIGEN_ARCH_ARM_OR_ARM64
|
||||
// General, NEON.
|
||||
#define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+g,w" (X));
|
||||
#elif EIGEN_ARCH_i386_OR_x86_64
|
||||
// General, SSE.
|
||||
#define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+g,x" (X));
|
||||
#else
|
||||
// Not implemented for other architectures.
|
||||
#define EIGEN_OPTIMIZATION_BARRIER(X)
|
||||
#endif
|
||||
#else
|
||||
// Not implemented for other compilers.
|
||||
#define EIGEN_OPTIMIZATION_BARRIER(X)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if EIGEN_COMP_MSVC
|
||||
// NOTE MSVC often gives C4127 warnings with compiletime if statements. See bug 1362.
|
||||
// This workaround is ugly, but it does the job.
|
||||
|
@ -299,6 +299,29 @@ void packetmath_minus_zero_add() {
|
||||
CHECK_CWISE2_IF(internal::packet_traits<Scalar>::HasAdd, REF_ADD, internal::padd);
|
||||
}
|
||||
|
||||
// Ensure optimization barrier compiles and doesn't modify contents.
|
||||
// Only applies to raw types, so will not work for std::complex, Eigen::half
|
||||
// or Eigen::bfloat16. For those you would need to refer to an underlying
|
||||
// storage element.
|
||||
template<typename Packet, typename EnableIf = void>
|
||||
struct eigen_optimization_barrier_test {
|
||||
static void run() {}
|
||||
};
|
||||
|
||||
template<typename Packet>
|
||||
struct eigen_optimization_barrier_test<Packet, typename internal::enable_if<
|
||||
!NumTraits<Packet>::IsComplex &&
|
||||
!internal::is_same<Packet, Eigen::half>::value &&
|
||||
!internal::is_same<Packet, Eigen::bfloat16>::value
|
||||
>::type> {
|
||||
static void run() {
|
||||
typedef typename internal::unpacket_traits<Packet>::type Scalar;
|
||||
Scalar s = internal::random<Scalar>();
|
||||
Packet barrier = internal::pset1<Packet>(s);
|
||||
EIGEN_OPTIMIZATION_BARRIER(barrier);
|
||||
eigen_assert(s == internal::pfirst(barrier) && "EIGEN_OPTIMIZATION_BARRIER");
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Scalar, typename Packet>
|
||||
void packetmath() {
|
||||
@ -317,6 +340,10 @@ void packetmath() {
|
||||
EIGEN_ALIGN_MAX Scalar data3[size];
|
||||
EIGEN_ALIGN_MAX Scalar ref[size];
|
||||
RealScalar refvalue = RealScalar(0);
|
||||
|
||||
eigen_optimization_barrier_test<Packet>::run();
|
||||
eigen_optimization_barrier_test<Scalar>::run();
|
||||
|
||||
for (int i = 0; i < size; ++i) {
|
||||
data1[i] = internal::random<Scalar>() / RealScalar(PacketSize);
|
||||
data2[i] = internal::random<Scalar>() / RealScalar(PacketSize);
|
||||
@ -543,10 +570,10 @@ void packetmath_real() {
|
||||
CHECK_CWISE1_IF(PacketTraits::HasCos, std::cos, internal::pcos);
|
||||
CHECK_CWISE1_IF(PacketTraits::HasTan, std::tan, internal::ptan);
|
||||
|
||||
CHECK_CWISE1_IF(PacketTraits::HasRound, numext::round, internal::pround);
|
||||
CHECK_CWISE1_IF(PacketTraits::HasCeil, numext::ceil, internal::pceil);
|
||||
CHECK_CWISE1_IF(PacketTraits::HasFloor, numext::floor, internal::pfloor);
|
||||
CHECK_CWISE1_IF(PacketTraits::HasRint, numext::rint, internal::print);
|
||||
CHECK_CWISE1_EXACT_IF(PacketTraits::HasRound, numext::round, internal::pround);
|
||||
CHECK_CWISE1_EXACT_IF(PacketTraits::HasCeil, numext::ceil, internal::pceil);
|
||||
CHECK_CWISE1_EXACT_IF(PacketTraits::HasFloor, numext::floor, internal::pfloor);
|
||||
CHECK_CWISE1_EXACT_IF(PacketTraits::HasRint, numext::rint, internal::print);
|
||||
|
||||
// Rounding edge cases.
|
||||
if (PacketTraits::HasRound || PacketTraits::HasCeil || PacketTraits::HasFloor || PacketTraits::HasRint) {
|
||||
@ -583,10 +610,10 @@ void packetmath_real() {
|
||||
|
||||
for (size_t k=0; k<values.size(); ++k) {
|
||||
data1[0] = values[k];
|
||||
CHECK_CWISE1_IF(PacketTraits::HasRound, numext::round, internal::pround);
|
||||
CHECK_CWISE1_IF(PacketTraits::HasCeil, numext::ceil, internal::pceil);
|
||||
CHECK_CWISE1_IF(PacketTraits::HasFloor, numext::floor, internal::pfloor);
|
||||
CHECK_CWISE1_IF(PacketTraits::HasRint, numext::rint, internal::print);
|
||||
CHECK_CWISE1_EXACT_IF(PacketTraits::HasRound, numext::round, internal::pround);
|
||||
CHECK_CWISE1_EXACT_IF(PacketTraits::HasCeil, numext::ceil, internal::pceil);
|
||||
CHECK_CWISE1_EXACT_IF(PacketTraits::HasFloor, numext::floor, internal::pfloor);
|
||||
CHECK_CWISE1_EXACT_IF(PacketTraits::HasRint, numext::rint, internal::print);
|
||||
}
|
||||
}
|
||||
|
||||
@ -644,7 +671,7 @@ void packetmath_real() {
|
||||
if (PacketTraits::HasExp) {
|
||||
data1[0] = Scalar(-1);
|
||||
// underflow to zero
|
||||
data1[PacketSize] = Scalar(std::numeric_limits<Scalar>::min_exponent-10);
|
||||
data1[PacketSize] = Scalar(std::numeric_limits<Scalar>::min_exponent-55);
|
||||
CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp);
|
||||
// overflow to inf
|
||||
data1[PacketSize] = Scalar(std::numeric_limits<Scalar>::max_exponent+10);
|
||||
|
@ -78,13 +78,18 @@ bool isApproxAbs(const Scalar& a, const Scalar& b, const typename NumTraits<Scal
|
||||
return internal::isMuchSmallerThan(a-b, refvalue);
|
||||
}
|
||||
|
||||
template<typename Scalar>
|
||||
inline void print_mismatch(const Scalar* ref, const Scalar* vec, int size) {
|
||||
std::cout << "ref: [" << Map<const Matrix<Scalar,1,Dynamic> >(ref,size) << "]" << " != vec: [" << Map<const Matrix<Scalar,1,Dynamic> >(vec,size) << "]\n";
|
||||
}
|
||||
|
||||
template<typename Scalar> bool areApproxAbs(const Scalar* a, const Scalar* b, int size, const typename NumTraits<Scalar>::Real& refvalue)
|
||||
{
|
||||
for (int i=0; i<size; ++i)
|
||||
{
|
||||
if (!isApproxAbs(a[i],b[i],refvalue))
|
||||
{
|
||||
std::cout << "ref: [" << Map<const Matrix<Scalar,1,Dynamic> >(a,size) << "]" << " != vec: [" << Map<const Matrix<Scalar,1,Dynamic> >(b,size) << "]\n";
|
||||
print_mismatch(a, b, size);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@ -95,13 +100,23 @@ template<typename Scalar> bool areApprox(const Scalar* a, const Scalar* b, int s
|
||||
{
|
||||
for (int i=0; i<size; ++i)
|
||||
{
|
||||
if (a[i]!=b[i] && !internal::isApprox(a[i],b[i]))
|
||||
if ( a[i]!=b[i] && !internal::isApprox(a[i],b[i])
|
||||
&& !((numext::isnan)(a[i]) && (numext::isnan)(b[i])) )
|
||||
{
|
||||
if((numext::isnan)(a[i]) && (numext::isnan)(b[i]))
|
||||
{
|
||||
continue;
|
||||
print_mismatch(a, b, size);
|
||||
return false;
|
||||
}
|
||||
std::cout << "ref: [" << Map<const Matrix<Scalar,1,Dynamic> >(a,size) << "]" << " != vec: [" << Map<const Matrix<Scalar,1,Dynamic> >(b,size) << "]\n";
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template<typename Scalar> bool areEqual(const Scalar* a, const Scalar* b, int size)
|
||||
{
|
||||
for (int i=0; i<size; ++i)
|
||||
{
|
||||
if ( (a[i] != b[i]) && !((numext::isnan)(a[i]) && (numext::isnan)(b[i])) )
|
||||
{
|
||||
print_mismatch(a, b, size);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@ -178,6 +193,14 @@ struct packet_helper<false,Packet>
|
||||
VERIFY(test::areApprox(ref, data2, PacketSize) && #POP); \
|
||||
}
|
||||
|
||||
#define CHECK_CWISE1_EXACT_IF(COND, REFOP, POP) if(COND) { \
|
||||
test::packet_helper<COND,Packet> h; \
|
||||
for (int i=0; i<PacketSize; ++i) \
|
||||
ref[i] = Scalar(REFOP(data1[i])); \
|
||||
h.store(data2, POP(h.load(data1))); \
|
||||
VERIFY(test::areEqual(ref, data2, PacketSize) && #POP); \
|
||||
}
|
||||
|
||||
#define CHECK_CWISE2_IF(COND, REFOP, POP) if(COND) { \
|
||||
test::packet_helper<COND,Packet> h; \
|
||||
for (int i=0; i<PacketSize; ++i) \
|
||||
|
Loading…
x
Reference in New Issue
Block a user