mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-07-31 01:03:38 +08:00
Fix arm32 issues.
(cherry picked from commit a73970a8640330c4908d68ef9257fd31a4fdae93)
This commit is contained in:
parent
f23b8c0d78
commit
c23abcf25c
@ -642,10 +642,10 @@ Packet psincos_float(const Packet& _x)
|
|||||||
PacketI y_int = preinterpret<PacketI>(y_round); // last 23 digits represent integer (if abs(x)<2^24)
|
PacketI y_int = preinterpret<PacketI>(y_round); // last 23 digits represent integer (if abs(x)<2^24)
|
||||||
y = psub(y_round, cst_rounding_magic); // nearest integer to x*4/pi
|
y = psub(y_round, cst_rounding_magic); // nearest integer to x*4/pi
|
||||||
|
|
||||||
// Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4
|
// Subtract y * Pi/2 to reduce x to the interval -Pi/4 <= x <= +Pi/4
|
||||||
// using "Extended precision modular arithmetic"
|
// using "Extended precision modular arithmetic"
|
||||||
#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD)
|
#if defined(EIGEN_VECTORIZE_FMA)
|
||||||
// This version requires true FMA for high accuracy
|
// This version requires true FMA for high accuracy.
|
||||||
// It provides a max error of 1ULP up to (with absolute_error < 5.9605e-08):
|
// It provides a max error of 1ULP up to (with absolute_error < 5.9605e-08):
|
||||||
const float huge_th = ComputeSine ? 117435.992f : 71476.0625f;
|
const float huge_th = ComputeSine ? 117435.992f : 71476.0625f;
|
||||||
x = pmadd(y, pset1<Packet>(-1.57079601287841796875f), x);
|
x = pmadd(y, pset1<Packet>(-1.57079601287841796875f), x);
|
||||||
@ -915,7 +915,7 @@ void fast_twosum(const Packet& x, const Packet& y, Packet& s_hi, Packet& s_lo) {
|
|||||||
s_lo = psub(y, t);
|
s_lo = psub(y, t);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
|
#ifdef EIGEN_VECTORIZE_FMA
|
||||||
// This function implements the extended precision product of
|
// This function implements the extended precision product of
|
||||||
// a pair of floating point numbers. Given {x, y}, it computes the pair
|
// a pair of floating point numbers. Given {x, y}, it computes the pair
|
||||||
// {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and
|
// {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and
|
||||||
@ -966,7 +966,7 @@ void twoprod(const Packet& x, const Packet& y,
|
|||||||
p_lo = pmadd(x_lo, y_lo, p_lo);
|
p_lo = pmadd(x_lo, y_lo, p_lo);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // EIGEN_HAS_SINGLE_INSTRUCTION_MADD
|
#endif // EIGEN_VECTORIZE_FMA
|
||||||
|
|
||||||
|
|
||||||
// This function implements Dekker's algorithm for the addition
|
// This function implements Dekker's algorithm for the addition
|
||||||
|
@ -1089,12 +1089,15 @@ template<> EIGEN_STRONG_INLINE Packet2ul pdiv<Packet2ul>(const Packet2ul& /*a*/,
|
|||||||
return pset1<Packet2ul>(0ULL);
|
return pset1<Packet2ul>(0ULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef EIGEN_VECTORIZE_FMA
|
||||||
#ifdef __ARM_FEATURE_FMA
|
template <>
|
||||||
template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
|
EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
|
||||||
{ return vfmaq_f32(c,a,b); }
|
return vfmaq_f32(c, a, b);
|
||||||
template<> EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c)
|
}
|
||||||
{ return vfma_f32(c,a,b); }
|
template <>
|
||||||
|
EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
|
||||||
|
return vfma_f32(c, a, b);
|
||||||
|
}
|
||||||
#else
|
#else
|
||||||
template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
|
template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
|
||||||
{
|
{
|
||||||
@ -3782,7 +3785,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const
|
|||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); }
|
template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); }
|
||||||
|
|
||||||
#ifdef __ARM_FEATURE_FMA
|
#ifdef EIGEN_VECTORIZE_FMA
|
||||||
// See bug 936. See above comment about FMA for float.
|
// See bug 936. See above comment about FMA for float.
|
||||||
template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c)
|
template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c)
|
||||||
{ return vfmaq_f64(c,a,b); }
|
{ return vfmaq_f64(c,a,b); }
|
||||||
|
@ -367,6 +367,7 @@
|
|||||||
|
|
||||||
#define EIGEN_VECTORIZE
|
#define EIGEN_VECTORIZE
|
||||||
#define EIGEN_VECTORIZE_VSX 1
|
#define EIGEN_VECTORIZE_VSX 1
|
||||||
|
#define EIGEN_VECTORIZE_FMA
|
||||||
#include <altivec.h>
|
#include <altivec.h>
|
||||||
// We need to #undef all these ugly tokens defined in <altivec.h>
|
// We need to #undef all these ugly tokens defined in <altivec.h>
|
||||||
// => use __vector instead of vector
|
// => use __vector instead of vector
|
||||||
@ -378,6 +379,7 @@
|
|||||||
|
|
||||||
#define EIGEN_VECTORIZE
|
#define EIGEN_VECTORIZE
|
||||||
#define EIGEN_VECTORIZE_ALTIVEC
|
#define EIGEN_VECTORIZE_ALTIVEC
|
||||||
|
#define EIGEN_VECTORIZE_FMA
|
||||||
#include <altivec.h>
|
#include <altivec.h>
|
||||||
// We need to #undef all these ugly tokens defined in <altivec.h>
|
// We need to #undef all these ugly tokens defined in <altivec.h>
|
||||||
// => use __vector instead of vector
|
// => use __vector instead of vector
|
||||||
@ -438,7 +440,12 @@
|
|||||||
#include <arm_fp16.h>
|
#include <arm_fp16.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__F16C__) && (!defined(EIGEN_GPUCC) && (!EIGEN_COMP_CLANG || EIGEN_COMP_CLANG>=380))
|
// Enable FMA for ARM.
|
||||||
|
#if defined(__ARM_FEATURE_FMA)
|
||||||
|
#define EIGEN_VECTORIZE_FMA
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__F16C__) && !defined(EIGEN_GPUCC) && (!EIGEN_COMP_CLANG_STRICT || EIGEN_COMP_CLANG>=380)
|
||||||
// We can use the optimized fp16 to float and float to fp16 conversion routines
|
// We can use the optimized fp16 to float and float to fp16 conversion routines
|
||||||
#define EIGEN_HAS_FP16_C
|
#define EIGEN_HAS_FP16_C
|
||||||
|
|
||||||
|
@ -72,7 +72,17 @@ void pow_test() {
|
|||||||
for (int j = 0; j < num_cases; ++j) {
|
for (int j = 0; j < num_cases; ++j) {
|
||||||
Scalar e = static_cast<Scalar>(std::pow(x(i,j), y(i,j)));
|
Scalar e = static_cast<Scalar>(std::pow(x(i,j), y(i,j)));
|
||||||
Scalar a = actual(i, j);
|
Scalar a = actual(i, j);
|
||||||
bool success = (a==e) || ((numext::isfinite)(e) && internal::isApprox(a, e, tol)) || ((numext::isnan)(a) && (numext::isnan)(e));
|
#if EIGEN_ARCH_ARM
|
||||||
|
// Work around NEON flush-to-zero mode
|
||||||
|
// if ref returns a subnormal value and Eigen returns 0, then skip the test
|
||||||
|
if (a == Scalar(0) &&
|
||||||
|
(e > -(std::numeric_limits<Scalar>::min)() && e < (std::numeric_limits<Scalar>::min)() &&
|
||||||
|
e >= -std::numeric_limits<Scalar>::denorm_min() && e <= std::numeric_limits<Scalar>::denorm_min())) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
bool success = (a == e) || ((numext::isfinite)(e) && internal::isApprox(a, e, tol)) ||
|
||||||
|
((numext::isnan)(a) && (numext::isnan)(e));
|
||||||
all_pass &= success;
|
all_pass &= success;
|
||||||
if (!success) {
|
if (!success) {
|
||||||
std::cout << "pow(" << x(i,j) << "," << y(i,j) << ") = " << a << " != " << e << std::endl;
|
std::cout << "pow(" << x(i,j) << "," << y(i,j) << ") = " << a << " != " << e << std::endl;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user