diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index f8388579b..fa43d8809 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -88,7 +88,7 @@ struct packet_traits : default_packet_traits { HasCeil = 1, HasRint = 1, HasBessel = 1, - HasNdtri = 1, + HasNdtri = 1 }; }; diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index f40093455..9a1feb0d9 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -793,6 +793,171 @@ Packet psqrt_complex(const Packet& a) { pselect(is_real_inf, real_inf_result,result)); } + +// This function implements the Veltkamp splitting. Given a floating point +// number x it returns the pair {x_hi, x_lo} such that x_hi + x_lo = x holds +// exactly and that half of the significant of x fits in x_hi. +// This code corresponds to Algorithms 3 and 4 in +// https://hal.inria.fr/hal-01774587v2/document +template +EIGEN_STRONG_INLINE +void veltkamp_splitting(const Packet& x, Packet& x_hi, Packet& x_lo) { + typedef typename unpacket_traits::type Scalar; + EIGEN_CONSTEXPR int shift = (NumTraits::digits() + 1) / 2; + EIGEN_CONSTEXPR Scalar shift_scale = Scalar(uint64_t(1) << shift); + Packet gamma = pmul(pset1(shift_scale + 1), x); +#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD + x_hi = pmadd(pset1(-shift_scale), x, gamma); +#else + Packet rho = psub(x, gamma); + x_hi = padd(rho, gamma); +#endif + x_lo = psub(x, x_hi); +} + +// This function splits x into the nearest integer n and fractional part r, +// such that x = n + r holds exactly. +template +EIGEN_STRONG_INLINE +void integer_split(const Packet& x, Packet& n, Packet& r) { + n = pround(x); + r = psub(x, n); +} + +// This function implements Dekker's algorithm for two products {x * y1, x * y2} with +// a shared factor. Given floating point numbers {x, y1, y2} computes the pairs +// {p1, r1} and {p2, r2} such that x * y1 = p1 + r1 holds exactly and +// p1 = fl(x * y1), and x * y2 = p2 + r2 holds exactly and p2 = fl(x * y2). +template +EIGEN_STRONG_INLINE +void double_dekker(const Packet& x, const Packet& y1, const Packet& y2, + Packet& p1, Packet& r1, Packet& p2, Packet& r2) { + Packet x_hi, x_lo, y1_hi, y1_lo, y2_hi, y2_lo; + veltkamp_splitting(x, x_hi, x_lo); + veltkamp_splitting(y1, y1_hi, y1_lo); + veltkamp_splitting(y2, y2_hi, y2_lo); + + p1 = pmul(x, y1); + r1 = pmadd(x_hi, y1_hi, pnegate(p1)); + r1 = pmadd(x_hi, y1_lo, r1); + r1 = pmadd(x_lo, y1_hi, r1); + r1 = pmadd(x_lo, y1_lo, r1); + + p2 = pmul(x, y2); + r2 = pmadd(x_hi, y2_hi, pnegate(p2)); + r2 = pmadd(x_hi, y2_lo, r2); + r2 = pmadd(x_lo, y2_hi, r2); + r2 = pmadd(x_lo, y2_lo, r2); +} + +// This function implements the non-trivial case of pow(x,y) where x is +// positive and y is (possibly) non-integer. +// Formally, pow(x,y) = 2**(y * log2(x)) +template +EIGEN_STRONG_INLINE +Packet generic_pow_impl(const Packet& x, const Packet& y) { + typedef typename unpacket_traits::type Scalar; + // Split x into exponent e_x and mantissa m_x. + Packet e_x; + Packet m_x = pfrexp(x, e_x); + + // Adjust m_x to lie in [0.75:1.5) to minimize absolute error in log2(m_x). + Packet m_x_scale_mask = pcmp_lt(m_x, pset1(Scalar(0.75))); + m_x = pselect(m_x_scale_mask, pmul(pset1(Scalar(2)), m_x), m_x); + e_x = pselect(m_x_scale_mask, psub(e_x, pset1(Scalar(1))), e_x); + + Packet r_x = plog2(m_x); + + // Compute the two terms {y * e_x, y * r_x} in f = y * log2(x) with doubled + // precision using Dekker's algorithm. + Packet f1_hi, f1_lo, f2_hi, f2_lo; + double_dekker(y, e_x, r_x, f1_hi, f1_lo, f2_hi, f2_lo); + + // Separate f into integer and fractional parts, keeping f1_hi, and f2_hi + // separate to avoid cancellation. + Packet n1, r1, n2, r2; + integer_split(f1_hi, n1, r1); + integer_split(f2_hi, n2, r2); + + // Add up integer parts and sum the remainders. + Packet n_z = padd(n1, n2); + // Notice: I experimented with using compensated (Kahan) summation here, + // but it does not seem to matter. + Packet rem = padd(padd(f1_lo, f2_lo), padd(r1, r2)); + + // Extract any additional integer part that may have accumulated in rem. + Packet nrem, r_z; + integer_split(rem, nrem, r_z); + n_z = padd(n_z, nrem); + + // We now have an accurate split of f = n_z + r_z and can compute + // x^y = 2**{n_z + r_z) = exp(ln(2) * r_z) * 2**{n_z}. + // The first factor we compute by calling pexp(), while multiplication + // by an integer power of 2 can be done exactly using pldexp(). + // Note: I experimented with using Dekker's algorithms for the + // multiplication by ln(2) here, but did not see any difference. + Packet e_r = pexp(pmul(pset1(Scalar(EIGEN_LN2)), r_z)); + return pldexp(e_r, n_z); +} + +// Generic implementation of pow(x,y). +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet generic_pow(const Packet& x, const Packet& y) { + typedef typename unpacket_traits::type Scalar; + const Packet cst_pos_inf = pset1(NumTraits::infinity()); + const Packet cst_zero = pset1(Scalar(0)); + const Packet cst_one = pset1(Scalar(1)); + const Packet cst_nan = pset1(NumTraits::quiet_NaN()); + + Packet abs_x = pabs(x); + // Predicates for sign and magnitude of x. + Packet x_is_zero = pcmp_eq(x, cst_zero); + Packet x_is_neg = pcmp_lt(x, cst_zero); + Packet abs_x_is_inf = pcmp_eq(abs_x, cst_pos_inf); + Packet abs_x_is_one = pcmp_eq(abs_x, cst_one); + Packet abs_x_is_gt_one = pcmp_lt(cst_one, abs_x); + Packet abs_x_is_lt_one = pcmp_lt(abs_x, cst_one); + Packet x_is_one = pandnot(abs_x_is_one, x_is_neg); + Packet x_is_neg_one = pand(abs_x_is_one, x_is_neg); + Packet x_is_nan = pandnot(ptrue(x), pcmp_eq(x, x)); + + // Predicates for sign and magnitude of y. + Packet y_is_zero = pcmp_eq(y, cst_zero); + Packet y_is_neg = pcmp_lt(y, cst_zero); + Packet y_is_pos = pandnot(ptrue(y), por(y_is_zero, y_is_neg)); + Packet y_is_nan = pandnot(ptrue(y), pcmp_eq(y, y)); + Packet abs_y_is_inf = pcmp_eq(pabs(y), cst_pos_inf); + + // Predicates for whether y is integer and/or even. + Packet y_is_int = pcmp_eq(pfloor(y), y); + Packet y_div_2 = pldexp(y, pset1(Scalar(-1))); + Packet y_is_even = pcmp_eq(pround(y_div_2), y_div_2); + + // Predicates encoding special cases for the value of pow(x,y) + Packet invalid_negative_x = pandnot(pandnot(pandnot(x_is_neg, abs_x_is_inf), y_is_int), abs_y_is_inf); + Packet pow_is_nan = por(invalid_negative_x, por(x_is_nan, y_is_nan)); + Packet pow_is_one = por(por(y_is_zero, x_is_one), pand(x_is_neg_one, abs_y_is_inf)); + Packet pow_is_zero = por(por(por(pand(x_is_zero, y_is_pos), pand(abs_x_is_inf, y_is_neg)), + pand(pand(abs_x_is_lt_one, abs_y_is_inf), y_is_pos)), + pand(pand(abs_x_is_gt_one, abs_y_is_inf), y_is_neg)); + Packet pow_is_inf = por(por(por(pand(x_is_zero, y_is_neg), pand(abs_x_is_inf, y_is_pos)), + pand(pand(abs_x_is_lt_one, abs_y_is_inf), y_is_neg)), + pand(pand(abs_x_is_gt_one, abs_y_is_inf), y_is_pos)); + + // General computation of pow(x,y) for positive x or negative x and integer y. + Packet negate_pow_abs = pandnot(x_is_neg, y_is_even); + Packet pow_abs = generic_pow_impl(abs_x, y); + + return pselect(pow_is_one, cst_one, + pselect(pow_is_nan, cst_nan, + pselect(pow_is_inf, cst_pos_inf, + pselect(pow_is_zero, cst_zero, + pselect(negate_pow_abs, pnegate(pow_abs), pow_abs))))); +} + + /* polevl (modified for Eigen) * * Evaluate polynomial diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 4c5b664e6..05d9f8edd 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -26,7 +26,7 @@ namespace internal { #ifdef EIGEN_VECTORIZE_FMA #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD -#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1 +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif #endif @@ -147,13 +147,13 @@ struct packet_traits : default_packet_traits { HasTanh = EIGEN_FAST_MATH, HasErf = EIGEN_FAST_MATH, HasBlend = 1, + HasCeil = 1, HasFloor = 1 #ifdef EIGEN_VECTORIZE_SSE4_1 , HasRint = 1, - HasRound = 1, - HasCeil = 1 + HasRound = 1 #endif }; }; @@ -173,14 +173,14 @@ struct packet_traits : default_packet_traits { HasExp = 1, HasSqrt = 1, HasRsqrt = 1, - HasBlend = 1 + HasBlend = 1, + HasFloor = 1, + HasCeil = 1 #ifdef EIGEN_VECTORIZE_SSE4_1 , HasRound = 1, - HasRint = 1, - HasFloor = 1, - HasCeil = 1 + HasRint = 1 #endif }; }; @@ -650,6 +650,30 @@ template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) mask = pand(mask, cst_1); return psub(tmp, mask); } + +template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) +{ + const Packet4f cst_1 = pset1(1.0f); + Packet4i emm0 = _mm_cvttps_epi32(a); + Packet4f tmp = _mm_cvtepi32_ps(emm0); + /* if greater, substract 1 */ + Packet4f mask = _mm_cmplt_ps(tmp, a); + mask = pand(mask, cst_1); + return padd(tmp, mask); +} + +// WARNING: this pfloor implementation makes sense for small inputs only, +// It is currently only used by pexp and not exposed through HasFloor. +template<> EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) +{ + const Packet2d cst_1 = pset1(1.0); + Packet4i emm0 = _mm_cvttpd_epi32(a); + Packet2d tmp = _mm_cvtepi32_pd(emm0); + /* if greater, substract 1 */ + Packet2d mask = _mm_cmplt_pd(tmp, a); + mask = pand(mask, cst_1); + return padd(tmp, mask); +} #endif template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); } diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h index f3509c4b9..a3ec9e177 100644 --- a/Eigen/src/Core/functors/BinaryFunctors.h +++ b/Eigen/src/Core/functors/BinaryFunctors.h @@ -403,6 +403,7 @@ struct functor_traits > { /** \internal * \brief Template functor to compute the pow of two scalars + * See the specification of pow in https://en.cppreference.com/w/cpp/numeric/math/pow */ template struct scalar_pow_op : binary_op_base @@ -417,12 +418,25 @@ struct scalar_pow_op : binary_op_base EIGEN_SCALAR_BINARY_OP_PLUGIN } #endif + EIGEN_DEVICE_FUNC inline result_type operator() (const Scalar& a, const Exponent& b) const { return numext::pow(a, b); } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const + { + return generic_pow(a,b); + } }; + template struct functor_traits > { - enum { Cost = 5 * NumTraits::MulCost, PacketAccess = false }; + enum { + Cost = 5 * NumTraits::MulCost, + PacketAccess = (!NumTraits::IsComplex && !NumTraits::IsInteger && + packet_traits::HasExp && packet_traits::HasLog && + packet_traits::HasRound && packet_traits::HasCmp) + }; }; diff --git a/test/array_cwise.cpp b/test/array_cwise.cpp index 6910f0e1f..27702c19d 100644 --- a/test/array_cwise.cpp +++ b/test/array_cwise.cpp @@ -9,6 +9,62 @@ #include "main.h" + +// Test the corner cases of pow(x, y) for real types. +template +void pow_test() { + const Scalar zero = Scalar(0); + const Scalar one = Scalar(1); + const Scalar sqrt_half = Scalar(std::sqrt(0.5)); + const Scalar sqrt2 = Scalar(std::sqrt(2)); + const Scalar inf = std::numeric_limits::infinity(); + const Scalar nan = std::numeric_limits::quiet_NaN(); + const static Scalar abs_vals[] = {zero, sqrt_half, one, sqrt2, inf, nan}; + const int abs_cases = 6; + const int num_cases = 2*abs_cases * 2*abs_cases; + // Repeat the same value to make sure we hit the vectorized path. + const int num_repeats = 32; + Array x(num_repeats, num_cases); + Array y(num_repeats, num_cases); + Array expected(num_repeats, num_cases); + int count = 0; + for (int i = 0; i < abs_cases; ++i) { + const Scalar abs_x = abs_vals[i]; + for (int sign_x = 0; sign_x < 2; ++sign_x) { + Scalar x_case = sign_x == 0 ? -abs_x : abs_x; + for (int j = 0; j < abs_cases; ++j) { + const Scalar abs_y = abs_vals[j]; + for (int sign_y = 0; sign_y < 2; ++sign_y) { + Scalar y_case = sign_y == 0 ? -abs_y : abs_y; + for (int repeat = 0; repeat < num_repeats; ++repeat) { + x(repeat, count) = x_case; + y(repeat, count) = y_case; + expected(repeat, count) = numext::pow(x_case, y_case); + } + ++count; + } + } + } + } + + Array actual = x.pow(y); + const Scalar tol = test_precision(); + bool all_pass = true; + for (int i = 0; i < 1; ++i) { + for (int j = 0; j < num_cases; ++j) { + Scalar a = actual(i, j); + Scalar e = expected(i, j); + bool fail = !(a==e) && !internal::isApprox(a, e, tol) && !((std::isnan)(a) && (std::isnan)(e)); + all_pass &= !fail; + if (fail) { + std::cout << "pow(" << x(i,j) << "," << y(i,j) << ") = " << a << " != " << e << std::endl; + } + } + } + VERIFY(all_pass); +} + + template void array(const ArrayType& m) { typedef typename ArrayType::Scalar Scalar; @@ -371,6 +427,8 @@ template void array_real(const ArrayType& m) VERIFY_IS_APPROX(m3.pow(RealScalar(-0.5)), m3.rsqrt()); VERIFY_IS_APPROX(pow(m3,RealScalar(-0.5)), m3.rsqrt()); + VERIFY_IS_APPROX(m1.pow(RealScalar(-2)), m1.square().inverse()); + pow_test(); VERIFY_IS_APPROX(log10(m3), log(m3)/log(10)); VERIFY_IS_APPROX(log2(m3), log(m3)/log(2));