From ce62177b5b806f3fd7fba4dbe2f8cb7eab04c39f Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Tue, 21 Feb 2023 03:14:05 +0000 Subject: [PATCH] Vectorize atanh & add a missing definition and unit test for atan. --- Eigen/src/Core/GenericPacketMath.h | 12 ++- Eigen/src/Core/arch/AVX/MathFunctions.h | 6 ++ Eigen/src/Core/arch/AVX/PacketMath.h | 1 + Eigen/src/Core/arch/AVX512/MathFunctions.h | 6 ++ Eigen/src/Core/arch/AVX512/PacketMath.h | 1 + Eigen/src/Core/arch/AltiVec/MathFunctions.h | 6 ++ Eigen/src/Core/arch/AltiVec/PacketMath.h | 1 + .../arch/Default/GenericPacketMathFunctions.h | 28 ++++++ .../Default/GenericPacketMathFunctionsFwd.h | 5 + Eigen/src/Core/arch/Default/Half.h | 6 ++ Eigen/src/Core/arch/NEON/MathFunctions.h | 6 ++ Eigen/src/Core/arch/NEON/PacketMath.h | 1 + Eigen/src/Core/arch/SSE/MathFunctions.h | 16 ++- Eigen/src/Core/arch/SSE/PacketMath.h | 1 + Eigen/src/Core/functors/UnaryFunctors.h | 7 +- test/array_cwise.cpp | 98 ++++++++++++++++--- test/packetmath.cpp | 3 + 17 files changed, 179 insertions(+), 25 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 499a94b22..8b5f030bf 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -82,6 +82,7 @@ struct default_packet_traits HasASin = 0, HasACos = 0, HasATan = 0, + HasATanh = 0, HasSinh = 0, HasCosh = 0, HasTanh = 0, @@ -857,9 +858,6 @@ Packet pasin(const Packet& a) { EIGEN_USING_STD(asin); return asin(a); } template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pacos(const Packet& a) { EIGEN_USING_STD(acos); return acos(a); } -/** \internal \returns the arc tangent of \a a (coeff-wise) */ -template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet patan(const Packet& a) { EIGEN_USING_STD(atan); return atan(a); } /** \internal \returns the hyperbolic sine of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS @@ -869,10 +867,18 @@ Packet psinh(const Packet& a) { EIGEN_USING_STD(sinh); return sinh(a); } template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcosh(const Packet& a) { EIGEN_USING_STD(cosh); return cosh(a); } +/** \internal \returns the arc tangent of \a a (coeff-wise) */ +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet patan(const Packet& a) { EIGEN_USING_STD(atan); return atan(a); } + /** \internal \returns the hyperbolic tan of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet ptanh(const Packet& a) { EIGEN_USING_STD(tanh); return tanh(a); } +/** \internal \returns the arc tangent of \a a (coeff-wise) */ +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet patanh(const Packet& a) { EIGEN_USING_STD(atanh); return atanh(a); } + /** \internal \returns the exp of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp(const Packet& a) { EIGEN_USING_STD(exp); return exp(a); } diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h index cb7d7b82a..8ce181e30 100644 --- a/Eigen/src/Core/arch/AVX/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX/MathFunctions.h @@ -56,6 +56,12 @@ patan(const Packet4d& _x) { return patan_double(_x); } +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8f +patanh(const Packet8f& _x) { + return patanh_float(_x); +} + template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8f plog(const Packet8f& _x) { diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 60e1ff4d3..89963eb7a 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -76,6 +76,7 @@ template<> struct packet_traits : default_packet_traits HasACos = 1, HasASin = 1, HasATan = 1, + HasATanh = 1, HasLog = 1, HasLog1p = 1, HasExpm1 = 1, diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h index af47a85cf..be0e4dd94 100644 --- a/Eigen/src/Core/arch/AVX512/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h @@ -275,6 +275,12 @@ patan(const Packet16f& _x) { return patan_float(_x); } +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f +patanh(const Packet16f& _x) { + return patanh_float(_x); +} + template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8d patan(const Packet8d& _x) { diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 916babf6a..4628b21f5 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -125,6 +125,7 @@ template<> struct packet_traits : default_packet_traits HasACos = 1, HasASin = 1, HasATan = 1, + HasATanh = 1, #if EIGEN_HAS_AVX512_MATH HasLog = 1, HasLog1p = 1, diff --git a/Eigen/src/Core/arch/AltiVec/MathFunctions.h b/Eigen/src/Core/arch/AltiVec/MathFunctions.h index 45761e217..7648b404e 100644 --- a/Eigen/src/Core/arch/AltiVec/MathFunctions.h +++ b/Eigen/src/Core/arch/AltiVec/MathFunctions.h @@ -60,6 +60,12 @@ Packet4f patan(const Packet4f& _x) return patan_float(_x); } +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet4f patanh(const Packet4f& _x) +{ + return patanh_float(_x); +} + #ifdef EIGEN_VECTORIZE_VSX template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f psqrt(const Packet4f& x) diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 4cebd8fcb..f1676afd7 100644 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -171,6 +171,7 @@ struct packet_traits : default_packet_traits { HasACos = 1, HasASin = 1, HasATan = 1, + HasATanh = 1, HasLog = 1, HasExp = 1, #ifdef EIGEN_VECTORIZE_VSX diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index a322bc40e..52d076ad7 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -967,6 +967,34 @@ Packet patan_double(const Packet& x_in) { return pxor(p, x_signmask); } +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet patanh_float(const Packet& x) { + typedef typename unpacket_traits::type Scalar; + static_assert(std::is_same::value, "Scalar type must be float"); + const Packet half = pset1(0.5f); + const Packet x_gt_half = pcmp_le(half, pabs(x)); + // For |x| in [0:0.5] we use a polynomial approximation of the form + // P(x) = x + x^3*(c3 + x^2 * (c5 + x^2 * (... x^2 * c11) ... )). + const Packet C3 = pset1(0.3333373963832855224609375f); + const Packet C5 = pset1(0.1997792422771453857421875f); + const Packet C7 = pset1(0.14672131836414337158203125f); + const Packet C9 = pset1(8.2311116158962249755859375e-2f); + const Packet C11 = pset1(0.1819281280040740966796875f); + const Packet x2 = pmul(x,x); + Packet p = pmadd(C11, x2, C9); + p = pmadd(x2, p, C7); + p = pmadd(x2, p, C5); + p = pmadd(x2, p, C3); + p = pmadd(pmul(x,x2), p, x); + + // For |x| in ]0.5:1.0] we use atanh = 0.5*ln((1+x)/(1-x)); + const Packet one = pset1(1.0f); + Packet r = pdiv(padd(one, x), psub(one, x)); + r = pmul(half, plog(r)); + return pselect(x_gt_half, r, p); +} + template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pdiv_complex(const Packet& x, const Packet& y) { diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h index 179c55cf3..d7335797d 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h @@ -109,6 +109,11 @@ template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_double(const Packet& x); +/** \internal \returns atan(x) for single precision float */ +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet patanh_float(const Packet& x); + /** \internal \returns sqrt(x) for complex types */ template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS diff --git a/Eigen/src/Core/arch/Default/Half.h b/Eigen/src/Core/arch/Default/Half.h index 6ff3e20c4..c8ca33a5b 100644 --- a/Eigen/src/Core/arch/Default/Half.h +++ b/Eigen/src/Core/arch/Default/Half.h @@ -779,6 +779,12 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half asin(const half& a) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half acos(const half& a) { return half(::acosf(float(a))); } +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half atan(const half& a) { + return half(::atanf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half atanh(const half& a) { + return half(::atanhf(float(a))); +} EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) { #if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \ defined(EIGEN_HIP_DEVICE_COMPILE) diff --git a/Eigen/src/Core/arch/NEON/MathFunctions.h b/Eigen/src/Core/arch/NEON/MathFunctions.h index aea514949..f4ea6f91a 100644 --- a/Eigen/src/Core/arch/NEON/MathFunctions.h +++ b/Eigen/src/Core/arch/NEON/MathFunctions.h @@ -55,6 +55,12 @@ template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2f ptanh EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f ptanh(const Packet4f& x) { return internal::generic_fast_tanh_float(x); } +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2f patanh(const Packet2f& x) +{ return patanh_float(x); } +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f patanh(const Packet4f& x) +{ return patanh_float(x); } + + #if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC template <> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index aa873e504..e31f717d1 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -215,6 +215,7 @@ struct packet_traits : default_packet_traits HasACos = 1, HasASin = 1, HasATan = 1, + HasATanh = 1, HasLog = 1, HasExp = 1, HasSqrt = 1, diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h index f98fb7a3e..b0cd13aef 100644 --- a/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -81,11 +81,6 @@ Packet4f pacos(const Packet4f& _x) return pacos_float(_x); } -template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet2d patan(const Packet2d& _x) { - return patan_double(_x); -} - template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f pasin(const Packet4f& _x) { @@ -98,6 +93,17 @@ Packet4f patan(const Packet4f& _x) return patan_float(_x); } +template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet2d patan(const Packet2d& _x) { + return patan_double(_x); +} + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet4f patanh(const Packet4f& _x) +{ + return patanh_float(_x); +} + // Notice that for newer processors, it is counterproductive to use Newton // iteration for square root. In particular, Skylake and Zen2 processors // have approximately doubled throughput of the _mm_sqrt_ps instruction diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 62646a164..7d608bbb4 100644 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -142,6 +142,7 @@ struct packet_traits : default_packet_traits { HasACos = 1, HasASin = 1, HasATan = 1, + HasATanh = 1, HasLog = 1, HasLog1p = 1, HasExpm1 = 1, diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h index e518f982d..1faed10bf 100644 --- a/Eigen/src/Core/functors/UnaryFunctors.h +++ b/Eigen/src/Core/functors/UnaryFunctors.h @@ -622,11 +622,16 @@ struct functor_traits > { template struct scalar_atanh_op { EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::atanh(a); } + template + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& x) const { return patanh(x); } }; template struct functor_traits > { - enum { Cost = 5 * NumTraits::MulCost, PacketAccess = false }; + enum { + Cost = 5 * NumTraits::MulCost, + PacketAccess = packet_traits::HasATanh + }; }; /** \internal diff --git a/test/array_cwise.cpp b/test/array_cwise.cpp index 609057695..44a6a503e 100644 --- a/test/array_cwise.cpp +++ b/test/array_cwise.cpp @@ -93,27 +93,91 @@ void binary_op_test(std::string name, Fn fun, RefFn ref) { VERIFY(all_pass); } +#define BINARY_FUNCTOR_TEST_ARGS(fun) #fun, \ + [](const auto& x, const auto& y) { return (Eigen::fun)(x, y); }, \ + [](const auto& x, const auto& y) { return (std::fun)(x, y); } + + template void binary_ops_test() { - binary_op_test("pow", - [](const auto& x, const auto& y) { return Eigen::pow(x, y); }, - [](const auto& x, const auto& y) { return std::pow(x, y); }); - binary_op_test("atan2", - [](const auto& x, const auto& y) { return Eigen::atan2(x, y); }, - [](const auto& x, const auto& y) { - auto t = std::atan2(x, y); -#if EIGEN_COMP_MSVC - // Work around MSVC return value on underflow. - // |atan(y/x)| is bounded above by |y/x|, so on underflow return y/x according to POSIX spec. - // MSVC otherwise returns denorm_min. - if (EIGEN_PREDICT_FALSE(std::abs(t) == std::numeric_limits::denorm_min())) { - return x/y; - } + binary_op_test(BINARY_FUNCTOR_TEST_ARGS(pow)); +#ifndef EIGEN_COMP_MSVC + binary_op_test(BINARY_FUNCTOR_TEST_ARGS(atan2)); +#else + binary_op_test( + "atan2", [](const auto& x, const auto& y) { return Eigen::atan2(x, y); }, + [](Scalar x, Scalar y) { + auto t = Scalar(std::atan2(x, y)); + // Work around MSVC return value on underflow. + // |atan(y/x)| is bounded above by |y/x|, so on underflow return y/x according to POSIX spec. + // MSVC otherwise returns denorm_min. + if (EIGEN_PREDICT_FALSE(std::abs(t) == std::numeric_limits::denorm_min())) { + return x / y; + } + return t; + }); #endif - return t; - }); } + +template +void unary_op_test(std::string name, Fn fun, RefFn ref) { + const Scalar tol = test_precision(); + auto values = special_values(); + Map> x(values.data(), values.size()); + + Array actual = fun(x); + bool all_pass = true; + for (Index i = 0; i < x.size(); ++i) { + Scalar e = static_cast(ref(x(i))); + Scalar a = actual(i); + bool success = (a == e) || ((numext::isfinite)(e) && internal::isApprox(a, e, tol)) || + ((numext::isnan)(a) && (numext::isnan)(e)); + if ((a == a) && (e == e)) success &= (bool)numext::signbit(e) == (bool)numext::signbit(a); + all_pass &= success; + if (!success) { + std::cout << name << "(" << x(i) << ") = " << a << " != " << e << std::endl; + } + } + VERIFY(all_pass); +} + +#define UNARY_FUNCTOR_TEST_ARGS(fun) #fun, \ + [](const auto& x) { return (Eigen::fun)(x); }, \ + [](const auto& x) { return (std::fun)(x); } + +template +void unary_ops_test() { + unary_op_test(UNARY_FUNCTOR_TEST_ARGS(sqrt)); + unary_op_test(UNARY_FUNCTOR_TEST_ARGS(exp)); + unary_op_test(UNARY_FUNCTOR_TEST_ARGS(log)); + unary_op_test(UNARY_FUNCTOR_TEST_ARGS(sin)); + unary_op_test(UNARY_FUNCTOR_TEST_ARGS(cos)); + unary_op_test(UNARY_FUNCTOR_TEST_ARGS(tan)); + unary_op_test(UNARY_FUNCTOR_TEST_ARGS(asin)); + unary_op_test(UNARY_FUNCTOR_TEST_ARGS(acos)); + unary_op_test(UNARY_FUNCTOR_TEST_ARGS(atan)); + unary_op_test(UNARY_FUNCTOR_TEST_ARGS(sinh)); + unary_op_test(UNARY_FUNCTOR_TEST_ARGS(cosh)); + unary_op_test(UNARY_FUNCTOR_TEST_ARGS(tanh)); + unary_op_test(UNARY_FUNCTOR_TEST_ARGS(asinh)); + unary_op_test(UNARY_FUNCTOR_TEST_ARGS(acosh)); + unary_op_test(UNARY_FUNCTOR_TEST_ARGS(atanh)); + /* FIXME: Enable when the behavior of rsqrt on denormals for half and double is fixed. + unary_op_test("rsqrt", + [](const auto& x) { return Eigen::rsqrt(x); }, + [](Scalar x) { + if (x >= 0 && x < (std::numeric_limits::min)()) { + // rsqrt return +inf for positive subnormals. + return NumTraits::infinity(); + } else { + return Scalar(std::sqrt(Scalar(1)/x)); + } + }); + */ +} + + template void pow_scalar_exponent_test() { using Int_t = typename internal::make_integer::type; @@ -630,6 +694,7 @@ template void array_real(const ArrayType& m) VERIFY_IS_APPROX(m1.tanh().atanh(), atanh(tanh(m1))); VERIFY_IS_APPROX(m1.sinh().asinh(), asinh(sinh(m1))); VERIFY_IS_APPROX(m1.cosh().acosh(), acosh(cosh(m1))); + VERIFY_IS_APPROX(m1.tanh().atanh(), atanh(tanh(m1))); VERIFY_IS_APPROX(m1.logistic(), logistic(m1)); VERIFY_IS_APPROX(m1.arg(), arg(m1)); @@ -722,6 +787,7 @@ template void array_real(const ArrayType& m) VERIFY_IS_APPROX(m3.pow(RealScalar(-2)), m3.square().inverse()); // Test pow and atan2 on special IEEE values. + unary_ops_test(); binary_ops_test(); pow_scalar_exponent_test(); diff --git a/test/packetmath.cpp b/test/packetmath.cpp index f0829858b..76332d7f7 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -879,6 +879,9 @@ void packetmath_real() { } CHECK_CWISE1_IF(PacketTraits::HasASin, std::asin, internal::pasin); CHECK_CWISE1_IF(PacketTraits::HasACos, std::acos, internal::pacos); + CHECK_CWISE1_IF(PacketTraits::HasATan, std::atan, internal::patan); + CHECK_CWISE1_IF(PacketTraits::HasATanh, std::atanh, internal::patanh); + for (int i = 0; i < size; ++i) { data1[i] = Scalar(internal::random(-87, 88));