Vectorize atanh & add a missing definition and unit test for atan.

2025-07-06 05:05:12 +08:00 · 2023-02-21 03:14:05 +00:00 · 2023-02-21 03:14:05 +00:00 · ce62177b5b
commit ce62177b5b
parent 049a144798
17 changed files with 179 additions and 25 deletions
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@ -82,6 +82,7 @@ struct default_packet_traits
    HasASin   = 0,
    HasACos   = 0,
    HasATan   = 0,
+    HasATanh  = 0,
    HasSinh   = 0,
    HasCosh   = 0,
    HasTanh   = 0,
@ -857,9 +858,6 @@ Packet pasin(const Packet& a) { EIGEN_USING_STD(asin); return asin(a); }
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pacos(const Packet& a) { EIGEN_USING_STD(acos); return acos(a); }

-/** \internal \returns the arc tangent of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet patan(const Packet& a) { EIGEN_USING_STD(atan); return atan(a); }

 /** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
@ -869,10 +867,18 @@ Packet psinh(const Packet& a) { EIGEN_USING_STD(sinh); return sinh(a); }
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pcosh(const Packet& a) { EIGEN_USING_STD(cosh); return cosh(a); }

+/** \internal \returns the arc tangent of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet patan(const Packet& a) { EIGEN_USING_STD(atan); return atan(a); }
+
 /** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet ptanh(const Packet& a) { EIGEN_USING_STD(tanh); return tanh(a); }

+/** \internal \returns the arc tangent of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet patanh(const Packet& a) { EIGEN_USING_STD(atanh); return atanh(a); }
+
 /** \internal \returns the exp of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pexp(const Packet& a) { EIGEN_USING_STD(exp); return exp(a); }
--- a/Eigen/src/Core/arch/AVX/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX/MathFunctions.h
@ -56,6 +56,12 @@ patan<Packet4d>(const Packet4d& _x) {
  return patan_double(_x);
 }

+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8f
+patanh<Packet8f>(const Packet8f& _x) {
+  return patanh_float(_x);
+}
+
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8f
 plog<Packet8f>(const Packet8f& _x) {
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@ -76,6 +76,7 @@ template<> struct packet_traits<float>  : default_packet_traits
    HasACos = 1,
    HasASin = 1,
    HasATan = 1,
+    HasATanh = 1,
    HasLog = 1,
    HasLog1p = 1,
    HasExpm1 = 1,
--- a/Eigen/src/Core/arch/AVX512/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h
@ -275,6 +275,12 @@ patan<Packet16f>(const Packet16f& _x) {
  return patan_float(_x);
 }

+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f
+patanh<Packet16f>(const Packet16f& _x) {
+  return patanh_float(_x);
+}
+
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8d
 patan<Packet8d>(const Packet8d& _x) {
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@ -125,6 +125,7 @@ template<> struct packet_traits<float>  : default_packet_traits
    HasACos = 1,
    HasASin = 1,
    HasATan = 1,
+    HasATanh = 1,
 #if EIGEN_HAS_AVX512_MATH
    HasLog = 1,
    HasLog1p  = 1,
--- a/Eigen/src/Core/arch/AltiVec/MathFunctions.h
+++ b/Eigen/src/Core/arch/AltiVec/MathFunctions.h
@ -60,6 +60,12 @@ Packet4f patan<Packet4f>(const Packet4f& _x)
  return patan_float(_x);
 }

+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet4f patanh<Packet4f>(const Packet4f& _x)
+{
+  return patanh_float(_x);
+}
+
 #ifdef EIGEN_VECTORIZE_VSX
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet4f psqrt<Packet4f>(const Packet4f& x)
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@ -171,6 +171,7 @@ struct packet_traits<float> : default_packet_traits {
    HasACos = 1,
    HasASin = 1,
    HasATan = 1,
+    HasATanh = 1,
    HasLog = 1,
    HasExp = 1,
 #ifdef EIGEN_VECTORIZE_VSX
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@ -967,6 +967,34 @@ Packet patan_double(const Packet& x_in) {
  return pxor(p, x_signmask);
 }

+template<typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet patanh_float(const Packet& x) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
+  const Packet half = pset1<Packet>(0.5f);
+  const Packet x_gt_half = pcmp_le(half, pabs(x));
+  // For |x| in [0:0.5] we use a polynomial approximation of the form
+  // P(x) = x + x^3*(c3 + x^2 * (c5 + x^2 * (... x^2 * c11) ... )).
+  const Packet C3 = pset1<Packet>(0.3333373963832855224609375f);
+  const Packet C5 = pset1<Packet>(0.1997792422771453857421875f);
+  const Packet C7 = pset1<Packet>(0.14672131836414337158203125f);
+  const Packet C9 = pset1<Packet>(8.2311116158962249755859375e-2f);
+  const Packet C11 = pset1<Packet>(0.1819281280040740966796875f);
+  const Packet x2 = pmul(x,x);
+  Packet p = pmadd(C11, x2, C9);
+  p = pmadd(x2, p, C7);
+  p = pmadd(x2, p, C5);
+  p = pmadd(x2, p, C3);
+  p = pmadd(pmul(x,x2), p, x);
+
+  // For |x| in ]0.5:1.0] we use atanh = 0.5*ln((1+x)/(1-x));
+  const Packet one = pset1<Packet>(1.0f);
+  Packet r = pdiv(padd(one, x), psub(one, x));
+  r = pmul(half, plog(r));
+  return pselect(x_gt_half, r, p);
+}
+
 template<typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pdiv_complex(const Packet& x, const Packet& y) {
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
@ -109,6 +109,11 @@ template<typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet patan_double(const Packet& x);

+/** \internal \returns atan(x) for single precision float */
+template<typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet patanh_float(const Packet& x);
+
 /** \internal \returns sqrt(x) for complex types */
 template<typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
--- a/Eigen/src/Core/arch/Default/Half.h
+++ b/Eigen/src/Core/arch/Default/Half.h
@ -779,6 +779,12 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half asin(const half& a) {
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half acos(const half& a) {
  return half(::acosf(float(a)));
 }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half atan(const half& a) {
+  return half(::atanf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half atanh(const half& a) {
+  return half(::atanhf(float(a)));
+}
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
 #if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
  defined(EIGEN_HIP_DEVICE_COMPILE)
--- a/Eigen/src/Core/arch/NEON/MathFunctions.h
+++ b/Eigen/src/Core/arch/NEON/MathFunctions.h
@ -55,6 +55,12 @@ template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2f ptanh<Pa
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f ptanh<Packet4f>(const Packet4f& x)
 { return internal::generic_fast_tanh_float(x); }

+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2f patanh<Packet2f>(const Packet2f& x)
+{ return patanh_float(x); }
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f patanh<Packet4f>(const Packet4f& x)
+{ return patanh_float(x); }
+
+
 #if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
 template <>
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@ -215,6 +215,7 @@ struct packet_traits<float> : default_packet_traits
    HasACos  = 1,
    HasASin  = 1,
    HasATan  = 1,
+    HasATanh = 1,
    HasLog  = 1,
    HasExp  = 1,
    HasSqrt = 1,
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@ -81,11 +81,6 @@ Packet4f pacos<Packet4f>(const Packet4f& _x)
  return pacos_float(_x);
 }

-template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet2d patan<Packet2d>(const Packet2d& _x) {
-  return patan_double(_x);
-}
-
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet4f pasin<Packet4f>(const Packet4f& _x)
 {
@ -98,6 +93,17 @@ Packet4f patan<Packet4f>(const Packet4f& _x)
  return patan_float(_x);
 }

+template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet2d patan<Packet2d>(const Packet2d& _x) {
+  return patan_double(_x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet4f patanh<Packet4f>(const Packet4f& _x)
+{
+  return patanh_float(_x);
+}
+
 // Notice that for newer processors, it is counterproductive to use Newton
 // iteration for square root. In particular, Skylake and Zen2 processors
 // have approximately doubled throughput of the _mm_sqrt_ps instruction
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@ -142,6 +142,7 @@ struct packet_traits<float> : default_packet_traits {
    HasACos = 1,
    HasASin = 1,
    HasATan = 1,
+    HasATanh = 1,
    HasLog = 1,
    HasLog1p = 1,
    HasExpm1 = 1,
--- a/Eigen/src/Core/functors/UnaryFunctors.h
+++ b/Eigen/src/Core/functors/UnaryFunctors.h
@ -622,11 +622,16 @@ struct functor_traits<scalar_tanh_op<Scalar> > {
 template <typename Scalar>
 struct scalar_atanh_op {
  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::atanh(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& x) const { return patanh(x); }
 };

 template <typename Scalar>
 struct functor_traits<scalar_atanh_op<Scalar> > {
-  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasATanh
+  };
 };

 /** \internal
--- a/test/array_cwise.cpp
+++ b/test/array_cwise.cpp
@ -93,27 +93,91 @@ void binary_op_test(std::string name, Fn fun, RefFn ref) {
  VERIFY(all_pass);
 }

+#define BINARY_FUNCTOR_TEST_ARGS(fun) #fun, \
+      [](const auto& x, const auto& y) { return (Eigen::fun)(x, y); },    \
+      [](const auto& x, const auto& y) { return (std::fun)(x, y); }
+
+
 template <typename Scalar>
 void binary_ops_test() {
-  binary_op_test<Scalar>("pow",
-                         [](const auto& x, const auto& y) { return Eigen::pow(x, y); },
-                         [](const auto& x, const auto& y) { return std::pow(x, y); });
-  binary_op_test<Scalar>("atan2",
-                         [](const auto& x, const auto& y) { return Eigen::atan2(x, y); },
-                         [](const auto& x, const auto& y) {
-                            auto t = std::atan2(x, y);
-#if EIGEN_COMP_MSVC
-                            // Work around MSVC return value on underflow.
-                            // |atan(y/x)| is bounded above by |y/x|, so on underflow return y/x according to POSIX spec.
-                            // MSVC otherwise returns denorm_min.
-                            if (EIGEN_PREDICT_FALSE(std::abs(t) == std::numeric_limits<decltype(t)>::denorm_min())) {
-                              return x/y;
-                            }
+  binary_op_test<Scalar>(BINARY_FUNCTOR_TEST_ARGS(pow));
+#ifndef EIGEN_COMP_MSVC
+  binary_op_test<Scalar>(BINARY_FUNCTOR_TEST_ARGS(atan2));
+#else
+  binary_op_test<Scalar>(
+      "atan2", [](const auto& x, const auto& y) { return Eigen::atan2(x, y); },
+      [](Scalar x, Scalar y) {
+        auto t = Scalar(std::atan2(x, y));
+        // Work around MSVC return value on underflow.
+        // |atan(y/x)| is bounded above by |y/x|, so on underflow return y/x according to POSIX spec.
+        // MSVC otherwise returns denorm_min.
+        if (EIGEN_PREDICT_FALSE(std::abs(t) == std::numeric_limits<decltype(t)>::denorm_min())) {
+          return x / y;
+        }
+        return t;
+      });
 #endif
-                            return t;
-                          });
 }

+
+template <typename Scalar, typename Fn, typename RefFn>
+void unary_op_test(std::string name, Fn fun, RefFn ref) {
+  const Scalar tol = test_precision<Scalar>();
+  auto values = special_values<Scalar>();
+  Map<Array<Scalar, Dynamic, 1>> x(values.data(), values.size());
+
+  Array<Scalar, Dynamic, Dynamic> actual = fun(x);
+  bool all_pass = true;
+  for (Index i = 0; i < x.size(); ++i) {
+    Scalar e = static_cast<Scalar>(ref(x(i)));
+    Scalar a = actual(i);
+    bool success = (a == e) || ((numext::isfinite)(e) && internal::isApprox(a, e, tol)) ||
+                   ((numext::isnan)(a) && (numext::isnan)(e));
+    if ((a == a) && (e == e)) success &= (bool)numext::signbit(e) == (bool)numext::signbit(a);
+    all_pass &= success;
+    if (!success) {
+      std::cout << name << "(" << x(i) << ") = " << a << " !=  " << e << std::endl;
+    }
+  }
+  VERIFY(all_pass);
+}
+
+#define UNARY_FUNCTOR_TEST_ARGS(fun) #fun, \
+      [](const auto& x) { return (Eigen::fun)(x); },    \
+      [](const auto& x) { return (std::fun)(x); }
+
+template <typename Scalar>
+void unary_ops_test() {
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(sqrt));
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(exp));
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(log));
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(sin));
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(cos));
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(tan));
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(asin));
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(acos));
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(atan));
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(sinh));
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(cosh));
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(tanh));
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(asinh));
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(acosh));
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(atanh));
+  /* FIXME: Enable when the behavior of rsqrt on denormals for half and double is fixed. 
+  unary_op_test<Scalar>("rsqrt",
+                        [](const auto& x) { return Eigen::rsqrt(x); }, 
+                        [](Scalar x) {
+                          if (x >= 0 && x < (std::numeric_limits<Scalar>::min)()) {
+                            // rsqrt return +inf for positive subnormals.
+                            return NumTraits<Scalar>::infinity();
+                          } else {
+                            return  Scalar(std::sqrt(Scalar(1)/x));
+                          }
+                        });
+  */
+}
+
+
 template <typename Scalar>
 void pow_scalar_exponent_test() {
  using Int_t = typename internal::make_integer<Scalar>::type;
@ -630,6 +694,7 @@ template<typename ArrayType> void array_real(const ArrayType& m)
  VERIFY_IS_APPROX(m1.tanh().atanh(), atanh(tanh(m1)));
  VERIFY_IS_APPROX(m1.sinh().asinh(), asinh(sinh(m1)));
  VERIFY_IS_APPROX(m1.cosh().acosh(), acosh(cosh(m1)));
+  VERIFY_IS_APPROX(m1.tanh().atanh(), atanh(tanh(m1)));
  VERIFY_IS_APPROX(m1.logistic(), logistic(m1));

  VERIFY_IS_APPROX(m1.arg(), arg(m1));
@ -722,6 +787,7 @@ template<typename ArrayType> void array_real(const ArrayType& m)
  VERIFY_IS_APPROX(m3.pow(RealScalar(-2)), m3.square().inverse());

  // Test pow and atan2 on special IEEE values.
+  unary_ops_test<Scalar>();
  binary_ops_test<Scalar>();
  pow_scalar_exponent_test<Scalar>();

--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@ -879,6 +879,9 @@ void packetmath_real() {
  }
  CHECK_CWISE1_IF(PacketTraits::HasASin, std::asin, internal::pasin);
  CHECK_CWISE1_IF(PacketTraits::HasACos, std::acos, internal::pacos);
+  CHECK_CWISE1_IF(PacketTraits::HasATan, std::atan, internal::patan);
+  CHECK_CWISE1_IF(PacketTraits::HasATanh, std::atanh, internal::patanh);
+

  for (int i = 0; i < size; ++i) {
    data1[i] = Scalar(internal::random<double>(-87, 88));