Fix pre-POWER8_VECTOR bugs in pcmp_lt and pnegate and reactivate psqrt.

(cherry picked from commit 4a58f30aa083d2f5753465a36730ca7734b483be)
2025-09-17 20:03:17 +08:00 · 2023-01-31 19:40:24 +00:00 · 2023-01-31 19:40:24 +00:00 · e734787bb7
commit e734787bb7
parent 1217390db4
2 changed files with 63 additions and 31 deletions
--- a/Eigen/src/Core/arch/AltiVec/MathFunctions.h
+++ b/Eigen/src/Core/arch/AltiVec/MathFunctions.h
@ -40,24 +40,9 @@ Packet4f pcos<Packet4f>(const Packet4f& _x)
  return pcos_float(_x);
 }

-#ifndef EIGEN_COMP_CLANG
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f prsqrt<Packet4f>(const Packet4f& x)
-{
-  return  vec_rsqrt(x);
-}
-#endif
-
 #ifdef __VSX__
-#ifndef EIGEN_COMP_CLANG
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet2d prsqrt<Packet2d>(const Packet2d& x)
-{
-  return  vec_rsqrt(x);
-}
-#endif

-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet4f psqrt<Packet4f>(const Packet4f& x)
 {
  return  vec_sqrt(x);
@ -69,12 +54,41 @@ Packet2d psqrt<Packet2d>(const Packet2d& x)
  return  vec_sqrt(x);
 }

-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet4f prsqrt<Packet4f>(const Packet4f& x)
+{
+  return pset1<Packet4f>(1.0f) / psqrt<Packet4f>(x);
+//  vec_rsqrt returns different results from the generic version
+//  return  vec_rsqrt(x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet2d prsqrt<Packet2d>(const Packet2d& x)
+{
+  return pset1<Packet2d>(1.0) / psqrt<Packet2d>(x);
+//  vec_rsqrt returns different results from the generic version
+//  return  vec_rsqrt(x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet2d pexp<Packet2d>(const Packet2d& _x)
 {
  return pexp_double(_x);
 }
-#endif
+
+template<> EIGEN_STRONG_INLINE Packet8bf psqrt<Packet8bf> (const Packet8bf& a){
+  BF16_TO_F32_UNARY_OP_WRAPPER(vec_sqrt, a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf prsqrt<Packet8bf> (const Packet8bf& a){
+  BF16_TO_F32_UNARY_OP_WRAPPER(prsqrt<Packet4f>, a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pexp<Packet8bf> (const Packet8bf& a){
+  BF16_TO_F32_UNARY_OP_WRAPPER(pexp_float, a);
+}
+
+#endif  // __VSX__

 // Hyperbolic Tangent function.
 template <>
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@ -786,8 +786,22 @@ template<> EIGEN_STRONG_INLINE Packet8us  psub<Packet8us> (const Packet8us&  a,
 template<> EIGEN_STRONG_INLINE Packet16c  psub<Packet16c> (const Packet16c&  a, const Packet16c&  b) { return a - b; }
 template<> EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return a - b; }

-template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return p4f_ZERO - a; }
-template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return p4i_ZERO - a; }
+template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
+{
+#ifdef __POWER8_VECTOR__
+  return vec_neg(a);
+#else
+  return vec_xor(a, p4f_MZERO);
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a)
+{
+#ifdef __POWER8_VECTOR__
+  return vec_neg(a);
+#else
+  return p4i_ZERO - a;
+#endif
+}

 template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
 template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
@ -865,7 +879,10 @@ template<> EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, con
 template<> EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_max(a, b); }

 template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmple(a,b)); }
+// To fix bug with vec_cmplt on older versions
+#if defined(__POWER8_VECTOR__) || EIGEN_COMP_LLVM
 template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmplt(a,b)); }
+#endif
 template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmpeq(a,b)); }
 template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) {
  Packet4f c = reinterpret_cast<Packet4f>(vec_cmpge(a,b));
@ -1341,16 +1358,6 @@ template<> EIGEN_STRONG_INLINE Packet8bf psub<Packet8bf>(const Packet8bf& a, con
  BF16_TO_F32_BINARY_OP_WRAPPER(psub<Packet4f>, a, b);
 }

-template<> EIGEN_STRONG_INLINE Packet8bf psqrt<Packet8bf> (const Packet8bf& a){
-  BF16_TO_F32_UNARY_OP_WRAPPER(vec_sqrt, a);
-}
-template<> EIGEN_STRONG_INLINE Packet8bf prsqrt<Packet8bf> (const Packet8bf& a){
-  BF16_TO_F32_UNARY_OP_WRAPPER(prsqrt<Packet4f>, a);
-}
-template<> EIGEN_STRONG_INLINE Packet8bf pexp<Packet8bf> (const Packet8bf& a){
-  BF16_TO_F32_UNARY_OP_WRAPPER(pexp_float, a);
-}
-
 template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
  return pldexp_generic(a,exponent);
 }
@ -2304,7 +2311,11 @@ template<> struct packet_traits<double> : default_packet_traits
    HasLog  = 0,
    HasExp  = 1,
    HasSqrt = 1,
+#if !EIGEN_COMP_CLANG
    HasRsqrt = 1,
+#else
+    HasRsqrt = 0,
+#endif
    HasRound = 1,
    HasFloor = 1,
    HasCeil = 1,
@ -2393,7 +2404,14 @@ template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const

 template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return a - b; }

-template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return p2d_ZERO - a; }
+template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a)
+{
+#ifdef __POWER8_VECTOR__
+  return vec_neg(a);
+#else
+  return vec_xor(a, p2d_MZERO);
+#endif
+}

 template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }