Altivec fixes for Darwin: do not use unsupported VSX insns

(cherry picked from commit 4d05765345e7e4a984d600039f797e2fede924f3)
This commit is contained in:
Sergey Fedorov 2023-01-12 16:33:33 +00:00 committed by Antonio Sanchez
parent e67c494cba
commit 3adc78e39c
6 changed files with 2447 additions and 27 deletions

View File

@ -16,7 +16,7 @@ namespace Eigen {
namespace internal {
static Packet4ui p4ui_CONJ_XOR = vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
#ifdef __VSX__
#ifdef EIGEN_VECTORIZE_VSX
#if defined(_BIG_ENDIAN)
static Packet2ul p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_MZERO, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
static Packet2ul p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO, (Packet4ui) p2d_MZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
@ -100,7 +100,7 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits
HasAbs2 = 0,
HasMin = 0,
HasMax = 0,
#ifdef __VSX__
#ifdef EIGEN_VECTORIZE_VSXs
HasBlend = 1,
#endif
HasSetLinear = 0
@ -130,7 +130,7 @@ template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<f
EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex<float>& from0, const std::complex<float>& from1)
{
Packet4f res0, res1;
#ifdef __VSX__
#ifdef EIGEN_VECTORIZE_VSX
__asm__ ("lxsdx %x0,%y1" : "=wa" (res0) : "Z" (from0));
__asm__ ("lxsdx %x0,%y1" : "=wa" (res1) : "Z" (from1));
#ifdef _BIG_ENDIAN
@ -230,7 +230,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packe
return Packet2cf(vec_and(eq, vec_perm(eq, eq, p16uc_COMPLEX32_REV)));
}
#ifdef __VSX__
#ifdef EIGEN_VECTORIZE_VSX
template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
Packet2cf result;
result.v = reinterpret_cast<Packet4f>(pblend<Packet2d>(ifPacket, reinterpret_cast<Packet2d>(thenPacket.v), reinterpret_cast<Packet2d>(elsePacket.v)));
@ -244,7 +244,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a)
}
//---------- double ----------
#ifdef __VSX__
#ifdef EIGEN_VECTORIZE_VSX
struct Packet1cd
{
EIGEN_STRONG_INLINE Packet1cd() {}
@ -403,7 +403,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd psqrt<Packet1cd>(const Packet1cd& a)
return psqrt_complex<Packet1cd>(a);
}
#endif // __VSX__
#endif // EIGEN_VECTORIZE_VSX
} // end namespace internal
} // end namespace Eigen

View File

@ -40,7 +40,22 @@ Packet4f pcos<Packet4f>(const Packet4f& _x)
return pcos_float(_x);
}
#ifdef __VSX__
#ifndef EIGEN_COMP_CLANG
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet4f prsqrt<Packet4f>(const Packet4f& x)
{
return vec_rsqrt(x);
}
#endif
#ifdef EIGEN_VECTORIZE_VSX
#ifndef EIGEN_COMP_CLANG
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet2d prsqrt<Packet2d>(const Packet2d& x)
{
return vec_rsqrt(x);
}
#endif
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Packet4f psqrt<Packet4f>(const Packet4f& x)
@ -88,7 +103,7 @@ template<> EIGEN_STRONG_INLINE Packet8bf pexp<Packet8bf> (const Packet8bf& a){
BF16_TO_F32_UNARY_OP_WRAPPER(pexp_float, a);
}
#endif // __VSX__
#endif // EIGEN_VECTORIZE_VSX
// Hyperbolic Tangent function.
template <>

File diff suppressed because it is too large Load Diff

View File

@ -84,7 +84,7 @@ static _EIGEN_DECLARE_CONST_FAST_Packet4ui(PREV0DOT5, 0x3EFFFFFFu);
static _EIGEN_DECLARE_CONST_FAST_Packet8us(ONE,1); //{ 1, 1, 1, 1, 1, 1, 1, 1}
static _EIGEN_DECLARE_CONST_FAST_Packet16uc(ONE,1);
static Packet4f p4f_MZERO = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000}
#ifndef __VSX__
#ifndef EIGEN_VECTORIZE_VSX
static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0}
#endif
@ -114,7 +114,7 @@ static Packet16uc p16uc_QUADRUPLICATE16_HI = { 0,1,0,1,0,1,0,1, 2,3,2,3,2,3,2,3
// Define global static constants:
#ifdef _BIG_ENDIAN
static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0);
#ifdef __VSX__
#ifdef EIGEN_VECTORIZE_VSX
static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
#endif
static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
@ -168,7 +168,7 @@ struct packet_traits<float> : default_packet_traits {
HasCos = EIGEN_FAST_MATH,
HasLog = 1,
HasExp = 1,
#ifdef __VSX__
#ifdef EIGEN_VECTORIZE_VSX
HasSqrt = 1,
#if !EIGEN_COMP_CLANG
HasRsqrt = 1,
@ -210,7 +210,7 @@ struct packet_traits<bfloat16> : default_packet_traits {
HasCos = EIGEN_FAST_MATH,
HasLog = 1,
HasExp = 1,
#ifdef __VSX__
#ifdef EIGEN_VECTORIZE_VSX
HasSqrt = 1,
#if !EIGEN_COMP_CLANG
HasRsqrt = 1,
@ -432,7 +432,7 @@ EIGEN_STRONG_INLINE Packet pload_common(const __UNPACK_TYPE__(Packet)* from)
// ignoring these warnings for now.
EIGEN_UNUSED_VARIABLE(from);
EIGEN_DEBUG_ALIGNED_LOAD
#ifdef __VSX__
#ifdef EIGEN_VECTORIZE_VSX
return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
#else
return vec_ld(0, from);
@ -481,7 +481,7 @@ EIGEN_STRONG_INLINE void pstore_common(__UNPACK_TYPE__(Packet)* to, const Packet
// ignoring these warnings for now.
EIGEN_UNUSED_VARIABLE(to);
EIGEN_DEBUG_ALIGNED_STORE
#ifdef __VSX__
#ifdef EIGEN_VECTORIZE_VSX
vec_xst(from, 0, to);
#else
vec_st(from, 0, to);
@ -816,7 +816,7 @@ template<> EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a,
template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
{
#ifndef __VSX__ // VSX actually provides a div instruction
#ifndef EIGEN_VECTORIZE_VSX // VSX actually provides a div instruction
Packet4f t, y_0, y_1;
// Altivec does not offer a divide instruction, we have to do a reciprocal approximation
@ -845,7 +845,7 @@ template<> EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8
template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
{
#ifdef __VSX__
#ifdef EIGEN_VECTORIZE_VSX
// NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
Packet4f ret;
__asm__ ("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
@ -863,7 +863,7 @@ template<> EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a,
template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
{
#ifdef __VSX__
#ifdef EIGEN_VECTORIZE_VSX
// NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
Packet4f ret;
__asm__ ("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
@ -940,7 +940,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
Packet4f t = vec_add(reinterpret_cast<Packet4f>(vec_or(vec_and(reinterpret_cast<Packet4ui>(a), p4ui_SIGN), p4ui_PREV0DOT5)), a);
Packet4f res;
#ifdef __VSX__
#ifdef EIGEN_VECTORIZE_VSX
__asm__("xvrspiz %x0, %x1\n\t"
: "=&wa" (res)
: "wa" (t));
@ -2259,7 +2259,7 @@ template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f,Packet4i>(const Pa
//---------- double ----------
#ifdef __VSX__
#ifdef EIGEN_VECTORIZE_VSX
typedef __vector double Packet2d;
typedef __vector unsigned long long Packet2ul;
typedef __vector long long Packet2l;
@ -2721,7 +2721,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, cons
}
#endif // __VSX__
#endif // EIGEN_VECTORIZE_VSX
} // end namespace internal
} // end namespace Eigen

View File

@ -363,10 +363,10 @@
#endif
} // end extern "C"
#elif defined __VSX__
#elif defined(__VSX__) && !defined(__APPLE__)
#define EIGEN_VECTORIZE
#define EIGEN_VECTORIZE_VSX
#define EIGEN_VECTORIZE_VSX 1
#include <altivec.h>
// We need to #undef all these ugly tokens defined in <altivec.h>
// => use __vector instead of vector

View File

@ -314,7 +314,7 @@
#endif
/// \internal EIGEN_ARCH_PPC set to 1 if the architecture is PowerPC
#if defined(__powerpc__) || defined(__ppc__) || defined(_M_PPC)
#if defined(__powerpc__) || defined(__ppc__) || defined(_M_PPC) || defined(__POWERPC__)
#define EIGEN_ARCH_PPC 1
#else
#define EIGEN_ARCH_PPC 0
@ -1135,11 +1135,16 @@ namespace Eigen {
// directly for std::complex<T>, Eigen::half, Eigen::bfloat16. For these,
// you will need to apply to the underlying POD type.
#if EIGEN_ARCH_PPC && EIGEN_COMP_GNUC_STRICT
// This seems to be broken on clang. Packet4f is loaded into a single
// register rather than a vector, zeroing out some entries. Integer
// This seems to be broken on clang. Packet4f is loaded into a single
// register rather than a vector, zeroing out some entries. Integer
// types also generate a compile error.
// General, Altivec, VSX.
#define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+r,v,wa" (X));
#if EIGEN_OS_MAC
// General, Altivec for Apple (VSX were added in ISA v2.06):
#define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+r,v" (X));
#else
// General, Altivec, VSX otherwise:
#define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+r,v,wa" (X));
#endif
#elif EIGEN_ARCH_ARM_OR_ARM64
// General, NEON.
// Clang doesn't like "r",