From be107e387b9b161d9a7f7875f0b8aaee7d407287 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 23 Jun 2016 10:19:05 -0300 Subject: [PATCH] fix compilation with clang 3.9, fix performance with pset1, use vector operators instead of intrinsics in some cases --- Eigen/src/Core/arch/AltiVec/Complex.h | 34 ++++---- Eigen/src/Core/arch/AltiVec/MathFunctions.h | 4 + Eigen/src/Core/arch/AltiVec/PacketMath.h | 88 +++++++++++---------- 3 files changed, 67 insertions(+), 59 deletions(-) diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h index 9feef7782..ca8eee3ca 100644 --- a/Eigen/src/Core/arch/AltiVec/Complex.h +++ b/Eigen/src/Core/arch/AltiVec/Complex.h @@ -29,7 +29,7 @@ static Packet2ul p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (P //---------- float ---------- struct Packet2cf { - EIGEN_STRONG_INLINE Packet2cf() {} + EIGEN_STRONG_INLINE explicit Packet2cf() : v(p4f_ZERO) {} EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {} Packet4f v; }; @@ -71,12 +71,12 @@ template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex EIGEN_STRONG_INLINE Packet2cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload((const float*)from)); } -template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu((const float*)from)); } +template<> EIGEN_STRONG_INLINE Packet2cf pload(const std::complex* from) { return Packet2cf(pload((const float *) from)); } +template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) { return Packet2cf(ploadu((const float*) from)); } template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { return pset1(*from); } -template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); } -template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { pstore((float*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { pstoreu((float*)to, from.v); } template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, Index stride) { @@ -93,8 +93,8 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf to[1*stride] = af[1]; } -template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd(a.v, b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub(a.v, b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v + b.v); } +template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v - b.v); } template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor(a.v, reinterpret_cast(p4ui_CONJ_XOR))); } @@ -114,7 +114,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, con // permute back to a proper order v2 = vec_perm(v2, v2, p16uc_COMPLEX32_REV); - return Packet2cf(vec_add(v1, v2)); + return Packet2cf(padd(v1, v2)); } template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand(a.v, b.v)); } @@ -300,10 +300,10 @@ template<> struct packet_traits > : default_packet_traits template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; }; -template<> EIGEN_STRONG_INLINE Packet1cd pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload((const double*)from)); } -template<> EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu((const double*)from)); } -template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } -template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE Packet1cd pload (const std::complex* from) { return Packet1cd(pload((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) { return Packet1cd(ploadu((const double*)from)); } +template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { pstore((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { pstoreu((double*)to, from.v); } template<> EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex& from) { /* here we really have to use unaligned loads :( */ return ploadu(&from); } @@ -323,8 +323,8 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet1c to[1*stride] = af[1]; } -template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(padd(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(psub(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); } +template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v - b.v); } template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); } template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd(pxor(a.v, reinterpret_cast(p2ul_CONJ_XOR2))); } @@ -343,7 +343,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, con v2 = reinterpret_cast(vec_sld(reinterpret_cast(v2), reinterpret_cast(v2), 8)); v2 = pxor(v2, reinterpret_cast(p2ul_CONJ_XOR1)); - return Packet1cd(vec_add(v1, v2)); + return Packet1cd(padd(v1, v2)); } template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pand(a.v,b.v)); } @@ -434,8 +434,8 @@ template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, con { // TODO optimize it for AltiVec Packet1cd res = conj_helper().pmul(a,b); - Packet2d s = vec_madd(b.v, b.v, p2d_ZERO_); - return Packet1cd(pdiv(res.v, vec_add(s,vec_perm(s, s, p16uc_REVERSE64)))); + Packet2d s = pmul(b.v, b.v); + return Packet1cd(pdiv(res.v, padd(s, vec_perm(s, s, p16uc_REVERSE64)))); } EIGEN_STRONG_INLINE Packet1cd pcplxflip/**/(const Packet1cd& x) diff --git a/Eigen/src/Core/arch/AltiVec/MathFunctions.h b/Eigen/src/Core/arch/AltiVec/MathFunctions.h index 340f064e0..d92ec240e 100644 --- a/Eigen/src/Core/arch/AltiVec/MathFunctions.h +++ b/Eigen/src/Core/arch/AltiVec/MathFunctions.h @@ -194,18 +194,22 @@ Packet4f pexp(const Packet4f& _x) isnumber_mask); } +#ifndef EIGEN_COMP_CLANG template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f prsqrt(const Packet4f& x) { return vec_rsqrt(x); } +#endif #ifdef __VSX__ +#ifndef EIGEN_COMP_CLANG template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d prsqrt(const Packet2d& x) { return vec_rsqrt(x); } +#endif template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f psqrt(const Packet4f& x) diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 04f1421b9..3f4243995 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -112,8 +112,8 @@ static Packet16uc p16uc_HALF64_0_16 = vec_sld(vec_splat((Packet16uc) vec_abs(p4i static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 }; static Packet16uc p16uc_PSET64_LO = (Packet16uc) vec_mergel((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 }; -static Packet16uc p16uc_TRANSPOSE64_HI = vec_add(p16uc_PSET64_HI, p16uc_HALF64_0_16); //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23}; -static Packet16uc p16uc_TRANSPOSE64_LO = vec_add(p16uc_PSET64_LO, p16uc_HALF64_0_16); //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31}; +static Packet16uc p16uc_TRANSPOSE64_HI = p16uc_PSET64_HI + p16uc_HALF64_0_16; //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23}; +static Packet16uc p16uc_TRANSPOSE64_LO = p16uc_PSET64_LO + p16uc_HALF64_0_16; //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31}; static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 }; @@ -155,7 +155,11 @@ template<> struct packet_traits : default_packet_traits #else HasSqrt = 0, #endif +#if !EIGEN_COMP_CLANG HasRsqrt = 1, +#else + HasRsqrt = 0, +#endif HasRound = 1, HasFloor = 1, HasCeil = 1, @@ -272,17 +276,13 @@ template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet4i& f } template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { - float EIGEN_ALIGN16 af[4]; - af[0] = from; - Packet4f vc = pload(af); - return vec_splat(vc, 0); + Packet4f v = {from, from, from, from}; + return v; } template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { - int EIGEN_ALIGN16 ai[4]; - ai[0] = from; - Packet4i vc = pload(ai); - return vec_splat(vc, 0); + Packet4i v = {from, from, from, from}; + return v; } template<> EIGEN_STRONG_INLINE void pbroadcast4(const float *a, @@ -342,22 +342,23 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const to[3*stride] = ai[3]; } -template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return vec_add(pset1(a), p4f_COUNTDOWN); } -template<> EIGEN_STRONG_INLINE Packet4i plset(const int& a) { return vec_add(pset1(a), p4i_COUNTDOWN); } +template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return pset1(a) + p4f_COUNTDOWN; } +template<> EIGEN_STRONG_INLINE Packet4i plset(const int& a) { return pset1(a) + p4i_COUNTDOWN; } -template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) { return vec_add(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const Packet4i& b) { return vec_add(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) { return a + b; } +template<> EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const Packet4i& b) { return a + b; } -template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) { return vec_sub(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { return vec_sub(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) { return a - b; } +template<> EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { return a - b; } -template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return psub(p4f_ZERO, a); } -template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return psub(p4i_ZERO, a); } +template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return p4f_ZERO - a; } +template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return p4i_ZERO - a; } template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b,p4f_ZERO); } +template<> EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b, p4f_ZERO); } +template<> EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const Packet4i& b) { return a * b; } template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { @@ -383,8 +384,8 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& /*a*/, co } // for some weird raisons, it has to be overloaded for packet of integers -template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a, b, c); } -template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); } +template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a,b,c); } +template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return a*b + c; } template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { return vec_min(a, b); } template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); } @@ -514,8 +515,8 @@ template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { EIGEN_PPC_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { EIGEN_PPC_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; pstore((float *)x, a); return x[0]; } -template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; pstore((int *)x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; } +template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { int EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; } template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { @@ -532,9 +533,9 @@ template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) { Packet4f b, sum; b = vec_sld(a, a, 8); - sum = vec_add(a, b); + sum = a + b; b = vec_sld(sum, sum, 4); - sum = vec_add(sum, b); + sum += b; return pfirst(sum); } @@ -557,11 +558,11 @@ template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) // Now do the summation: // Lines 0+1 - sum[0] = vec_add(sum[0], sum[1]); + sum[0] = sum[0] + sum[1]; // Lines 2+3 - sum[1] = vec_add(sum[2], sum[3]); + sum[1] = sum[2] + sum[3]; // Add the results - sum[0] = vec_add(sum[0], sum[1]); + sum[0] = sum[0] + sum[1]; return sum[0]; } @@ -597,11 +598,11 @@ template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) // Now do the summation: // Lines 0+1 - sum[0] = vec_add(sum[0], sum[1]); + sum[0] = sum[0] + sum[1]; // Lines 2+3 - sum[1] = vec_add(sum[2], sum[3]); + sum[1] = sum[2] + sum[3]; // Add the results - sum[0] = vec_add(sum[0], sum[1]); + sum[0] = sum[0] + sum[1]; return sum[0]; } @@ -754,7 +755,11 @@ template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, cons typedef __vector double Packet2d; typedef __vector unsigned long long Packet2ul; typedef __vector long long Packet2l; +#if EIGEN_COMP_CLANG +typedef Packet2ul Packet2bl; +#else typedef __vector __bool long Packet2bl; +#endif static Packet2l p2l_ONE = { 1, 1 }; static Packet2l p2l_ZERO = reinterpret_cast(p4i_ZERO); @@ -857,11 +862,10 @@ template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& } template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { - double EIGEN_ALIGN16 af[2]; - af[0] = from; - Packet2d vc = pload(af); - return vec_splat_dbl<0>(vc); + Packet2d v = {from, from}; + return v; } + template<> EIGEN_STRONG_INLINE void pbroadcast4(const double *a, Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) @@ -889,13 +893,13 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, to[1*stride] = af[1]; } -template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) { return vec_add(pset1(a), p2d_COUNTDOWN); } +template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) { return pset1(a) + p2d_COUNTDOWN; } -template<> EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b) { return vec_add(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b) { return a + b; } -template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { return vec_sub(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { return a - b; } -template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return psub(p2d_ZERO, a); } +template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return p2d_ZERO - a; } template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } @@ -955,15 +959,15 @@ template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) { Packet2d b, sum; b = reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)); - sum = padd(a, b); + sum = a + b; return pfirst(sum); } template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) { Packet2d v[2], sum; - v[0] = padd(vecs[0], reinterpret_cast(vec_sld(reinterpret_cast(vecs[0]), reinterpret_cast(vecs[0]), 8))); - v[1] = padd(vecs[1], reinterpret_cast(vec_sld(reinterpret_cast(vecs[1]), reinterpret_cast(vecs[1]), 8))); + v[0] = vecs[0] + reinterpret_cast(vec_sld(reinterpret_cast(vecs[0]), reinterpret_cast(vecs[0]), 8)); + v[1] = vecs[1] + reinterpret_cast(vec_sld(reinterpret_cast(vecs[1]), reinterpret_cast(vecs[1]), 8)); #ifdef _BIG_ENDIAN sum = reinterpret_cast(vec_sld(reinterpret_cast(v[0]), reinterpret_cast(v[1]), 8));