Implement pcplflip, palign, predux and the likes from AVC/complexes

This commit is contained in:
Gael Guennebaud 2014-03-27 14:47:00 +01:00
parent a419cea4a0
commit 052aedd394
3 changed files with 37 additions and 55 deletions

View File

@ -303,9 +303,10 @@ using std::ptrdiff_t;
#if defined EIGEN_VECTORIZE_AVX #if defined EIGEN_VECTORIZE_AVX
// Use AVX for floats and doubles, SSE for integers // Use AVX for floats and doubles, SSE for integers
#include "src/Core/arch/SSE/PacketMath.h"
#include "src/Core/arch/SSE/Complex.h"
#include "src/Core/arch/AVX/PacketMath.h" #include "src/Core/arch/AVX/PacketMath.h"
#include "src/Core/arch/AVX/Complex.h" #include "src/Core/arch/AVX/Complex.h"
#include "src/Core/arch/SSE/PacketMath.h" // For integers
#elif defined EIGEN_VECTORIZE_SSE #elif defined EIGEN_VECTORIZE_SSE
#include "src/Core/arch/SSE/PacketMath.h" #include "src/Core/arch/SSE/PacketMath.h"
#include "src/Core/arch/SSE/MathFunctions.h" #include "src/Core/arch/SSE/MathFunctions.h"

View File

@ -99,8 +99,6 @@ template<> EIGEN_STRONG_INLINE Packet4cf ploaddup<Packet4cf>(const std::complex<
template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float>* to, const Packet4cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); } template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float>* to, const Packet4cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); }
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet4cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); } template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet4cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); }
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float>* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet4cf>(const Packet4cf& a) template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet4cf>(const Packet4cf& a)
{ {
@ -125,28 +123,29 @@ template<> EIGEN_STRONG_INLINE Packet4cf preverse(const Packet4cf& a) {
template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet4cf>(const Packet4cf& a) template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet4cf>(const Packet4cf& a)
{ {
return std::complex<float>(a.v[0]+a.v[2]+a.v[4]+a.v[6], a.v[1]+a.v[3]+a.v[5]+a.v[7]); return predux(padd(Packet2cf(_mm256_extractf128_ps(a.v,0)),
Packet2cf(_mm256_extractf128_ps(a.v,1))));
} }
template<> EIGEN_STRONG_INLINE Packet4cf preduxp<Packet4cf>(const Packet4cf* vecs) template<> EIGEN_STRONG_INLINE Packet4cf preduxp<Packet4cf>(const Packet4cf* vecs)
{ {
__m256 result = _mm256_setzero_ps(); Packet8f t0 = _mm256_shuffle_ps(vecs[0].v, vecs[0].v, _MM_SHUFFLE(3, 1, 2 ,0));
for (int i = 0; i < 4; ++i) { Packet8f t1 = _mm256_shuffle_ps(vecs[1].v, vecs[1].v, _MM_SHUFFLE(3, 1, 2 ,0));
for (int j = 0; j < 8; j+=2) { t0 = _mm256_hadd_ps(t0,t1);
result[2*i] += vecs[i].v[j]; Packet8f t2 = _mm256_shuffle_ps(vecs[2].v, vecs[2].v, _MM_SHUFFLE(3, 1, 2 ,0));
result[2*i+1] += vecs[i].v[j+1]; Packet8f t3 = _mm256_shuffle_ps(vecs[3].v, vecs[3].v, _MM_SHUFFLE(3, 1, 2 ,0));
} t2 = _mm256_hadd_ps(t2,t3);
}
return Packet4cf(result); t1 = _mm256_permute2f128_ps(t0,t2, 0 + (2<<4));
t3 = _mm256_permute2f128_ps(t0,t2, 1 + (3<<4));
return Packet4cf(_mm256_add_ps(t1,t3));
} }
template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet4cf>(const Packet4cf& a) template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet4cf>(const Packet4cf& a)
{ {
std::complex<float> result(a.v[0], a.v[1]); return predux_mul(pmul(Packet2cf(_mm256_extractf128_ps(a.v, 0)),
for (int i = 2; i < 8; i+=2) { Packet2cf(_mm256_extractf128_ps(a.v, 1))));
result *= std::complex<float>(a.v[i], a.v[i+1]);
}
return result;
} }
template<int Offset> template<int Offset>
@ -155,16 +154,7 @@ struct palign_impl<Offset,Packet4cf>
static EIGEN_STRONG_INLINE void run(Packet4cf& first, const Packet4cf& second) static EIGEN_STRONG_INLINE void run(Packet4cf& first, const Packet4cf& second)
{ {
if (Offset==0) return; if (Offset==0) return;
for (int i = 0; i < 4-Offset; ++i) palign_impl<Offset*2,Packet8f>::run(first.v, second.v);
{
first.v[2*i] = first.v[2*(i+Offset)];
first.v[2*i+1] = first.v[2*(i+Offset)+1];
}
for (int i = 4-Offset; i < 4; ++i)
{
first.v[2*i] = second.v[2*(i-4+Offset)];
first.v[2*i+1] = second.v[2*(i-4+Offset)+1];
}
} }
}; };
@ -230,12 +220,7 @@ template<> EIGEN_STRONG_INLINE Packet4cf pdiv<Packet4cf>(const Packet4cf& a, con
template<> EIGEN_STRONG_INLINE Packet4cf pcplxflip<Packet4cf>(const Packet4cf& x) template<> EIGEN_STRONG_INLINE Packet4cf pcplxflip<Packet4cf>(const Packet4cf& x)
{ {
Packet4cf res; return Packet4cf(_mm256_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0 ,1)));
for (int i = 0; i < 8; i+=2) {
res.v[i] = x.v[i+1];
res.v[i+1] = x.v[i];
}
return res;
} }
//---------- double ---------- //---------- double ----------
@ -312,8 +297,6 @@ template<> EIGEN_STRONG_INLINE Packet2cd ploaddup<Packet2cd>(const std::complex<
template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet2cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet2cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet2cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet2cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet2cd>(const Packet2cd& a) template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet2cd>(const Packet2cd& a)
{ {
__m128d low = _mm256_extractf128_pd(a.v, 0); __m128d low = _mm256_extractf128_pd(a.v, 0);
@ -329,24 +312,22 @@ template<> EIGEN_STRONG_INLINE Packet2cd preverse(const Packet2cd& a) {
template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet2cd>(const Packet2cd& a) template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet2cd>(const Packet2cd& a)
{ {
return std::complex<double>(a.v[0]+a.v[2], a.v[1]+a.v[3]); return predux(padd(Packet1cd(_mm256_extractf128_pd(a.v,0)),
Packet1cd(_mm256_extractf128_pd(a.v,1))));
} }
template<> EIGEN_STRONG_INLINE Packet2cd preduxp<Packet2cd>(const Packet2cd* vecs) template<> EIGEN_STRONG_INLINE Packet2cd preduxp<Packet2cd>(const Packet2cd* vecs)
{ {
__m256d result = _mm256_setzero_pd(); Packet4d t0 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 0 + (2<<4));
for (int i = 0; i < 2; ++i) { Packet4d t1 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 1 + (3<<4));
for (int j = 0; j < 4; j+=2) {
result[2*i] += vecs[i].v[j]; return Packet2cd(_mm256_add_pd(t0,t1));
result[2*i+1] += vecs[i].v[j+1];
}
}
return Packet2cd(result);
} }
template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet2cd>(const Packet2cd& a) template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet2cd>(const Packet2cd& a)
{ {
return std::complex<double>(a.v[0], a.v[1]) * std::complex<double>(a.v[2], a.v[3]); return predux(pmul(Packet1cd(_mm256_extractf128_pd(a.v,0)),
Packet1cd(_mm256_extractf128_pd(a.v,1))));
} }
template<int Offset> template<int Offset>
@ -355,10 +336,7 @@ struct palign_impl<Offset,Packet2cd>
static EIGEN_STRONG_INLINE void run(Packet2cd& first, const Packet2cd& second) static EIGEN_STRONG_INLINE void run(Packet2cd& first, const Packet2cd& second)
{ {
if (Offset==0) return; if (Offset==0) return;
first.v[0] = first.v[2]; palign_impl<Offset*2,Packet4d>::run(first.v, second.v);
first.v[1] = first.v[3];
first.v[2] = second.v[0];
first.v[3] = second.v[1];
} }
}; };
@ -423,12 +401,7 @@ template<> EIGEN_STRONG_INLINE Packet2cd pdiv<Packet2cd>(const Packet2cd& a, con
template<> EIGEN_STRONG_INLINE Packet2cd pcplxflip<Packet2cd>(const Packet2cd& x) template<> EIGEN_STRONG_INLINE Packet2cd pcplxflip<Packet2cd>(const Packet2cd& x)
{ {
Packet2cd res; return Packet2cd(_mm256_shuffle_pd(x.v, x.v, 0x5));
for (int i = 0; i < 4; i+=2) {
res.v[i] = x.v[i+1];
res.v[i+1] = x.v[i];
}
return res;
} }
} // end namespace internal } // end namespace internal

View File

@ -22,6 +22,9 @@ struct Packet2cf
__m128 v; __m128 v;
}; };
// Use the packet_traits defined in AVX/PacketMath.h instead if we're going
// to leverage AVX instructions.
#ifndef EIGEN_VECTORIZE_AVX
template<> struct packet_traits<std::complex<float> > : default_packet_traits template<> struct packet_traits<std::complex<float> > : default_packet_traits
{ {
typedef Packet2cf type; typedef Packet2cf type;
@ -42,6 +45,7 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits
HasSetLinear = 0 HasSetLinear = 0
}; };
}; };
#endif
template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; }; template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; };
@ -248,6 +252,9 @@ struct Packet1cd
__m128d v; __m128d v;
}; };
// Use the packet_traits defined in AVX/PacketMath.h instead if we're going
// to leverage AVX instructions.
#ifndef EIGEN_VECTORIZE_AVX
template<> struct packet_traits<std::complex<double> > : default_packet_traits template<> struct packet_traits<std::complex<double> > : default_packet_traits
{ {
typedef Packet1cd type; typedef Packet1cd type;
@ -268,6 +275,7 @@ template<> struct packet_traits<std::complex<double> > : default_packet_traits
HasSetLinear = 0 HasSetLinear = 0
}; };
}; };
#endif
template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1}; }; template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1}; };