mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-04-19 16:19:37 +08:00
Remove unused packet op "palign".
Clean up a compiler warning in c++03 mode in AVX512/Complex.h.
This commit is contained in:
parent
74ec8e6618
commit
225ab040e0
@ -685,35 +685,6 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt_ro(const typename unpacket_t
|
||||
return ploadt<Packet, LoadMode>(from);
|
||||
}
|
||||
|
||||
/** \internal default implementation of palign() allowing partial specialization */
|
||||
template<int Offset,typename PacketType>
|
||||
struct palign_impl
|
||||
{
|
||||
// by default data are aligned, so there is nothing to be done :)
|
||||
static inline void run(PacketType&, const PacketType&) {}
|
||||
};
|
||||
|
||||
/** \internal update \a first using the concatenation of the packet_size minus \a Offset last elements
|
||||
* of \a first and \a Offset first elements of \a second.
|
||||
*
|
||||
* This function is currently only used to optimize matrix-vector products on unligned matrices.
|
||||
* It takes 2 packets that represent a contiguous memory array, and returns a packet starting
|
||||
* at the position \a Offset. For instance, for packets of 4 elements, we have:
|
||||
* Input:
|
||||
* - first = {f0,f1,f2,f3}
|
||||
* - second = {s0,s1,s2,s3}
|
||||
* Output:
|
||||
* - if Offset==0 then {f0,f1,f2,f3}
|
||||
* - if Offset==1 then {f1,f2,f3,s0}
|
||||
* - if Offset==2 then {f2,f3,s0,s1}
|
||||
* - if Offset==3 then {f3,s0,s1,s3}
|
||||
*/
|
||||
template<int Offset,typename PacketType>
|
||||
inline void palign(PacketType& first, const PacketType& second)
|
||||
{
|
||||
palign_impl<Offset,PacketType>::run(first,second);
|
||||
}
|
||||
|
||||
/***************************************************************************
|
||||
* Fast complex products (GCC generates a function call which is very slow)
|
||||
***************************************************************************/
|
||||
|
@ -157,16 +157,6 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet4cf>(const P
|
||||
Packet2cf(_mm256_extractf128_ps(a.v, 1))));
|
||||
}
|
||||
|
||||
template<int Offset>
|
||||
struct palign_impl<Offset,Packet4cf>
|
||||
{
|
||||
static EIGEN_STRONG_INLINE void run(Packet4cf& first, const Packet4cf& second)
|
||||
{
|
||||
if (Offset==0) return;
|
||||
palign_impl<Offset*2,Packet8f>::run(first.v, second.v);
|
||||
}
|
||||
};
|
||||
|
||||
template<> struct conj_helper<Packet4cf, Packet4cf, false,true>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
|
||||
@ -339,16 +329,6 @@ template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet2cd>(const
|
||||
Packet1cd(_mm256_extractf128_pd(a.v,1))));
|
||||
}
|
||||
|
||||
template<int Offset>
|
||||
struct palign_impl<Offset,Packet2cd>
|
||||
{
|
||||
static EIGEN_STRONG_INLINE void run(Packet2cd& first, const Packet2cd& second)
|
||||
{
|
||||
if (Offset==0) return;
|
||||
palign_impl<Offset*2,Packet4d>::run(first.v, second.v);
|
||||
}
|
||||
};
|
||||
|
||||
template<> struct conj_helper<Packet2cd, Packet2cd, false,true>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
|
||||
|
@ -691,93 +691,6 @@ template<> EIGEN_STRONG_INLINE bool predux_any(const Packet8f& x)
|
||||
return _mm256_movemask_ps(x)!=0;
|
||||
}
|
||||
|
||||
template<int Offset>
|
||||
struct palign_impl<Offset,Packet8f>
|
||||
{
|
||||
static EIGEN_STRONG_INLINE void run(Packet8f& first, const Packet8f& second)
|
||||
{
|
||||
if (Offset==1)
|
||||
{
|
||||
first = _mm256_blend_ps(first, second, 1);
|
||||
Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1));
|
||||
Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1);
|
||||
first = _mm256_blend_ps(tmp1, tmp2, 0x88);
|
||||
}
|
||||
else if (Offset==2)
|
||||
{
|
||||
first = _mm256_blend_ps(first, second, 3);
|
||||
Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2));
|
||||
Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1);
|
||||
first = _mm256_blend_ps(tmp1, tmp2, 0xcc);
|
||||
}
|
||||
else if (Offset==3)
|
||||
{
|
||||
first = _mm256_blend_ps(first, second, 7);
|
||||
Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3));
|
||||
Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1);
|
||||
first = _mm256_blend_ps(tmp1, tmp2, 0xee);
|
||||
}
|
||||
else if (Offset==4)
|
||||
{
|
||||
first = _mm256_blend_ps(first, second, 15);
|
||||
Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(3,2,1,0));
|
||||
Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1);
|
||||
first = _mm256_permute_ps(tmp2, _MM_SHUFFLE(3,2,1,0));
|
||||
}
|
||||
else if (Offset==5)
|
||||
{
|
||||
first = _mm256_blend_ps(first, second, 31);
|
||||
first = _mm256_permute2f128_ps(first, first, 1);
|
||||
Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1));
|
||||
first = _mm256_permute2f128_ps(tmp, tmp, 1);
|
||||
first = _mm256_blend_ps(tmp, first, 0x88);
|
||||
}
|
||||
else if (Offset==6)
|
||||
{
|
||||
first = _mm256_blend_ps(first, second, 63);
|
||||
first = _mm256_permute2f128_ps(first, first, 1);
|
||||
Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2));
|
||||
first = _mm256_permute2f128_ps(tmp, tmp, 1);
|
||||
first = _mm256_blend_ps(tmp, first, 0xcc);
|
||||
}
|
||||
else if (Offset==7)
|
||||
{
|
||||
first = _mm256_blend_ps(first, second, 127);
|
||||
first = _mm256_permute2f128_ps(first, first, 1);
|
||||
Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3));
|
||||
first = _mm256_permute2f128_ps(tmp, tmp, 1);
|
||||
first = _mm256_blend_ps(tmp, first, 0xee);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<int Offset>
|
||||
struct palign_impl<Offset,Packet4d>
|
||||
{
|
||||
static EIGEN_STRONG_INLINE void run(Packet4d& first, const Packet4d& second)
|
||||
{
|
||||
if (Offset==1)
|
||||
{
|
||||
first = _mm256_blend_pd(first, second, 1);
|
||||
__m256d tmp = _mm256_permute_pd(first, 5);
|
||||
first = _mm256_permute2f128_pd(tmp, tmp, 1);
|
||||
first = _mm256_blend_pd(tmp, first, 0xA);
|
||||
}
|
||||
else if (Offset==2)
|
||||
{
|
||||
first = _mm256_blend_pd(first, second, 3);
|
||||
first = _mm256_permute2f128_pd(first, first, 1);
|
||||
}
|
||||
else if (Offset==3)
|
||||
{
|
||||
first = _mm256_blend_pd(first, second, 7);
|
||||
__m256d tmp = _mm256_permute_pd(first, 5);
|
||||
first = _mm256_permute2f128_pd(tmp, tmp, 1);
|
||||
first = _mm256_blend_pd(tmp, first, 5);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC inline void
|
||||
ptranspose(PacketBlock<Packet8f,8>& kernel) {
|
||||
__m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
|
||||
@ -1078,16 +991,6 @@ template<> EIGEN_STRONG_INLINE Packet8h pinsertlast(const Packet8h& a, Eigen::ha
|
||||
return _mm_insert_epi16(a,int(b.x),7);
|
||||
}
|
||||
|
||||
template<int Offset>
|
||||
struct palign_impl<Offset,Packet8h>
|
||||
{
|
||||
static EIGEN_STRONG_INLINE void run(Packet8h& first, const Packet8h& second)
|
||||
{
|
||||
if (Offset!=0)
|
||||
first = _mm_alignr_epi8(second,first, Offset*2);
|
||||
}
|
||||
};
|
||||
|
||||
EIGEN_STRONG_INLINE void
|
||||
ptranspose(PacketBlock<Packet8h,8>& kernel) {
|
||||
__m128i a = kernel.packet[0];
|
||||
|
@ -153,16 +153,6 @@ EIGEN_STRONG_INLINE Packet4cf predux_half_dowto4<Packet8cf>(const Packet8cf& a)
|
||||
return Packet4cf(res);
|
||||
}
|
||||
|
||||
template<int Offset>
|
||||
struct palign_impl<Offset,Packet8cf>
|
||||
{
|
||||
static EIGEN_STRONG_INLINE void run(Packet8cf& first, const Packet8cf& second)
|
||||
{
|
||||
if (Offset==0) return;
|
||||
palign_impl<Offset*2,Packet16f>::run(first.v, second.v);
|
||||
}
|
||||
};
|
||||
|
||||
template<> struct conj_helper<Packet8cf, Packet8cf, false,true>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Packet8cf pmadd(const Packet8cf& x, const Packet8cf& y, const Packet8cf& c) const
|
||||
@ -239,7 +229,7 @@ template<> struct packet_traits<std::complex<double> > : default_packet_traits
|
||||
HasAbs2 = 0,
|
||||
HasMin = 0,
|
||||
HasMax = 0,
|
||||
HasSetLinear = 0,
|
||||
HasSetLinear = 0
|
||||
};
|
||||
};
|
||||
|
||||
@ -351,16 +341,6 @@ template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet4cd>(const
|
||||
Packet2cd(_mm512_extractf64x4_pd(a.v,1))));
|
||||
}
|
||||
|
||||
template<int Offset>
|
||||
struct palign_impl<Offset,Packet4cd>
|
||||
{
|
||||
static EIGEN_STRONG_INLINE void run(Packet4cd& first, const Packet4cd& second)
|
||||
{
|
||||
if (Offset==0) return;
|
||||
palign_impl<Offset*2,Packet8d>::run(first.v, second.v);
|
||||
}
|
||||
};
|
||||
|
||||
template<> struct conj_helper<Packet4cd, Packet4cd, false,true>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const
|
||||
|
@ -919,52 +919,6 @@ template<> EIGEN_STRONG_INLINE bool predux_any(const Packet16f& x)
|
||||
return !_mm512_kortestz(tmp,tmp);
|
||||
}
|
||||
|
||||
template <int Offset>
|
||||
struct palign_impl<Offset, Packet16f> {
|
||||
static EIGEN_STRONG_INLINE void run(Packet16f& first,
|
||||
const Packet16f& second) {
|
||||
if (Offset != 0) {
|
||||
__m512i first_idx = _mm512_set_epi32(
|
||||
Offset + 15, Offset + 14, Offset + 13, Offset + 12, Offset + 11,
|
||||
Offset + 10, Offset + 9, Offset + 8, Offset + 7, Offset + 6,
|
||||
Offset + 5, Offset + 4, Offset + 3, Offset + 2, Offset + 1, Offset);
|
||||
|
||||
__m512i second_idx =
|
||||
_mm512_set_epi32(Offset - 1, Offset - 2, Offset - 3, Offset - 4,
|
||||
Offset - 5, Offset - 6, Offset - 7, Offset - 8,
|
||||
Offset - 9, Offset - 10, Offset - 11, Offset - 12,
|
||||
Offset - 13, Offset - 14, Offset - 15, Offset - 16);
|
||||
|
||||
unsigned short mask = 0xFFFF;
|
||||
mask <<= (16 - Offset);
|
||||
|
||||
first = _mm512_permutexvar_ps(first_idx, first);
|
||||
Packet16f tmp = _mm512_permutexvar_ps(second_idx, second);
|
||||
first = _mm512_mask_blend_ps(mask, first, tmp);
|
||||
}
|
||||
}
|
||||
};
|
||||
template <int Offset>
|
||||
struct palign_impl<Offset, Packet8d> {
|
||||
static EIGEN_STRONG_INLINE void run(Packet8d& first, const Packet8d& second) {
|
||||
if (Offset != 0) {
|
||||
__m512i first_idx = _mm512_set_epi32(
|
||||
0, Offset + 7, 0, Offset + 6, 0, Offset + 5, 0, Offset + 4, 0,
|
||||
Offset + 3, 0, Offset + 2, 0, Offset + 1, 0, Offset);
|
||||
|
||||
__m512i second_idx = _mm512_set_epi32(
|
||||
0, Offset - 1, 0, Offset - 2, 0, Offset - 3, 0, Offset - 4, 0,
|
||||
Offset - 5, 0, Offset - 6, 0, Offset - 7, 0, Offset - 8);
|
||||
|
||||
unsigned char mask = 0xFF;
|
||||
mask <<= (8 - Offset);
|
||||
|
||||
first = _mm512_permutexvar_pd(first_idx, first);
|
||||
Packet8d tmp = _mm512_permutexvar_pd(second_idx, second);
|
||||
first = _mm512_mask_blend_pd(mask, first, tmp);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
#define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE) \
|
||||
|
@ -159,22 +159,6 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const P
|
||||
return pfirst<Packet2cf>(prod);
|
||||
}
|
||||
|
||||
template<int Offset>
|
||||
struct palign_impl<Offset,Packet2cf>
|
||||
{
|
||||
static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second)
|
||||
{
|
||||
if (Offset==1)
|
||||
{
|
||||
#ifdef _BIG_ENDIAN
|
||||
first.v = vec_sld(first.v, second.v, 8);
|
||||
#else
|
||||
first.v = vec_sld(second.v, first.v, 8);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<> struct conj_helper<Packet2cf, Packet2cf, false,true>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
|
||||
@ -346,16 +330,6 @@ template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Pack
|
||||
|
||||
template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
|
||||
|
||||
template<int Offset>
|
||||
struct palign_impl<Offset,Packet1cd>
|
||||
{
|
||||
static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/)
|
||||
{
|
||||
// FIXME is it sure we never have to align a Packet1cd?
|
||||
// Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary...
|
||||
}
|
||||
};
|
||||
|
||||
template<> struct conj_helper<Packet1cd, Packet1cd, false,true>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
|
||||
|
@ -1524,176 +1524,6 @@ template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
|
||||
return vec_any_ne(x, pzero(x));
|
||||
}
|
||||
|
||||
template<int Offset>
|
||||
struct palign_impl<Offset,Packet4f>
|
||||
{
|
||||
static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
|
||||
{
|
||||
#ifdef _BIG_ENDIAN
|
||||
switch (Offset % 4) {
|
||||
case 1:
|
||||
first = vec_sld(first, second, 4); break;
|
||||
case 2:
|
||||
first = vec_sld(first, second, 8); break;
|
||||
case 3:
|
||||
first = vec_sld(first, second, 12); break;
|
||||
}
|
||||
#else
|
||||
switch (Offset % 4) {
|
||||
case 1:
|
||||
first = vec_sld(second, first, 12); break;
|
||||
case 2:
|
||||
first = vec_sld(second, first, 8); break;
|
||||
case 3:
|
||||
first = vec_sld(second, first, 4); break;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
template<int Offset>
|
||||
struct palign_impl<Offset,Packet4i>
|
||||
{
|
||||
static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
|
||||
{
|
||||
#ifdef _BIG_ENDIAN
|
||||
switch (Offset % 4) {
|
||||
case 1:
|
||||
first = vec_sld(first, second, 4); break;
|
||||
case 2:
|
||||
first = vec_sld(first, second, 8); break;
|
||||
case 3:
|
||||
first = vec_sld(first, second, 12); break;
|
||||
}
|
||||
#else
|
||||
switch (Offset % 4) {
|
||||
case 1:
|
||||
first = vec_sld(second, first, 12); break;
|
||||
case 2:
|
||||
first = vec_sld(second, first, 8); break;
|
||||
case 3:
|
||||
first = vec_sld(second, first, 4); break;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
template<int Offset>
|
||||
struct palign_impl<Offset,Packet8s>
|
||||
{
|
||||
static EIGEN_STRONG_INLINE void run(Packet8s& first, const Packet8s& second)
|
||||
{
|
||||
#ifdef _BIG_ENDIAN
|
||||
switch (Offset % 8) {
|
||||
case 1:
|
||||
first = vec_sld(first, second, 2); break;
|
||||
case 2:
|
||||
first = vec_sld(first, second, 4); break;
|
||||
case 3:
|
||||
first = vec_sld(first, second, 6); break;
|
||||
case 4:
|
||||
first = vec_sld(first, second, 8); break;
|
||||
case 5:
|
||||
first = vec_sld(first, second, 10); break;
|
||||
case 6:
|
||||
first = vec_sld(first, second, 12); break;
|
||||
case 7:
|
||||
first = vec_sld(first, second, 14); break;
|
||||
}
|
||||
#else
|
||||
switch (Offset % 8) {
|
||||
case 1:
|
||||
first = vec_sld(second, first, 14); break;
|
||||
case 2:
|
||||
first = vec_sld(second, first, 12); break;
|
||||
case 3:
|
||||
first = vec_sld(second, first, 10); break;
|
||||
case 4:
|
||||
first = vec_sld(second, first, 8); break;
|
||||
case 5:
|
||||
first = vec_sld(second, first, 6); break;
|
||||
case 6:
|
||||
first = vec_sld(second, first, 4); break;
|
||||
case 7:
|
||||
first = vec_sld(second, first, 2); break;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
template<int Offset>
|
||||
struct palign_impl<Offset,Packet8us>
|
||||
{
|
||||
static EIGEN_STRONG_INLINE void run(Packet8us& first, const Packet8us& second)
|
||||
{
|
||||
#ifdef _BIG_ENDIAN
|
||||
switch (Offset % 8) {
|
||||
case 1:
|
||||
first = vec_sld(first, second, 2); break;
|
||||
case 2:
|
||||
first = vec_sld(first, second, 4); break;
|
||||
case 3:
|
||||
first = vec_sld(first, second, 6); break;
|
||||
case 4:
|
||||
first = vec_sld(first, second, 8); break;
|
||||
case 5:
|
||||
first = vec_sld(first, second, 10); break;
|
||||
case 6:
|
||||
first = vec_sld(first, second, 12); break;
|
||||
case 7:
|
||||
first = vec_sld(first, second, 14); break;
|
||||
}
|
||||
#else
|
||||
switch (Offset % 8) {
|
||||
case 1:
|
||||
first = vec_sld(second, first, 14); break;
|
||||
case 2:
|
||||
first = vec_sld(second, first, 12); break;
|
||||
case 3:
|
||||
first = vec_sld(second, first, 10); break;
|
||||
case 4:
|
||||
first = vec_sld(second, first, 8); break;
|
||||
case 5:
|
||||
first = vec_sld(second, first, 6); break;
|
||||
case 6:
|
||||
first = vec_sld(second, first, 4); break;
|
||||
case 7:
|
||||
first = vec_sld(second, first, 2); break;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
template<int Offset>
|
||||
struct palign_impl<Offset,Packet16c>
|
||||
{
|
||||
static EIGEN_STRONG_INLINE void run(Packet16c& first, const Packet16c& second)
|
||||
{
|
||||
const int shift = Offset % 16;
|
||||
if ( shift == 0 ) return;
|
||||
#ifdef _BIG_ENDIAN
|
||||
first = vec_sld(first, second, shift);
|
||||
#else
|
||||
first = vec_sld(first, second, shift);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
template<int Offset>
|
||||
struct palign_impl<Offset,Packet16uc>
|
||||
{
|
||||
static EIGEN_STRONG_INLINE void run(Packet16uc& first, const Packet16uc& second)
|
||||
{
|
||||
const int shift = Offset % 16;
|
||||
if ( shift == 0 ) return;
|
||||
#ifdef _BIG_ENDIAN
|
||||
first = vec_sld(first, second, shift);
|
||||
#else
|
||||
first = vec_sld(first, second, shift);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC inline void
|
||||
ptranspose(PacketBlock<Packet4f,4>& kernel) {
|
||||
Packet4f t0, t1, t2, t3;
|
||||
@ -2362,20 +2192,6 @@ template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
|
||||
return pfirst(pmax(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
|
||||
}
|
||||
|
||||
template<int Offset>
|
||||
struct palign_impl<Offset,Packet2d>
|
||||
{
|
||||
static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second)
|
||||
{
|
||||
if (Offset == 1)
|
||||
#ifdef _BIG_ENDIAN
|
||||
first = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(first), reinterpret_cast<Packet4ui>(second), 8));
|
||||
#else
|
||||
first = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(second), reinterpret_cast<Packet4ui>(first), 8));
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC inline void
|
||||
ptranspose(PacketBlock<Packet2d,2>& kernel) {
|
||||
Packet2d t0, t1;
|
||||
|
@ -305,15 +305,6 @@ EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a
|
||||
(a.v[0] * a.v[3]) + (a.v[1] * a.v[2]));
|
||||
}
|
||||
|
||||
template <int Offset>
|
||||
struct palign_impl<Offset, Packet2cf> {
|
||||
EIGEN_STRONG_INLINE static void run(Packet2cf& first, const Packet2cf& second) {
|
||||
if (Offset == 1) {
|
||||
first.v = (Packet4f)__builtin_msa_sldi_b((v16i8)second.v, (v16i8)first.v, Offset * 8);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct conj_helper<Packet2cf, Packet2cf, false, true> {
|
||||
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y,
|
||||
@ -653,15 +644,6 @@ EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd&
|
||||
return pfirst(a);
|
||||
}
|
||||
|
||||
template <int Offset>
|
||||
struct palign_impl<Offset, Packet1cd> {
|
||||
static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/) {
|
||||
// FIXME is it sure we never have to align a Packet1cd?
|
||||
// Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes
|
||||
// boundary...
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct conj_helper<Packet1cd, Packet1cd, false, true> {
|
||||
EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y,
|
||||
|
@ -675,25 +675,6 @@ EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a) {
|
||||
return m[0];
|
||||
}
|
||||
|
||||
#define PALIGN_MSA(Offset, Type, Command) \
|
||||
template <> \
|
||||
struct palign_impl<Offset, Type> { \
|
||||
EIGEN_STRONG_INLINE static void run(Type& first, const Type& second) { \
|
||||
if (Offset != 0) first = (Type)(Command((v16i8)second, (v16i8)first, Offset * 4)); \
|
||||
} \
|
||||
};
|
||||
|
||||
PALIGN_MSA(0, Packet4f, __builtin_msa_sldi_b)
|
||||
PALIGN_MSA(1, Packet4f, __builtin_msa_sldi_b)
|
||||
PALIGN_MSA(2, Packet4f, __builtin_msa_sldi_b)
|
||||
PALIGN_MSA(3, Packet4f, __builtin_msa_sldi_b)
|
||||
PALIGN_MSA(0, Packet4i, __builtin_msa_sldi_b)
|
||||
PALIGN_MSA(1, Packet4i, __builtin_msa_sldi_b)
|
||||
PALIGN_MSA(2, Packet4i, __builtin_msa_sldi_b)
|
||||
PALIGN_MSA(3, Packet4i, __builtin_msa_sldi_b)
|
||||
|
||||
#undef PALIGN_MSA
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet4f, 4>& value) {
|
||||
os << "[ " << value.packet[0] << "," << std::endl
|
||||
<< " " << value.packet[1] << "," << std::endl
|
||||
@ -1168,19 +1149,6 @@ EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) {
|
||||
#endif
|
||||
}
|
||||
|
||||
#define PALIGN_MSA(Offset, Type, Command) \
|
||||
template <> \
|
||||
struct palign_impl<Offset, Type> { \
|
||||
EIGEN_STRONG_INLINE static void run(Type& first, const Type& second) { \
|
||||
if (Offset != 0) first = (Type)(Command((v16i8)second, (v16i8)first, Offset * 8)); \
|
||||
} \
|
||||
};
|
||||
|
||||
PALIGN_MSA(0, Packet2d, __builtin_msa_sldi_b)
|
||||
PALIGN_MSA(1, Packet2d, __builtin_msa_sldi_b)
|
||||
|
||||
#undef PALIGN_MSA
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet2d, 2>& value) {
|
||||
os << "[ " << value.packet[0] << "," << std::endl << " " << value.packet[1] << " ]";
|
||||
return os;
|
||||
|
@ -340,16 +340,6 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const P
|
||||
return s;
|
||||
}
|
||||
|
||||
template<int Offset>
|
||||
struct palign_impl<Offset,Packet2cf>
|
||||
{
|
||||
EIGEN_STRONG_INLINE static void run(Packet2cf& first, const Packet2cf& second)
|
||||
{
|
||||
if (Offset == 1)
|
||||
first.v = vextq_f32(first.v, second.v, 2);
|
||||
}
|
||||
};
|
||||
|
||||
template<> struct conj_helper<Packet1cf,Packet1cf,false,true>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Packet1cf pmadd(const Packet1cf& x, const Packet1cf& y, const Packet1cf& c) const
|
||||
@ -602,16 +592,6 @@ template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Pack
|
||||
|
||||
template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
|
||||
|
||||
template<int Offset>
|
||||
struct palign_impl<Offset,Packet1cd>
|
||||
{
|
||||
static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/)
|
||||
{
|
||||
// FIXME is it sure we never have to align a Packet1cd?
|
||||
// Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary...
|
||||
}
|
||||
};
|
||||
|
||||
template<> struct conj_helper<Packet1cd, Packet1cd, false,true>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
|
||||
|
@ -2708,147 +2708,6 @@ template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
|
||||
return vget_lane_u32(vpmax_u32(tmp, tmp), 0);
|
||||
}
|
||||
|
||||
// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors,
|
||||
// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074
|
||||
#define PALIGN_NEON(Offset,Type,Command) \
|
||||
template<>\
|
||||
struct palign_impl<Offset,Type>\
|
||||
{\
|
||||
EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\
|
||||
{\
|
||||
if (Offset!=0)\
|
||||
first = Command(first, second, Offset);\
|
||||
}\
|
||||
};\
|
||||
|
||||
template<typename T>
|
||||
EIGEN_STRONG_INLINE T palign_4c(const T& first, const T &second, const int n)
|
||||
{
|
||||
return static_cast<T>((static_cast<uint32_t>(second) << (32 - n * 8)) | (static_cast<uint32_t>(first) >> (n * 8)));
|
||||
}
|
||||
|
||||
PALIGN_NEON(0, Packet2f, vext_f32)
|
||||
PALIGN_NEON(1, Packet2f, vext_f32)
|
||||
|
||||
PALIGN_NEON(0, Packet4f, vextq_f32)
|
||||
PALIGN_NEON(1, Packet4f, vextq_f32)
|
||||
PALIGN_NEON(2, Packet4f, vextq_f32)
|
||||
PALIGN_NEON(3, Packet4f, vextq_f32)
|
||||
|
||||
PALIGN_NEON(0, Packet4c, palign_4c)
|
||||
PALIGN_NEON(1, Packet4c, palign_4c)
|
||||
PALIGN_NEON(2, Packet4c, palign_4c)
|
||||
PALIGN_NEON(3, Packet4c, palign_4c)
|
||||
|
||||
PALIGN_NEON(0, Packet8c, vext_s8)
|
||||
PALIGN_NEON(1, Packet8c, vext_s8)
|
||||
PALIGN_NEON(2, Packet8c, vext_s8)
|
||||
PALIGN_NEON(3, Packet8c, vext_s8)
|
||||
PALIGN_NEON(4, Packet8c, vext_s8)
|
||||
PALIGN_NEON(5, Packet8c, vext_s8)
|
||||
PALIGN_NEON(6, Packet8c, vext_s8)
|
||||
PALIGN_NEON(7, Packet8c, vext_s8)
|
||||
|
||||
PALIGN_NEON(0, Packet16c, vextq_s8)
|
||||
PALIGN_NEON(1, Packet16c, vextq_s8)
|
||||
PALIGN_NEON(2, Packet16c, vextq_s8)
|
||||
PALIGN_NEON(3, Packet16c, vextq_s8)
|
||||
PALIGN_NEON(4, Packet16c, vextq_s8)
|
||||
PALIGN_NEON(5, Packet16c, vextq_s8)
|
||||
PALIGN_NEON(6, Packet16c, vextq_s8)
|
||||
PALIGN_NEON(7, Packet16c, vextq_s8)
|
||||
PALIGN_NEON(8, Packet16c, vextq_s8)
|
||||
PALIGN_NEON(9, Packet16c, vextq_s8)
|
||||
PALIGN_NEON(10, Packet16c, vextq_s8)
|
||||
PALIGN_NEON(11, Packet16c, vextq_s8)
|
||||
PALIGN_NEON(12, Packet16c, vextq_s8)
|
||||
PALIGN_NEON(13, Packet16c, vextq_s8)
|
||||
PALIGN_NEON(14, Packet16c, vextq_s8)
|
||||
PALIGN_NEON(15, Packet16c, vextq_s8)
|
||||
|
||||
PALIGN_NEON(0, Packet4uc, palign_4c)
|
||||
PALIGN_NEON(1, Packet4uc, palign_4c)
|
||||
PALIGN_NEON(2, Packet4uc, palign_4c)
|
||||
PALIGN_NEON(3, Packet4uc, palign_4c)
|
||||
|
||||
PALIGN_NEON(0, Packet8uc, vext_u8)
|
||||
PALIGN_NEON(1, Packet8uc, vext_u8)
|
||||
PALIGN_NEON(2, Packet8uc, vext_u8)
|
||||
PALIGN_NEON(3, Packet8uc, vext_u8)
|
||||
PALIGN_NEON(4, Packet8uc, vext_u8)
|
||||
PALIGN_NEON(5, Packet8uc, vext_u8)
|
||||
PALIGN_NEON(6, Packet8uc, vext_u8)
|
||||
PALIGN_NEON(7, Packet8uc, vext_u8)
|
||||
|
||||
PALIGN_NEON(0, Packet16uc, vextq_u8)
|
||||
PALIGN_NEON(1, Packet16uc, vextq_u8)
|
||||
PALIGN_NEON(2, Packet16uc, vextq_u8)
|
||||
PALIGN_NEON(3, Packet16uc, vextq_u8)
|
||||
PALIGN_NEON(4, Packet16uc, vextq_u8)
|
||||
PALIGN_NEON(5, Packet16uc, vextq_u8)
|
||||
PALIGN_NEON(6, Packet16uc, vextq_u8)
|
||||
PALIGN_NEON(7, Packet16uc, vextq_u8)
|
||||
PALIGN_NEON(8, Packet16uc, vextq_u8)
|
||||
PALIGN_NEON(9, Packet16uc, vextq_u8)
|
||||
PALIGN_NEON(10, Packet16uc, vextq_u8)
|
||||
PALIGN_NEON(11, Packet16uc, vextq_u8)
|
||||
PALIGN_NEON(12, Packet16uc, vextq_u8)
|
||||
PALIGN_NEON(13, Packet16uc, vextq_u8)
|
||||
PALIGN_NEON(14, Packet16uc, vextq_u8)
|
||||
PALIGN_NEON(15, Packet16uc, vextq_u8)
|
||||
|
||||
PALIGN_NEON(0, Packet4s, vext_s16)
|
||||
PALIGN_NEON(1, Packet4s, vext_s16)
|
||||
PALIGN_NEON(2, Packet4s, vext_s16)
|
||||
PALIGN_NEON(3, Packet4s, vext_s16)
|
||||
|
||||
PALIGN_NEON(0, Packet8s, vextq_s16)
|
||||
PALIGN_NEON(1, Packet8s, vextq_s16)
|
||||
PALIGN_NEON(2, Packet8s, vextq_s16)
|
||||
PALIGN_NEON(3, Packet8s, vextq_s16)
|
||||
PALIGN_NEON(4, Packet8s, vextq_s16)
|
||||
PALIGN_NEON(5, Packet8s, vextq_s16)
|
||||
PALIGN_NEON(6, Packet8s, vextq_s16)
|
||||
PALIGN_NEON(7, Packet8s, vextq_s16)
|
||||
|
||||
PALIGN_NEON(0, Packet4us, vext_u16)
|
||||
PALIGN_NEON(1, Packet4us, vext_u16)
|
||||
PALIGN_NEON(2, Packet4us, vext_u16)
|
||||
PALIGN_NEON(3, Packet4us, vext_u16)
|
||||
|
||||
PALIGN_NEON(0, Packet8us, vextq_u16)
|
||||
PALIGN_NEON(1, Packet8us, vextq_u16)
|
||||
PALIGN_NEON(2, Packet8us, vextq_u16)
|
||||
PALIGN_NEON(3, Packet8us, vextq_u16)
|
||||
PALIGN_NEON(4, Packet8us, vextq_u16)
|
||||
PALIGN_NEON(5, Packet8us, vextq_u16)
|
||||
PALIGN_NEON(6, Packet8us, vextq_u16)
|
||||
PALIGN_NEON(7, Packet8us, vextq_u16)
|
||||
|
||||
PALIGN_NEON(0, Packet2i, vext_s32)
|
||||
PALIGN_NEON(1, Packet2i, vext_s32)
|
||||
|
||||
PALIGN_NEON(0, Packet4i, vextq_s32)
|
||||
PALIGN_NEON(1, Packet4i, vextq_s32)
|
||||
PALIGN_NEON(2, Packet4i, vextq_s32)
|
||||
PALIGN_NEON(3, Packet4i, vextq_s32)
|
||||
|
||||
PALIGN_NEON(0, Packet2ui, vext_u32)
|
||||
PALIGN_NEON(1, Packet2ui, vext_u32)
|
||||
|
||||
PALIGN_NEON(0, Packet4ui, vextq_u32)
|
||||
PALIGN_NEON(1, Packet4ui, vextq_u32)
|
||||
PALIGN_NEON(2, Packet4ui, vextq_u32)
|
||||
PALIGN_NEON(3, Packet4ui, vextq_u32)
|
||||
|
||||
PALIGN_NEON(0, Packet2l, vextq_s64)
|
||||
PALIGN_NEON(1, Packet2l, vextq_s64)
|
||||
|
||||
PALIGN_NEON(0, Packet2ul, vextq_u64)
|
||||
PALIGN_NEON(1, Packet2ul, vextq_u64)
|
||||
|
||||
#undef PALIGN_NEON
|
||||
|
||||
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2f, 2>& kernel)
|
||||
{
|
||||
const float32x2x2_t z = vzip_f32(kernel.packet[0], kernel.packet[1]);
|
||||
@ -3563,22 +3422,6 @@ template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
|
||||
template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
|
||||
{ return vgetq_lane_f64(vpmaxq_f64(a,a), 0); }
|
||||
|
||||
// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors,
|
||||
// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074
|
||||
#define PALIGN_NEON(Offset,Type,Command) \
|
||||
template<>\
|
||||
struct palign_impl<Offset,Type>\
|
||||
{\
|
||||
EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\
|
||||
{\
|
||||
if (Offset!=0)\
|
||||
first = Command(first, second, Offset);\
|
||||
}\
|
||||
};\
|
||||
|
||||
PALIGN_NEON(0, Packet2d, vextq_f64)
|
||||
PALIGN_NEON(1, Packet2d, vextq_f64)
|
||||
#undef PALIGN_NEON
|
||||
|
||||
EIGEN_DEVICE_FUNC inline void
|
||||
ptranspose(PacketBlock<Packet2d, 2>& kernel)
|
||||
|
@ -161,19 +161,6 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const P
|
||||
return pfirst(pmul(a, Packet2cf(_mm_movehl_ps(a.v,a.v))));
|
||||
}
|
||||
|
||||
template<int Offset>
|
||||
struct palign_impl<Offset,Packet2cf>
|
||||
{
|
||||
static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second)
|
||||
{
|
||||
if (Offset==1)
|
||||
{
|
||||
first.v = _mm_movehl_ps(first.v, first.v);
|
||||
first.v = _mm_movelh_ps(first.v, second.v);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<> struct conj_helper<Packet2cf, Packet2cf, false,true>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
|
||||
@ -346,16 +333,6 @@ template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const
|
||||
return pfirst(a);
|
||||
}
|
||||
|
||||
template<int Offset>
|
||||
struct palign_impl<Offset,Packet1cd>
|
||||
{
|
||||
static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/)
|
||||
{
|
||||
// FIXME is it sure we never have to align a Packet1cd?
|
||||
// Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary...
|
||||
}
|
||||
};
|
||||
|
||||
template<> struct conj_helper<Packet1cd, Packet1cd, false,true>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
|
||||
|
@ -867,114 +867,6 @@ template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
|
||||
return _mm_movemask_ps(x) != 0x0;
|
||||
}
|
||||
|
||||
#if EIGEN_COMP_GNUC
|
||||
// template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
|
||||
// {
|
||||
// Packet4f res = b;
|
||||
// asm("mulps %[a], %[b] \n\taddps %[c], %[b]" : [b] "+x" (res) : [a] "x" (a), [c] "x" (c));
|
||||
// return res;
|
||||
// }
|
||||
// EIGEN_STRONG_INLINE Packet4i _mm_alignr_epi8(const Packet4i& a, const Packet4i& b, const int i)
|
||||
// {
|
||||
// Packet4i res = a;
|
||||
// asm("palignr %[i], %[a], %[b] " : [b] "+x" (res) : [a] "x" (a), [i] "i" (i));
|
||||
// return res;
|
||||
// }
|
||||
#endif
|
||||
|
||||
#ifdef EIGEN_VECTORIZE_SSSE3
|
||||
// SSSE3 versions
|
||||
template<int Offset>
|
||||
struct palign_impl<Offset,Packet4f>
|
||||
{
|
||||
static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
|
||||
{
|
||||
if (Offset!=0)
|
||||
first = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(second), _mm_castps_si128(first), Offset*4));
|
||||
}
|
||||
};
|
||||
|
||||
template<int Offset>
|
||||
struct palign_impl<Offset,Packet4i>
|
||||
{
|
||||
static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
|
||||
{
|
||||
if (Offset!=0)
|
||||
first = _mm_alignr_epi8(second,first, Offset*4);
|
||||
}
|
||||
};
|
||||
|
||||
template<int Offset>
|
||||
struct palign_impl<Offset,Packet2d>
|
||||
{
|
||||
static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second)
|
||||
{
|
||||
if (Offset==1)
|
||||
first = _mm_castsi128_pd(_mm_alignr_epi8(_mm_castpd_si128(second), _mm_castpd_si128(first), 8));
|
||||
}
|
||||
};
|
||||
#else
|
||||
// SSE2 versions
|
||||
template<int Offset>
|
||||
struct palign_impl<Offset,Packet4f>
|
||||
{
|
||||
static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
|
||||
{
|
||||
if (Offset==1)
|
||||
{
|
||||
first = _mm_move_ss(first,second);
|
||||
first = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(first),0x39));
|
||||
}
|
||||
else if (Offset==2)
|
||||
{
|
||||
first = _mm_movehl_ps(first,first);
|
||||
first = _mm_movelh_ps(first,second);
|
||||
}
|
||||
else if (Offset==3)
|
||||
{
|
||||
first = _mm_move_ss(first,second);
|
||||
first = _mm_shuffle_ps(first,second,0x93);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<int Offset>
|
||||
struct palign_impl<Offset,Packet4i>
|
||||
{
|
||||
static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
|
||||
{
|
||||
if (Offset==1)
|
||||
{
|
||||
first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
|
||||
first = _mm_shuffle_epi32(first,0x39);
|
||||
}
|
||||
else if (Offset==2)
|
||||
{
|
||||
first = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(first)));
|
||||
first = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
|
||||
}
|
||||
else if (Offset==3)
|
||||
{
|
||||
first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
|
||||
first = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second),0x93));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<int Offset>
|
||||
struct palign_impl<Offset,Packet2d>
|
||||
{
|
||||
static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second)
|
||||
{
|
||||
if (Offset==1)
|
||||
{
|
||||
first = _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(first),_mm_castpd_ps(first)));
|
||||
first = _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(first),_mm_castpd_ps(second)));
|
||||
}
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
EIGEN_DEVICE_FUNC inline void
|
||||
ptranspose(PacketBlock<Packet4f,4>& kernel) {
|
||||
_MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]);
|
||||
|
@ -160,16 +160,6 @@ template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const
|
||||
{
|
||||
return pfirst(a);
|
||||
}
|
||||
template<int Offset>
|
||||
struct palign_impl<Offset,Packet1cd>
|
||||
{
|
||||
static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/)
|
||||
{
|
||||
// FIXME is it sure we never have to align a Packet1cd?
|
||||
// Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary...
|
||||
}
|
||||
};
|
||||
|
||||
template<> struct conj_helper<Packet1cd, Packet1cd, false,true>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
|
||||
@ -331,18 +321,6 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const P
|
||||
return res;
|
||||
}
|
||||
|
||||
template<int Offset>
|
||||
struct palign_impl<Offset,Packet2cf>
|
||||
{
|
||||
static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second)
|
||||
{
|
||||
if (Offset == 1) {
|
||||
first.cd[0] = first.cd[1];
|
||||
first.cd[1] = second.cd[0];
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<> struct conj_helper<Packet2cf, Packet2cf, false,true>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
|
||||
@ -457,18 +435,6 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const P
|
||||
return pfirst<Packet2cf>(prod);
|
||||
}
|
||||
|
||||
template<int Offset>
|
||||
struct palign_impl<Offset,Packet2cf>
|
||||
{
|
||||
static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second)
|
||||
{
|
||||
if (Offset==1)
|
||||
{
|
||||
first.v = vec_sld(first.v, second.v, 8);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<> struct conj_helper<Packet2cf, Packet2cf, false,true>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
|
||||
|
@ -298,33 +298,6 @@ inline std::ostream & operator <<(std::ostream & s, const Packet4f & v)
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
template<int Offset>
|
||||
struct palign_impl<Offset,Packet4i>
|
||||
{
|
||||
static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
|
||||
{
|
||||
switch (Offset % 4) {
|
||||
case 1:
|
||||
first = vec_sld(first, second, 4); break;
|
||||
case 2:
|
||||
first = vec_sld(first, second, 8); break;
|
||||
case 3:
|
||||
first = vec_sld(first, second, 12); break;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<int Offset>
|
||||
struct palign_impl<Offset,Packet2d>
|
||||
{
|
||||
static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second)
|
||||
{
|
||||
if (Offset == 1)
|
||||
first = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(first), reinterpret_cast<Packet4i>(second), 8));
|
||||
}
|
||||
};
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from)
|
||||
{
|
||||
// FIXME: No intrinsic yet
|
||||
@ -636,30 +609,6 @@ template<int element> EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Pack
|
||||
return splat;
|
||||
}
|
||||
|
||||
/* This is a tricky one, we have to translate float alignment to vector elements of sizeof double
|
||||
*/
|
||||
template<int Offset>
|
||||
struct palign_impl<Offset,Packet4f>
|
||||
{
|
||||
static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
|
||||
{
|
||||
switch (Offset % 4) {
|
||||
case 1:
|
||||
first.v4f[0] = vec_sld(first.v4f[0], first.v4f[1], 8);
|
||||
first.v4f[1] = vec_sld(first.v4f[1], second.v4f[0], 8);
|
||||
break;
|
||||
case 2:
|
||||
first.v4f[0] = first.v4f[1];
|
||||
first.v4f[1] = second.v4f[0];
|
||||
break;
|
||||
case 3:
|
||||
first.v4f[0] = vec_sld(first.v4f[1], second.v4f[0], 8);
|
||||
first.v4f[1] = vec_sld(second.v4f[0], second.v4f[1], 8);
|
||||
break;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
|
||||
{
|
||||
// FIXME: No intrinsic yet
|
||||
@ -942,22 +891,6 @@ template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, cons
|
||||
return result;
|
||||
}
|
||||
#else
|
||||
template<int Offset>
|
||||
struct palign_impl<Offset,Packet4f>
|
||||
{
|
||||
static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
|
||||
{
|
||||
switch (Offset % 4) {
|
||||
case 1:
|
||||
first = vec_sld(first, second, 4); break;
|
||||
case 2:
|
||||
first = vec_sld(first, second, 8); break;
|
||||
case 3:
|
||||
first = vec_sld(first, second, 12); break;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
|
||||
{
|
||||
// FIXME: No intrinsic yet
|
||||
|
@ -103,7 +103,6 @@ template<typename Scalar,typename Packet> void packetmath()
|
||||
EIGEN_ALIGN_MAX Scalar data1[size];
|
||||
EIGEN_ALIGN_MAX Scalar data2[size];
|
||||
EIGEN_ALIGN_MAX Scalar data3[size];
|
||||
EIGEN_ALIGN_MAX Packet packets[PacketSize*2];
|
||||
EIGEN_ALIGN_MAX Scalar ref[size];
|
||||
RealScalar refvalue = RealScalar(0);
|
||||
for (int i=0; i<size; ++i)
|
||||
@ -163,38 +162,6 @@ template<typename Scalar,typename Packet> void packetmath()
|
||||
}
|
||||
}
|
||||
|
||||
for (int offset=0; offset<PacketSize; ++offset)
|
||||
{
|
||||
#define MIN(A,B) (A<B?A:B)
|
||||
packets[0] = internal::pload<Packet>(data1);
|
||||
packets[1] = internal::pload<Packet>(data1+PacketSize);
|
||||
if (offset==0) internal::palign<0>(packets[0], packets[1]);
|
||||
else if (offset==1) internal::palign<MIN(1,PacketSize-1)>(packets[0], packets[1]);
|
||||
else if (offset==2) internal::palign<MIN(2,PacketSize-1)>(packets[0], packets[1]);
|
||||
else if (offset==3) internal::palign<MIN(3,PacketSize-1)>(packets[0], packets[1]);
|
||||
else if (offset==4) internal::palign<MIN(4,PacketSize-1)>(packets[0], packets[1]);
|
||||
else if (offset==5) internal::palign<MIN(5,PacketSize-1)>(packets[0], packets[1]);
|
||||
else if (offset==6) internal::palign<MIN(6,PacketSize-1)>(packets[0], packets[1]);
|
||||
else if (offset==7) internal::palign<MIN(7,PacketSize-1)>(packets[0], packets[1]);
|
||||
else if (offset==8) internal::palign<MIN(8,PacketSize-1)>(packets[0], packets[1]);
|
||||
else if (offset==9) internal::palign<MIN(9,PacketSize-1)>(packets[0], packets[1]);
|
||||
else if (offset==10) internal::palign<MIN(10,PacketSize-1)>(packets[0], packets[1]);
|
||||
else if (offset==11) internal::palign<MIN(11,PacketSize-1)>(packets[0], packets[1]);
|
||||
else if (offset==12) internal::palign<MIN(12,PacketSize-1)>(packets[0], packets[1]);
|
||||
else if (offset==13) internal::palign<MIN(13,PacketSize-1)>(packets[0], packets[1]);
|
||||
else if (offset==14) internal::palign<MIN(14,PacketSize-1)>(packets[0], packets[1]);
|
||||
else if (offset==15) internal::palign<MIN(15,PacketSize-1)>(packets[0], packets[1]);
|
||||
internal::pstore(data2, packets[0]);
|
||||
|
||||
for (int i=0; i<PacketSize; ++i)
|
||||
ref[i] = data1[i+offset];
|
||||
|
||||
// palign is not used anymore, so let's just put a warning if it fails
|
||||
++g_test_level;
|
||||
VERIFY(test::areApprox(ref, data2, PacketSize) && "internal::palign");
|
||||
--g_test_level;
|
||||
}
|
||||
|
||||
VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasAdd);
|
||||
VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasSub);
|
||||
VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasMul);
|
||||
|
Loading…
x
Reference in New Issue
Block a user