mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-09-12 09:23:12 +08:00
wip: extend the gebp kernel to optimize complex and mixed products
This commit is contained in:
parent
45362f4eae
commit
cd0e5dca9b
@ -167,6 +167,10 @@ ei_pload(const typename ei_unpacket_traits<Packet>::type* from) { return *from;
|
|||||||
template<typename Packet> inline Packet
|
template<typename Packet> inline Packet
|
||||||
ei_ploadu(const typename ei_unpacket_traits<Packet>::type* from) { return *from; }
|
ei_ploadu(const typename ei_unpacket_traits<Packet>::type* from) { return *from; }
|
||||||
|
|
||||||
|
/** \internal \returns a packet with elements of \a *from duplicated, e.g.: (from[0],from[0],from[1],from[1]) */
|
||||||
|
template<typename Packet> inline Packet
|
||||||
|
ei_ploaddup(const typename ei_unpacket_traits<Packet>::type* from) { return *from; }
|
||||||
|
|
||||||
/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */
|
/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */
|
||||||
template<typename Packet> inline Packet
|
template<typename Packet> inline Packet
|
||||||
ei_pset1(const typename ei_unpacket_traits<Packet>::type& a) { return a; }
|
ei_pset1(const typename ei_unpacket_traits<Packet>::type& a) { return a; }
|
||||||
|
@ -73,9 +73,12 @@ template<> EIGEN_STRONG_INLINE Packet2cf ei_pmul<Packet2cf>(const Packet2cf& a,
|
|||||||
{
|
{
|
||||||
// TODO optimize it for SSE3 and 4
|
// TODO optimize it for SSE3 and 4
|
||||||
#ifdef EIGEN_VECTORIZE_SSE3
|
#ifdef EIGEN_VECTORIZE_SSE3
|
||||||
return Packet2cf(_mm_addsub_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
|
return Packet2cf(_mm_addsub_ps(_mm_mul_ps(_mm_moveldup_ps(a.v), b.v),
|
||||||
_mm_mul_ps(ei_vec4f_swizzle1(a.v, 1, 1, 3, 3),
|
_mm_mul_ps(_mm_movehdup_ps(a.v),
|
||||||
ei_vec4f_swizzle1(b.v, 1, 0, 3, 2))));
|
ei_vec4f_swizzle1(b.v, 1, 0, 3, 2))));
|
||||||
|
// return Packet2cf(_mm_addsub_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
|
||||||
|
// _mm_mul_ps(ei_vec4f_swizzle1(a.v, 1, 1, 3, 3),
|
||||||
|
// ei_vec4f_swizzle1(b.v, 1, 0, 3, 2))));
|
||||||
#else
|
#else
|
||||||
const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x00000000,0x80000000,0x00000000));
|
const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x00000000,0x80000000,0x00000000));
|
||||||
return Packet2cf(_mm_add_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
|
return Packet2cf(_mm_add_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
|
||||||
@ -224,6 +227,12 @@ template<> EIGEN_STRONG_INLINE Packet2cf ei_pdiv<Packet2cf>(const Packet2cf& a,
|
|||||||
return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(s), 0xb1)))));
|
return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(s), 0xb1)))));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
EIGEN_STRONG_INLINE Packet2cf ei_pcplxflip/*<Packet2cf>*/(const Packet2cf& x)
|
||||||
|
{
|
||||||
|
return Packet2cf(ei_vec4f_swizzle1(x.v, 1, 0, 3, 2));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
//---------- double ----------
|
//---------- double ----------
|
||||||
struct Packet1cd
|
struct Packet1cd
|
||||||
{
|
{
|
||||||
@ -268,9 +277,13 @@ template<> EIGEN_STRONG_INLINE Packet1cd ei_pmul<Packet1cd>(const Packet1cd& a,
|
|||||||
{
|
{
|
||||||
// TODO optimize it for SSE3 and 4
|
// TODO optimize it for SSE3 and 4
|
||||||
#ifdef EIGEN_VECTORIZE_SSE3
|
#ifdef EIGEN_VECTORIZE_SSE3
|
||||||
return Packet1cd(_mm_addsub_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 0, 0), b.v),
|
// return Packet1cd(_mm_addsub_pd(_mm_mul_pd(a.v, b.v),
|
||||||
_mm_mul_pd(ei_vec2d_swizzle1(a.v, 1, 1),
|
// _mm_mul_pd(a.v, b.v/*ei_vec2d_swizzle1(b.v, 1, 0)*/)));
|
||||||
ei_vec2d_swizzle1(b.v, 1, 0))));
|
return Packet1cd(_mm_add_pd(_mm_mul_pd(a.v, b.v),
|
||||||
|
_mm_mul_pd(a.v, ei_vec2d_swizzle1(b.v, 1, 0))));
|
||||||
|
// return Packet1cd(_mm_addsub_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 0, 0), b.v),
|
||||||
|
// _mm_mul_pd(ei_vec2d_swizzle1(a.v, 1, 1),
|
||||||
|
// ei_vec2d_swizzle1(b.v, 1, 0))));
|
||||||
#else
|
#else
|
||||||
const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0));
|
const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0));
|
||||||
return Packet1cd(_mm_add_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 0, 0), b.v),
|
return Packet1cd(_mm_add_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 0, 0), b.v),
|
||||||
@ -286,14 +299,14 @@ template<> EIGEN_STRONG_INLINE Packet1cd ei_pandnot<Packet1cd>(const Packet1cd&
|
|||||||
|
|
||||||
// FIXME force unaligned load, this is a temporary fix
|
// FIXME force unaligned load, this is a temporary fix
|
||||||
template<> EIGEN_STRONG_INLINE Packet1cd ei_pload <Packet1cd>(const std::complex<double>* from)
|
template<> EIGEN_STRONG_INLINE Packet1cd ei_pload <Packet1cd>(const std::complex<double>* from)
|
||||||
{ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(ei_ploadu<Packet2d>((const double*)from)); }
|
{ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(ei_pload<Packet2d>((const double*)from)); }
|
||||||
template<> EIGEN_STRONG_INLINE Packet1cd ei_ploadu<Packet1cd>(const std::complex<double>* from)
|
template<> EIGEN_STRONG_INLINE Packet1cd ei_ploadu<Packet1cd>(const std::complex<double>* from)
|
||||||
{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ei_ploadu<Packet2d>((const double*)from)); }
|
{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ei_ploadu<Packet2d>((const double*)from)); }
|
||||||
template<> EIGEN_STRONG_INLINE Packet1cd ei_pset1<Packet1cd>(const std::complex<double>& from)
|
template<> EIGEN_STRONG_INLINE Packet1cd ei_pset1<Packet1cd>(const std::complex<double>& from)
|
||||||
{ /* here we really have to use unaligned loads :( */ return ei_ploadu<Packet1cd>(&from); }
|
{ /* here we really have to use unaligned loads :( */ return ei_ploadu<Packet1cd>(&from); }
|
||||||
|
|
||||||
// FIXME force unaligned store, this is a temporary fix
|
// FIXME force unaligned store, this is a temporary fix
|
||||||
template<> EIGEN_STRONG_INLINE void ei_pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE ei_pstoreu((double*)to, from.v); }
|
template<> EIGEN_STRONG_INLINE void ei_pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE ei_pstore((double*)to, from.v); }
|
||||||
template<> EIGEN_STRONG_INLINE void ei_pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE ei_pstoreu((double*)to, from.v); }
|
template<> EIGEN_STRONG_INLINE void ei_pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE ei_pstoreu((double*)to, from.v); }
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE void ei_prefetch<std::complex<double> >(const std::complex<double> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
template<> EIGEN_STRONG_INLINE void ei_prefetch<std::complex<double> >(const std::complex<double> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
||||||
@ -415,4 +428,9 @@ template<> EIGEN_STRONG_INLINE Packet1cd ei_pdiv<Packet1cd>(const Packet1cd& a,
|
|||||||
return Packet1cd(_mm_div_pd(res.v, _mm_add_pd(s,_mm_shuffle_pd(s, s, 0x1))));
|
return Packet1cd(_mm_div_pd(res.v, _mm_add_pd(s,_mm_shuffle_pd(s, s, 0x1))));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
EIGEN_STRONG_INLINE Packet1cd ei_pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
|
||||||
|
{
|
||||||
|
return Packet1cd(ei_preverse(x.v));
|
||||||
|
}
|
||||||
|
|
||||||
#endif // EIGEN_COMPLEX_SSE_H
|
#endif // EIGEN_COMPLEX_SSE_H
|
||||||
|
@ -45,6 +45,8 @@ template<> struct ei_is_arithmetic<__m128d> { enum { ret = true }; };
|
|||||||
|
|
||||||
#define ei_vec2d_swizzle1(v,p,q) \
|
#define ei_vec2d_swizzle1(v,p,q) \
|
||||||
(_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), ((q*2+1)<<6|(q*2)<<4|(p*2+1)<<2|(p*2)))))
|
(_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), ((q*2+1)<<6|(q*2)<<4|(p*2+1)<<2|(p*2)))))
|
||||||
|
// #define ei_vec2d_swizzle1(v,p,q) \
|
||||||
|
(_mm_shuffle_pd(v,v, (q)<<1|(p) ))
|
||||||
|
|
||||||
#define ei_vec4f_swizzle2(a,b,p,q,r,s) \
|
#define ei_vec4f_swizzle2(a,b,p,q,r,s) \
|
||||||
(_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p))))
|
(_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p))))
|
||||||
@ -255,6 +257,21 @@ template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<Packet4i>(const int* from)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4f ei_ploaddup<Packet4f>(const float* from)
|
||||||
|
{
|
||||||
|
Packet4f tmp;
|
||||||
|
tmp = _mm_loadl_pi(tmp,(__m64*)from);
|
||||||
|
return ei_vec4f_swizzle1(tmp, 0, 0, 1, 1);
|
||||||
|
}
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2d ei_ploaddup<Packet2d>(const double* from)
|
||||||
|
{ return ei_pset1<Packet2d>(from[0]); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4i ei_ploaddup<Packet4i>(const int* from)
|
||||||
|
{
|
||||||
|
Packet4i tmp;
|
||||||
|
tmp = _mm_loadl_epi64(reinterpret_cast<const Packet4i*>(from));
|
||||||
|
return ei_vec4i_swizzle1(tmp, 0, 0, 1, 1);
|
||||||
|
}
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE void ei_pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); }
|
template<> EIGEN_STRONG_INLINE void ei_pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); }
|
||||||
template<> EIGEN_STRONG_INLINE void ei_pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); }
|
template<> EIGEN_STRONG_INLINE void ei_pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); }
|
||||||
template<> EIGEN_STRONG_INLINE void ei_pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<Packet4i*>(to), from); }
|
template<> EIGEN_STRONG_INLINE void ei_pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<Packet4i*>(to), from); }
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -170,7 +170,7 @@ static void run(Index rows, Index cols, Index depth,
|
|||||||
// this is the sequential version!
|
// this is the sequential version!
|
||||||
std::size_t sizeA = kc*mc;
|
std::size_t sizeA = kc*mc;
|
||||||
std::size_t sizeB = kc*cols;
|
std::size_t sizeB = kc*cols;
|
||||||
std::size_t sizeW = kc*ei_packet_traits<RhsScalar>::size*Blocking::nr;
|
std::size_t sizeW = kc*ei_packet_traits<RhsScalar>::size*Blocking::nr*2;
|
||||||
LhsScalar *blockA = blocking.blockA()==0 ? ei_aligned_stack_new(LhsScalar, sizeA) : blocking.blockA();
|
LhsScalar *blockA = blocking.blockA()==0 ? ei_aligned_stack_new(LhsScalar, sizeA) : blocking.blockA();
|
||||||
RhsScalar *blockB = blocking.blockB()==0 ? ei_aligned_stack_new(RhsScalar, sizeB) : blocking.blockB();
|
RhsScalar *blockB = blocking.blockB()==0 ? ei_aligned_stack_new(RhsScalar, sizeB) : blocking.blockB();
|
||||||
RhsScalar *blockW = blocking.blockW()==0 ? ei_aligned_stack_new(RhsScalar, sizeW) : blocking.blockW();
|
RhsScalar *blockW = blocking.blockW()==0 ? ei_aligned_stack_new(RhsScalar, sizeW) : blocking.blockW();
|
||||||
|
@ -160,8 +160,8 @@ struct ei_product_blocking_traits
|
|||||||
enum {
|
enum {
|
||||||
Vectorizable = ei_packet_traits<LhsScalar>::Vectorizable
|
Vectorizable = ei_packet_traits<LhsScalar>::Vectorizable
|
||||||
&& ei_packet_traits<RhsScalar>::Vectorizable
|
&& ei_packet_traits<RhsScalar>::Vectorizable
|
||||||
&& (ei_is_same_type<LhsScalar,RhsScalar>::ret
|
/*&& (ei_is_same_type<LhsScalar,RhsScalar>::ret
|
||||||
|| (NumTraits<LhsScalar>::IsComplex && !NumTraits<RhsScalar>::IsComplex)),
|
|| (NumTraits<LhsScalar>::IsComplex && !NumTraits<RhsScalar>::IsComplex))*/,
|
||||||
LhsPacketSize = Vectorizable ? ei_packet_traits<LhsScalar>::size : 1,
|
LhsPacketSize = Vectorizable ? ei_packet_traits<LhsScalar>::size : 1,
|
||||||
NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
|
NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user