mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-09-12 09:23:12 +08:00
Optimize AVX pset1 for complexes and ploaddup
This commit is contained in:
parent
1dd015fea6
commit
9746396d1b
@ -78,11 +78,7 @@ template<> EIGEN_STRONG_INLINE Packet4cf ploadu<Packet4cf>(const std::complex<fl
|
|||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet4cf pset1<Packet4cf>(const std::complex<float>& from)
|
template<> EIGEN_STRONG_INLINE Packet4cf pset1<Packet4cf>(const std::complex<float>& from)
|
||||||
{
|
{
|
||||||
const float r = std::real(from);
|
return Packet4cf(_mm256_castps_pd(_mm256_broadcast_sd((const double*)(const void*)&from)));
|
||||||
const float i = std::imag(from);
|
|
||||||
// Beware, _mm256_set_ps expects the scalar values in reverse order (i.e. 7 to 0)
|
|
||||||
const __m256 result = _mm256_set_ps(i, r, i, r, i, r, i, r);
|
|
||||||
return Packet4cf(result);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet4cf ploaddup<Packet4cf>(const std::complex<float>* from)
|
template<> EIGEN_STRONG_INLINE Packet4cf ploaddup<Packet4cf>(const std::complex<float>* from)
|
||||||
@ -304,11 +300,9 @@ template<> EIGEN_STRONG_INLINE Packet2cd ploadu<Packet2cd>(const std::complex<do
|
|||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet2cd pset1<Packet2cd>(const std::complex<double>& from)
|
template<> EIGEN_STRONG_INLINE Packet2cd pset1<Packet2cd>(const std::complex<double>& from)
|
||||||
{
|
{
|
||||||
const double r = std::real(from);
|
// in case casting to a __m128d* is really not safe, then we can still fallback to this version: (much slower though)
|
||||||
const double i = std::imag(from);
|
// return Packet2cd(_mm256_loadu2_m128d((const double*)&from,(const double*)&from));
|
||||||
// Beware, _mm256_set_pd expects the scalar values in reverse order (i.e. 3 to 0)
|
return Packet2cd(_mm256_broadcast_pd((const __m128d*)(const void*)&from));
|
||||||
const __m256d result = _mm256_set_pd(i, r, i, r);
|
|
||||||
return Packet2cd(result);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet2cd ploaddup<Packet2cd>(const std::complex<double>* from) { return pset1<Packet2cd>(*from); }
|
template<> EIGEN_STRONG_INLINE Packet2cd ploaddup<Packet2cd>(const std::complex<double>* from) { return pset1<Packet2cd>(*from); }
|
||||||
|
@ -183,18 +183,22 @@ template<> EIGEN_STRONG_INLINE Packet8i ploadu<Packet8i>(const int* from) { EIGE
|
|||||||
template<> EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(const float* from)
|
template<> EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(const float* from)
|
||||||
{
|
{
|
||||||
// TODO try to find a way to avoid the need of a temporary register
|
// TODO try to find a way to avoid the need of a temporary register
|
||||||
Packet8f tmp = _mm256_castps128_ps256(_mm_loadu_ps(from));
|
// Packet8f tmp = _mm256_castps128_ps256(_mm_loadu_ps(from));
|
||||||
tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1);
|
// tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1);
|
||||||
return _mm256_unpacklo_ps(tmp,tmp);
|
// return _mm256_unpacklo_ps(tmp,tmp);
|
||||||
|
|
||||||
|
// _mm256_insertf128_ps is very slow on Haswell, thus:
|
||||||
|
Packet8f tmp = _mm256_broadcast_ps((const __m128*)(const void*)from);
|
||||||
|
// mimic an "inplace" permutation of the lower 128bits using a blend
|
||||||
|
tmp = _mm256_blend_ps(tmp,_mm256_castps128_ps256(_mm_permute_ps( _mm256_castps256_ps128(tmp), _MM_SHUFFLE(1,0,1,0))), 15);
|
||||||
|
// then we can perform a consistent permutation on the global register to get everything in shape:
|
||||||
|
return _mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2));
|
||||||
}
|
}
|
||||||
// Loads 2 doubles from memory a returns the packet {a0, a0 a1, a1}
|
// Loads 2 doubles from memory a returns the packet {a0, a0 a1, a1}
|
||||||
template<> EIGEN_STRONG_INLINE Packet4d ploaddup<Packet4d>(const double* from)
|
template<> EIGEN_STRONG_INLINE Packet4d ploaddup<Packet4d>(const double* from)
|
||||||
{
|
{
|
||||||
// TODO try to find a way to avoid the need of a temporary register
|
Packet4d tmp = _mm256_broadcast_pd((const __m128d*)(const void*)from);
|
||||||
Packet2d tmp0 = _mm_loadu_pd(from);
|
return _mm256_permute_pd(tmp, 3<<2);
|
||||||
Packet2d tmp1 = _mm_permute_pd(tmp0,3);
|
|
||||||
tmp0 = _mm_permute_pd(tmp0,0);
|
|
||||||
return _mm256_insertf128_pd(_mm256_castpd128_pd256(tmp0), tmp1, 1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Loads 2 floats from memory a returns the packet {a0, a0 a0, a0, a1, a1, a1, a1}
|
// Loads 2 floats from memory a returns the packet {a0, a0 a0, a0, a1, a1, a1, a1}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user