Disable usage of SSE3 haddpd that is extremely slow.

This commit is contained in:
Gael Guennebaud 2016-11-22 16:58:31 +01:00
parent 6a84246a6a
commit f3fb0a1940

View File

@ -504,6 +504,7 @@ template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
{ {
return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3])); return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3]));
} }
template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs) template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
{ {
return _mm_hadd_pd(vecs[0], vecs[1]); return _mm_hadd_pd(vecs[0], vecs[1]);
@ -515,7 +516,6 @@ template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
return pfirst<Packet4f>(_mm_hadd_ps(tmp0, tmp0)); return pfirst<Packet4f>(_mm_hadd_ps(tmp0, tmp0));
} }
template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return pfirst<Packet2d>(_mm_hadd_pd(a, a)); }
#else #else
// SSE2 versions // SSE2 versions
template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
@ -523,10 +523,6 @@ template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a)); Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a));
return pfirst<Packet4f>(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); return pfirst<Packet4f>(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
} }
template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
{
return pfirst<Packet2d>(_mm_add_sd(a, _mm_unpackhi_pd(a,a)));
}
template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs) template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
{ {
@ -548,6 +544,16 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
} }
#endif // SSE3 #endif // SSE3
template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
{
// Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures
// (from Nehalem to Haswell)
// #ifdef EIGEN_VECTORIZE_SSE3
// return pfirst<Packet2d>(_mm_hadd_pd(a, a));
// #else
return pfirst<Packet2d>(_mm_add_sd(a, _mm_unpackhi_pd(a,a)));
// #endif
}
#ifdef EIGEN_VECTORIZE_SSSE3 #ifdef EIGEN_VECTORIZE_SSSE3
template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs) template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)