From f3fb0a1940c93c2eea2342b20506d652050ff48b Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 22 Nov 2016 16:58:31 +0100 Subject: [PATCH] Disable usage of SSE3 haddpd that is extremely slow. --- Eigen/src/Core/arch/SSE/PacketMath.h | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 6f31cf12b..3646abdb1 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -504,6 +504,7 @@ template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) { return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3])); } + template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) { return _mm_hadd_pd(vecs[0], vecs[1]); @@ -515,7 +516,6 @@ template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) return pfirst(_mm_hadd_ps(tmp0, tmp0)); } -template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) { return pfirst(_mm_hadd_pd(a, a)); } #else // SSE2 versions template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) @@ -523,10 +523,6 @@ template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a)); return pfirst(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); } -template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) -{ - return pfirst(_mm_add_sd(a, _mm_unpackhi_pd(a,a))); -} template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) { @@ -548,6 +544,16 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) } #endif // SSE3 +template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) +{ + // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures + // (from Nehalem to Haswell) +// #ifdef EIGEN_VECTORIZE_SSE3 +// return pfirst(_mm_hadd_pd(a, a)); +// #else + return pfirst(_mm_add_sd(a, _mm_unpackhi_pd(a,a))); +// #endif +} #ifdef EIGEN_VECTORIZE_SSSE3 template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs)