From 5c5231ab71ba51432a24fdfd7d36a835b44805ea Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 22 Apr 2014 16:03:19 +0200 Subject: [PATCH] Workaround gcc's default ABI not being able to distinghish between vector types of different sizes. --- CMakeLists.txt | 3 -- Eigen/src/Core/arch/SSE/Complex.h | 22 ++++---- Eigen/src/Core/arch/SSE/MathFunctions.h | 10 ++-- Eigen/src/Core/arch/SSE/PacketMath.h | 68 +++++++++++++++++-------- 4 files changed, 63 insertions(+), 40 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 838a41b79..fb13769f4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -199,9 +199,6 @@ if(NOT MSVC) option(EIGEN_TEST_AVX "Enable/Disable AVX in tests/examples" OFF) if(EIGEN_TEST_AVX) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx") - if(CMAKE_COMPILER_IS_GNUCXX) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fabi-version=6") - endif() message(STATUS "Enabling AVX in tests/examples") endif() diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index 715e5a13c..d0c080c4f 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -110,8 +110,8 @@ template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { return pset1(*from); } -template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); } -template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); } +template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), Packet4f(from.v)); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), Packet4f(from.v)); } template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, int stride) @@ -145,7 +145,7 @@ template<> EIGEN_STRONG_INLINE std::complex pfirst(const Pack #endif } -template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { return Packet2cf(_mm_castpd_ps(preverse(_mm_castps_pd(a.v)))); } +template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { return Packet2cf(_mm_castpd_ps(preverse(Packet2d(_mm_castps_pd(a.v))))); } template<> EIGEN_STRONG_INLINE std::complex predux(const Packet2cf& a) { @@ -235,7 +235,7 @@ template<> struct conj_helper { return padd(c, pmul(x,y)); } EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const - { return Packet2cf(Eigen::internal::pmul(x, y.v)); } + { return Packet2cf(Eigen::internal::pmul(x, y.v)); } }; template<> struct conj_helper @@ -244,7 +244,7 @@ template<> struct conj_helper { return padd(c, pmul(x,y)); } EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const - { return Packet2cf(Eigen::internal::pmul(x.v, y)); } + { return Packet2cf(Eigen::internal::pmul(x.v, y)); } }; template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) @@ -300,7 +300,7 @@ template<> struct unpacket_traits { typedef std::complex type template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(a.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); } template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); @@ -338,8 +338,8 @@ template<> EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* from) { return pset1(*from); } // FIXME force unaligned store, this is a temporary fix -template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } -template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v)); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v)); } template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } @@ -437,7 +437,7 @@ template<> struct conj_helper { return padd(c, pmul(x,y)); } EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const - { return Packet1cd(Eigen::internal::pmul(x, y.v)); } + { return Packet1cd(Eigen::internal::pmul(x, y.v)); } }; template<> struct conj_helper @@ -446,7 +446,7 @@ template<> struct conj_helper { return padd(c, pmul(x,y)); } EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const - { return Packet1cd(Eigen::internal::pmul(x.v, y)); } + { return Packet1cd(Eigen::internal::pmul(x.v, y)); } }; template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) @@ -459,7 +459,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, con EIGEN_STRONG_INLINE Packet1cd pcplxflip/**/(const Packet1cd& x) { - return Packet1cd(preverse(x.v)); + return Packet1cd(preverse(Packet2d(x.v))); } template<> EIGEN_DEVICE_FUNC inline void diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h index 09f74c651..8f78b3a6c 100644 --- a/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -63,7 +63,7 @@ Packet4f plog(const Packet4f& _x) x = _mm_or_ps(x, p4f_half); emm0 = _mm_sub_epi32(emm0, p4i_0x7f); - Packet4f e = padd(_mm_cvtepi32_ps(emm0), p4f_1); + Packet4f e = padd(Packet4f(_mm_cvtepi32_ps(emm0)), p4f_1); /* part2: if( x < SQRTHF ) { @@ -72,9 +72,9 @@ Packet4f plog(const Packet4f& _x) } else { x = x - 1.0; } */ Packet4f mask = _mm_cmplt_ps(x, p4f_cephes_SQRTHF); - Packet4f tmp = _mm_and_ps(x, mask); + Packet4f tmp = pand(x, mask); x = psub(x, p4f_1); - e = psub(e, _mm_and_ps(p4f_1, mask)); + e = psub(e, pand(p4f_1, mask)); x = padd(x, tmp); Packet4f x2 = pmul(x,x); @@ -167,7 +167,7 @@ Packet4f pexp(const Packet4f& _x) emm0 = _mm_cvttps_epi32(fx); emm0 = _mm_add_epi32(emm0, p4i_0x7f); emm0 = _mm_slli_epi32(emm0, 23); - return pmul(y, _mm_castsi128_ps(emm0)); + return pmul(y, Packet4f(_mm_castsi128_ps(emm0))); } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d pexp(const Packet2d& _x) @@ -241,7 +241,7 @@ Packet2d pexp(const Packet2d& _x) emm0 = _mm_add_epi32(emm0, p4i_1023_0); emm0 = _mm_slli_epi32(emm0, 20); emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3)); - return pmul(x, _mm_castsi128_pd(emm0)); + return pmul(x, Packet2d(_mm_castsi128_pd(emm0))); } /* evaluation of 4 sines at onces, using SSE2 intrinsics. diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 89dfa6975..e7e035f4e 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -22,9 +22,35 @@ namespace internal { #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) #endif +#if defined EIGEN_VECTORIZE_AVX && defined __GNUC__ && !(defined __clang__ || defined __INTEL_COMPILER) +// With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot +// have overloads for both types without linking error. +// One solution is to increase ABI version using -fabi-version=4 (or greater). +// To workaround this inconvenince, we rather wrap 128bit types into the following helper +// structure: +// TODO disable this wrapper if abi-versio>=4, but to detect that without asking the user to define a macro? +template +struct eigen_packet_wrapper +{ + EIGEN_ALWAYS_INLINE operator T&() { return m_val; } + EIGEN_ALWAYS_INLINE operator const T&() const { return m_val; } + EIGEN_ALWAYS_INLINE eigen_packet_wrapper() {} + EIGEN_ALWAYS_INLINE eigen_packet_wrapper(const T &v) : m_val(v) {} + EIGEN_ALWAYS_INLINE eigen_packet_wrapper& operator=(const T &v) { + m_val = v; + return *this; + } + + T m_val; +}; +typedef eigen_packet_wrapper<__m128> Packet4f; +typedef eigen_packet_wrapper<__m128i> Packet4i; +typedef eigen_packet_wrapper<__m128d> Packet2d; +#else typedef __m128 Packet4f; typedef __m128i Packet4i; typedef __m128d Packet2d; +#endif template<> struct is_arithmetic<__m128> { enum { value = true }; }; template<> struct is_arithmetic<__m128i> { enum { value = true }; }; @@ -161,7 +187,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) } template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { - return psub(_mm_setr_epi32(0,0,0,0), a); + return psub(Packet4i(_mm_setr_epi32(0,0,0,0)), a); } template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } @@ -244,7 +270,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, con template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); } template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); } -template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast(from)); } +template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast(from)); } #if defined(_MSC_VER) template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { @@ -262,7 +288,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) { E #endif } template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); } - template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast(from)); } + template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast(from)); } #else // Fast unaligned loads. Note that here we cannot directly use intrinsics: this would // require pointer casting to incompatible pointer types and leads to invalid code @@ -312,7 +338,7 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD #if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS - return _mm_loadu_si128(reinterpret_cast(from)); + return _mm_loadu_si128(reinterpret_cast(from)); #else __m128d res; res = _mm_load_sd((const double*)(from)) ; @@ -331,13 +357,13 @@ template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int* from) { Packet4i tmp; - tmp = _mm_loadl_epi64(reinterpret_cast(from)); + tmp = _mm_loadl_epi64(reinterpret_cast(from)); return vec4i_swizzle1(tmp, 0, 0, 1, 1); } template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); } template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); } -template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast(to), from); } +template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); } template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE @@ -348,8 +374,8 @@ template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& _mm_storeh_pd((to+1), from); #endif } -template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast(to), _mm_castps_pd(from)); } -template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast(to), _mm_castsi128_pd(from)); } +template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast(to), Packet2d(_mm_castps_pd(from))); } +template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast(to), Packet2d(_mm_castsi128_pd(from))); } template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, int stride) { @@ -388,13 +414,13 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const template<> EIGEN_STRONG_INLINE void pstore1(float* to, const float& a) { Packet4f pa = _mm_set_ss(a); - pstore(to, vec4f_swizzle1(pa,0,0,0,0)); + pstore(to, Packet4f(vec4f_swizzle1(pa,0,0,0,0))); } // some compilers might be tempted to perform multiple moves instead of using a vector path. template<> EIGEN_STRONG_INLINE void pstore1(double* to, const double& a) { Packet2d pa = _mm_set_sd(a); - pstore(to, vec2d_swizzle1(pa,0,0)); + pstore(to, Packet2d(vec2d_swizzle1(pa,0,0))); } #ifndef EIGEN_VECTORIZE_AVX @@ -507,10 +533,10 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) { Packet4f tmp0 = _mm_hadd_ps(a,a); - return pfirst(_mm_hadd_ps(tmp0, tmp0)); + return pfirst(_mm_hadd_ps(tmp0, tmp0)); } -template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) { return pfirst(_mm_hadd_pd(a, a)); } +template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) { return pfirst(_mm_hadd_pd(a, a)); } // SSSE3 version: // EIGEN_STRONG_INLINE float predux(const Packet4i& a) @@ -553,7 +579,7 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) { Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a)); - return pfirst(tmp) + pfirst(_mm_shuffle_epi32(tmp, 1)); + return pfirst(tmp) + pfirst(_mm_shuffle_epi32(tmp, 1)); } template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) @@ -576,11 +602,11 @@ template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) { Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a)); - return pfirst(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); + return pfirst(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); } template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) { - return pfirst(_mm_mul_sd(a, _mm_unpackhi_pd(a,a))); + return pfirst(_mm_mul_sd(a, _mm_unpackhi_pd(a,a))); } template<> EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) { @@ -596,17 +622,17 @@ template<> EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) { Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a,a)); - return pfirst(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); + return pfirst(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); } template<> EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) { - return pfirst(_mm_min_sd(a, _mm_unpackhi_pd(a,a))); + return pfirst(_mm_min_sd(a, _mm_unpackhi_pd(a,a))); } template<> EIGEN_STRONG_INLINE int predux_min(const Packet4i& a) { #ifdef EIGEN_VECTORIZE_SSE4_1 Packet4i tmp = _mm_min_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2))); - return pfirst(_mm_min_epi32(tmp,_mm_shuffle_epi32(tmp, 1))); + return pfirst(_mm_min_epi32(tmp,_mm_shuffle_epi32(tmp, 1))); #else // after some experiments, it is seems this is the fastest way to implement it // for GCC (eg., it does not like using std::min after the pstore !!) @@ -622,17 +648,17 @@ template<> EIGEN_STRONG_INLINE int predux_min(const Packet4i& a) template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) { Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a,a)); - return pfirst(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); + return pfirst(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); } template<> EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) { - return pfirst(_mm_max_sd(a, _mm_unpackhi_pd(a,a))); + return pfirst(_mm_max_sd(a, _mm_unpackhi_pd(a,a))); } template<> EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) { #ifdef EIGEN_VECTORIZE_SSE4_1 Packet4i tmp = _mm_max_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2))); - return pfirst(_mm_max_epi32(tmp,_mm_shuffle_epi32(tmp, 1))); + return pfirst(_mm_max_epi32(tmp,_mm_shuffle_epi32(tmp, 1))); #else // after some experiments, it is seems this is the fastest way to implement it // for GCC (eg., it does not like using std::min after the pstore !!)