diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index 5518058fa..751adfbfd 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -90,10 +90,10 @@ template<> EIGEN_STRONG_INLINE Packet2cf ei_pxor (const Packet2cf& template<> EIGEN_STRONG_INLINE Packet2cf ei_pandnot(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_andnot_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf ei_pload >(const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(_mm_load_ps((const float*)from)); } -template<> EIGEN_STRONG_INLINE Packet2cf ei_ploadu >(const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(ei_ploadu((const float*)from)); } +template<> EIGEN_STRONG_INLINE Packet2cf ei_ploadu >(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ei_ploadu((const float*)from)); } template<> EIGEN_STRONG_INLINE void ei_pstore >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps((float*)to, from.v); } -template<> EIGEN_STRONG_INLINE void ei_pstoreu >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE ei_pstoreu((float*)to, from.v); } +template<> EIGEN_STRONG_INLINE void ei_pstoreu >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE ei_pstoreu((float*)to, from.v); } template<> EIGEN_STRONG_INLINE void ei_prefetch >(const std::complex * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } @@ -227,9 +227,9 @@ template<> EIGEN_STRONG_INLINE Packet1cd ei_pmul(const Packet1cd& a, { // TODO optimize it for SSE3 and 4 const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0)); - return Packet1cd(_mm_add_pd(_mm_mul_pd(_mm_unpacklo_pd(a.v, a.v), b.v), - _mm_xor_pd(_mm_mul_pd(_mm_unpackhi_pd(a.v, a.v), - _mm_shuffle_pd(b.v, b.v, 0x1)), mask))); + return Packet1cd(_mm_add_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 0, 0), b.v), + _mm_xor_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 1, 1), + ei_vec2d_swizzle1(b.v, 1, 0)), mask))); } template<> EIGEN_STRONG_INLINE Packet1cd ei_pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); } @@ -238,18 +238,18 @@ template<> EIGEN_STRONG_INLINE Packet1cd ei_pxor (const Packet1cd& template<> EIGEN_STRONG_INLINE Packet1cd ei_pandnot(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd ei_pload >(const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(_mm_load_pd((const double*)from)); } -template<> EIGEN_STRONG_INLINE Packet1cd ei_ploadu >(const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(ei_ploadu((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet1cd ei_ploadu >(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ei_ploadu((const double*)from)); } template<> EIGEN_STRONG_INLINE Packet1cd ei_pset1 >(const std::complex& from) -{ /* FIXME here it seems we have to use unaligned loads */ return ei_ploadu(&from); } +{ /* here we really have to use unaligned loads :( */ return ei_ploadu(&from); } template<> EIGEN_STRONG_INLINE void ei_pstore >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd((double*)to, from.v); } -template<> EIGEN_STRONG_INLINE void ei_pstoreu >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE ei_pstoreu((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE void ei_pstoreu >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE ei_pstoreu((double*)to, from.v); } template<> EIGEN_STRONG_INLINE void ei_prefetch >(const std::complex * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE std::complex ei_pfirst(const Packet1cd& a) { - std::complex res; + EIGEN_ALIGN16 std::complex res; _mm_store_pd((double*)&res, a.v); return res; } @@ -289,9 +289,9 @@ template<> struct ei_conj_helper EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const { const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); - return Packet1cd(_mm_add_pd(_mm_xor_pd(_mm_mul_pd(_mm_unpacklo_pd(a.v, a.v), b.v), mask), - _mm_mul_pd(_mm_unpackhi_pd(a.v, a.v), - _mm_shuffle_pd(b.v, b.v, 0x1)))); + return Packet1cd(_mm_add_pd(_mm_xor_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 0, 0), b.v), mask), + _mm_mul_pd(ei_vec2d_swizzle1(a.v, 1, 1), + ei_vec2d_swizzle1(b.v, 1, 0)))); } }; @@ -303,9 +303,9 @@ template<> struct ei_conj_helper EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const { const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); - return Packet1cd(_mm_add_pd(_mm_mul_pd(_mm_unpacklo_pd(a.v, a.v), b.v), - _mm_xor_pd(_mm_mul_pd(_mm_unpackhi_pd(a.v, a.v), - _mm_shuffle_pd(b.v, b.v, 0x1)), mask))); + return Packet1cd(_mm_add_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 0, 0), b.v), + _mm_xor_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 1, 1), + ei_vec2d_swizzle1(b.v, 1, 0)), mask))); } }; @@ -317,9 +317,9 @@ template<> struct ei_conj_helper EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const { const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); - return Packet1cd(_mm_sub_pd(_mm_xor_pd(_mm_mul_pd(_mm_unpacklo_pd(a.v, a.v), b.v), mask), - _mm_mul_pd(_mm_unpackhi_pd(a.v, a.v), - _mm_shuffle_pd(b.v, b.v, 0x1)))); + return Packet1cd(_mm_sub_pd(_mm_xor_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 0, 0), b.v), mask), + _mm_mul_pd(ei_vec2d_swizzle1(a.v, 1, 1), + ei_vec2d_swizzle1(b.v, 1, 0)))); } }; diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 05c101652..9382fbde5 100644 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -43,6 +43,9 @@ template<> struct ei_is_arithmetic<__m128d> { enum { ret = true }; }; #define ei_vec4i_swizzle1(v,p,q,r,s) \ (_mm_shuffle_epi32( v, ((s)<<6|(r)<<4|(q)<<2|(p)))) +#define ei_vec2d_swizzle1(v,p,q) \ + (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), ((q*2+1)<<6|(q*2)<<4|(p*2+1)<<2|(p*2))))) + #define ei_vec4f_swizzle2(a,b,p,q,r,s) \ (_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p)))) @@ -65,7 +68,7 @@ template<> struct ei_packet_traits : ei_default_packet_traits enum { Vectorizable = 1, size=4, - + HasDiv = 1, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, @@ -103,11 +106,11 @@ template<> struct ei_unpacket_traits { typedef int type; enum {size // that is inefficient :( (e.g., see ei_gemm_pack_rhs) template<> EIGEN_STRONG_INLINE Packet4f ei_pset1(const float& from) { Packet4f res = _mm_set_ss(from); - return _mm_shuffle_ps(res,res,0); + return ei_vec4f_swizzle1(res,0,0,0,0); } template<> EIGEN_STRONG_INLINE Packet2d ei_pset1(const double& from) { Packet2d res = _mm_set_sd(from); - return _mm_unpacklo_pd(res,res); + return ei_vec2d_swizzle1(res, 0, 0); } #else template<> EIGEN_STRONG_INLINE Packet4f ei_pset1(const float& from) { return _mm_set1_ps(from); }