diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index a3b4b67fd..c3ab0588f 100644 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -29,6 +29,18 @@ #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 16 #endif +#define ei_vec4f_swizzle1(v,p,q,r,s) \ + (_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), ((s)<<6|(r)<<4|(q)<<2|(p))))) + +#define ei_vec4i_swizzle1(v,p,q,r,s) \ + (_mm_shuffle_epi32( v, ((s)<<6|(r)<<4|(q)<<2|(p)))) + +#define ei_vec4f_swizzle2(a,b,p,q,r,s) \ + (_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p)))) + +#define ei_vec4i_swizzle2(a,b,p,q,r,s) \ + (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), ((s)<<6|(r)<<4|(q)<<2|(p)))))) + template<> struct ei_packet_traits { typedef __m128 type; enum {size=4}; }; template<> struct ei_packet_traits { typedef __m128d type; enum {size=2}; }; template<> struct ei_packet_traits { typedef __m128i type; enum {size=4}; }; @@ -54,14 +66,13 @@ template<> EIGEN_STRONG_INLINE __m128d ei_pmul<__m128d>(const __m128d& a, const template<> EIGEN_STRONG_INLINE __m128i ei_pmul<__m128i>(const __m128i& a, const __m128i& b) { // this version is very slightly faster than 4 scalar products - return _mm_or_si128( - _mm_and_si128( - _mm_mul_epu32(a,b), - _mm_setr_epi32(0xffffffff,0,0xffffffff,0)), - _mm_slli_si128( - _mm_and_si128( - _mm_mul_epu32(_mm_srli_si128(a,4),_mm_srli_si128(b,4)), - _mm_setr_epi32(0xffffffff,0,0xffffffff,0)), 4)); + return ei_vec4i_swizzle1( + ei_vec4i_swizzle2( + _mm_mul_epu32(a,b), + _mm_mul_epu32(ei_vec4i_swizzle1(a,1,0,3,2), + ei_vec4i_swizzle1(b,1,0,3,2)), + 0,2,0,2), + 0,2,1,3); } template<> EIGEN_STRONG_INLINE __m128 ei_pdiv<__m128>(const __m128& a, const __m128& b) { return _mm_div_ps(a,b); } diff --git a/Eigen/src/Geometry/arch/Geometry_SSE.h b/Eigen/src/Geometry/arch/Geometry_SSE.h index 6c2933898..9be5ecd6f 100644 --- a/Eigen/src/Geometry/arch/Geometry_SSE.h +++ b/Eigen/src/Geometry/arch/Geometry_SSE.h @@ -26,9 +26,6 @@ #ifndef EIGEN_GEOMETRY_SSE_H #define EIGEN_GEOMETRY_SSE_H -#define vec4f_swizzle(v,p,q,r,s) (_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), \ - ((s)<<6|(r)<<4|(q)<<2|(p))))) - template<> inline Quaternion ei_quaternion_product(const Quaternion& _a, const Quaternion& _b) { @@ -36,14 +33,14 @@ ei_quaternion_product(const Quaternion& _a, const Quate Quaternion res; __m128 a = _a.coeffs().packet(0); __m128 b = _b.coeffs().packet(0); - __m128 flip1 = _mm_xor_ps(_mm_mul_ps(vec4f_swizzle(a,1,2,0,2), - vec4f_swizzle(b,2,0,1,2)),mask); - __m128 flip2 = _mm_xor_ps(_mm_mul_ps(vec4f_swizzle(a,3,3,3,1), - vec4f_swizzle(b,0,1,2,1)),mask); + __m128 flip1 = _mm_xor_ps(_mm_mul_ps(ei_vec4f_swizzle1(a,1,2,0,2), + ei_vec4f_swizzle1(b,2,0,1,2)),mask); + __m128 flip2 = _mm_xor_ps(_mm_mul_ps(ei_vec4f_swizzle1(a,3,3,3,1), + ei_vec4f_swizzle1(b,0,1,2,1)),mask); ei_pstore(&res.x(), - _mm_add_ps(_mm_sub_ps(_mm_mul_ps(a,vec4f_swizzle(b,3,3,3,3)), - _mm_mul_ps(vec4f_swizzle(a,2,0,1,0), - vec4f_swizzle(b,1,2,0,0))), + _mm_add_ps(_mm_sub_ps(_mm_mul_ps(a,ei_vec4f_swizzle1(b,3,3,3,3)), + _mm_mul_ps(ei_vec4f_swizzle1(a,2,0,1,0), + ei_vec4f_swizzle1(b,1,2,0,0))), _mm_add_ps(flip1,flip2))); return res; }