From bfa606d16fcb475025f55b226617c969a623877a Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 6 Jul 2010 23:36:00 +0200 Subject: [PATCH] * add a IsVectorized mechanism (instead of packet-size>1...) * vectorize complex --- Eigen/src/Core/Functors.h | 2 +- Eigen/src/Core/GenericPacketMath.h | 10 +- Eigen/src/Core/Product.h | 2 + Eigen/src/Core/arch/AltiVec/PacketMath.h | 15 +- Eigen/src/Core/arch/NEON/PacketMath.h | 15 +- Eigen/src/Core/arch/SSE/Complex.h | 177 ++++++++++++++++-- Eigen/src/Core/arch/SSE/PacketMath.h | 19 +- Eigen/src/Core/products/GeneralMatrixVector.h | 20 +- Eigen/src/Core/util/XprHelper.h | 2 +- 9 files changed, 230 insertions(+), 32 deletions(-) diff --git a/Eigen/src/Core/Functors.h b/Eigen/src/Core/Functors.h index cbe20d50e..103833447 100644 --- a/Eigen/src/Core/Functors.h +++ b/Eigen/src/Core/Functors.h @@ -470,7 +470,7 @@ struct ei_scalar_constant_op { template struct ei_functor_traits > // FIXME replace this packet test by a safe one -{ enum { Cost = 1, PacketAccess = ei_packet_traits::size>1, IsRepeatable = true }; }; +{ enum { Cost = 1, PacketAccess = ei_packet_traits::IsVectorized, IsRepeatable = true }; }; template struct ei_scalar_identity_op { EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_identity_op) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 6cd288c55..6fb3eea19 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -82,15 +82,21 @@ struct ei_default_packet_traits template struct ei_packet_traits : ei_default_packet_traits { typedef T type; - enum {size=1}; + enum { + IsVectorized = 0, + size = 1 + }; enum { HasAdd = 0, HasSub = 0, HasMul = 0, HasNegate = 0, HasAbs = 0, + HasAbs2 = 0, HasMin = 0, - HasMax = 0 + HasMax = 0, + HasConj = 0, + HasSetLinear = 0 }; }; diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h index 707b6ec85..6a205eda9 100644 --- a/Eigen/src/Core/Product.h +++ b/Eigen/src/Core/Product.h @@ -376,6 +376,8 @@ template<> struct ei_gemv_selector * RhsBlasTraits::extractScalarFactor(prod.rhs()); enum { + // FIXME I think here we really have to check for ei_packet_traits::size==1 + // because in this case it is fine to have an inner stride DirectlyUseRhs = ((ei_packet_traits::size==1) || (_ActualRhsType::Flags&ActualPacketAccessBit)) && (!(_ActualRhsType::Flags & RowMajorBit)) }; diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index e3454b554..403624167 100644 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -85,8 +85,12 @@ static Packet4f ei_p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)ei_p4i_MINUS1, (Pack template<> struct ei_packet_traits : ei_default_packet_traits { - typedef Packet4f type; enum {size=4}; + typedef Packet4f type; enum { + IsVectorized = 1, + size=4, + + // FIXME check the Has* HasSin = 0, HasCos = 0, HasLog = 0, @@ -95,7 +99,14 @@ template<> struct ei_packet_traits : ei_default_packet_traits }; }; template<> struct ei_packet_traits : ei_default_packet_traits -{ typedef Packet4i type; enum {size=4}; }; +{ + typedef Packet4i type; + enum { + // FIXME check the Has* + IsVectorized = 1, + size=4 + }; +}; template<> struct ei_unpacket_traits { typedef float type; enum {size=4}; }; template<> struct ei_unpacket_traits { typedef int type; enum {size=4}; }; diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 2f0efbcc9..2d6a71723 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -59,8 +59,12 @@ typedef int32x4_t Packet4i; template<> struct ei_packet_traits : ei_default_packet_traits { - typedef Packet4f type; enum {size=4}; + typedef Packet4f type; enum { + IsVectorized = 1, + size = 4, + + // FIXME check the Has* HasSin = 0, HasCos = 0, HasLog = 0, @@ -69,7 +73,14 @@ template<> struct ei_packet_traits : ei_default_packet_traits }; }; template<> struct ei_packet_traits : ei_default_packet_traits -{ typedef Packet4i type; enum {size=4}; }; +{ + typedef Packet4i type; + enum { + IsVectorized = 1, + size=4 + // FIXME check the Has* + }; +}; template<> struct ei_unpacket_traits { typedef float type; enum {size=4}; }; template<> struct ei_unpacket_traits { typedef int type; enum {size=4}; }; diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index fcc5219f5..5e9351f70 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -25,6 +25,7 @@ #ifndef EIGEN_COMPLEX_SSE_H #define EIGEN_COMPLEX_SSE_H +//---------- float ---------- struct Packet2cf { EIGEN_STRONG_INLINE Packet2cf() {} @@ -32,12 +33,13 @@ struct Packet2cf __m128 v; }; -typedef __m128d Packet1cd; - template<> struct ei_packet_traits > : ei_default_packet_traits { - typedef Packet2cf type; enum {size=2}; + typedef Packet2cf type; enum { + IsVectorized = 1, + size = 2, + HasAdd = 1, HasSub = 1, HasMul = 1, @@ -94,17 +96,6 @@ template<> EIGEN_STRONG_INLINE Packet2cf ei_pmul(const Packet2cf& a, // _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b.v), 0xb1 ))), mask)))); // } -template<> EIGEN_STRONG_INLINE Packet2cf ei_pdiv(const Packet2cf& a, const Packet2cf& b) -{ - // TODO optimize it for SSE3 and 4 - const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); - Packet2cf res(_mm_add_ps(_mm_mul_ps(a.v, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b.v), 0xa0))), - _mm_xor_ps(_mm_mul_ps(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(a.v), 0xb1)), - _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b.v), 0xf5 ))), mask))); - __m128 s = _mm_mul_ps(b.v,b.v); - return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(s), 0xb1))))); -} - template<> EIGEN_STRONG_INLINE Packet2cf ei_pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf ei_por (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf ei_pxor (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); } @@ -199,4 +190,162 @@ template<> struct ei_conj_helper } }; +template<> EIGEN_STRONG_INLINE Packet2cf ei_pdiv(const Packet2cf& a, const Packet2cf& b) +{ + // TODO optimize it for SSE3 and 4 + Packet2cf res = ei_conj_helper().pmul(a,b); + __m128 s = _mm_mul_ps(b.v,b.v); + return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(s), 0xb1))))); +} + +//---------- double ---------- +struct Packet1cd +{ + EIGEN_STRONG_INLINE Packet1cd() {} + EIGEN_STRONG_INLINE explicit Packet1cd(const __m128d& a) : v(a) {} + __m128d v; +}; + +template<> struct ei_packet_traits > : ei_default_packet_traits +{ + typedef Packet1cd type; + enum { + IsVectorized = 1, + size = 1, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0 + }; +}; + +template<> struct ei_unpacket_traits { typedef std::complex type; enum {size=1}; }; + +// template<> EIGEN_STRONG_INLINE Packet4f ei_plset >(const std::complex & a) { } + +template<> EIGEN_STRONG_INLINE Packet1cd ei_padd(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd ei_psub(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd ei_pnegate(const Packet1cd& a) { return Packet1cd(ei_pnegate(a.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd ei_pconj(const Packet1cd& a) +{ + const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); + return Packet1cd(_mm_xor_pd(a.v,mask)); +} + +template<> EIGEN_STRONG_INLINE Packet1cd ei_pmul(const Packet1cd& a, const Packet1cd& b) +{ + // TODO optimize it for SSE3 and 4 + const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0)); + return Packet1cd(_mm_add_pd(_mm_mul_pd(_mm_unpacklo_pd(a.v, a.v), b.v), + _mm_xor_pd(_mm_mul_pd(_mm_unpackhi_pd(a.v, a.v), + _mm_shuffle_pd(b.v, b.v, 0x1)), mask))); +} + +template<> EIGEN_STRONG_INLINE Packet1cd ei_pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd ei_por (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd ei_pxor (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd ei_pandnot(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(a.v,b.v)); } + +template<> EIGEN_STRONG_INLINE Packet1cd ei_pload >(const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(_mm_load_pd((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet1cd ei_ploadu >(const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(ei_ploadu((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet1cd ei_pset1 >(const std::complex& from) +{ /* FIXME here it seems we have to use unaligned loads */ return ei_ploadu(&from); } + +template<> EIGEN_STRONG_INLINE void ei_pstore >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE void ei_pstoreu >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE ei_pstoreu((double*)to, from.v); } + +template<> EIGEN_STRONG_INLINE void ei_prefetch >(const std::complex * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } + +template<> EIGEN_STRONG_INLINE std::complex ei_pfirst(const Packet1cd& a) +{ + std::complex res; + _mm_store_pd((double*)&res, a.v); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet1cd ei_preverse(const Packet1cd& a) { return a; } + +// template<> EIGEN_STRONG_INLINE Packet2cf ei_pabs(const Packet2cf& a) {} + +template<> EIGEN_STRONG_INLINE std::complex ei_predux(const Packet1cd& a) +{ + return ei_pfirst(a); +} + +template<> EIGEN_STRONG_INLINE Packet1cd ei_preduxp(const Packet1cd* vecs) +{ + return vecs[0]; +} + +template<> EIGEN_STRONG_INLINE std::complex ei_predux_mul(const Packet1cd& a) +{ + return ei_pfirst(a); +} + +template +struct ei_palign_impl +{ + EIGEN_STRONG_INLINE static void run(Packet1cd& first, const Packet1cd& second) + { + } +}; + +template<> struct ei_conj_helper +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return ei_padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { +// return ei_pmul(a,ei_pconj(b)); + const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); + return Packet1cd(_mm_add_pd(_mm_xor_pd(_mm_mul_pd(_mm_unpacklo_pd(a.v, a.v), b.v), mask), + _mm_mul_pd(_mm_unpackhi_pd(a.v, a.v), + _mm_shuffle_pd(b.v, b.v, 0x1)))); + } +}; + +template<> struct ei_conj_helper +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return ei_padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); + return Packet1cd(_mm_add_pd(_mm_mul_pd(_mm_unpacklo_pd(a.v, a.v), b.v), + _mm_xor_pd(_mm_mul_pd(_mm_unpackhi_pd(a.v, a.v), + _mm_shuffle_pd(b.v, b.v, 0x1)), mask))); + } +}; + +template<> struct ei_conj_helper +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return ei_padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); + return Packet1cd(_mm_sub_pd(_mm_xor_pd(_mm_mul_pd(_mm_unpacklo_pd(a.v, a.v), b.v), mask), + _mm_mul_pd(_mm_unpackhi_pd(a.v, a.v), + _mm_shuffle_pd(b.v, b.v, 0x1)))); + } +}; + +template<> EIGEN_STRONG_INLINE Packet1cd ei_pdiv(const Packet1cd& a, const Packet1cd& b) +{ + // TODO optimize it for SSE3 and 4 + Packet1cd res = ei_conj_helper().pmul(a,b); + __m128d s = _mm_mul_pd(b.v,b.v); + return Packet1cd(_mm_div_pd(res.v, _mm_add_pd(s,_mm_shuffle_pd(s, s, 0x1)))); +} + #endif // EIGEN_COMPLEX_SSE_H diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 141754d98..43b8ef488 100644 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -61,8 +61,11 @@ template<> struct ei_is_arithmetic<__m128d> { enum { ret = true }; }; template<> struct ei_packet_traits : ei_default_packet_traits { - typedef Packet4f type; enum {size=4}; + typedef Packet4f type; enum { + IsVectorized = 1, + size=4, + HasDiv = 1, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, @@ -73,13 +76,23 @@ template<> struct ei_packet_traits : ei_default_packet_traits }; template<> struct ei_packet_traits : ei_default_packet_traits { - typedef Packet2d type; enum {size=2}; + typedef Packet2d type; enum { + IsVectorized = 1, + size=2, + HasDiv = 1 }; }; template<> struct ei_packet_traits : ei_default_packet_traits -{ typedef Packet4i type; enum {size=4}; }; +{ + typedef Packet4i type; + enum { + // FIXME check the Has* + IsVectorized = 1, + size=4 + }; +}; template<> struct ei_unpacket_traits { typedef float type; enum {size=4}; }; template<> struct ei_unpacket_traits { typedef double type; enum {size=2}; }; diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index 4cf843422..4df13f3be 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -55,7 +55,10 @@ void ei_cache_friendly_product_colmajor_times_vector( typedef typename NumTraits::Real RealScalar; typedef typename ei_packet_traits::type Packet; - const Index PacketSize = sizeof(Packet)/sizeof(Scalar); + enum { + PacketSize = sizeof(Packet)/sizeof(Scalar), + IsVectorized = ei_packet_traits::IsVectorized + }; ei_conj_helper cj; ei_conj_helper pcj; @@ -128,7 +131,7 @@ void ei_cache_friendly_product_colmajor_times_vector( const Scalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride, *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride; - if (PacketSize>1) + if (IsVectorized) { /* explicit vectorization */ // process initial unaligned coeffs @@ -216,7 +219,7 @@ void ei_cache_friendly_product_colmajor_times_vector( Packet ptmp0 = ei_pset1(alpha*rhs[i]); const Scalar* lhs0 = lhs + i*lhsStride; - if (PacketSize>1) + if (IsVectorized) { /* explicit vectorization */ // process first unaligned result's coeffs @@ -244,7 +247,7 @@ void ei_cache_friendly_product_colmajor_times_vector( } else break; - } while(PacketSize>1); + } while(IsVectorized); #undef _EIGEN_ACCUMULATE_PACKETS } @@ -269,7 +272,10 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( typedef typename NumTraits::Real RealScalar; typedef typename ei_packet_traits::type Packet; - const Index PacketSize = sizeof(Packet)/sizeof(Scalar); + enum { + PacketSize = sizeof(Packet)/sizeof(Scalar), + IsVectorized = ei_packet_traits::IsVectorized + }; ei_conj_helper cj; ei_conj_helper pcj; @@ -341,7 +347,7 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( const Scalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride, *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride; - if (PacketSize>1) + if (IsVectorized) { /* explicit vectorization */ Packet ptmp0 = ei_pset1(Scalar(0)), ptmp1 = ei_pset1(Scalar(0)), ptmp2 = ei_pset1(Scalar(0)), ptmp3 = ei_pset1(Scalar(0)); @@ -470,7 +476,7 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( } else break; - } while(PacketSize>1); + } while(IsVectorized); #undef _EIGEN_ACCUMULATE_PACKETS } diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index f33b576ea..a5386979c 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -151,7 +151,7 @@ class ei_compute_matrix_flags ) ) ? AlignedBit : 0, - packet_access_bit = ei_packet_traits::size > 1 && aligned_bit ? PacketAccessBit : 0 + packet_access_bit = ei_packet_traits::IsVectorized && aligned_bit ? PacketAccessBit : 0 }; public: