From a6655dd91aea66a7e617031e87ca7f34dce2a639 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 3 May 2008 00:45:08 +0000 Subject: [PATCH] added packet mul-add function (ei_pmad) and updated Product to use it. this change nothing for current SSE architecture but might be helpful for altivec/cell and up comming AMD processors. --- Eigen/src/Core/PacketMath.h | 18 ++++++++++++++++-- Eigen/src/Core/Product.h | 32 ++++++++++++-------------------- 2 files changed, 28 insertions(+), 22 deletions(-) diff --git a/Eigen/src/Core/PacketMath.h b/Eigen/src/Core/PacketMath.h index 3534452ff..5a5f15dad 100644 --- a/Eigen/src/Core/PacketMath.h +++ b/Eigen/src/Core/PacketMath.h @@ -29,15 +29,26 @@ // In practice these functions are provided to make easier the writting // of generic vectorized code. However, at runtime, they should never be // called, TODO so sould we raise an assertion or not ? +/** \internal \returns a + b (coeff-wise) */ template inline Scalar ei_padd(const Scalar& a, const Scalar& b) { return a + b; } +/** \internal \returns a - b (coeff-wise) */ template inline Scalar ei_psub(const Scalar& a, const Scalar& b) { return a - b; } +/** \internal \returns a * b (coeff-wise) */ template inline Scalar ei_pmul(const Scalar& a, const Scalar& b) { return a * b; } +/** \internal \returns a * b - c (coeff-wise) */ +template inline Scalar ei_pmadd(const Scalar& a, const Scalar& b, const Scalar& c) +{ return ei_padd(ei_pmul(a, b),c); } +/** \internal \returns the min of \a a and \a b (coeff-wise) */ template inline Scalar ei_pmin(const Scalar& a, const Scalar& b) { return std::min(a,b); } +/** \internal \returns the max of \a a and \a b (coeff-wise) */ template inline Scalar ei_pmax(const Scalar& a, const Scalar& b) { return std::max(a,b); } +/** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */ template inline Scalar ei_pload(const Scalar* from) { return *from; } -template inline Scalar ei_pload1(const Scalar* from) { return *from; } -template inline Scalar ei_pset1(const Scalar& from) { return from; } +/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */ +template inline Scalar ei_pset1(const Scalar& a) { return a; } +/** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */ template inline void ei_pstore(Scalar* to, const Scalar& from) { (*to) = from; } +/** \internal \returns the first element of a packet */ template inline Scalar ei_pfirst(const Scalar& a) { return a; } #ifdef EIGEN_VECTORIZE_SSE @@ -68,6 +79,9 @@ inline __m128i ei_pmul(const __m128i& a, const __m128i& b) _mm_setr_epi32(0xffffffff,0,0xffffffff,0)), 4)); } +// for some weird raisons, it has to be overloaded for packet integer +inline __m128i ei_pmadd(const __m128i& a, const __m128i& b, const __m128i& c) { return ei_padd(ei_pmul(a,b), c); } + inline __m128 ei_pmin(const __m128& a, const __m128& b) { return _mm_min_ps(a,b); } inline __m128d ei_pmin(const __m128d& a, const __m128d& b) { return _mm_min_pd(a,b); } // FIXME this vectorized min operator is likely to be slower than the standard one diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h index b1892e1df..04b6fb9c0 100644 --- a/Eigen/src/Core/Product.h +++ b/Eigen/src/Core/Product.h @@ -69,7 +69,7 @@ struct ei_packet_product_unroller static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res) { ei_packet_product_unroller::run(row, col, lhs, rhs, res); - res = ei_padd(res, ei_pmul(ei_pset1(lhs.coeff(row, Index)), rhs.packetCoeff(Index, col))); + res = ei_pmadd(ei_pset1(lhs.coeff(row, Index)), rhs.packetCoeff(Index, col), res); } }; @@ -79,7 +79,7 @@ struct ei_packet_product_unroller static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res) { ei_packet_product_unroller::run(row, col, lhs, rhs, res); - res = ei_padd(res, ei_pmul(lhs.packetCoeff(row, Index), ei_pset1(rhs.coeff(Index, col)))); + res = ei_pmadd(lhs.packetCoeff(row, Index), ei_pset1(rhs.coeff(Index, col)), res); } }; @@ -249,7 +249,7 @@ template class Product : ei_no_assignm PacketScalar res; res = ei_pmul(ei_pset1(m_lhs.coeff(row, 0)),m_rhs.packetCoeff(0, col)); for(int i = 1; i < m_lhs.cols(); i++) - res = ei_padd(res, ei_pmul(ei_pset1(m_lhs.coeff(row, i)), m_rhs.packetCoeff(i, col))); + res = ei_pmadd(ei_pset1(m_lhs.coeff(row, i)), m_rhs.packetCoeff(i, col), res); return res; } @@ -258,7 +258,7 @@ template class Product : ei_no_assignm PacketScalar res; res = ei_pmul(m_lhs.packetCoeff(row, 0), ei_pset1(m_rhs.coeff(0, col))); for(int i = 1; i < m_lhs.cols(); i++) - res = ei_padd(res, ei_pmul(m_lhs.packetCoeff(row, i), ei_pset1(m_rhs.coeff(i, col)))); + res = ei_pmadd(m_lhs.packetCoeff(row, i), ei_pset1(m_rhs.coeff(i, col)), res); return res; } @@ -398,17 +398,13 @@ void Product::_cacheOptimalEval(DestDerived& res, ei_meta_true const typename ei_packet_traits::type tmp3 = ei_pset1(m_lhs.coeff(k,j+3)); for (int i=0; icols(); i+=ei_packet_traits::size) { + // FIXME the following could be implemented using only mul-add, check if this is still OK for SSE res.writePacketCoeff(k,i, ei_padd( res.packetCoeff(k,i), ei_padd( - ei_padd( - ei_pmul(tmp0, m_rhs.packetCoeff(j+0,i)), - ei_pmul(tmp1, m_rhs.packetCoeff(j+1,i))), - ei_padd( - ei_pmul(tmp2, m_rhs.packetCoeff(j+2,i)), - ei_pmul(tmp3, m_rhs.packetCoeff(j+3,i)) - ) + ei_pmadd(tmp0, m_rhs.packetCoeff(j+0,i), ei_pmul(tmp1, m_rhs.packetCoeff(j+1,i))), + ei_pmadd(tmp2, m_rhs.packetCoeff(j+2,i), ei_pmul(tmp3, m_rhs.packetCoeff(j+3,i))) ) ) ); @@ -421,7 +417,7 @@ void Product::_cacheOptimalEval(DestDerived& res, ei_meta_true { const typename ei_packet_traits::type tmp = ei_pset1(m_lhs.coeff(k,j)); for (int i=0; icols(); i+=ei_packet_traits::size) - res.writePacketCoeff(k,i, ei_pmul(tmp, m_rhs.packetCoeff(j,i))); + res.writePacketCoeff(k,i, ei_pmadd(tmp, m_rhs.packetCoeff(j,i), res.packetCoeff(k,i))); } } } @@ -443,13 +439,9 @@ void Product::_cacheOptimalEval(DestDerived& res, ei_meta_true ei_padd( res.packetCoeff(i,k), ei_padd( - ei_padd( - ei_pmul(tmp0, m_lhs.packetCoeff(i,j)), - ei_pmul(tmp1, m_lhs.packetCoeff(i,j+1))), - ei_padd( - ei_pmul(tmp2, m_lhs.packetCoeff(i,j+2)), - ei_pmul(tmp3, m_lhs.packetCoeff(i,j+3)) - ) + ei_pmadd(tmp0, m_lhs.packetCoeff(i,j), ei_pmul(tmp1, m_lhs.packetCoeff(i,j+1))), + ei_pmadd(tmp2, m_lhs.packetCoeff(i,j+2),ei_pmul(tmp3, m_lhs.packetCoeff(i,j+3))) + ) ) ); @@ -462,7 +454,7 @@ void Product::_cacheOptimalEval(DestDerived& res, ei_meta_true { const typename ei_packet_traits::type tmp = ei_pset1(m_rhs.coeff(j,k)); for (int i=0; irows(); i+=ei_packet_traits::size) - res.writePacketCoeff(i,k, ei_pmul(tmp, m_lhs.packetCoeff(i,j))); + res.writePacketCoeff(i,k, ei_pmadd(tmp, m_lhs.packetCoeff(i,j), res.packetCoeff(i,k))); } } }