added packet mul-add function (ei_pmad) and updated Product to use it.

this change nothing for current SSE architecture but might be helpful
for altivec/cell and up comming AMD processors.
This commit is contained in:
Gael Guennebaud 2008-05-03 00:45:08 +00:00
parent 102e029dad
commit a6655dd91a
2 changed files with 28 additions and 22 deletions

View File

@ -29,15 +29,26 @@
// In practice these functions are provided to make easier the writting
// of generic vectorized code. However, at runtime, they should never be
// called, TODO so sould we raise an assertion or not ?
/** \internal \returns a + b (coeff-wise) */
template <typename Scalar> inline Scalar ei_padd(const Scalar& a, const Scalar& b) { return a + b; }
/** \internal \returns a - b (coeff-wise) */
template <typename Scalar> inline Scalar ei_psub(const Scalar& a, const Scalar& b) { return a - b; }
/** \internal \returns a * b (coeff-wise) */
template <typename Scalar> inline Scalar ei_pmul(const Scalar& a, const Scalar& b) { return a * b; }
/** \internal \returns a * b - c (coeff-wise) */
template <typename Scalar> inline Scalar ei_pmadd(const Scalar& a, const Scalar& b, const Scalar& c)
{ return ei_padd(ei_pmul(a, b),c); }
/** \internal \returns the min of \a a and \a b (coeff-wise) */
template <typename Scalar> inline Scalar ei_pmin(const Scalar& a, const Scalar& b) { return std::min(a,b); }
/** \internal \returns the max of \a a and \a b (coeff-wise) */
template <typename Scalar> inline Scalar ei_pmax(const Scalar& a, const Scalar& b) { return std::max(a,b); }
/** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */
template <typename Scalar> inline Scalar ei_pload(const Scalar* from) { return *from; }
template <typename Scalar> inline Scalar ei_pload1(const Scalar* from) { return *from; }
template <typename Scalar> inline Scalar ei_pset1(const Scalar& from) { return from; }
/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */
template <typename Scalar> inline Scalar ei_pset1(const Scalar& a) { return a; }
/** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */
template <typename Scalar> inline void ei_pstore(Scalar* to, const Scalar& from) { (*to) = from; }
/** \internal \returns the first element of a packet */
template <typename Scalar> inline Scalar ei_pfirst(const Scalar& a) { return a; }
#ifdef EIGEN_VECTORIZE_SSE
@ -68,6 +79,9 @@ inline __m128i ei_pmul(const __m128i& a, const __m128i& b)
_mm_setr_epi32(0xffffffff,0,0xffffffff,0)), 4));
}
// for some weird raisons, it has to be overloaded for packet integer
inline __m128i ei_pmadd(const __m128i& a, const __m128i& b, const __m128i& c) { return ei_padd(ei_pmul(a,b), c); }
inline __m128 ei_pmin(const __m128& a, const __m128& b) { return _mm_min_ps(a,b); }
inline __m128d ei_pmin(const __m128d& a, const __m128d& b) { return _mm_min_pd(a,b); }
// FIXME this vectorized min operator is likely to be slower than the standard one

View File

@ -69,7 +69,7 @@ struct ei_packet_product_unroller<true, Index, Size, Lhs, Rhs, PacketScalar>
static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
{
ei_packet_product_unroller<true, Index-1, Size, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, res);
res = ei_padd(res, ei_pmul(ei_pset1(lhs.coeff(row, Index)), rhs.packetCoeff(Index, col)));
res = ei_pmadd(ei_pset1(lhs.coeff(row, Index)), rhs.packetCoeff(Index, col), res);
}
};
@ -79,7 +79,7 @@ struct ei_packet_product_unroller<false, Index, Size, Lhs, Rhs, PacketScalar>
static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
{
ei_packet_product_unroller<false, Index-1, Size, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, res);
res = ei_padd(res, ei_pmul(lhs.packetCoeff(row, Index), ei_pset1(rhs.coeff(Index, col))));
res = ei_pmadd(lhs.packetCoeff(row, Index), ei_pset1(rhs.coeff(Index, col)), res);
}
};
@ -249,7 +249,7 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm
PacketScalar res;
res = ei_pmul(ei_pset1(m_lhs.coeff(row, 0)),m_rhs.packetCoeff(0, col));
for(int i = 1; i < m_lhs.cols(); i++)
res = ei_padd(res, ei_pmul(ei_pset1(m_lhs.coeff(row, i)), m_rhs.packetCoeff(i, col)));
res = ei_pmadd(ei_pset1(m_lhs.coeff(row, i)), m_rhs.packetCoeff(i, col), res);
return res;
}
@ -258,7 +258,7 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm
PacketScalar res;
res = ei_pmul(m_lhs.packetCoeff(row, 0), ei_pset1(m_rhs.coeff(0, col)));
for(int i = 1; i < m_lhs.cols(); i++)
res = ei_padd(res, ei_pmul(m_lhs.packetCoeff(row, i), ei_pset1(m_rhs.coeff(i, col))));
res = ei_pmadd(m_lhs.packetCoeff(row, i), ei_pset1(m_rhs.coeff(i, col)), res);
return res;
}
@ -398,17 +398,13 @@ void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res, ei_meta_true
const typename ei_packet_traits<Scalar>::type tmp3 = ei_pset1(m_lhs.coeff(k,j+3));
for (int i=0; i<this->cols(); i+=ei_packet_traits<Scalar>::size)
{
// FIXME the following could be implemented using only mul-add, check if this is still OK for SSE
res.writePacketCoeff(k,i,
ei_padd(
res.packetCoeff(k,i),
ei_padd(
ei_padd(
ei_pmul(tmp0, m_rhs.packetCoeff(j+0,i)),
ei_pmul(tmp1, m_rhs.packetCoeff(j+1,i))),
ei_padd(
ei_pmul(tmp2, m_rhs.packetCoeff(j+2,i)),
ei_pmul(tmp3, m_rhs.packetCoeff(j+3,i))
)
ei_pmadd(tmp0, m_rhs.packetCoeff(j+0,i), ei_pmul(tmp1, m_rhs.packetCoeff(j+1,i))),
ei_pmadd(tmp2, m_rhs.packetCoeff(j+2,i), ei_pmul(tmp3, m_rhs.packetCoeff(j+3,i)))
)
)
);
@ -421,7 +417,7 @@ void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res, ei_meta_true
{
const typename ei_packet_traits<Scalar>::type tmp = ei_pset1(m_lhs.coeff(k,j));
for (int i=0; i<this->cols(); i+=ei_packet_traits<Scalar>::size)
res.writePacketCoeff(k,i, ei_pmul(tmp, m_rhs.packetCoeff(j,i)));
res.writePacketCoeff(k,i, ei_pmadd(tmp, m_rhs.packetCoeff(j,i), res.packetCoeff(k,i)));
}
}
}
@ -443,13 +439,9 @@ void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res, ei_meta_true
ei_padd(
res.packetCoeff(i,k),
ei_padd(
ei_padd(
ei_pmul(tmp0, m_lhs.packetCoeff(i,j)),
ei_pmul(tmp1, m_lhs.packetCoeff(i,j+1))),
ei_padd(
ei_pmul(tmp2, m_lhs.packetCoeff(i,j+2)),
ei_pmul(tmp3, m_lhs.packetCoeff(i,j+3))
)
ei_pmadd(tmp0, m_lhs.packetCoeff(i,j), ei_pmul(tmp1, m_lhs.packetCoeff(i,j+1))),
ei_pmadd(tmp2, m_lhs.packetCoeff(i,j+2),ei_pmul(tmp3, m_lhs.packetCoeff(i,j+3)))
)
)
);
@ -462,7 +454,7 @@ void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res, ei_meta_true
{
const typename ei_packet_traits<Scalar>::type tmp = ei_pset1(m_rhs.coeff(j,k));
for (int i=0; i<this->rows(); i+=ei_packet_traits<Scalar>::size)
res.writePacketCoeff(i,k, ei_pmul(tmp, m_lhs.packetCoeff(i,j)));
res.writePacketCoeff(i,k, ei_pmadd(tmp, m_lhs.packetCoeff(i,j), res.packetCoeff(i,k)));
}
}
}