mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-07-22 21:04:28 +08:00
added packet mul-add function (ei_pmad) and updated Product to use it.
this change nothing for current SSE architecture but might be helpful for altivec/cell and up comming AMD processors.
This commit is contained in:
parent
102e029dad
commit
a6655dd91a
@ -29,15 +29,26 @@
|
||||
// In practice these functions are provided to make easier the writting
|
||||
// of generic vectorized code. However, at runtime, they should never be
|
||||
// called, TODO so sould we raise an assertion or not ?
|
||||
/** \internal \returns a + b (coeff-wise) */
|
||||
template <typename Scalar> inline Scalar ei_padd(const Scalar& a, const Scalar& b) { return a + b; }
|
||||
/** \internal \returns a - b (coeff-wise) */
|
||||
template <typename Scalar> inline Scalar ei_psub(const Scalar& a, const Scalar& b) { return a - b; }
|
||||
/** \internal \returns a * b (coeff-wise) */
|
||||
template <typename Scalar> inline Scalar ei_pmul(const Scalar& a, const Scalar& b) { return a * b; }
|
||||
/** \internal \returns a * b - c (coeff-wise) */
|
||||
template <typename Scalar> inline Scalar ei_pmadd(const Scalar& a, const Scalar& b, const Scalar& c)
|
||||
{ return ei_padd(ei_pmul(a, b),c); }
|
||||
/** \internal \returns the min of \a a and \a b (coeff-wise) */
|
||||
template <typename Scalar> inline Scalar ei_pmin(const Scalar& a, const Scalar& b) { return std::min(a,b); }
|
||||
/** \internal \returns the max of \a a and \a b (coeff-wise) */
|
||||
template <typename Scalar> inline Scalar ei_pmax(const Scalar& a, const Scalar& b) { return std::max(a,b); }
|
||||
/** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */
|
||||
template <typename Scalar> inline Scalar ei_pload(const Scalar* from) { return *from; }
|
||||
template <typename Scalar> inline Scalar ei_pload1(const Scalar* from) { return *from; }
|
||||
template <typename Scalar> inline Scalar ei_pset1(const Scalar& from) { return from; }
|
||||
/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */
|
||||
template <typename Scalar> inline Scalar ei_pset1(const Scalar& a) { return a; }
|
||||
/** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */
|
||||
template <typename Scalar> inline void ei_pstore(Scalar* to, const Scalar& from) { (*to) = from; }
|
||||
/** \internal \returns the first element of a packet */
|
||||
template <typename Scalar> inline Scalar ei_pfirst(const Scalar& a) { return a; }
|
||||
|
||||
#ifdef EIGEN_VECTORIZE_SSE
|
||||
@ -68,6 +79,9 @@ inline __m128i ei_pmul(const __m128i& a, const __m128i& b)
|
||||
_mm_setr_epi32(0xffffffff,0,0xffffffff,0)), 4));
|
||||
}
|
||||
|
||||
// for some weird raisons, it has to be overloaded for packet integer
|
||||
inline __m128i ei_pmadd(const __m128i& a, const __m128i& b, const __m128i& c) { return ei_padd(ei_pmul(a,b), c); }
|
||||
|
||||
inline __m128 ei_pmin(const __m128& a, const __m128& b) { return _mm_min_ps(a,b); }
|
||||
inline __m128d ei_pmin(const __m128d& a, const __m128d& b) { return _mm_min_pd(a,b); }
|
||||
// FIXME this vectorized min operator is likely to be slower than the standard one
|
||||
|
@ -69,7 +69,7 @@ struct ei_packet_product_unroller<true, Index, Size, Lhs, Rhs, PacketScalar>
|
||||
static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
|
||||
{
|
||||
ei_packet_product_unroller<true, Index-1, Size, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, res);
|
||||
res = ei_padd(res, ei_pmul(ei_pset1(lhs.coeff(row, Index)), rhs.packetCoeff(Index, col)));
|
||||
res = ei_pmadd(ei_pset1(lhs.coeff(row, Index)), rhs.packetCoeff(Index, col), res);
|
||||
}
|
||||
};
|
||||
|
||||
@ -79,7 +79,7 @@ struct ei_packet_product_unroller<false, Index, Size, Lhs, Rhs, PacketScalar>
|
||||
static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
|
||||
{
|
||||
ei_packet_product_unroller<false, Index-1, Size, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, res);
|
||||
res = ei_padd(res, ei_pmul(lhs.packetCoeff(row, Index), ei_pset1(rhs.coeff(Index, col))));
|
||||
res = ei_pmadd(lhs.packetCoeff(row, Index), ei_pset1(rhs.coeff(Index, col)), res);
|
||||
}
|
||||
};
|
||||
|
||||
@ -249,7 +249,7 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm
|
||||
PacketScalar res;
|
||||
res = ei_pmul(ei_pset1(m_lhs.coeff(row, 0)),m_rhs.packetCoeff(0, col));
|
||||
for(int i = 1; i < m_lhs.cols(); i++)
|
||||
res = ei_padd(res, ei_pmul(ei_pset1(m_lhs.coeff(row, i)), m_rhs.packetCoeff(i, col)));
|
||||
res = ei_pmadd(ei_pset1(m_lhs.coeff(row, i)), m_rhs.packetCoeff(i, col), res);
|
||||
return res;
|
||||
}
|
||||
|
||||
@ -258,7 +258,7 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm
|
||||
PacketScalar res;
|
||||
res = ei_pmul(m_lhs.packetCoeff(row, 0), ei_pset1(m_rhs.coeff(0, col)));
|
||||
for(int i = 1; i < m_lhs.cols(); i++)
|
||||
res = ei_padd(res, ei_pmul(m_lhs.packetCoeff(row, i), ei_pset1(m_rhs.coeff(i, col))));
|
||||
res = ei_pmadd(m_lhs.packetCoeff(row, i), ei_pset1(m_rhs.coeff(i, col)), res);
|
||||
return res;
|
||||
}
|
||||
|
||||
@ -398,17 +398,13 @@ void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res, ei_meta_true
|
||||
const typename ei_packet_traits<Scalar>::type tmp3 = ei_pset1(m_lhs.coeff(k,j+3));
|
||||
for (int i=0; i<this->cols(); i+=ei_packet_traits<Scalar>::size)
|
||||
{
|
||||
// FIXME the following could be implemented using only mul-add, check if this is still OK for SSE
|
||||
res.writePacketCoeff(k,i,
|
||||
ei_padd(
|
||||
res.packetCoeff(k,i),
|
||||
ei_padd(
|
||||
ei_padd(
|
||||
ei_pmul(tmp0, m_rhs.packetCoeff(j+0,i)),
|
||||
ei_pmul(tmp1, m_rhs.packetCoeff(j+1,i))),
|
||||
ei_padd(
|
||||
ei_pmul(tmp2, m_rhs.packetCoeff(j+2,i)),
|
||||
ei_pmul(tmp3, m_rhs.packetCoeff(j+3,i))
|
||||
)
|
||||
ei_pmadd(tmp0, m_rhs.packetCoeff(j+0,i), ei_pmul(tmp1, m_rhs.packetCoeff(j+1,i))),
|
||||
ei_pmadd(tmp2, m_rhs.packetCoeff(j+2,i), ei_pmul(tmp3, m_rhs.packetCoeff(j+3,i)))
|
||||
)
|
||||
)
|
||||
);
|
||||
@ -421,7 +417,7 @@ void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res, ei_meta_true
|
||||
{
|
||||
const typename ei_packet_traits<Scalar>::type tmp = ei_pset1(m_lhs.coeff(k,j));
|
||||
for (int i=0; i<this->cols(); i+=ei_packet_traits<Scalar>::size)
|
||||
res.writePacketCoeff(k,i, ei_pmul(tmp, m_rhs.packetCoeff(j,i)));
|
||||
res.writePacketCoeff(k,i, ei_pmadd(tmp, m_rhs.packetCoeff(j,i), res.packetCoeff(k,i)));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -443,13 +439,9 @@ void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res, ei_meta_true
|
||||
ei_padd(
|
||||
res.packetCoeff(i,k),
|
||||
ei_padd(
|
||||
ei_padd(
|
||||
ei_pmul(tmp0, m_lhs.packetCoeff(i,j)),
|
||||
ei_pmul(tmp1, m_lhs.packetCoeff(i,j+1))),
|
||||
ei_padd(
|
||||
ei_pmul(tmp2, m_lhs.packetCoeff(i,j+2)),
|
||||
ei_pmul(tmp3, m_lhs.packetCoeff(i,j+3))
|
||||
)
|
||||
ei_pmadd(tmp0, m_lhs.packetCoeff(i,j), ei_pmul(tmp1, m_lhs.packetCoeff(i,j+1))),
|
||||
ei_pmadd(tmp2, m_lhs.packetCoeff(i,j+2),ei_pmul(tmp3, m_lhs.packetCoeff(i,j+3)))
|
||||
|
||||
)
|
||||
)
|
||||
);
|
||||
@ -462,7 +454,7 @@ void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res, ei_meta_true
|
||||
{
|
||||
const typename ei_packet_traits<Scalar>::type tmp = ei_pset1(m_rhs.coeff(j,k));
|
||||
for (int i=0; i<this->rows(); i+=ei_packet_traits<Scalar>::size)
|
||||
res.writePacketCoeff(i,k, ei_pmul(tmp, m_lhs.packetCoeff(i,j)));
|
||||
res.writePacketCoeff(i,k, ei_pmadd(tmp, m_lhs.packetCoeff(i,j), res.packetCoeff(i,k)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user