Added support for FMA instructions

This commit is contained in:
Benoit Steiner 2014-02-24 13:45:32 -08:00
parent 7ed9441ea4
commit db7d49efbb
4 changed files with 28 additions and 0 deletions

View File

@ -117,6 +117,9 @@
#define EIGEN_VECTORIZE_SSE4_1
#define EIGEN_VECTORIZE_SSE4_2
#endif
#ifdef __FMA__
#define EIGEN_VECTORIZE_FMA
#endif
// include files
// This extern "C" works around a MINGW-w64 compilation issue

View File

@ -120,6 +120,11 @@ template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& /*a*/, co
return pset1<Packet8i>(0);
}
#ifdef EIGEN_VECTORIZE_FMA
template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) { return _mm256_fmadd_ps(a,b,c); }
template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) { return _mm256_fmadd_pd(a,b,c); }
#endif
template<> EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_min_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_min_pd(a,b); }

View File

@ -179,6 +179,10 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co
// for some weird raisons, it has to be overloaded for packet of integers
template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
#ifdef EIGEN_VECTORIZE_FMA
template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); }
template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); }
#endif
template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); }

View File

@ -205,7 +205,15 @@ public:
EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, AccPacket& tmp) const
{
// It would be a lot cleaner to call pmadd all the time. Unfortunately if we
// let gcc allocate the register in which to store the result of the pmul
// (in the case where there is no FMA) gcc fails to figure out how to avoid
// spilling register.
#ifdef EIGEN_VECTORIZE_FMA
c = pmadd(a,b,c);
#else
tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp);
#endif
}
EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
@ -281,7 +289,11 @@ public:
EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
{
#ifdef EIGEN_VECTORIZE_FMA
c.v = pmadd(a.v,b,c.v);
#else
tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp);
#endif
}
EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
@ -486,7 +498,11 @@ public:
EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
{
#ifdef EIGEN_VECTORIZE_FMA
c = pmadd(a,b,c);
#else
tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp);
#endif
}
EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const