added packet mul-add function (ei_pmad) and updated Product to use it.

this change nothing for current SSE architecture but might be helpful for altivec/cell and up comming AMD processors.
2025-07-22 21:04:28 +08:00 · 2008-05-03 00:45:08 +00:00 · 2008-05-03 00:45:08 +00:00 · a6655dd91a
commit a6655dd91a
parent 102e029dad
2 changed files with 28 additions and 22 deletions
--- a/Eigen/src/Core/PacketMath.h
+++ b/Eigen/src/Core/PacketMath.h
@ -29,15 +29,26 @@
 // In practice these functions are provided to make easier the writting
 // of generic vectorized code. However, at runtime, they should never be
 // called, TODO so sould we raise an assertion or not ?
+/** \internal \returns a + b (coeff-wise) */
 template <typename Scalar> inline Scalar ei_padd(const Scalar&  a, const Scalar&  b) { return a + b; }
+/** \internal \returns a - b (coeff-wise) */
 template <typename Scalar> inline Scalar ei_psub(const Scalar&  a, const Scalar&  b) { return a - b; }
+/** \internal \returns a * b (coeff-wise) */
 template <typename Scalar> inline Scalar ei_pmul(const Scalar&  a, const Scalar&  b) { return a * b; }
+/** \internal \returns a * b - c (coeff-wise) */
+template <typename Scalar> inline Scalar ei_pmadd(const Scalar&  a, const Scalar&  b, const Scalar&  c)
+{ return ei_padd(ei_pmul(a, b),c); }
+/** \internal \returns the min of \a a and \a b  (coeff-wise) */
 template <typename Scalar> inline Scalar ei_pmin(const Scalar&  a, const Scalar&  b) { return std::min(a,b); }
+/** \internal \returns the max of \a a and \a b  (coeff-wise) */
 template <typename Scalar> inline Scalar ei_pmax(const Scalar&  a, const Scalar&  b) { return std::max(a,b); }
+/** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */
 template <typename Scalar> inline Scalar ei_pload(const Scalar* from) { return *from; }
-template <typename Scalar> inline Scalar ei_pload1(const Scalar* from) { return *from; }
-template <typename Scalar> inline Scalar ei_pset1(const Scalar& from) { return from; }
+/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */
+template <typename Scalar> inline Scalar ei_pset1(const Scalar& a) { return a; }
+/** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */
 template <typename Scalar> inline void ei_pstore(Scalar* to, const Scalar& from) { (*to) = from; }
+/** \internal \returns the first element of a packet */
 template <typename Scalar> inline Scalar ei_pfirst(const Scalar& a) { return a; }

 #ifdef EIGEN_VECTORIZE_SSE
@ -68,6 +79,9 @@ inline __m128i ei_pmul(const __m128i& a, const __m128i& b)
        _mm_setr_epi32(0xffffffff,0,0xffffffff,0)), 4));
 }

+// for some weird raisons, it has to be overloaded for packet integer
+inline __m128i ei_pmadd(const __m128i& a, const __m128i& b, const __m128i& c) { return ei_padd(ei_pmul(a,b), c); }
+
 inline __m128  ei_pmin(const __m128&  a, const __m128&  b) { return _mm_min_ps(a,b); }
 inline __m128d ei_pmin(const __m128d& a, const __m128d& b) { return _mm_min_pd(a,b); }
 // FIXME this vectorized min operator is likely to be slower than the standard one
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@ -69,7 +69,7 @@ struct ei_packet_product_unroller<true, Index, Size, Lhs, Rhs, PacketScalar>
  static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
  {
    ei_packet_product_unroller<true, Index-1, Size, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, res);
-    res =  ei_padd(res, ei_pmul(ei_pset1(lhs.coeff(row, Index)), rhs.packetCoeff(Index, col)));
+    res =  ei_pmadd(ei_pset1(lhs.coeff(row, Index)), rhs.packetCoeff(Index, col), res);
  }
 };

@ -79,7 +79,7 @@ struct ei_packet_product_unroller<false, Index, Size, Lhs, Rhs, PacketScalar>
  static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
  {
    ei_packet_product_unroller<false, Index-1, Size, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, res);
-    res =  ei_padd(res, ei_pmul(lhs.packetCoeff(row, Index), ei_pset1(rhs.coeff(Index, col))));
+    res =  ei_pmadd(lhs.packetCoeff(row, Index), ei_pset1(rhs.coeff(Index, col)), res);
  }
 };

@ -249,7 +249,7 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm
      PacketScalar res;
      res = ei_pmul(ei_pset1(m_lhs.coeff(row, 0)),m_rhs.packetCoeff(0, col));
      for(int i = 1; i < m_lhs.cols(); i++)
-        res =  ei_padd(res, ei_pmul(ei_pset1(m_lhs.coeff(row, i)), m_rhs.packetCoeff(i, col)));
+        res =  ei_pmadd(ei_pset1(m_lhs.coeff(row, i)), m_rhs.packetCoeff(i, col), res);
      return res;
    }

@ -258,7 +258,7 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm
      PacketScalar res;
      res = ei_pmul(m_lhs.packetCoeff(row, 0), ei_pset1(m_rhs.coeff(0, col)));
      for(int i = 1; i < m_lhs.cols(); i++)
-        res =  ei_padd(res, ei_pmul(m_lhs.packetCoeff(row, i), ei_pset1(m_rhs.coeff(i, col))));
+        res =  ei_pmadd(m_lhs.packetCoeff(row, i), ei_pset1(m_rhs.coeff(i, col)), res);
      return res;
    }

@ -398,17 +398,13 @@ void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res, ei_meta_true
        const typename ei_packet_traits<Scalar>::type tmp3 = ei_pset1(m_lhs.coeff(k,j+3));
        for (int i=0; i<this->cols(); i+=ei_packet_traits<Scalar>::size)
        {
+          // FIXME the following could be implemented using only mul-add, check if this is still OK for SSE
          res.writePacketCoeff(k,i,
            ei_padd(
              res.packetCoeff(k,i),
              ei_padd(
-                ei_padd(
-                  ei_pmul(tmp0, m_rhs.packetCoeff(j+0,i)),
-                  ei_pmul(tmp1, m_rhs.packetCoeff(j+1,i))),
-                ei_padd(
-                  ei_pmul(tmp2, m_rhs.packetCoeff(j+2,i)),
-                  ei_pmul(tmp3, m_rhs.packetCoeff(j+3,i))
-                )
+                ei_pmadd(tmp0, m_rhs.packetCoeff(j+0,i), ei_pmul(tmp1, m_rhs.packetCoeff(j+1,i))),
+                ei_pmadd(tmp2, m_rhs.packetCoeff(j+2,i), ei_pmul(tmp3, m_rhs.packetCoeff(j+3,i)))
              )
            )
          );
@ -421,7 +417,7 @@ void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res, ei_meta_true
      {
        const typename ei_packet_traits<Scalar>::type tmp = ei_pset1(m_lhs.coeff(k,j));
        for (int i=0; i<this->cols(); i+=ei_packet_traits<Scalar>::size)
-          res.writePacketCoeff(k,i, ei_pmul(tmp, m_rhs.packetCoeff(j,i)));
+          res.writePacketCoeff(k,i, ei_pmadd(tmp, m_rhs.packetCoeff(j,i), res.packetCoeff(k,i)));
      }
    }
  }
@ -443,13 +439,9 @@ void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res, ei_meta_true
            ei_padd(
              res.packetCoeff(i,k),
              ei_padd(
-                ei_padd(
-                  ei_pmul(tmp0, m_lhs.packetCoeff(i,j)),
-                  ei_pmul(tmp1, m_lhs.packetCoeff(i,j+1))),
-                ei_padd(
-                  ei_pmul(tmp2, m_lhs.packetCoeff(i,j+2)),
-                  ei_pmul(tmp3, m_lhs.packetCoeff(i,j+3))
-                )
+                ei_pmadd(tmp0, m_lhs.packetCoeff(i,j),  ei_pmul(tmp1, m_lhs.packetCoeff(i,j+1))),
+                ei_pmadd(tmp2, m_lhs.packetCoeff(i,j+2),ei_pmul(tmp3, m_lhs.packetCoeff(i,j+3)))
+
              )
            )
          );
@ -462,7 +454,7 @@ void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res, ei_meta_true
      {
        const typename ei_packet_traits<Scalar>::type tmp = ei_pset1(m_rhs.coeff(j,k));
        for (int i=0; i<this->rows(); i+=ei_packet_traits<Scalar>::size)
-          res.writePacketCoeff(i,k, ei_pmul(tmp, m_lhs.packetCoeff(i,j)));
+          res.writePacketCoeff(i,k, ei_pmadd(tmp, m_lhs.packetCoeff(i,j), res.packetCoeff(i,k)));
      }
    }
  }