From a6655dd91aea66a7e617031e87ca7f34dce2a639 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Sat, 3 May 2008 00:45:08 +0000
Subject: [PATCH] added packet mul-add function (ei_pmad) and updated Product
 to use it. this change nothing for current SSE architecture but might be
 helpful for altivec/cell and up comming AMD processors.

---
 Eigen/src/Core/PacketMath.h | 18 ++++++++++++++++--
 Eigen/src/Core/Product.h    | 32 ++++++++++++--------------------
 2 files changed, 28 insertions(+), 22 deletions(-)
diff --git a/Eigen/src/Core/PacketMath.h b/Eigen/src/Core/PacketMath.h
index 3534452ff..5a5f15dad 100644
--- a/Eigen/src/Core/PacketMath.h
+++ b/Eigen/src/Core/PacketMath.h
@@ -29,15 +29,26 @@
 // In practice these functions are provided to make easier the writting
 // of generic vectorized code. However, at runtime, they should never be
 // called, TODO so sould we raise an assertion or not ?
+/** \internal \returns a + b (coeff-wise) */
 template <typename Scalar> inline Scalar ei_padd(const Scalar&  a, const Scalar&  b) { return a + b; }
+/** \internal \returns a - b (coeff-wise) */
 template <typename Scalar> inline Scalar ei_psub(const Scalar&  a, const Scalar&  b) { return a - b; }
+/** \internal \returns a * b (coeff-wise) */
 template <typename Scalar> inline Scalar ei_pmul(const Scalar&  a, const Scalar&  b) { return a * b; }
+/** \internal \returns a * b - c (coeff-wise) */
+template <typename Scalar> inline Scalar ei_pmadd(const Scalar&  a, const Scalar&  b, const Scalar&  c)
+{ return ei_padd(ei_pmul(a, b),c); }
+/** \internal \returns the min of \a a and \a b  (coeff-wise) */
 template <typename Scalar> inline Scalar ei_pmin(const Scalar&  a, const Scalar&  b) { return std::min(a,b); }
+/** \internal \returns the max of \a a and \a b  (coeff-wise) */
 template <typename Scalar> inline Scalar ei_pmax(const Scalar&  a, const Scalar&  b) { return std::max(a,b); }
+/** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */
 template <typename Scalar> inline Scalar ei_pload(const Scalar* from) { return *from; }
-template <typename Scalar> inline Scalar ei_pload1(const Scalar* from) { return *from; }
-template <typename Scalar> inline Scalar ei_pset1(const Scalar& from) { return from; }
+/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */
+template <typename Scalar> inline Scalar ei_pset1(const Scalar& a) { return a; }
+/** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */
 template <typename Scalar> inline void ei_pstore(Scalar* to, const Scalar& from) { (*to) = from; }
+/** \internal \returns the first element of a packet */
 template <typename Scalar> inline Scalar ei_pfirst(const Scalar& a) { return a; }
 
 #ifdef EIGEN_VECTORIZE_SSE
@@ -68,6 +79,9 @@ inline __m128i ei_pmul(const __m128i& a, const __m128i& b)
         _mm_setr_epi32(0xffffffff,0,0xffffffff,0)), 4));
 }
 
+// for some weird raisons, it has to be overloaded for packet integer
+inline __m128i ei_pmadd(const __m128i& a, const __m128i& b, const __m128i& c) { return ei_padd(ei_pmul(a,b), c); }
+
 inline __m128  ei_pmin(const __m128&  a, const __m128&  b) { return _mm_min_ps(a,b); }
 inline __m128d ei_pmin(const __m128d& a, const __m128d& b) { return _mm_min_pd(a,b); }
 // FIXME this vectorized min operator is likely to be slower than the standard one
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index b1892e1df..04b6fb9c0 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -69,7 +69,7 @@ struct ei_packet_product_unroller<true, Index, Size, Lhs, Rhs, PacketScalar>
   static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
   {
     ei_packet_product_unroller<true, Index-1, Size, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, res);
-    res =  ei_padd(res, ei_pmul(ei_pset1(lhs.coeff(row, Index)), rhs.packetCoeff(Index, col)));
+    res =  ei_pmadd(ei_pset1(lhs.coeff(row, Index)), rhs.packetCoeff(Index, col), res);
   }
 };
 
@@ -79,7 +79,7 @@ struct ei_packet_product_unroller<false, Index, Size, Lhs, Rhs, PacketScalar>
   static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
   {
     ei_packet_product_unroller<false, Index-1, Size, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, res);
-    res =  ei_padd(res, ei_pmul(lhs.packetCoeff(row, Index), ei_pset1(rhs.coeff(Index, col))));
+    res =  ei_pmadd(lhs.packetCoeff(row, Index), ei_pset1(rhs.coeff(Index, col)), res);
   }
 };
 
@@ -249,7 +249,7 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm
       PacketScalar res;
       res = ei_pmul(ei_pset1(m_lhs.coeff(row, 0)),m_rhs.packetCoeff(0, col));
       for(int i = 1; i < m_lhs.cols(); i++)
-        res =  ei_padd(res, ei_pmul(ei_pset1(m_lhs.coeff(row, i)), m_rhs.packetCoeff(i, col)));
+        res =  ei_pmadd(ei_pset1(m_lhs.coeff(row, i)), m_rhs.packetCoeff(i, col), res);
       return res;
     }
 
@@ -258,7 +258,7 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm
       PacketScalar res;
       res = ei_pmul(m_lhs.packetCoeff(row, 0), ei_pset1(m_rhs.coeff(0, col)));
       for(int i = 1; i < m_lhs.cols(); i++)
-        res =  ei_padd(res, ei_pmul(m_lhs.packetCoeff(row, i), ei_pset1(m_rhs.coeff(i, col))));
+        res =  ei_pmadd(m_lhs.packetCoeff(row, i), ei_pset1(m_rhs.coeff(i, col)), res);
       return res;
     }
 
@@ -398,17 +398,13 @@ void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res, ei_meta_true
         const typename ei_packet_traits<Scalar>::type tmp3 = ei_pset1(m_lhs.coeff(k,j+3));
         for (int i=0; i<this->cols(); i+=ei_packet_traits<Scalar>::size)
         {
+          // FIXME the following could be implemented using only mul-add, check if this is still OK for SSE
           res.writePacketCoeff(k,i,
             ei_padd(
               res.packetCoeff(k,i),
               ei_padd(
-                ei_padd(
-                  ei_pmul(tmp0, m_rhs.packetCoeff(j+0,i)),
-                  ei_pmul(tmp1, m_rhs.packetCoeff(j+1,i))),
-                ei_padd(
-                  ei_pmul(tmp2, m_rhs.packetCoeff(j+2,i)),
-                  ei_pmul(tmp3, m_rhs.packetCoeff(j+3,i))
-                )
+                ei_pmadd(tmp0, m_rhs.packetCoeff(j+0,i), ei_pmul(tmp1, m_rhs.packetCoeff(j+1,i))),
+                ei_pmadd(tmp2, m_rhs.packetCoeff(j+2,i), ei_pmul(tmp3, m_rhs.packetCoeff(j+3,i)))
               )
             )
           );
@@ -421,7 +417,7 @@ void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res, ei_meta_true
       {
         const typename ei_packet_traits<Scalar>::type tmp = ei_pset1(m_lhs.coeff(k,j));
         for (int i=0; i<this->cols(); i+=ei_packet_traits<Scalar>::size)
-          res.writePacketCoeff(k,i, ei_pmul(tmp, m_rhs.packetCoeff(j,i)));
+          res.writePacketCoeff(k,i, ei_pmadd(tmp, m_rhs.packetCoeff(j,i), res.packetCoeff(k,i)));
       }
     }
   }
@@ -443,13 +439,9 @@ void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res, ei_meta_true
             ei_padd(
               res.packetCoeff(i,k),
               ei_padd(
-                ei_padd(
-                  ei_pmul(tmp0, m_lhs.packetCoeff(i,j)),
-                  ei_pmul(tmp1, m_lhs.packetCoeff(i,j+1))),
-                ei_padd(
-                  ei_pmul(tmp2, m_lhs.packetCoeff(i,j+2)),
-                  ei_pmul(tmp3, m_lhs.packetCoeff(i,j+3))
-                )
+                ei_pmadd(tmp0, m_lhs.packetCoeff(i,j),  ei_pmul(tmp1, m_lhs.packetCoeff(i,j+1))),
+                ei_pmadd(tmp2, m_lhs.packetCoeff(i,j+2),ei_pmul(tmp3, m_lhs.packetCoeff(i,j+3)))
+
               )
             )
           );
@@ -462,7 +454,7 @@ void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res, ei_meta_true
       {
         const typename ei_packet_traits<Scalar>::type tmp = ei_pset1(m_rhs.coeff(j,k));
         for (int i=0; i<this->rows(); i+=ei_packet_traits<Scalar>::size)
-          res.writePacketCoeff(i,k, ei_pmul(tmp, m_lhs.packetCoeff(i,j)));
+          res.writePacketCoeff(i,k, ei_pmadd(tmp, m_lhs.packetCoeff(i,j), res.packetCoeff(i,k)));
       }
     }
   }