From c9b046d5d5eba6e3f454ec2a125d74a21c61d988 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 9 Jul 2008 22:30:18 +0000
Subject: [PATCH] * added optimized paths for matrix-vector and vector-matrix
 products   (using either a cache friendly strategy or re-using dot-product  
 vectorized implementation) * add LinearAccessBit to Transpose

---
 Eigen/Core                  |   2 +-
 Eigen/src/Core/Block.h      |  13 ++--
 Eigen/src/Core/MatrixBase.h |   2 +-
 Eigen/src/Core/Product.h    | 125 +++++++++++++++++++++++++++++++-----
 Eigen/src/Core/Transpose.h  |  24 ++++++-
 5 files changed, 143 insertions(+), 23 deletions(-)
diff --git a/Eigen/Core b/Eigen/Core
index ecfd141b2..333e80a6d 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -41,12 +41,12 @@ namespace Eigen {
 #include "src/Core/CwiseUnaryOp.h"
 #include "src/Core/CwiseNullaryOp.h"
 #include "src/Core/InverseProduct.h"
+#include "src/Core/Dot.h"
 #include "src/Core/Product.h"
 #include "src/Core/DiagonalProduct.h"
 #include "src/Core/Block.h"
 #include "src/Core/Minor.h"
 #include "src/Core/Transpose.h"
-#include "src/Core/Dot.h"
 #include "src/Core/DiagonalMatrix.h"
 #include "src/Core/DiagonalCoeffs.h"
 #include "src/Core/Sum.h"
diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h
index d954e1a2e..245357511 100644
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@@ -155,7 +155,7 @@ template<typename MatrixType, int BlockRows, int BlockCols> class Block
       return m_matrix.const_cast_derived()
              .coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
                        m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
-      
+
     }
 
     inline const Scalar coeff(int index) const
@@ -168,20 +168,23 @@ template<typename MatrixType, int BlockRows, int BlockCols> class Block
     template<int LoadMode>
     inline PacketScalar packet(int row, int col) const
     {
-      return m_matrix.template packet<Unaligned>(row + m_startRow.value(), col + m_startCol.value());
+      return m_matrix.template packet<Unaligned>
+              (row + m_startRow.value(), col + m_startCol.value());
     }
 
     template<int LoadMode>
     inline void writePacket(int row, int col, const PacketScalar& x)
     {
-      m_matrix.const_cast_derived().template writePacket<Unaligned>(row + m_startRow.value(), col + m_startCol.value(), x);
+      m_matrix.const_cast_derived().template writePacket<Unaligned>
+              (row + m_startRow.value(), col + m_startCol.value(), x);
     }
 
     template<int LoadMode>
     inline PacketScalar packet(int index) const
     {
-      return m_matrix.template packet<Unaligned>(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
-                                                 m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+      return m_matrix.template packet<Unaligned>
+              (m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+               m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
     }
 
     template<int LoadMode>
diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h
index aba4817ac..e8b317a2a 100644
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@@ -219,7 +219,7 @@ template<typename Derived> class MatrixBase
     /** Overloaded for cache friendly product evaluation */
     template<typename OtherDerived>
     Derived& lazyAssign(const Flagged<OtherDerived, 0, EvalBeforeNestingBit | EvalBeforeAssigningBit>& other)
-    { lazyAssign(other._expression()); }
+    { return lazyAssign(other._expression()); }
 
     /** Overloaded for sparse product evaluation */
     template<typename Derived1, typename Derived2>
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index 0a301f983..6089c1be5 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -89,9 +89,11 @@ template<typename Lhs, typename Rhs> struct ei_product_mode
               ? DiagonalProduct
               : (Rhs::Flags & Lhs::Flags & SparseBit)
               ? SparseProduct
-              :    Lhs::MaxRowsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
-                && Rhs::MaxColsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
-                && Lhs::MaxColsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+              :    Lhs::MaxColsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+                && ( Lhs::MaxRowsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+                  || ((Rhs::Flags&RowMajorBit) && Lhs::IsVectorAtCompileTime))
+                && ( Rhs::MaxColsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+                  || ((!(Lhs::Flags&RowMajorBit)) && Rhs::IsVectorAtCompileTime))
                 ? CacheFriendlyProduct : NormalProduct };
 };
 
@@ -161,7 +163,7 @@ struct ei_traits<Product<LhsNested, RhsNested, ProductMode> >
      * the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI.
      */
     CanVectorizeInner = LhsRowMajor && (!RhsRowMajor) && (LhsFlags & RhsFlags & ActualPacketAccessBit)
-                      && (InnerSize!=Dynamic) && (InnerSize % ei_packet_traits<Scalar>::size == 0)
+                      && (InnerSize % ei_packet_traits<Scalar>::size == 0)
   };
 };
 
@@ -181,7 +183,7 @@ template<typename LhsNested, typename RhsNested, int ProductMode> class Product
       PacketSize = ei_packet_traits<Scalar>::size,
       InnerSize  = ei_traits<Product>::InnerSize,
       Unroll = CoeffReadCost <= EIGEN_UNROLLING_LIMIT,
-      CanVectorizeInner = ei_traits<Product>::CanVectorizeInner && Unroll
+      CanVectorizeInner = ei_traits<Product>::CanVectorizeInner
     };
 
     typedef ei_product_coeff_impl<CanVectorizeInner ? InnerVectorization : NoVectorization,
@@ -206,10 +208,13 @@ template<typename LhsNested, typename RhsNested, int ProductMode> class Product
     /** \internal
       * \returns whether it is worth it to use the cache friendly product.
       */
-    inline bool _useCacheFriendlyProduct() const {
-      return   rows()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
-            && cols()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
-            && m_lhs.cols()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD;
+    inline bool _useCacheFriendlyProduct() const
+    {
+      return   m_lhs.cols()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+            && (rows()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+                || ((_RhsNested::Flags&RowMajorBit) && _LhsNested::IsVectorAtCompileTime))
+            && (cols()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+                || ((!(_LhsNested::Flags&RowMajorBit)) && _RhsNested::IsVectorAtCompileTime));
     }
 
     inline int rows() const { return m_lhs.rows(); }
@@ -245,6 +250,9 @@ template<typename LhsNested, typename RhsNested, int ProductMode> class Product
       return res;
     }
 
+    const _LhsNested& lhs() const { return m_lhs; }
+    const _RhsNested& rhs() const { return m_rhs; }
+
   protected:
     const LhsNested m_lhs;
     const RhsNested m_rhs;
@@ -324,18 +332,18 @@ struct ei_product_coeff_impl<NoVectorization, -1, Lhs, Rhs>
 *******************************************/
 
 template<int Index, typename Lhs, typename Rhs, typename PacketScalar>
-struct ei_product_coeff_vectorized_impl
+struct ei_product_coeff_vectorized_unroller
 {
   enum { PacketSize = ei_packet_traits<typename Lhs::Scalar>::size };
   inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres)
   {
-    ei_product_coeff_vectorized_impl<Index-PacketSize, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, pres);
+    ei_product_coeff_vectorized_unroller<Index-PacketSize, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, pres);
     pres = ei_padd(pres, ei_pmul( lhs.template packet<Aligned>(row, Index) , rhs.template packet<Aligned>(Index, col) ));
   }
 };
 
 template<typename Lhs, typename Rhs, typename PacketScalar>
-struct ei_product_coeff_vectorized_impl<0, Lhs, Rhs, PacketScalar>
+struct ei_product_coeff_vectorized_unroller<0, Lhs, Rhs, PacketScalar>
 {
   inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres)
   {
@@ -351,12 +359,59 @@ struct ei_product_coeff_impl<InnerVectorization, Index, Lhs, Rhs>
   inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
   {
     PacketScalar pres;
-    ei_product_coeff_vectorized_impl<Index+1-PacketSize, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, pres);
+    ei_product_coeff_vectorized_unroller<Index+1-PacketSize, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, pres);
     ei_product_coeff_impl<NoVectorization,Index,Lhs,Rhs>::run(row, col, lhs, rhs, res);
     res = ei_predux(pres);
   }
 };
 
+// FIXME the following is a hack to get very high perf with matrix-vector product,
+// however, it would be preferable to switch for more general dynamic alignment queries
+template<typename Lhs, typename Rhs, int LhsRows = Lhs::RowsAtCompileTime, int RhsCols = Rhs::ColsAtCompileTime>
+struct ei_product_coeff_vectorized_dyn_selector
+{
+  inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
+  {
+    res = ei_dot_impl<
+      Block<Lhs, 1, ei_traits<Lhs>::ColsAtCompileTime>,
+      Block<Rhs, ei_traits<Rhs>::RowsAtCompileTime, 1>,
+      LinearVectorization, NoUnrolling>::run(lhs.row(row), rhs.col(col));
+  }
+};
+
+template<typename Lhs, typename Rhs, int RhsCols>
+struct ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,RhsCols>
+{
+  inline static void run(int /*row*/, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
+  {
+    res = ei_dot_impl<
+      Lhs,
+      Block<Rhs, ei_traits<Rhs>::RowsAtCompileTime, 1>,
+      LinearVectorization, NoUnrolling>::run(lhs, rhs.col(col));
+  }
+};
+
+template<typename Lhs, typename Rhs, int LhsRows>
+struct ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs,LhsRows,1>
+{
+  inline static void run(int row, int /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
+  {
+    res = ei_dot_impl<
+      Block<Lhs, 1, ei_traits<Lhs>::ColsAtCompileTime>,
+      Rhs,
+      LinearVectorization, NoUnrolling>::run(lhs.row(row), rhs);
+  }
+};
+
+template<typename Lhs, typename Rhs>
+struct ei_product_coeff_impl<InnerVectorization, Dynamic, Lhs, Rhs>
+{
+  inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
+  {
+    ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs>::run(row, col, lhs, rhs, res);
+  }
+};
+
 /*******************
 *** Packet path  ***
 *******************/
@@ -425,6 +480,46 @@ struct ei_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, PacketScalar, LoadMod
 * Cache friendly product callers and specific nested evaluation strategies
 ***************************************************************************/
 
+template<typename ProductType,
+  int LhsRows  = ei_traits<ProductType>::RowsAtCompileTime,
+  int LhsOrder = int(ei_traits<ProductType>::LhsFlags)&RowMajorBit ? RowMajor : ColMajor,
+  int RhsCols  = ei_traits<ProductType>::ColsAtCompileTime,
+  int RhsOrder = int(ei_traits<ProductType>::RhsFlags)&RowMajorBit ? RowMajor : ColMajor>
+struct ei_cache_friendly_product_selector
+{
+  template<typename DestDerived>
+  inline static void run(DestDerived& res, const ProductType& product)
+  {
+    product._cacheFriendlyEvalAndAdd(res);
+  }
+};
+
+// optimized colmajor * vector path
+template<typename ProductType, int LhsRows, int RhsOrder>
+struct ei_cache_friendly_product_selector<ProductType,LhsRows,ColMajor,1,RhsOrder>
+{
+  template<typename DestDerived>
+  inline static void run(DestDerived& res, const ProductType& product)
+  {
+    const int rows = product.rhs().rows();
+    for (int j=0; j<rows; ++j)
+      res += product.rhs().coeff(j) * product.lhs().col(j);
+  }
+};
+
+// optimized vector * rowmajor path
+template<typename ProductType, int LhsOrder, int RhsCols>
+struct ei_cache_friendly_product_selector<ProductType,1,LhsOrder,RhsCols,RowMajor>
+{
+  template<typename DestDerived>
+  inline static void run(DestDerived& res, const ProductType& product)
+  {
+    const int cols = product.lhs().cols();
+    for (int j=0; j<cols; ++j)
+      res += product.lhs().coeff(j) * product.rhs().row(j);
+  }
+};
+
 /** \internal */
 template<typename Derived>
 template<typename Lhs,typename Rhs>
@@ -432,7 +527,7 @@ inline Derived&
 MatrixBase<Derived>::operator+=(const Flagged<Product<Lhs,Rhs,CacheFriendlyProduct>, 0, EvalBeforeNestingBit | EvalBeforeAssigningBit>& other)
 {
   if (other._expression()._useCacheFriendlyProduct())
-    other._expression()._cacheFriendlyEvalAndAdd(const_cast_derived());
+    ei_cache_friendly_product_selector<Product<Lhs,Rhs,CacheFriendlyProduct> >::run(const_cast_derived(), other._expression());
   else
     lazyAssign(derived() + other._expression());
   return derived();
@@ -445,7 +540,7 @@ inline Derived& MatrixBase<Derived>::lazyAssign(const Product<Lhs,Rhs,CacheFrien
   if (product._useCacheFriendlyProduct())
   {
     setZero();
-    product._cacheFriendlyEvalAndAdd(derived());
+    ei_cache_friendly_product_selector<Product<Lhs,Rhs,CacheFriendlyProduct> >::run(const_cast_derived(), product);
   }
   else
   {
diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h
index aa2ac44d5..dba19f025 100644
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@@ -49,7 +49,7 @@ struct ei_traits<Transpose<MatrixType> >
     MaxRowsAtCompileTime = MatrixType::MaxColsAtCompileTime,
     MaxColsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
     Flags = ((int(_MatrixTypeNested::Flags) ^ RowMajorBit)
-          & ~( LinearAccessBit | LowerTriangularBit | UpperTriangularBit))
+          & ~(LowerTriangularBit | UpperTriangularBit))
           | (int(_MatrixTypeNested::Flags)&UpperTriangularBit ? LowerTriangularBit : 0)
           | (int(_MatrixTypeNested::Flags)&LowerTriangularBit ? UpperTriangularBit : 0),
     CoeffReadCost = _MatrixTypeNested::CoeffReadCost
@@ -84,6 +84,16 @@ template<typename MatrixType> class Transpose
       return m_matrix.coeff(col, row);
     }
 
+    inline const Scalar coeff(int index) const
+    {
+      return m_matrix.coeff(index);
+    }
+
+    inline Scalar& coeffRef(int index)
+    {
+      return m_matrix.const_cast_derived().coeffRef(index);
+    }
+
     template<int LoadMode>
     inline const PacketScalar packet(int row, int col) const
     {
@@ -96,6 +106,18 @@ template<typename MatrixType> class Transpose
       m_matrix.const_cast_derived().template writePacket<LoadMode>(col, row, x);
     }
 
+    template<int LoadMode>
+    inline const PacketScalar packet(int index) const
+    {
+      return m_matrix.template packet<LoadMode>(index);
+    }
+
+    template<int LoadMode>
+    inline void writePacket(int index, const PacketScalar& x)
+    {
+      m_matrix.const_cast_derived().template writePacket<LoadMode>(index, x);
+    }
+
   protected:
     const typename MatrixType::Nested m_matrix;
 };