From 861d18d5532546ddb0cd2bff8795eda080ce0c85 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Sat, 12 Jul 2008 22:59:34 +0000
Subject: [PATCH] * Optimization: added a specialization of Block for xpr with
 DirectAccessBit * some simplifications and fixes in cache friendly products

---
 Eigen/src/Core/Block.h                    | 139 +++++++++++++++++++++-
 Eigen/src/Core/CacheFriendlyProduct.h     |  99 +++++++--------
 Eigen/src/Core/Product.h                  |  24 +---
 Eigen/src/Core/util/Constants.h           |   5 +
 Eigen/src/Core/util/ForwardDeclarations.h |   3 +-
 bench/benchBlasGemm.cpp                   |   8 +-
 6 files changed, 198 insertions(+), 80 deletions(-)
diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h
index ae4e83c9e..e7eb87a26 100644
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@@ -56,8 +56,8 @@
   *
   * \sa MatrixBase::block(int,int,int,int), MatrixBase::block(int,int), class VectorBlock
   */
-template<typename MatrixType, int BlockRows, int BlockCols>
-struct ei_traits<Block<MatrixType, BlockRows, BlockCols> >
+template<typename MatrixType, int BlockRows, int BlockCols, int DirectAccesStatus>
+struct ei_traits<Block<MatrixType, BlockRows, BlockCols, DirectAccesStatus> >
 {
   typedef typename MatrixType::Scalar Scalar;
   enum{
@@ -83,8 +83,8 @@ struct ei_traits<Block<MatrixType, BlockRows, BlockCols> >
   };
 };
 
-template<typename MatrixType, int BlockRows, int BlockCols> class Block
-  : public MatrixBase<Block<MatrixType, BlockRows, BlockCols> >
+template<typename MatrixType, int BlockRows, int BlockCols, int DirectAccesStatus> class Block
+  : public MatrixBase<Block<MatrixType, BlockRows, BlockCols, DirectAccesStatus> >
 {
   public:
 
@@ -203,6 +203,137 @@ template<typename MatrixType, int BlockRows, int BlockCols> class Block
     const ei_int_if_dynamic<ColsAtCompileTime> m_blockCols;
 };
 
+/** \internal */
+template<typename MatrixType, int BlockRows, int BlockCols> class Block<MatrixType,BlockRows,BlockCols,HasDirectAccess>
+  : public MatrixBase<Block<MatrixType, BlockRows, BlockCols,HasDirectAccess> >
+{
+    enum {
+      IsRowMajor = int(ei_traits<MatrixType>::Flags)&RowMajorBit ? 1 : 0
+    };
+
+  public:
+
+    EIGEN_GENERIC_PUBLIC_INTERFACE(Block)
+
+    /** Column or Row constructor
+      */
+    inline Block(const MatrixType& matrix, int i)
+      : m_matrix(matrix),
+        m_data_ptr(&matrix.const_cast_derived().coeffRef(
+          (BlockRows==1) && (BlockCols==MatrixType::ColsAtCompileTime) ? i : 0,
+          (BlockRows==MatrixType::RowsAtCompileTime) && (BlockCols==1) ? i : 0)),
+        m_blockRows(matrix.rows()),
+        m_blockCols(matrix.cols())
+    {
+      ei_assert( (i>=0) && (
+          ((BlockRows==1) && (BlockCols==MatrixType::ColsAtCompileTime) && i<matrix.rows())
+        ||((BlockRows==MatrixType::RowsAtCompileTime) && (BlockCols==1) && i<matrix.cols())));
+    }
+
+    /** Fixed-size constructor
+      */
+    inline Block(const MatrixType& matrix, int startRow, int startCol)
+      : m_matrix(matrix), m_data_ptr(&matrix.const_cast_derived().coeffRef(startRow,startCol))
+    {
+      ei_assert(RowsAtCompileTime!=Dynamic && RowsAtCompileTime!=Dynamic);
+      ei_assert(startRow >= 0 && BlockRows >= 1 && startRow + BlockRows <= matrix.rows()
+          && startCol >= 0 && BlockCols >= 1 && startCol + BlockCols <= matrix.cols());
+    }
+
+    /** Dynamic-size constructor
+      */
+    inline Block(const MatrixType& matrix,
+          int startRow, int startCol,
+          int blockRows, int blockCols)
+      : m_matrix(matrix), m_data_ptr(&matrix.const_cast_derived().coeffRef(startRow,startCol)),
+        m_blockRows(blockRows), m_blockCols(blockCols)
+    {
+      ei_assert((RowsAtCompileTime==Dynamic || RowsAtCompileTime==blockRows)
+          && (ColsAtCompileTime==Dynamic || ColsAtCompileTime==blockCols));
+      ei_assert(startRow >= 0 && blockRows >= 1 && startRow + blockRows <= matrix.rows()
+          && startCol >= 0 && blockCols >= 1 && startCol + blockCols <= matrix.cols());
+    }
+
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Block)
+
+    inline int rows() const { return m_blockRows.value(); }
+    inline int cols() const { return m_blockCols.value(); }
+
+    inline int stride(void) const { return m_matrix.stride(); }
+
+    inline Scalar& coeffRef(int row, int col)
+    {
+      if (IsRowMajor)
+        return m_data_ptr[col + row * stride()];
+      else
+        return m_data_ptr[row + col * stride()];
+    }
+
+    inline const Scalar coeff(int row, int col) const
+    {
+//       std::cerr << "coeff(int row, int col)\n";
+      if (IsRowMajor)
+        return m_data_ptr[col + row * stride()];
+      else
+        return m_data_ptr[row + col * stride()];
+    }
+
+    inline Scalar& coeffRef(int index)
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(Block);
+      return m_data_ptr[index];
+    }
+
+    inline const Scalar coeff(int index) const
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(Block);
+      if ( (RowsAtCompileTime == 1) == IsRowMajor )
+        return m_data_ptr[index];
+      else
+        return m_data_ptr[index*stride()];
+    }
+
+    template<int LoadMode>
+    inline PacketScalar packet(int row, int col) const
+    {
+      if (IsRowMajor)
+        return ei_ploadu(&m_data_ptr[col + row * stride()]);
+      else
+        return ei_ploadu(&m_data_ptr[row + col * stride()]);
+    }
+
+    template<int LoadMode>
+    inline void writePacket(int row, int col, const PacketScalar& x)
+    {
+      if (IsRowMajor)
+        ei_pstoreu(&m_data_ptr[col + row * stride()], x);
+      else
+        ei_pstoreu(&m_data_ptr[row + col * stride()], x);
+    }
+
+    template<int LoadMode>
+    inline PacketScalar packet(int index) const
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(Block);
+      return ei_ploadu(&m_data_ptr[index]);
+    }
+
+    template<int LoadMode>
+    inline void writePacket(int index, const PacketScalar& x)
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(Block);
+      ei_pstoreu(&m_data_ptr[index], x);
+    }
+
+  protected:
+
+    const typename MatrixType::Nested m_matrix;
+    Scalar* m_data_ptr;
+    const ei_int_if_dynamic<RowsAtCompileTime> m_blockRows;
+    const ei_int_if_dynamic<ColsAtCompileTime> m_blockCols;
+};
+
+
 /** \returns a dynamic-size expression of a block in *this.
   *
   * \param startRow the first row in the block
diff --git a/Eigen/src/Core/CacheFriendlyProduct.h b/Eigen/src/Core/CacheFriendlyProduct.h
index a710d44d4..06b3f5876 100644
--- a/Eigen/src/Core/CacheFriendlyProduct.h
+++ b/Eigen/src/Core/CacheFriendlyProduct.h
@@ -367,7 +367,7 @@ static void ei_cache_friendly_product(
  * TODO: since rhs gets evaluated only once, no need to evaluate it
  */
 template<typename Scalar, typename RhsType>
-EIGEN_DONT_INLINE static void ei_cache_friendly_product(
+EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector(
   int size,
   const Scalar* lhs, int lhsStride,
   const RhsType& rhs,
@@ -408,54 +408,34 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product(
                        : alignmentStep==2 ? EvenAligned
                        : FirstAligned;
 
-  // find how many column do we have to skip to be aligned with the result (if possible)
+  // find how many columns do we have to skip to be aligned with the result (if possible)
   int skipColumns=0;
-  for (; skipColumns<PacketSize; ++skipColumns)
-  {
-    if (alignedStart == alignmentStep*skipColumns)
-      break;
-  }
+  for (; skipColumns<PacketSize && alignedStart != alignmentStep*skipColumns; ++skipColumns)
+  {}
   if (skipColumns==PacketSize)
+  {
+    // nothing can be aligned, no need to skip any column
     alignmentPattern = NoneAligned;
-  skipColumns = std::min(skipColumns,rhs.size());
-  if (alignmentPattern!=NoneAligned)
-    for (int i=0; i<skipColumns; i++)
-    {
-      Scalar tmp0 = rhs[i];
-      Packet ptmp0 = ei_pset1(tmp0);
-      int iN0 = i*lhsStride;
-      // process first unaligned result's coeffs
-      for (int j=0; j<alignedStart; j++)
-        res[j] += tmp0 * lhs[j+iN0];
-      // process aligned result's coeffs (we know the lhs columns are not aligned)
-      for (int j = alignedStart;j<alignedSize;j+=PacketSize)
-        ei_pstore(&res[j], ei_padd(ei_pmul(ptmp0,ei_ploadu(&lhs[j+iN0])),ei_pload(&res[j])));
-      // process remaining result's coeffs
-      for (int j=alignedSize; j<size; j++)
-        res[j] += tmp0 * lhs[j+iN0];
-    }
+    skipColumns = 0;
+  }
+  else
+  {
+    skipColumns = std::min(skipColumns,rhs.size());
+    // note that the skiped columns are processed later.
+  }
 
   int columnBound = (rhs.size()/columnsAtOnce)*columnsAtOnce;
-  for (int i=0; i<columnBound; i+=columnsAtOnce)
+  for (int i=skipColumns; i<columnBound; i+=columnsAtOnce)
   {
-    Scalar tmp0 = rhs[i];
-    Packet ptmp0 = ei_pset1(tmp0);
-    Scalar tmp1 = rhs[i+1];
-    Packet ptmp1 = ei_pset1(tmp1);
-    Scalar tmp2 = rhs[i+2];
-    Packet ptmp2 = ei_pset1(tmp2);
-    Scalar tmp3 = rhs[i+3];
-    Packet ptmp3 = ei_pset1(tmp3);
-    int iN0 = i*lhsStride;
-    int iN1 = (i+1)*lhsStride;
-    int iN2 = (i+2)*lhsStride;
-    int iN3 = (i+3)*lhsStride;
+    Scalar tmp0  = rhs[i], tmp1 = rhs[i+1], tmp2 = rhs[i+2], tmp3 = rhs[i+3];
+    Packet ptmp0 = ei_pset1(tmp0), ptmp1 = ei_pset1(tmp1), ptmp2 = ei_pset1(tmp2), ptmp3 = ei_pset1(tmp3);
+    int iN0 = i*lhsStride, iN1 = (i+1)*lhsStride, iN2 = (i+2)*lhsStride, iN3 = (i+3)*lhsStride;
 
     // process initial unaligned coeffs
     for (int j=0; j<alignedStart; j++)
       res[j] += tmp0 * lhs[j+iN0] + tmp1 * lhs[j+iN1] + tmp2 * lhs[j+iN2] + tmp3 * lhs[j+iN3];
 
-    if (alignedSize>0)
+    if (alignedSize>alignedStart)
     {
       switch(alignmentPattern)
       {
@@ -475,10 +455,6 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product(
               _EIGEN_ACCUMULATE_PACKETS(,u,u,+PacketSize);
               if (peels>2) _EIGEN_ACCUMULATE_PACKETS(,u,u,+2*PacketSize);
               if (peels>3) _EIGEN_ACCUMULATE_PACKETS(,u,u,+3*PacketSize);
-              if (peels>4) _EIGEN_ACCUMULATE_PACKETS(,u,u,+4*PacketSize);
-              if (peels>5) _EIGEN_ACCUMULATE_PACKETS(,u,u,+5*PacketSize);
-              if (peels>6) _EIGEN_ACCUMULATE_PACKETS(,u,u,+6*PacketSize);
-              if (peels>7) _EIGEN_ACCUMULATE_PACKETS(,u,u,+7*PacketSize);
             }
           for (int j = peeledSize; j<alignedSize; j+=PacketSize)
             _EIGEN_ACCUMULATE_PACKETS(,u,u,);
@@ -494,25 +470,42 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product(
     for (int j=alignedSize; j<size; j++)
       res[j] += tmp0 * lhs[j+iN0] + tmp1 * lhs[j+iN1] + tmp2 * lhs[j+iN2] + tmp3 * lhs[j+iN3];
   }
-  for (int i=columnBound; i<rhs.size(); i++)
+
+  // process remaining first and last columns (at most columnsAtOnce-1)
+  int end = rhs.size();
+  int start = columnBound;
+  do
   {
-    Scalar tmp0 = rhs[i];
-    Packet ptmp0 = ei_pset1(tmp0);
-    int iN0 = i*lhsStride;
-    if (alignedSize>0)
+    for (int i=columnBound; i<end; i++)
     {
-      bool aligned0 = (iN0 % PacketSize) == 0;
-      if (aligned0)
+      Scalar tmp0 = rhs[i];
+      Packet ptmp0 = ei_pset1(tmp0);
+      int iN0 = i*lhsStride;
+      // process first unaligned result's coeffs
+      for (int j=0; j<alignedStart; j++)
+        res[j] += tmp0 * lhs[j+iN0];
+
+      // process aligned result's coeffs
+      if ((iN0 % PacketSize) == 0)
         for (int j = 0;j<alignedSize;j+=PacketSize)
           ei_pstore(&res[j], ei_padd(ei_pmul(ptmp0,ei_pload(&lhs[j+iN0])),ei_pload(&res[j])));
       else
         for (int j = 0;j<alignedSize;j+=PacketSize)
           ei_pstore(&res[j], ei_padd(ei_pmul(ptmp0,ei_ploadu(&lhs[j+iN0])),ei_pload(&res[j])));
+
+      // process remaining scalars
+      for (int j=alignedSize; j<size; j++)
+        res[j] += tmp0 * lhs[j+iN0];
     }
-    // process remaining scalars
-    for (int j=alignedSize; j<size; j++)
-      res[j] += tmp0 * lhs[j+iN0];
-  }
+    if (skipColumns)
+    {
+      start = 0;
+      end = skipColumns;
+      skipColumns = 0;
+    }
+    else
+      break;
+  } while(true);
   asm("#end matrix_vector_product");
 
   #undef _EIGEN_ACCUMULATE_PACKETS
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index c2f0c07a8..31677e6cb 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -365,8 +365,7 @@ struct ei_product_coeff_impl<InnerVectorization, Index, Lhs, Rhs>
   }
 };
 
-// FIXME the following is a hack to get very high perf with matrix-vector product,
-// however, it would be preferable to switch for more general dynamic alignment queries
+// NOTE the following specializations are because taking .col(0) on a vector is a bit slower
 template<typename Lhs, typename Rhs, int LhsRows = Lhs::RowsAtCompileTime, int RhsCols = Rhs::ColsAtCompileTime>
 struct ei_product_coeff_vectorized_dyn_selector
 {
@@ -481,14 +480,9 @@ struct ei_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, PacketScalar, LoadMod
 ***************************************************************************/
 
 template<typename Scalar, typename RhsType>
-static void ei_cache_friendly_product(
+static void ei_cache_friendly_product_colmajor_times_vector(
   int size, const Scalar* lhs, int lhsStride, const RhsType& rhs, Scalar* res);
 
-enum {
-  HasDirectAccess,
-  NoDirectAccess
-};
-
 template<typename ProductType,
   int LhsRows  = ei_traits<ProductType>::RowsAtCompileTime,
   int LhsOrder = int(ei_traits<ProductType>::LhsFlags)&RowMajorBit ? RowMajor : ColMajor,
@@ -507,19 +501,13 @@ struct ei_cache_friendly_product_selector
 
 // optimized colmajor * vector path
 template<typename ProductType, int LhsRows, int RhsOrder, int RhsAccess>
-struct ei_cache_friendly_product_selector<ProductType,LhsRows,NoDirectAccess,ColMajor,1,RhsOrder,RhsAccess>
+struct ei_cache_friendly_product_selector<ProductType,LhsRows,ColMajor,NoDirectAccess,1,RhsOrder,RhsAccess>
 {
-  typedef typename ei_traits<ProductType>::_LhsNested Lhs;
   template<typename DestDerived>
   inline static void run(DestDerived& res, const ProductType& product)
   {
-    ei_scalar_sum_op<typename ProductType::Scalar> _sum;
     const int size = product.rhs().rows();
     for (int k=0; k<size; ++k)
-      if (Lhs::Flags&DirectAccessBit)
-        // TODO to properly hanlde this workaround let's specialize Block for type having the DirectAccessBit
-        res += product.rhs().coeff(k) * Map<DestDerived>(&product.lhs().const_cast_derived().coeffRef(0,k),product.lhs().rows());
-      else
         res += product.rhs().coeff(k) * product.lhs().col(k);
   }
 };
@@ -527,7 +515,7 @@ struct ei_cache_friendly_product_selector<ProductType,LhsRows,NoDirectAccess,Col
 // optimized cache friendly colmajor * vector path for matrix with direct access flag
 // NOTE this path coul also be enabled for expressions if we add runtime align queries
 template<typename ProductType, int LhsRows, int RhsOrder, int RhsAccess>
-struct ei_cache_friendly_product_selector<ProductType,LhsRows,HasDirectAccess,ColMajor,1,RhsOrder,RhsAccess>
+struct ei_cache_friendly_product_selector<ProductType,LhsRows,ColMajor,HasDirectAccess,1,RhsOrder,RhsAccess>
 {
   typedef typename ProductType::Scalar Scalar;
 
@@ -545,7 +533,7 @@ struct ei_cache_friendly_product_selector<ProductType,LhsRows,HasDirectAccess,Co
       _res = (Scalar*)alloca(sizeof(Scalar)*res.size());
       Map<Matrix<Scalar,DestDerived::RowsAtCompileTime,1> >(_res, res.size()) = res;
     }
-    ei_cache_friendly_product(res.size(),
+    ei_cache_friendly_product_colmajor_times_vector(res.size(),
       &product.lhs().const_cast_derived().coeffRef(0,0), product.lhs().stride(),
       product.rhs(), _res);
 
@@ -588,7 +576,7 @@ struct ei_cache_friendly_product_selector<ProductType,1,LhsOrder,LhsAccess,RhsCo
       _res = (Scalar*)alloca(sizeof(Scalar)*res.size());
       Map<Matrix<Scalar,DestDerived::RowsAtCompileTime,1> >(_res, res.size()) = res;
     }
-    ei_cache_friendly_product(res.size(),
+    ei_cache_friendly_product_colmajor_times_vector(res.size(),
       &product.rhs().const_cast_derived().coeffRef(0,0), product.rhs().stride(),
       product.lhs().transpose(), _res);
 
diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h
index c51aa282b..7e2d37dff 100644
--- a/Eigen/src/Core/util/Constants.h
+++ b/Eigen/src/Core/util/Constants.h
@@ -206,6 +206,11 @@ enum {
   RowMajor = RowMajorBit
 };
 
+enum {
+  NoDirectAccess = 0,
+  HasDirectAccess = DirectAccessBit
+};
+
 const int FullyCoherentAccessPattern  = 0x1;
 const int InnerCoherentAccessPattern  = 0x2 | FullyCoherentAccessPattern;
 const int OuterCoherentAccessPattern  = 0x4 | InnerCoherentAccessPattern;
diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h
index 922a3716f..f96d57747 100644
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@@ -42,7 +42,8 @@ class Matrix;
 template<typename ExpressionType, unsigned int Added, unsigned int Removed> class Flagged;
 template<typename ExpressionType> class NestByValue;
 template<typename MatrixType> class Minor;
-template<typename MatrixType, int BlockRows=Dynamic, int BlockCols=Dynamic> class Block;
+template<typename MatrixType, int BlockRows=Dynamic, int BlockCols=Dynamic,
+         int DirectAccessStatus = ei_traits<MatrixType>::Flags&DirectAccessBit> class Block;
 template<typename MatrixType> class Transpose;
 template<typename MatrixType> class Conjugate;
 template<typename NullaryOp, typename MatrixType>         class CwiseNullaryOp;
diff --git a/bench/benchBlasGemm.cpp b/bench/benchBlasGemm.cpp
index 02f067e1a..5455b6ed5 100644
--- a/bench/benchBlasGemm.cpp
+++ b/bench/benchBlasGemm.cpp
@@ -82,12 +82,12 @@ int main(int argc, char *argv[])
     std::cout << "Usage: " << argv[0] << " size nbloops nbtries\n";
     std::cout << "Usage: " << argv[0] << " M N K nbloops nbtries\n";
     std::cout << "Usage: " << argv[0] << " check\n";
-    std::cout << "Options:\n"
+    std::cout << "Options:\n";
     std::cout << "    size       unique size of the 2 matrices (integer)\n";
     std::cout << "    auto       automatically set the number of repetitions and tries\n";
-    std::cout << "    nbloops    number of times the GEMM routines is executed\n"
-    std::cout << "    nbtries    number of times the loop is benched (return the best try)\n"
-    std::cout << "    M N K      sizes of the matrices: MxN  =  MxK * KxN (integers)\n"
+    std::cout << "    nbloops    number of times the GEMM routines is executed\n";
+    std::cout << "    nbtries    number of times the loop is benched (return the best try)\n";
+    std::cout << "    M N K      sizes of the matrices: MxN  =  MxK * KxN (integers)\n";
     std::cout << "    check      check eigen product using cblas as a reference\n";
     exit(1);
   }