From fb4a1519829eabef0699b297fa493c2c495631e5 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 19 Jun 2008 23:00:51 +0000
Subject: [PATCH] * more cleaning in Product * make Matrix2f (and similar)
 vectorized using linear path * fix a couple of warnings and compilation
 issues with ICC and gcc 3.3/3.4   (cannot get Transform compiles with gcc
 3.3/3.4, see the FIXME)

---
 Eigen/src/Core/Assign.h         |  28 +++++----
 Eigen/src/Core/DiagonalCoeffs.h |   4 +-
 Eigen/src/Core/Product.h        | 101 +++++++++++++-------------------
 Eigen/src/Core/Redux.h          |  16 ++---
 Eigen/src/Core/util/Meta.h      |   4 +-
 Eigen/src/Geometry/Cross.h      |   4 +-
 Eigen/src/Geometry/Transform.h  |  91 ++++++++++++++--------------
 7 files changed, 112 insertions(+), 136 deletions(-)
diff --git a/Eigen/src/Core/Assign.h b/Eigen/src/Core/Assign.h
index 681b3d4ef..b0e885dfe 100644
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h
@@ -238,7 +238,7 @@ template<typename Derived1, typename Derived2>
 struct ei_assign_impl<Derived1, Derived2, NoVectorization, InnerUnrolling>
 {
   static void run(Derived1 &dst, const Derived2 &src)
-  {  
+  {
     const bool rowMajor = int(Derived1::Flags)&RowMajorBit;
     const int innerSize = rowMajor ? Derived1::ColsAtCompileTime : Derived1::RowsAtCompileTime;
     const int outerSize = rowMajor ? dst.rows() : dst.cols();
@@ -268,7 +268,7 @@ struct ei_assign_impl<Derived1, Derived2, InnerVectorization, NoUnrolling>
         const int row = rowMajor ? j : i;
         const int col = rowMajor ? i : j;
         dst.template writePacket<Aligned>(row, col, src.template packet<Aligned>(row, col));
-      }  
+      }
     }
   }
 };
@@ -351,23 +351,25 @@ struct ei_assign_impl<Derived1, Derived2, LinearVectorization, CompleteUnrolling
 {
   inline static void run(Derived1 &dst, const Derived2 &src)
   {
-    const int size = Derived1::SizeAtCompileTime;
-    const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size;
-    const int alignedSize = (size/packetSize)*packetSize;
-    const bool rowMajor = Derived1::Flags&RowMajorBit;
-    const int innerSize = rowMajor ? Derived1::ColsAtCompileTime : Derived1::RowsAtCompileTime;
-    const int outerSize = rowMajor ? Derived1::RowsAtCompileTime : Derived1::ColsAtCompileTime;
-    int index = 0;
+    enum {
+      size = Derived1::SizeAtCompileTime,
+      packetSize = ei_packet_traits<typename Derived1::Scalar>::size,
+      alignedSize = (int(size)/int(packetSize))*int(packetSize),
+      rowMajor = int(Derived1::Flags)&RowMajorBit,
+      innerSize = int(rowMajor) ? int(Derived1::ColsAtCompileTime) : int(Derived1::RowsAtCompileTime),
+      outerSize = int(rowMajor) ? int(Derived1::RowsAtCompileTime) : int(Derived1::ColsAtCompileTime)
+    };
 
     // do the vectorizable part of the assignment
     ei_assign_innervec_CompleteUnrolling<Derived1, Derived2, 0, alignedSize>::run(dst, src);
 
     // now we must do the rest without vectorization.
-    const int k = alignedSize/innerSize;
-    const int i = alignedSize%innerSize;
-
+    enum {
+      k = int(alignedSize)/int(innerSize),
+      i = int(alignedSize)%int(innerSize)
+    };
     // do the remainder of the current row or col
-    ei_assign_novec_InnerUnrolling<Derived1, Derived2, i, innerSize>::run(dst, src, k);
+    ei_assign_novec_InnerUnrolling<Derived1, Derived2, i, int(k)<int(outerSize) ? int(innerSize) : 0>::run(dst, src, k);
 
     // do the remaining rows or cols
     for(int j = k+1; j < outerSize; j++)
diff --git a/Eigen/src/Core/DiagonalCoeffs.h b/Eigen/src/Core/DiagonalCoeffs.h
index 75469b4bf..b7d3ef475 100644
--- a/Eigen/src/Core/DiagonalCoeffs.h
+++ b/Eigen/src/Core/DiagonalCoeffs.h
@@ -101,8 +101,8 @@ template<typename MatrixType> class DiagonalCoeffs
   *
   * \sa class DiagonalCoeffs */
 template<typename Derived>
-DiagonalCoeffs<Derived>
-inline MatrixBase<Derived>::diagonal()
+inline DiagonalCoeffs<Derived>
+MatrixBase<Derived>::diagonal()
 {
   return DiagonalCoeffs<Derived>(derived());
 }
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index f03ea4e8e..1f387af32 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -201,14 +201,21 @@ template<typename LhsNested, typename RhsNested, int ProductMode> class Product
       ei_assert(lhs.cols() == rhs.rows());
     }
 
-    /** \internal */
-    template<typename DestDerived>
-    void _cacheFriendlyEval(DestDerived& res) const;
-
-    /** \internal */
+    /** \internal
+      * compute \a res += \c *this using the cache friendly product.
+      */
     template<typename DestDerived>
     void _cacheFriendlyEvalAndAdd(DestDerived& res) const;
 
+    /** \internal
+      * \returns whether it is worth it to use the cache friendly product.
+      */
+    inline bool _useCacheFriendlyProduct() const {
+      return   _rows()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+            && _cols()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+            && m_lhs.cols()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD;
+    }
+
   private:
 
     inline int _rows() const { return m_lhs.rows(); }
@@ -229,7 +236,7 @@ template<typename LhsNested, typename RhsNested, int ProductMode> class Product
       return res;
     }
 
-    template<typename Lhs_, typename Rhs_, int EvalMode_, typename DestDerived_, bool DirectAccess_>
+    template<typename Lhs_, typename Rhs_, int ProductMode_, typename DestDerived_, bool DirectAccess_>
     friend struct ei_cache_friendly_selector;
 
   protected:
@@ -419,7 +426,10 @@ template<typename Lhs,typename Rhs>
 inline Derived&
 MatrixBase<Derived>::operator+=(const Flagged<Product<Lhs,Rhs,CacheFriendlyProduct>, 0, EvalBeforeNestingBit | EvalBeforeAssigningBit>& other)
 {
-  other._expression()._cacheFriendlyEvalAndAdd(const_cast_derived());
+  if (other._expression()._useCacheFriendlyProduct())
+    other._expression()._cacheFriendlyEvalAndAdd(const_cast_derived());
+  else
+    lazyAssign(derived() + other._expression());
   return derived();
 }
 
@@ -427,7 +437,15 @@ template<typename Derived>
 template<typename Lhs, typename Rhs>
 inline Derived& MatrixBase<Derived>::lazyAssign(const Product<Lhs,Rhs,CacheFriendlyProduct>& product)
 {
-  product._cacheFriendlyEval(derived());
+  if (product._useCacheFriendlyProduct())
+  {
+    setZero();
+    product._cacheFriendlyEvalAndAdd(derived());
+  }
+  else
+  {
+    lazyAssign<Product<Lhs,Rhs,CacheFriendlyProduct> >(product);
+  }
   return derived();
 }
 
@@ -472,61 +490,22 @@ template<typename T> struct ei_product_copy_lhs
     >::ret type;
 };
 
-template<typename Lhs, typename Rhs, int EvalMode>
+template<typename Lhs, typename Rhs, int ProductMode>
 template<typename DestDerived>
-inline void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEval(DestDerived& res) const
+inline void Product<Lhs,Rhs,ProductMode>::_cacheFriendlyEvalAndAdd(DestDerived& res) const
 {
-    if ( _rows()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
-      && _cols()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
-      && m_lhs.cols()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
-    )
-    {
-      res.setZero();
-      typedef typename ei_product_copy_lhs<_LhsNested>::type LhsCopy;
-      typedef typename ei_unref<LhsCopy>::type _LhsCopy;
-      typedef typename ei_product_copy_rhs<_RhsNested>::type RhsCopy;
-      typedef typename ei_unref<RhsCopy>::type _RhsCopy;
-      LhsCopy lhs(m_lhs);
-      RhsCopy rhs(m_rhs);
-      ei_cache_friendly_product<Scalar>(
-        _rows(), _cols(), lhs.cols(),
-        _LhsCopy::Flags&RowMajorBit, &(lhs.const_cast_derived().coeffRef(0,0)), lhs.stride(),
-        _RhsCopy::Flags&RowMajorBit, &(rhs.const_cast_derived().coeffRef(0,0)), rhs.stride(),
-        Flags&RowMajorBit, &(res.coeffRef(0,0)), res.stride()
-      );
-    }
-    else
-    {
-      res = Product<_LhsNested,_RhsNested,NormalProduct>(m_lhs, m_rhs).lazy();
-    }
-}
-
-template<typename Lhs, typename Rhs, int EvalMode>
-template<typename DestDerived>
-inline void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalAndAdd(DestDerived& res) const
-{
-    if ( _rows()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
-      && _cols()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
-      && m_lhs.cols()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
-    )
-    {
-      typedef typename ei_product_copy_lhs<_LhsNested>::type LhsCopy;
-      typedef typename ei_unref<LhsCopy>::type _LhsCopy;
-      typedef typename ei_product_copy_rhs<_RhsNested>::type RhsCopy;
-      typedef typename ei_unref<RhsCopy>::type _RhsCopy;
-      LhsCopy lhs(m_lhs);
-      RhsCopy rhs(m_rhs);
-      ei_cache_friendly_product<Scalar>(
-        _rows(), _cols(), lhs.cols(),
-        _LhsCopy::Flags&RowMajorBit, &(lhs.const_cast_derived().coeffRef(0,0)), lhs.stride(),
-        _RhsCopy::Flags&RowMajorBit, &(rhs.const_cast_derived().coeffRef(0,0)), rhs.stride(),
-        Flags&RowMajorBit, &(res.coeffRef(0,0)), res.stride()
-      );
-    }
-    else
-    {
-      res += Product<_LhsNested,_RhsNested,NormalProduct>(m_lhs, m_rhs).lazy();
-    }
+  typedef typename ei_product_copy_lhs<_LhsNested>::type LhsCopy;
+  typedef typename ei_unref<LhsCopy>::type _LhsCopy;
+  typedef typename ei_product_copy_rhs<_RhsNested>::type RhsCopy;
+  typedef typename ei_unref<RhsCopy>::type _RhsCopy;
+  LhsCopy lhs(m_lhs);
+  RhsCopy rhs(m_rhs);
+  ei_cache_friendly_product<Scalar>(
+    _rows(), _cols(), lhs.cols(),
+    _LhsCopy::Flags&RowMajorBit, &(lhs.const_cast_derived().coeffRef(0,0)), lhs.stride(),
+    _RhsCopy::Flags&RowMajorBit, &(rhs.const_cast_derived().coeffRef(0,0)), rhs.stride(),
+    Flags&RowMajorBit, &(res.coeffRef(0,0)), res.stride()
+  );
 }
 
 #endif // EIGEN_PRODUCT_H
diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h
index e7db140c5..4b93e20fd 100644
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h
@@ -101,8 +101,8 @@ MatrixBase<Derived>::redux(const BinaryOp& func) const
   * \sa trace()
   */
 template<typename Derived>
-typename ei_traits<Derived>::Scalar
-inline MatrixBase<Derived>::sum() const
+inline typename ei_traits<Derived>::Scalar
+MatrixBase<Derived>::sum() const
 {
   return this->redux(Eigen::ei_scalar_sum_op<Scalar>());
 }
@@ -114,8 +114,8 @@ inline MatrixBase<Derived>::sum() const
   * \sa diagonal(), sum()
   */
 template<typename Derived>
-typename ei_traits<Derived>::Scalar
-inline MatrixBase<Derived>::trace() const
+inline typename ei_traits<Derived>::Scalar
+MatrixBase<Derived>::trace() const
 {
   return diagonal().sum();
 }
@@ -123,8 +123,8 @@ inline MatrixBase<Derived>::trace() const
 /** \returns the minimum of all coefficients of *this
   */
 template<typename Derived>
-typename ei_traits<Derived>::Scalar
-inline MatrixBase<Derived>::minCoeff() const
+inline typename ei_traits<Derived>::Scalar
+MatrixBase<Derived>::minCoeff() const
 {
   return this->redux(Eigen::ei_scalar_min_op<Scalar>());
 }
@@ -132,8 +132,8 @@ inline MatrixBase<Derived>::minCoeff() const
 /** \returns the maximum of all coefficients of *this
   */
 template<typename Derived>
-typename ei_traits<Derived>::Scalar
-inline MatrixBase<Derived>::maxCoeff() const
+inline typename ei_traits<Derived>::Scalar
+MatrixBase<Derived>::maxCoeff() const
 {
   return this->redux(Eigen::ei_scalar_max_op<Scalar>());
 }
diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
index 509b72cc0..078beb681 100644
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@@ -156,10 +156,10 @@ class ei_corrected_matrix_flags
                          ? SuggestedFlags&RowMajorBit
                          : Cols > 1 ? RowMajorBit : 0,
            is_big = MaxRows == Dynamic || MaxCols == Dynamic,
-           inner_size = row_major_bit ? Cols : Rows,
+           linear_size = Cols * Rows,
            packet_access_bit
             = ei_packet_traits<Scalar>::size > 1
-              && (is_big || inner_size%ei_packet_traits<Scalar>::size==0)
+              && (is_big || linear_size%ei_packet_traits<Scalar>::size==0)
               ? PacketAccessBit : 0
     };
 
diff --git a/Eigen/src/Geometry/Cross.h b/Eigen/src/Geometry/Cross.h
index 1ee9a007c..61b630c2a 100644
--- a/Eigen/src/Geometry/Cross.h
+++ b/Eigen/src/Geometry/Cross.h
@@ -28,8 +28,8 @@
 /** \returns the cross product of \c *this and \a other */
 template<typename Derived>
 template<typename OtherDerived>
-typename ei_eval<Derived>::type
-inline MatrixBase<Derived>::cross(const MatrixBase<OtherDerived>& other) const
+inline typename ei_eval<Derived>::type
+MatrixBase<Derived>::cross(const MatrixBase<OtherDerived>& other) const
 {
   // Note that there is no need for an expression here since the compiler
   // optimize such a small temporary very well (even within a complex expression)
diff --git a/Eigen/src/Geometry/Transform.h b/Eigen/src/Geometry/Transform.h
index b5c5b3a0d..0b5b2a3a0 100644
--- a/Eigen/src/Geometry/Transform.h
+++ b/Eigen/src/Geometry/Transform.h
@@ -62,6 +62,47 @@ protected:
   int OtherCols=Other::ColsAtCompileTime>
   struct ei_transform_product_impl;
 
+  // FIXME these specializations of ei_transform_product_impl does not work with gcc 3.3 and 3.4 because
+  // Dim depends on a template parameter. Replacing Dim by 3 (for the 3D case) works.
+
+  // note that these specializations have to be defined here,
+  // otherwise some compilers (at least ICC and NVCC) complain about
+  // the use of Dim in the specialization parameters.
+  template<typename Other>
+  struct ei_transform_product_impl<Other,Dim+1,Dim+1>
+  {
+    typedef typename Transform<Scalar,Dim>::MatrixType MatrixType;
+    typedef typename ProductReturnType<MatrixType,Other>::Type ResultType;
+    static ResultType run(const Transform<Scalar,Dim>& tr, const Other& other)
+    { return tr.matrix() * other; }
+  };
+
+  template<typename Other>
+  struct ei_transform_product_impl<Other,Dim+1,1>
+  {
+    typedef typename Transform<Scalar,Dim>::MatrixType MatrixType;
+    typedef typename ProductReturnType<MatrixType,Other>::Type ResultType;
+    static ResultType run(const Transform<Scalar,Dim>& tr, const Other& other)
+    { return tr.matrix() * other; }
+  };
+
+  template<typename Other>
+  struct ei_transform_product_impl<Other,Dim,1>
+  {
+    typedef typename Transform<Scalar,Dim>::AffineMatrixRef MatrixType;
+    typedef const CwiseUnaryOp<
+        ei_scalar_multiple_op<Scalar>,
+        NestByValue<CwiseBinaryOp<
+          ei_scalar_sum_op<Scalar>,
+          NestByValue<typename ProductReturnType<NestByValue<MatrixType>,Other>::Type >,
+          NestByValue<typename Transform<Scalar,Dim>::VectorRef> > >
+        > ResultType;
+    // FIXME shall we offer an optimized version when the last row is know to be 0,0...,0,1 ?
+    static ResultType run(const Transform<Scalar,Dim>& tr, const Other& other)
+    { return ((tr.affine().nestByValue() * other).nestByValue() + tr.translation().nestByValue()).nestByValue()
+            * (Scalar(1) / ( (tr.matrix().template block<1,Dim>(Dim,0) * other).coeff(0) + tr.matrix().coeff(Dim,Dim))); }
+  };
+
 public:
 
   /** Default constructor without initialization of the coefficients. */
@@ -103,13 +144,7 @@ public:
   inline VectorRef translation() { return m_matrix.template block<Dim,1>(0,Dim); }
 
   template<typename OtherDerived>
-  struct TransformProductReturnType
-  {
-    typedef typename ei_transform_product_impl<OtherDerived>::ResultType Type;
-  };
-
-  template<typename OtherDerived>
-  const typename TransformProductReturnType<OtherDerived>::Type
+  const typename ei_transform_product_impl<OtherDerived>::ResultType
   operator * (const MatrixBase<OtherDerived> &other) const;
 
   /** Contatenates two transformations */
@@ -192,7 +227,7 @@ QMatrix Transform<Scalar,Dim>::toQMatrix(void) const
 
 template<typename Scalar, int Dim>
 template<typename OtherDerived>
-const typename Transform<Scalar,Dim>::template TransformProductReturnType<OtherDerived>::Type
+const typename Transform<Scalar,Dim>::template ei_transform_product_impl<OtherDerived>::ResultType
 Transform<Scalar,Dim>::operator*(const MatrixBase<OtherDerived> &other) const
 {
   return ei_transform_product_impl<OtherDerived>::run(*this,other.derived());
@@ -373,44 +408,4 @@ Transform<Scalar,Dim>::fromPositionOrientationScale(const MatrixBase<PositionDer
   return *this;
 }
 
-//----------
-
-template<typename Scalar, int Dim>
-template<typename Other>
-struct Transform<Scalar,Dim>::ei_transform_product_impl<Other,Dim+1,Dim+1>
-{
-  typedef typename Transform<Scalar,Dim>::MatrixType MatrixType;
-  typedef typename ProductReturnType<MatrixType,Other>::Type ResultType;
-  static ResultType run(const Transform<Scalar,Dim>& tr, const Other& other)
-  { return tr.matrix() * other; }
-};
-
-template<typename Scalar, int Dim>
-template<typename Other>
-struct Transform<Scalar,Dim>::ei_transform_product_impl<Other,Dim+1,1>
-{
-  typedef typename Transform<Scalar,Dim>::MatrixType MatrixType;
-  typedef typename ProductReturnType<MatrixType,Other>::Type ResultType;
-  static ResultType run(const Transform<Scalar,Dim>& tr, const Other& other)
-  { return tr.matrix() * other; }
-};
-
-template<typename Scalar, int Dim>
-template<typename Other>
-struct Transform<Scalar,Dim>::ei_transform_product_impl<Other,Dim,1>
-{
-  typedef typename Transform<Scalar,Dim>::AffineMatrixRef MatrixType;
-  typedef const CwiseUnaryOp<
-      ei_scalar_multiple_op<Scalar>,
-      NestByValue<CwiseBinaryOp<
-        ei_scalar_sum_op<Scalar>,
-        NestByValue<typename ProductReturnType<NestByValue<MatrixType>,Other>::Type >,
-        NestByValue<typename Transform<Scalar,Dim>::VectorRef> > >
-      > ResultType;
-  // FIXME shall we offer an optimized version when the last row is know to be 0,0...,0,1 ?
-  static ResultType run(const Transform<Scalar,Dim>& tr, const Other& other)
-  { return ((tr.affine().nestByValue() * other).nestByValue() + tr.translation().nestByValue()).nestByValue()
-          * (Scalar(1) / ( (tr.matrix().template block<1,Dim>(Dim,0) * other).coeff(0) + tr.matrix().coeff(Dim,Dim))); }
-};
-
 #endif // EIGEN_TRANSFORM_H