From a451835bce179a999cddedc3c9dab49e421968eb Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 25 Apr 2008 15:46:18 +0000
Subject: [PATCH] Make the explicit vectorization much more flexible:  -
 support dynamic sizes  - support arbitrary matrix size when the matrix can be
 seen as a 1D array    (except for fixed size matrices where the size in Bytes
 must be a factor of 16,     this is to allow compact storage of a vector of
 matrices) Note that the explict vectorization is still experimental and far
 to be completely tested.

---
 Eigen/Core                      |   2 +-
 Eigen/src/Core/Assign.h         |  62 ++++++++++++++++--
 Eigen/src/Core/CwiseNullaryOp.h |  95 +++++++++++++++++++++++----
 Eigen/src/Core/Lazy.h           |   5 ++
 Eigen/src/Core/Matrix.h         |   2 +-
 Eigen/src/Core/MatrixBase.h     |  14 ++--
 Eigen/src/Core/MatrixStorage.h  |  40 +++++++++---
 Eigen/src/Core/Product.h        | 110 +++++++++++++++++++-------------
 Eigen/src/Core/Temporary.h      |   5 ++
 Eigen/src/Core/util/Meta.h      |  29 +++++----
 10 files changed, 264 insertions(+), 100 deletions(-)
diff --git a/Eigen/Core b/Eigen/Core
index 950328aaa..3007899d1 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -2,7 +2,7 @@
 #define EIGEN_CORE_H
 
 #ifndef EIGEN_DONT_VECTORIZE
-#ifdef __SSE2__
+#if ((defined __SSE2__) && ( (!defined __GNUC__) || (__GNUC__>=4 && __GNUC_MINOR__>=2)))
 #define EIGEN_VECTORIZE
 #define EIGEN_VECTORIZE_SSE
 #include <emmintrin.h>
diff --git a/Eigen/src/Core/Assign.h b/Eigen/src/Core/Assign.h
index d0f126689..c9e2b6b4b 100644
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h
@@ -99,7 +99,11 @@ struct ei_matrix_assignment_packet_unroller<Derived1, Derived2, Dynamic>
 
 template <typename Derived, typename OtherDerived,
 bool Vectorize = (Derived::Flags & OtherDerived::Flags & VectorizableBit)
-              && ((Derived::Flags&RowMajorBit)==(OtherDerived::Flags&RowMajorBit))>
+              && ((Derived::Flags&RowMajorBit)==(OtherDerived::Flags&RowMajorBit))
+              && (  (Derived::Flags & OtherDerived::Flags & Like1DArrayBit)
+                  ||((Derived::Flags&RowMajorBit)
+                    ? Derived::ColsAtCompileTime!=Dynamic && (Derived::ColsAtCompileTime%ei_packet_traits<typename Derived::Scalar>::size==0)
+                    : Derived::RowsAtCompileTime!=Dynamic && (Derived::RowsAtCompileTime%ei_packet_traits<typename Derived::Scalar>::size==0)) )>
 struct ei_assignment_impl;
 
 template<typename Derived>
@@ -107,6 +111,7 @@ template<typename OtherDerived>
 Derived& MatrixBase<Derived>
   ::lazyAssign(const MatrixBase<OtherDerived>& other)
 {
+//   std::cout << "lazyAssign = " << Derived::Flags << " " << OtherDerived::Flags << "\n";
   ei_assignment_impl<Derived,OtherDerived>::execute(derived(),other.derived());
   return derived();
 }
@@ -178,6 +183,7 @@ struct ei_assignment_impl<Derived, OtherDerived, true>
     ei_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
     if(unroll)
     {
+//       std::cout << "vectorized unrolled\n";
       ei_matrix_assignment_packet_unroller
         <Derived, OtherDerived,
           unroll && int(Derived::SizeAtCompileTime)>=ei_packet_traits<typename Derived::Scalar>::size
@@ -188,15 +194,61 @@ struct ei_assignment_impl<Derived, OtherDerived, true>
     {
       if(OtherDerived::Flags&RowMajorBit)
       {
-        for(int i = 0; i < dst.rows(); i++)
-          for(int j = 0; j < dst.cols(); j+=ei_packet_traits<typename Derived::Scalar>::size)
+        if ( (Derived::Flags & OtherDerived::Flags & Like1DArrayBit)
+          &&  (Derived::ColsAtCompileTime==Dynamic
+            || Derived::ColsAtCompileTime%ei_packet_traits<typename Derived::Scalar>::size!=0))
+        {
+//           std::cout << "vectorized linear row major\n";
+          const int size = dst.rows() * dst.cols();
+          const int alignedSize = (size/ei_packet_traits<typename Derived::Scalar>::size)*ei_packet_traits<typename Derived::Scalar>::size;
+          int index = 0;
+          for ( ; index<alignedSize ; index+=ei_packet_traits<typename Derived::Scalar>::size)
+          {
+            // FIXME the following is not really efficient
+            int i = index/dst.rows();
+            int j = index%dst.rows();
             dst.writePacketCoeff(i, j, src.packetCoeff(i, j));
+          }
+          for(int i = alignedSize/dst.rows(); i < dst.rows(); i++)
+            for(int j = alignedSize%dst.rows(); j < dst.cols(); j++)
+              dst.coeffRef(i, j) = src.coeff(i, j);
+        }
+        else
+        {
+//           std::cout << "vectorized normal row major\n";
+          for(int i = 0; i < dst.rows(); i++)
+            for(int j = 0; j < dst.cols(); j+=ei_packet_traits<typename Derived::Scalar>::size)
+              dst.writePacketCoeff(i, j, src.packetCoeff(i, j));
+        }
       }
       else
       {
-        for(int j = 0; j < dst.cols(); j++)
-          for(int i = 0; i < dst.rows(); i+=ei_packet_traits<typename Derived::Scalar>::size)
+        if ((Derived::Flags & OtherDerived::Flags & Like1DArrayBit)
+          && ( Derived::RowsAtCompileTime==Dynamic
+            || Derived::RowsAtCompileTime%ei_packet_traits<typename Derived::Scalar>::size!=0))
+        {
+//           std::cout << "vectorized linear col major\n";
+          const int size = dst.rows() * dst.cols();
+          const int alignedSize = (size/ei_packet_traits<typename Derived::Scalar>::size)*ei_packet_traits<typename Derived::Scalar>::size;
+          int index = 0;
+          for ( ; index<alignedSize ; index+=ei_packet_traits<typename Derived::Scalar>::size)
+          {
+            // FIXME the following is not really efficient
+            int i = index%dst.rows();
+            int j = index/dst.rows();
             dst.writePacketCoeff(i, j, src.packetCoeff(i, j));
+          }
+          for(int j = alignedSize/dst.rows(); j < dst.cols(); j++)
+            for(int i = alignedSize%dst.rows(); i < dst.rows(); i++)
+              dst.coeffRef(i, j) = src.coeff(i, j);
+        }
+        else
+        {
+//           std::cout << "vectorized normal col major\n";
+          for(int j = 0; j < dst.cols(); j++)
+            for(int i = 0; i < dst.rows(); i+=ei_packet_traits<typename Derived::Scalar>::size)
+              dst.writePacketCoeff(i, j, src.packetCoeff(i, j));
+        }
       }
     }
   }
diff --git a/Eigen/src/Core/CwiseNullaryOp.h b/Eigen/src/Core/CwiseNullaryOp.h
index d3bce41d8..4f09bd8a9 100644
--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h
@@ -31,8 +31,8 @@
   *
   * \param NullaryOp template functor implementing the operator
   *
-  * This class represents an expression of a generic zeroary operator.
-  * It is the return type of the ones(), zero(), constant() and random() functions,
+  * This class represents an expression of a generic nullary operator.
+  * It is the return type of the ones(), zero(), constant(), identity() and random() functions,
   * and most of the time this is the only way it is used.
   *
   * However, if you want to write a function returning such an expression, you
@@ -94,12 +94,18 @@ class CwiseNullaryOp : ei_no_assignment_operator,
 };
 
 
-/* \returns an expression of a custom coefficient-wise operator \a func of *this and \a other
+/** \returns an expression of a matrix defined by a custom functor \a func
   *
-  * The template parameter \a CustomNullaryOp is the type of the functor
-  * of the custom operator (see class CwiseNullaryOp for an example)
+  * The parameters \a rows and \a cols are the number of rows and of columns of
+  * the returned matrix. Must be compatible with this MatrixBase type.
   *
-  * \sa class CwiseNullaryOp, MatrixBase::operator+, MatrixBase::operator-, MatrixBase::cwiseProduct, MatrixBase::cwiseQuotient
+  * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
+  * it is redundant to pass \a rows and \a cols as arguments, so zero() should be used
+  * instead.
+  *
+  * The template parameter \a CustomNullaryOp is the type of the functor.
+  *
+  * \sa class CwiseNullaryOp
   */
 template<typename Derived>
 template<typename CustomNullaryOp>
@@ -109,6 +115,21 @@ MatrixBase<Derived>::cwiseCreate(int rows, int cols, const CustomNullaryOp& func
   return CwiseNullaryOp<CustomNullaryOp, Derived>(rows, cols, func);
 }
 
+/** \returns an expression of a matrix defined by a custom functor \a func
+  *
+  * The parameter \a size is the size of the returned vector.
+  * Must be compatible with this MatrixBase type.
+  *
+  * \only_for_vectors
+  *
+  * This variant is meant to be used for dynamic-size vector types. For fixed-size types,
+  * it is redundant to pass \a size as argument, so zero() should be used
+  * instead.
+  *
+  * The template parameter \a CustomNullaryOp is the type of the functor.
+  *
+  * \sa class CwiseNullaryOp
+  */
 template<typename Derived>
 template<typename CustomNullaryOp>
 const CwiseNullaryOp<CustomNullaryOp, Derived>
@@ -119,6 +140,15 @@ MatrixBase<Derived>::cwiseCreate(int size, const CustomNullaryOp& func)
   else return CwiseNullaryOp<CustomNullaryOp, Derived>(size, 1, func);
 }
 
+/** \returns an expression of a matrix defined by a custom functor \a func
+  *
+  * This variant is only for fixed-size MatrixBase types. For dynamic-size types, you
+  * need to use the variants taking size arguments.
+  *
+  * The template parameter \a CustomNullaryOp is the type of the functor.
+  *
+  * \sa class CwiseNullaryOp
+  */
 template<typename Derived>
 template<typename CustomNullaryOp>
 const CwiseNullaryOp<CustomNullaryOp, Derived>
@@ -127,7 +157,16 @@ MatrixBase<Derived>::cwiseCreate(const CustomNullaryOp& func)
   return CwiseNullaryOp<CustomNullaryOp, Derived>(rows(), cols(), func);
 }
 
-/* \returns an expression of the coefficient-wise \< operator of *this and \a other
+/** \returns an expression of a constant matrix of value \a value
+  *
+  * The parameters \a rows and \a cols are the number of rows and of columns of
+  * the returned matrix. Must be compatible with this MatrixBase type.
+  *
+  * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
+  * it is redundant to pass \a rows and \a cols as arguments, so zero() should be used
+  * instead.
+  *
+  * The template parameter \a CustomNullaryOp is the type of the functor.
   *
   * \sa class CwiseNullaryOp
   */
@@ -138,6 +177,21 @@ MatrixBase<Derived>::constant(int rows, int cols, const Scalar& value)
   return cwiseCreate(rows, cols, ei_scalar_constant_op<Scalar>(value));
 }
 
+/** \returns an expression of a constant matrix of value \a value
+  *
+  * The parameter \a size is the size of the returned vector.
+  * Must be compatible with this MatrixBase type.
+  *
+  * \only_for_vectors
+  *
+  * This variant is meant to be used for dynamic-size vector types. For fixed-size types,
+  * it is redundant to pass \a size as argument, so zero() should be used
+  * instead.
+  *
+  * The template parameter \a CustomNullaryOp is the type of the functor.
+  *
+  * \sa class CwiseNullaryOp
+  */
 template<typename Derived>
 const CwiseNullaryOp<ei_scalar_constant_op<typename ei_traits<Derived>::Scalar>, Derived>
 MatrixBase<Derived>::constant(int size, const Scalar& value)
@@ -145,6 +199,15 @@ MatrixBase<Derived>::constant(int size, const Scalar& value)
   return cwiseCreate(size, ei_scalar_constant_op<Scalar>(value));
 }
 
+/** \returns an expression of a constant matrix of value \a value
+  *
+  * This variant is only for fixed-size MatrixBase types. For dynamic-size types, you
+  * need to use the variants taking size arguments.
+  *
+  * The template parameter \a CustomNullaryOp is the type of the functor.
+  *
+  * \sa class CwiseNullaryOp
+  */
 template<typename Derived>
 const CwiseNullaryOp<ei_scalar_constant_op<typename ei_traits<Derived>::Scalar>, Derived>
 MatrixBase<Derived>::constant(const Scalar& value)
@@ -163,6 +226,10 @@ bool MatrixBase<Derived>::isEqualToConstant
   return true;
 }
 
+/** Sets all coefficients in this expression to \a value.
+  *
+  * \sa class CwiseNullaryOp, zero(), ones()
+  */
 template<typename Derived>
 Derived& MatrixBase<Derived>::setConstant(const Scalar& value)
 {
@@ -238,7 +305,7 @@ MatrixBase<Derived>::zero()
   * Example: \include MatrixBase_isZero.cpp
   * Output: \verbinclude MatrixBase_isZero.out
   *
-  * \sa class Zero, zero()
+  * \sa class CwiseNullaryOp, zero()
   */
 template<typename Derived>
 bool MatrixBase<Derived>::isZero
@@ -256,7 +323,7 @@ bool MatrixBase<Derived>::isZero
   * Example: \include MatrixBase_setZero.cpp
   * Output: \verbinclude MatrixBase_setZero.out
   *
-  * \sa class Zero, zero()
+  * \sa class CwiseNullaryOp, zero()
   */
 template<typename Derived>
 Derived& MatrixBase<Derived>::setZero()
@@ -333,7 +400,7 @@ MatrixBase<Derived>::ones()
   * Example: \include MatrixBase_isOnes.cpp
   * Output: \verbinclude MatrixBase_isOnes.out
   *
-  * \sa class Ones, ones()
+  * \sa class CwiseNullaryOp, ones()
   */
 template<typename Derived>
 bool MatrixBase<Derived>::isOnes
@@ -347,7 +414,7 @@ bool MatrixBase<Derived>::isOnes
   * Example: \include MatrixBase_setOnes.cpp
   * Output: \verbinclude MatrixBase_setOnes.out
   *
-  * \sa class Ones, ones()
+  * \sa class CwiseNullaryOp, ones()
   */
 template<typename Derived>
 Derived& MatrixBase<Derived>::setOnes()
@@ -424,7 +491,7 @@ MatrixBase<Derived>::random()
   * Example: \include MatrixBase_setRandom.cpp
   * Output: \verbinclude MatrixBase_setRandom.out
   *
-  * \sa class Random, ei_random()
+  * \sa class CwiseNullaryOp, ei_random()
   */
 template<typename Derived>
 Derived& MatrixBase<Derived>::setRandom()
@@ -479,7 +546,7 @@ MatrixBase<Derived>::identity()
   * Example: \include MatrixBase_isIdentity.cpp
   * Output: \verbinclude MatrixBase_isIdentity.out
   *
-  * \sa class Identity, identity(), identity(int,int), setIdentity()
+  * \sa class CwiseNullaryOp, identity(), identity(int,int), setIdentity()
   */
 template<typename Derived>
 bool MatrixBase<Derived>::isIdentity
@@ -509,7 +576,7 @@ bool MatrixBase<Derived>::isIdentity
   * Example: \include MatrixBase_setIdentity.cpp
   * Output: \verbinclude MatrixBase_setIdentity.out
   *
-  * \sa class Identity, identity(), identity(int,int), isIdentity()
+  * \sa class CwiseNullaryOp, identity(), identity(int,int), isIdentity()
   */
 template<typename Derived>
 Derived& MatrixBase<Derived>::setIdentity()
diff --git a/Eigen/src/Core/Lazy.h b/Eigen/src/Core/Lazy.h
index 0c65cdeba..3e25acb19 100644
--- a/Eigen/src/Core/Lazy.h
+++ b/Eigen/src/Core/Lazy.h
@@ -72,6 +72,11 @@ template<typename ExpressionType> class Lazy
       return m_expression.coeff(row, col);
     }
 
+    PacketScalar _packetCoeff(int row, int col) const
+    {
+      return m_expression.packetCoeff(row, col);
+    }
+
   protected:
     const typename ExpressionType::Nested m_expression;
 };
diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h
index 92f726011..dd1235aa3 100644
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h
@@ -79,7 +79,7 @@ struct ei_traits<Matrix<_Scalar, _Rows, _Cols, _SuggestedFlags, _MaxRows, _MaxCo
     ColsAtCompileTime = _Cols,
     MaxRowsAtCompileTime = _MaxRows,
     MaxColsAtCompileTime = _MaxCols,
-    Flags = ei_corrected_matrix_flags<_Scalar, _Rows, _Cols, _SuggestedFlags>::ret,
+    Flags = ei_corrected_matrix_flags<_Scalar, ei_size_at_compile_time<_MaxRows,_MaxCols>::ret, _SuggestedFlags>::ret,
     CoeffReadCost = NumTraits<Scalar>::ReadCost
   };
 };
diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h
index 3247ec4bf..b6a161bdd 100644
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@@ -75,11 +75,8 @@ template<typename Derived> class MatrixBase
           * it is set to the \a Dynamic constant.
           * \sa MatrixBase::rows(), MatrixBase::cols(), RowsAtCompileTime, SizeAtCompileTime */
 
-      SizeAtCompileTime
-        = ei_traits<Derived>::RowsAtCompileTime == Dynamic
-        || ei_traits<Derived>::ColsAtCompileTime == Dynamic
-        ? Dynamic
-        : ei_traits<Derived>::RowsAtCompileTime * ei_traits<Derived>::ColsAtCompileTime,
+      SizeAtCompileTime = ei_size_at_compile_time<ei_traits<Derived>::RowsAtCompileTime,
+                                                  ei_traits<Derived>::ColsAtCompileTime>::ret,
         /**< This is equal to the number of coefficients, i.e. the number of
           * rows times the number of columns, or to \a Dynamic if this is not
           * known at compile-time. \sa RowsAtCompileTime, ColsAtCompileTime */
@@ -106,11 +103,8 @@ template<typename Derived> class MatrixBase
           * \sa ColsAtCompileTime, MaxRowsAtCompileTime, MaxSizeAtCompileTime
           */
 
-      MaxSizeAtCompileTime
-        = ei_traits<Derived>::MaxRowsAtCompileTime == Dynamic
-        || ei_traits<Derived>::MaxColsAtCompileTime == Dynamic
-        ? Dynamic
-        : ei_traits<Derived>::MaxRowsAtCompileTime * ei_traits<Derived>::MaxColsAtCompileTime,
+      MaxSizeAtCompileTime = ei_size_at_compile_time<ei_traits<Derived>::MaxRowsAtCompileTime,
+                                                     ei_traits<Derived>::MaxColsAtCompileTime>::ret,
         /**< This value is equal to the maximum possible number of coefficients that this expression
           * might have. If this expression might have an arbitrarily high number of coefficients,
           * this value is set to \a Dynamic.
diff --git a/Eigen/src/Core/MatrixStorage.h b/Eigen/src/Core/MatrixStorage.h
index cca4414d3..c8ee7a62c 100644
--- a/Eigen/src/Core/MatrixStorage.h
+++ b/Eigen/src/Core/MatrixStorage.h
@@ -49,6 +49,28 @@ template <typename T, int Size> struct ei_aligned_array<T,Size,false>
   T array[Size];
 };
 
+template<typename T>
+T* ei_aligned_malloc(size_t size)
+{
+  #ifdef EIGEN_VECTORIZE
+  if (ei_packet_traits<T>::size>1)
+    return static_cast<T*>(_mm_malloc(sizeof(T)*size, 16));
+  else
+  #endif
+    return new T[size];
+}
+
+template<typename T>
+void ei_aligned_free(T* ptr)
+{
+  #ifdef EIGEN_VECTORIZE
+  if (ei_packet_traits<T>::size>1)
+    _mm_free(ptr);
+  else
+  #endif
+    delete[] ptr;
+}
+
 // purely fixed-size matrix
 template<typename T, int Size, int _Rows, int _Cols> class ei_matrix_storage
 {
@@ -127,7 +149,7 @@ template<typename T> class ei_matrix_storage<T, Dynamic, Dynamic, Dynamic>
     int m_cols;
   public:
     ei_matrix_storage(int size, int rows, int cols)
-      : m_data(new T[size]), m_rows(rows), m_cols(cols) {}
+      : m_data(ei_aligned_malloc<T>(size)), m_rows(rows), m_cols(cols) {}
     ~ei_matrix_storage() { delete[] m_data; }
     int rows(void) const {return m_rows;}
     int cols(void) const {return m_cols;}
@@ -135,8 +157,8 @@ template<typename T> class ei_matrix_storage<T, Dynamic, Dynamic, Dynamic>
     {
       if(size != m_rows*m_cols)
       {
-        delete[] m_data;
-        m_data = new T[size];
+        ei_aligned_free(m_data);
+        m_data = ei_aligned_malloc<T>(size);
       }
       m_rows = rows;
       m_cols = cols;
@@ -151,7 +173,7 @@ template<typename T, int _Rows> class ei_matrix_storage<T, Dynamic, _Rows, Dynam
     T *m_data;
     int m_cols;
   public:
-    ei_matrix_storage(int size, int, int cols) : m_data(new T[size]), m_cols(cols) {}
+    ei_matrix_storage(int size, int, int cols) : m_data(ei_aligned_malloc<T>(size)), m_cols(cols) {}
     ~ei_matrix_storage() { delete[] m_data; }
     static int rows(void) {return _Rows;}
     int cols(void) const {return m_cols;}
@@ -159,8 +181,8 @@ template<typename T, int _Rows> class ei_matrix_storage<T, Dynamic, _Rows, Dynam
     {
       if(size != _Rows*m_cols)
       {
-        delete[] m_data;
-        m_data = new T[size];
+        ei_aligned_free(m_data);
+        m_data = ei_aligned_malloc<T>(size);
       }
       m_cols = cols;
     }
@@ -174,7 +196,7 @@ template<typename T, int _Cols> class ei_matrix_storage<T, Dynamic, Dynamic, _Co
     T *m_data;
     int m_rows;
   public:
-    ei_matrix_storage(int size, int rows, int) : m_data(new T[size]), m_rows(rows) {}
+    ei_matrix_storage(int size, int rows, int) : m_data(ei_aligned_malloc<T>(size)), m_rows(rows) {}
     ~ei_matrix_storage() { delete[] m_data; }
     int rows(void) const {return m_rows;}
     static int cols(void) {return _Cols;}
@@ -182,8 +204,8 @@ template<typename T, int _Cols> class ei_matrix_storage<T, Dynamic, Dynamic, _Co
     {
       if(size != m_rows*_Cols)
       {
-        delete[] m_data;
-        m_data = new T[size];
+        ei_aligned_free(m_data);
+        m_data = ei_aligned_malloc<T>(size);
       }
       m_rows = rows;
     }
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index 590e03599..895e19e0e 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -135,7 +135,7 @@ struct ei_traits<Product<Lhs, Rhs, EvalMode> >
           | EvalBeforeAssigningBit
           | (ei_product_eval_mode<Lhs, Rhs>::value == (int)CacheOptimalProduct ? EvalBeforeNestingBit : 0))
           & (
-              ~(RowMajorBit | VectorizableBit)
+              ~(RowMajorBit | VectorizableBit | Like1DArrayBit)
               | (
                   (
                     !(Lhs::Flags & RowMajorBit) && (Lhs::Flags & VectorizableBit)
@@ -178,7 +178,11 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm
 
     /** \internal */
     template<typename DestDerived>
-    void _cacheOptimalEval(DestDerived& res) const;
+    void _cacheOptimalEval(DestDerived& res, ei_meta_false) const;
+    #ifdef EIGEN_VECTORIZE
+    template<typename DestDerived>
+    void _cacheOptimalEval(DestDerived& res, ei_meta_true) const;
+    #endif
 
   private:
 
@@ -267,59 +271,29 @@ MatrixBase<Derived>::operator*=(const MatrixBase<OtherDerived> &other)
 }
 
 template<typename Derived>
-template<typename Derived1, typename Derived2>
-Derived& MatrixBase<Derived>::lazyAssign(const Product<Derived1,Derived2,CacheOptimalProduct>& product)
+template<typename Lhs, typename Rhs>
+Derived& MatrixBase<Derived>::lazyAssign(const Product<Lhs,Rhs,CacheOptimalProduct>& product)
 {
-  product._cacheOptimalEval(*this);
+  product._cacheOptimalEval(*this,
+    #ifdef EIGEN_VECTORIZE
+    typename ei_meta_if<(Flags & VectorizableBit)
+      && (!(Lhs::Flags & RowMajorBit)
+      && (Lhs::RowsAtCompileTime!=Dynamic)
+      && (Lhs::RowsAtCompileTime%ei_packet_traits<Scalar>::size==0) ),
+      ei_meta_true,ei_meta_false>::ret()
+    #else
+    ei_meta_false
+    #endif
+    );
   return derived();
 }
 
 template<typename Lhs, typename Rhs, int EvalMode>
 template<typename DestDerived>
-void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res) const
+void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res, ei_meta_false) const
 {
   res.setZero();
   const int cols4 = m_lhs.cols() & 0xfffffffC;
-  #ifdef EIGEN_VECTORIZE
-  if( (Flags & VectorizableBit) && (!(Lhs::Flags & RowMajorBit)) )
-  {
-    for(int k=0; k<this->cols(); k++)
-    {
-      int j=0;
-      for(; j<cols4; j+=4)
-      {
-        const typename ei_packet_traits<Scalar>::type tmp0 = ei_pset1(m_rhs.coeff(j+0,k));
-        const typename ei_packet_traits<Scalar>::type tmp1 = ei_pset1(m_rhs.coeff(j+1,k));
-        const typename ei_packet_traits<Scalar>::type tmp2 = ei_pset1(m_rhs.coeff(j+2,k));
-        const typename ei_packet_traits<Scalar>::type tmp3 = ei_pset1(m_rhs.coeff(j+3,k));
-        for (int i=0; i<this->rows(); i+=ei_packet_traits<Scalar>::size)
-        {
-          res.writePacketCoeff(i,k,\
-            ei_padd(
-              res.packetCoeff(i,k),
-              ei_padd(
-                ei_padd(
-                  ei_pmul(tmp0, m_lhs.packetCoeff(i,j)),
-                  ei_pmul(tmp1, m_lhs.packetCoeff(i,j+1))),
-                ei_padd(
-                  ei_pmul(tmp2, m_lhs.packetCoeff(i,j+2)),
-                  ei_pmul(tmp3, m_lhs.packetCoeff(i,j+3))
-                )
-              )
-            )
-          );
-        }
-      }
-      for(; j<m_lhs.cols(); ++j)
-      {
-        const typename ei_packet_traits<Scalar>::type tmp = ei_pset1(m_rhs.coeff(j,k));
-        for (int i=0; i<this->rows(); ++i)
-          res.writePacketCoeff(i,k,ei_pmul(tmp, m_lhs.packetCoeff(i,j)));
-      }
-    }
-  }
-  else
-  #endif // EIGEN_VECTORIZE
   {
     for(int k=0; k<this->cols(); ++k)
     {
@@ -344,4 +318,48 @@ void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res) const
   }
 }
 
+#ifdef EIGEN_VECTORIZE
+template<typename Lhs, typename Rhs, int EvalMode>
+template<typename DestDerived>
+void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res, ei_meta_true) const
+{
+  res.setZero();
+  const int cols4 = m_lhs.cols() & 0xfffffffC;
+  for(int k=0; k<this->cols(); k++)
+  {
+    int j=0;
+    for(; j<cols4; j+=4)
+    {
+      const typename ei_packet_traits<Scalar>::type tmp0 = ei_pset1(m_rhs.coeff(j+0,k));
+      const typename ei_packet_traits<Scalar>::type tmp1 = ei_pset1(m_rhs.coeff(j+1,k));
+      const typename ei_packet_traits<Scalar>::type tmp2 = ei_pset1(m_rhs.coeff(j+2,k));
+      const typename ei_packet_traits<Scalar>::type tmp3 = ei_pset1(m_rhs.coeff(j+3,k));
+      for (int i=0; i<this->rows(); i+=ei_packet_traits<Scalar>::size)
+      {
+        res.writePacketCoeff(i,k,\
+          ei_padd(
+            res.packetCoeff(i,k),
+            ei_padd(
+              ei_padd(
+                ei_pmul(tmp0, m_lhs.packetCoeff(i,j)),
+                ei_pmul(tmp1, m_lhs.packetCoeff(i,j+1))),
+              ei_padd(
+                ei_pmul(tmp2, m_lhs.packetCoeff(i,j+2)),
+                ei_pmul(tmp3, m_lhs.packetCoeff(i,j+3))
+              )
+            )
+          )
+        );
+      }
+    }
+    for(; j<m_lhs.cols(); ++j)
+    {
+      const typename ei_packet_traits<Scalar>::type tmp = ei_pset1(m_rhs.coeff(j,k));
+      for (int i=0; i<this->rows(); ++i)
+        res.writePacketCoeff(i,k,ei_pmul(tmp, m_lhs.packetCoeff(i,j)));
+    }
+  }
+}
+#endif // EIGEN_VECTORIZE
+
 #endif // EIGEN_PRODUCT_H
diff --git a/Eigen/src/Core/Temporary.h b/Eigen/src/Core/Temporary.h
index 981a0c218..9157b10e4 100644
--- a/Eigen/src/Core/Temporary.h
+++ b/Eigen/src/Core/Temporary.h
@@ -71,6 +71,11 @@ template<typename ExpressionType> class Temporary
       return m_expression.coeff(row, col);
     }
 
+    PacketScalar _packetCoeff(int row, int col) const
+    {
+      return m_expression.packetCoeff(row, col);
+    }
+
   protected:
     const ExpressionType m_expression;
 };
diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
index 3c8f9ad9a..19768c1ca 100644
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@@ -70,6 +70,9 @@ struct ei_meta_if <false, Then, Else> { typedef Else ret; };
 template<typename T, typename U> struct ei_is_same_type { enum { ret = 0 }; };
 template<typename T> struct ei_is_same_type<T,T> { enum { ret = 1 }; };
 
+struct ei_meta_true {};
+struct ei_meta_false {};
+
 
 /** \internal
   * Convenient struct to get the result type of a unary or binary functor.
@@ -145,19 +148,12 @@ template<typename T> struct ei_packet_traits
   enum {size=1};
 };
 
-template<typename Scalar, int Rows, int Cols, unsigned int SuggestedFlags>
+template<typename Scalar, int Size, unsigned int SuggestedFlags>
 class ei_corrected_matrix_flags
 {
     enum { is_vectorizable
             = ei_packet_traits<Scalar>::size > 1
-              && Rows!=Dynamic
-              && Cols!=Dynamic
-              &&
-              (
-                SuggestedFlags&RowMajorBit
-                  ? Cols%ei_packet_traits<Scalar>::size==0
-                  : Rows%ei_packet_traits<Scalar>::size==0
-              ),
+              && (Size%ei_packet_traits<Scalar>::size==0),
           _flags1 = (SuggestedFlags & ~(EvalBeforeNestingBit | EvalBeforeAssigningBit)) | Like1DArrayBit
     };
 
@@ -168,19 +164,24 @@ class ei_corrected_matrix_flags
     };
 };
 
+template<int _Rows, int _Cols> struct ei_size_at_compile_time
+{
+  enum { ret = (_Rows==Dynamic || _Cols==Dynamic) ? Dynamic : _Rows * _Cols };
+};
+
 template<typename T> class ei_eval
 {
     typedef typename ei_traits<T>::Scalar _Scalar;
-    enum { _Rows = ei_traits<T>::RowsAtCompileTime,
-          _Cols = ei_traits<T>::ColsAtCompileTime,
+    enum {_MaxRows = ei_traits<T>::MaxRowsAtCompileTime,
+          _MaxCols = ei_traits<T>::MaxColsAtCompileTime,
           _Flags = ei_traits<T>::Flags
     };
 
   public:
     typedef Matrix<_Scalar,
-                  _Rows,
-                  _Cols,
-                  ei_corrected_matrix_flags<_Scalar, _Rows, _Cols, _Flags>::ret,
+                  ei_traits<T>::RowsAtCompileTime,
+                  ei_traits<T>::ColsAtCompileTime,
+                  ei_corrected_matrix_flags<_Scalar, ei_size_at_compile_time<_MaxRows,_MaxCols>::ret, _Flags>::ret,
                   ei_traits<T>::MaxRowsAtCompileTime,
                   ei_traits<T>::MaxColsAtCompileTime> type;
 };