* add bench/benchVecAdd.cpp by Gael, fix crash (ei_pload on non-aligned)

* introduce packet(int), make use of it in linear vectorized paths --> completely fixes the slowdown noticed in benchVecAdd. * generalize coeff(int) to linear-access xprs * clarify the access flag bits * rework api dox in Coeffs.h and util/Constants.h * improve certain expressions's flags, allowing more vectorization * fix bug in Block: start(int) and end(int) returned dyn*dyn size * fix bug in Block: just because the Eval type has packet access doesn't imply the block xpr should have it too.
2025-09-29 01:33:14 +08:00 · 2008-06-26 16:06:41 +00:00 · 2008-06-26 16:06:41 +00:00 · 25ba9f377c
commit 25ba9f377c
parent 5b0da4b778
23 changed files with 558 additions and 264 deletions
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h
@ -301,45 +301,14 @@ struct ei_assign_impl<Derived1, Derived2, LinearVectorization, NoUnrolling>
    const int size = dst.size();
    const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size;
    const int alignedSize = (size/packetSize)*packetSize;
-    const bool rowMajor = Derived1::Flags&RowMajorBit;
-    const int innerSize = rowMajor ? dst.cols() : dst.rows();
-    const int outerSize = rowMajor ? dst.rows() : dst.cols();
-    int index = 0;

-    // do the vectorizable part of the assignment
-    int row = 0;
-    int col = 0;
-    while (index<alignedSize)
+    for(int index = 0; index < alignedSize; index += packetSize)
    {
-      int start = rowMajor ? col : row;
-      int end = std::min(innerSize, start + alignedSize-index);
-      for ( ; (rowMajor ? col : row)<end; (rowMajor ? col : row)+=packetSize)
-        dst.template writePacket<Aligned>(row, col, src.template packet<Aligned>(row, col));
-      index += (rowMajor ? col : row) - start;
-      row = rowMajor ? index/innerSize : index%innerSize;
-      col = rowMajor ? index%innerSize : index/innerSize;
+      dst.template writePacket<Aligned>(index, src.template packet<Aligned>(index));
    }

-    // now we must do the rest without vectorization.
-    if(alignedSize == size) return;
-    const int k = alignedSize/innerSize;
-
-    // do the remainder of the current row or col
-    for(int i = alignedSize%innerSize; i < innerSize; i++)
-    {
-      const int row = rowMajor ? k : i;
-      const int col = rowMajor ? i : k;
-      dst.coeffRef(row, col) = src.coeff(row, col);
-    }
-
-    // do the remaining rows or cols
-    for(int j = k+1; j < outerSize; j++)
-      for(int i = 0; i < innerSize; i++)
-      {
-        const int row = rowMajor ? i : j;
-        const int col = rowMajor ? j : i;
-        dst.coeffRef(row, col) = src.coeff(row, col);
-      }
+    for(int index = alignedSize; index < size; index++)
+      dst.coeffRef(index) = src.coeff(index);
  }
 };

@ -351,23 +320,9 @@ struct ei_assign_impl<Derived1, Derived2, LinearVectorization, CompleteUnrolling
    const int size = Derived1::SizeAtCompileTime;
    const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size;
    const int alignedSize = (size/packetSize)*packetSize;
-    const bool rowMajor = int(Derived1::Flags)&RowMajorBit;
-    const int innerSize = rowMajor ? int(Derived1::ColsAtCompileTime) : int(Derived1::RowsAtCompileTime);
-    const int outerSize = rowMajor ? int(Derived1::RowsAtCompileTime) : int(Derived1::ColsAtCompileTime);

-    // do the vectorizable part of the assignment
    ei_assign_innervec_CompleteUnrolling<Derived1, Derived2, 0, alignedSize>::run(dst, src);
-
-    // now we must do the rest without vectorization.
-    const int k = alignedSize/innerSize;
-    const int i = alignedSize%innerSize;
-
-    // do the remainder of the current row or col
-    ei_assign_novec_InnerUnrolling<Derived1, Derived2, i, k<outerSize ? innerSize : 0>::run(dst, src, k);
-
-    // do the remaining rows or cols
-    for(int j = k+1; j < outerSize; j++)
-      ei_assign_novec_InnerUnrolling<Derived1, Derived2, 0, innerSize>::run(dst, src, j);
+    ei_assign_novec_CompleteUnrolling<Derived1, Derived2, alignedSize, size>::run(dst, src);
  }
 };

@ -432,8 +387,8 @@ template<typename Derived, typename OtherDerived,
 struct ei_assign_selector;

 template<typename Derived, typename OtherDerived>
-struct ei_assign_selector<Derived,OtherDerived,true,true> {
-  static Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.transpose().eval()); }
+struct ei_assign_selector<Derived,OtherDerived,false,false> {
+  static Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.derived()); }
 };
 template<typename Derived, typename OtherDerived>
 struct ei_assign_selector<Derived,OtherDerived,true,false> {
@ -444,8 +399,8 @@ struct ei_assign_selector<Derived,OtherDerived,false,true> {
  static Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.transpose()); }
 };
 template<typename Derived, typename OtherDerived>
-struct ei_assign_selector<Derived,OtherDerived,false,false> {
-  static Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.derived()); }
+struct ei_assign_selector<Derived,OtherDerived,true,true> {
+  static Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.transpose().eval()); }
 };

 template<typename Derived>
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@ -71,13 +71,12 @@ struct ei_traits<Block<MatrixType, BlockRows, BlockCols> >
                  || (ColsAtCompileTime != Dynamic && MatrixType::ColsAtCompileTime == Dynamic))
                   ? ~LargeBit
                   : ~(unsigned int)0,
-    MaskPacketAccessBit = ei_corrected_matrix_flags<
-                            Scalar, RowsAtCompileTime, ColsAtCompileTime,
-                            MaxRowsAtCompileTime, MaxColsAtCompileTime, MatrixType::Flags
-                          >::ret & PacketAccessBit,
-    FlagsLinearAccessBit = MatrixType::Flags & RowMajorBit
-                        ? (RowsAtCompileTime == 1 ? LinearAccessBit : 0)
-                        : (ColsAtCompileTime == 1 ? LinearAccessBit : 0),
+    RowMajor = int(MatrixType::Flags)&RowMajorBit,
+    InnerSize = RowMajor ? ColsAtCompileTime : RowsAtCompileTime,
+    InnerMaxSize = RowMajor ? MaxColsAtCompileTime : MaxRowsAtCompileTime,
+    MaskPacketAccessBit = (InnerMaxSize == Dynamic || (InnerSize % ei_packet_traits<Scalar>::size) == 0)
+                        ? PacketAccessBit : 0,
+    FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1) ? LinearAccessBit : 0,
    Flags = (MatrixType::Flags & (HereditaryBits | MaskPacketAccessBit | DirectAccessBit) & MaskLargeBit)
          | FlagsLinearAccessBit,
    CoeffReadCost = MatrixType::CoeffReadCost
@ -153,6 +152,21 @@ template<typename MatrixType, int BlockRows, int BlockCols> class Block
      return m_matrix.coeff(row + m_startRow.value(), col + m_startCol.value());
    }

+    inline Scalar& _coeffRef(int index)
+    {
+      return m_matrix.const_cast_derived()
+             .coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+                       m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+      
+    }
+
+    inline const Scalar _coeff(int index) const
+    {
+      return m_matrix
+             .coeff(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+                    m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+    }
+
    template<int LoadMode>
    inline PacketScalar _packet(int row, int col) const
    {
@ -165,6 +179,21 @@ template<typename MatrixType, int BlockRows, int BlockCols> class Block
      m_matrix.const_cast_derived().template writePacket<UnAligned>(row + m_startRow.value(), col + m_startCol.value(), x);
    }

+    template<int LoadMode>
+    inline PacketScalar _packet(int index) const
+    {
+      return m_matrix.template packet<UnAligned>(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+                                                 m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+    }
+
+    template<int LoadMode>
+    inline void _writePacket(int index, const PacketScalar& x)
+    {
+      m_matrix.const_cast_derived().template writePacket<UnAligned>
+         (m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+          m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0), x);
+    }
+
  protected:

    const typename MatrixType::Nested m_matrix;
@ -260,22 +289,30 @@ inline const Block<Derived> MatrixBase<Derived>
  * \sa class Block, block(int,int)
  */
 template<typename Derived>
-inline Block<Derived> MatrixBase<Derived>::start(int size)
+inline typename MatrixBase<Derived>::template SubVectorReturnType<Dynamic>::Type
+MatrixBase<Derived>::start(int size)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
-  return Block<Derived>(derived(), 0, 0,
-                        RowsAtCompileTime == 1 ? 1 : size,
-                        ColsAtCompileTime == 1 ? 1 : size);
+  return Block<Derived,
+               RowsAtCompileTime == 1 ? 1 : Dynamic,
+               ColsAtCompileTime == 1 ? 1 : Dynamic>
+              (derived(), 0, 0,
+               RowsAtCompileTime == 1 ? 1 : size,
+               ColsAtCompileTime == 1 ? 1 : size);
 }

 /** This is the const version of start(int).*/
 template<typename Derived>
-inline const Block<Derived> MatrixBase<Derived>::start(int size) const
+inline const typename MatrixBase<Derived>::template SubVectorReturnType<Dynamic>::Type
+MatrixBase<Derived>::start(int size) const
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
-  return Block<Derived>(derived(), 0, 0,
-                        RowsAtCompileTime == 1 ? 1 : size,
-                        ColsAtCompileTime == 1 ? 1 : size);
+  return Block<Derived,
+               RowsAtCompileTime == 1 ? 1 : Dynamic,
+               ColsAtCompileTime == 1 ? 1 : Dynamic>
+              (derived(), 0, 0,
+               RowsAtCompileTime == 1 ? 1 : size,
+               ColsAtCompileTime == 1 ? 1 : size);
 }

 /** \returns a dynamic-size expression of the last coefficients of *this.
@ -294,26 +331,34 @@ inline const Block<Derived> MatrixBase<Derived>::start(int size) const
  * \sa class Block, block(int,int)
  */
 template<typename Derived>
-inline Block<Derived> MatrixBase<Derived>::end(int size)
+inline typename MatrixBase<Derived>::template SubVectorReturnType<Dynamic>::Type
+MatrixBase<Derived>::end(int size)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
-  return Block<Derived>(derived(),
-                        RowsAtCompileTime == 1 ? 0 : rows() - size,
-                        ColsAtCompileTime == 1 ? 0 : cols() - size,
-                        RowsAtCompileTime == 1 ? 1 : size,
-                        ColsAtCompileTime == 1 ? 1 : size);
+  return Block<Derived,
+               RowsAtCompileTime == 1 ? 1 : Dynamic,
+               ColsAtCompileTime == 1 ? 1 : Dynamic>
+              (derived(),
+               RowsAtCompileTime == 1 ? 0 : rows() - size,
+               ColsAtCompileTime == 1 ? 0 : cols() - size,
+               RowsAtCompileTime == 1 ? 1 : size,
+               ColsAtCompileTime == 1 ? 1 : size);
 }

 /** This is the const version of end(int).*/
 template<typename Derived>
-inline const Block<Derived> MatrixBase<Derived>::end(int size) const
+inline const typename MatrixBase<Derived>::template SubVectorReturnType<Dynamic>::Type
+MatrixBase<Derived>::end(int size) const
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
-  return Block<Derived>(derived(),
-                        RowsAtCompileTime == 1 ? 0 : rows() - size,
-                        ColsAtCompileTime == 1 ? 0 : cols() - size,
-                        RowsAtCompileTime == 1 ? 1 : size,
-                        ColsAtCompileTime == 1 ? 1 : size);
+  return Block<Derived,
+               RowsAtCompileTime == 1 ? 1 : Dynamic,
+               ColsAtCompileTime == 1 ? 1 : Dynamic>
+              (derived(),
+               RowsAtCompileTime == 1 ? 0 : rows() - size,
+               ColsAtCompileTime == 1 ? 0 : cols() - size,
+               RowsAtCompileTime == 1 ? 1 : size,
+               ColsAtCompileTime == 1 ? 1 : size);
 }

 /** \returns a fixed-size expression of the first coefficients of *this.
--- a/Eigen/src/Core/Coeffs.h
+++ b/Eigen/src/Core/Coeffs.h
@ -104,7 +104,7 @@ inline typename ei_traits<Derived>::Scalar& MatrixBase<Derived>
  * \link operator[](int) const \endlink, but without the assertion.
  * Use this for limiting the performance cost of debugging code when doing
  * repeated coefficient access. Only use this when it is guaranteed that the
-  * parameters \a row and \a col are in range.
+  * parameter \a index is in range.
  *
  * If EIGEN_INTERNAL_DEBUGGING is defined, an assertion will be made, making this
  * function equivalent to \link operator[](int) const \endlink.
@ -115,22 +115,13 @@ template<typename Derived>
 inline const typename ei_traits<Derived>::Scalar MatrixBase<Derived>
  ::coeff(int index) const
 {
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
-  if(RowsAtCompileTime == 1)
-  {
-    ei_internal_assert(index >= 0 && index < cols());
-    return coeff(0, index);
-  }
-  else
-  {
-    ei_internal_assert(index >= 0 && index < rows());
-    return coeff(index, 0);
-  }
+  ei_internal_assert(index >= 0 && index < size());
+  return derived()._coeff(index);
 }

 /** \returns the coefficient at given index.
  *
-  * \only_for_vectors
+  * This method is allowed only for vector expressions, and for matrix expressions having the LinearAccessBit.
  *
  * \sa operator[](int), operator()(int,int) const, x() const, y() const,
  * z() const, w() const
@ -139,17 +130,8 @@ template<typename Derived>
 inline const typename ei_traits<Derived>::Scalar MatrixBase<Derived>
  ::operator[](int index) const
 {
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
-  if(RowsAtCompileTime == 1)
-  {
-    ei_assert(index >= 0 && index < cols());
-    return coeff(0, index);
-  }
-  else
-  {
-    ei_assert(index >= 0 && index < rows());
-    return coeff(index, 0);
-  }
+  ei_assert(index >= 0 && index < size());
+  return derived()._coeff(index);
 }

 /** Short version: don't use this function, use
@ -170,22 +152,13 @@ template<typename Derived>
 inline typename ei_traits<Derived>::Scalar& MatrixBase<Derived>
  ::coeffRef(int index)
 {
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
-  if(RowsAtCompileTime == 1)
-  {
-    ei_internal_assert(index >= 0 && index < cols());
-    return coeffRef(0, index);
-  }
-  else
-  {
-    ei_internal_assert(index >= 0 && index < rows());
-    return coeffRef(index, 0);
-  }
+  ei_internal_assert(index >= 0 && index < size());
+  return derived()._coeffRef(index);
 }

 /** \returns a reference to the coefficient at given index.
  *
-  * \only_for_vectors
+  * This method is allowed only for vector expressions, and for matrix expressions having the LinearAccessBit.
  *
  * \sa operator[](int) const, operator()(int,int), x(), y(), z(), w()
  */
@ -193,70 +166,119 @@ template<typename Derived>
 inline typename ei_traits<Derived>::Scalar& MatrixBase<Derived>
  ::operator[](int index)
 {
-  ei_assert(IsVectorAtCompileTime);
-  if(RowsAtCompileTime == 1)
-  {
-    ei_assert(index >= 0 && index < cols());
-    return coeffRef(0, index);
-  }
-  else
-  {
-    ei_assert(index >= 0 && index < rows());
-    return coeffRef(index, 0);
-  }
+  ei_assert(index >= 0 && index < size());
+  return derived()._coeffRef(index);
 }

-/** equivalent to operator[](0). \only_for_vectors */
+/** equivalent to operator[](0).  */
 template<typename Derived>
 inline const typename ei_traits<Derived>::Scalar MatrixBase<Derived>
  ::x() const { return (*this)[0]; }

-/** equivalent to operator[](1). \only_for_vectors */
+/** equivalent to operator[](1).  */
 template<typename Derived>
 inline const typename ei_traits<Derived>::Scalar MatrixBase<Derived>
  ::y() const { return (*this)[1]; }

-/** equivalent to operator[](2). \only_for_vectors */
+/** equivalent to operator[](2).  */
 template<typename Derived>
 inline const typename ei_traits<Derived>::Scalar MatrixBase<Derived>
  ::z() const { return (*this)[2]; }

-/** equivalent to operator[](3). \only_for_vectors */
+/** equivalent to operator[](3).  */
 template<typename Derived>
 inline const typename ei_traits<Derived>::Scalar MatrixBase<Derived>
  ::w() const { return (*this)[3]; }

-/** equivalent to operator[](0). \only_for_vectors */
+/** equivalent to operator[](0).  */
 template<typename Derived>
 inline typename ei_traits<Derived>::Scalar& MatrixBase<Derived>
  ::x() { return (*this)[0]; }

-/** equivalent to operator[](1). \only_for_vectors */
+/** equivalent to operator[](1).  */
 template<typename Derived>
 inline typename ei_traits<Derived>::Scalar& MatrixBase<Derived>
  ::y() { return (*this)[1]; }

-/** equivalent to operator[](2). \only_for_vectors */
+/** equivalent to operator[](2).  */
 template<typename Derived>
 inline typename ei_traits<Derived>::Scalar& MatrixBase<Derived>
  ::z() { return (*this)[2]; }

-/** equivalent to operator[](3). \only_for_vectors */
+/** equivalent to operator[](3).  */
 template<typename Derived>
 inline typename ei_traits<Derived>::Scalar& MatrixBase<Derived>
  ::w() { return (*this)[3]; }

+/** \returns the packet of coefficients starting at the given row and column. It is your responsibility
+  * to ensure that a packet really starts there. This method is only available on expressions having the
+  * PacketAccessBit.
+  *
+  * The \a LoadMode parameter may have the value \a Aligned or \a UnAligned. Its effect is to select
+  * the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
+  * starting at an address which is a multiple of the packet size.
+  */
 template<typename Derived>
 template<int LoadMode>
 inline typename ei_packet_traits<typename ei_traits<Derived>::Scalar>::type
 MatrixBase<Derived>::packet(int row, int col) const
-{ return derived().template _packet<LoadMode>(row,col); }
+{
+  ei_internal_assert(row >= 0 && row < rows()
+                     && col >= 0 && col < cols());
+  return derived().template _packet<LoadMode>(row,col);
+}

+/** Stores the given packet of coefficients, at the given row and column of this expression. It is your responsibility
+  * to ensure that a packet really starts there. This method is only available on expressions having the
+  * PacketAccessBit.
+  *
+  * The \a LoadMode parameter may have the value \a Aligned or \a UnAligned. Its effect is to select
+  * the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
+  * starting at an address which is a multiple of the packet size.
+  */
 template<typename Derived>
 template<int StoreMode>
 inline void MatrixBase<Derived>::writePacket
 (int row, int col, const typename ei_packet_traits<typename ei_traits<Derived>::Scalar>::type& x)
-{ derived().template _writePacket<StoreMode>(row,col,x); }
+{
+  ei_internal_assert(row >= 0 && row < rows()
+                     && col >= 0 && col < cols());
+  derived().template _writePacket<StoreMode>(row,col,x);
+}
+
+/** \returns the packet of coefficients starting at the given index. It is your responsibility
+  * to ensure that a packet really starts there. This method is only available on expressions having the
+  * PacketAccessBit and the LinearAccessBit.
+  *
+  * The \a LoadMode parameter may have the value \a Aligned or \a UnAligned. Its effect is to select
+  * the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
+  * starting at an address which is a multiple of the packet size.
+  */
+template<typename Derived>
+template<int LoadMode>
+inline typename ei_packet_traits<typename ei_traits<Derived>::Scalar>::type
+MatrixBase<Derived>::packet(int index) const
+{
+  ei_internal_assert(index >= 0 && index < size());
+  return derived().template _packet<LoadMode>(index);
+}
+
+/** Stores the given packet of coefficients, at the given index in this expression. It is your responsibility
+  * to ensure that a packet really starts there. This method is only available on expressions having the
+  * PacketAccessBit and the LinearAccessBit.
+  *
+  * The \a LoadMode parameter may have the value \a Aligned or \a UnAligned. Its effect is to select
+  * the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
+  * starting at an address which is a multiple of the packet size.
+  */
+template<typename Derived>
+template<int StoreMode>
+inline void MatrixBase<Derived>::writePacket
+(int index, const typename ei_packet_traits<typename ei_traits<Derived>::Scalar>::type& x)
+{
+  ei_internal_assert(index >= 0 && index < size());
+  derived().template _writePacket<StoreMode>(index,x);
+}


 #endif // EIGEN_COEFFS_H
--- a/Eigen/src/Core/CwiseBinaryOp.h
+++ b/Eigen/src/Core/CwiseBinaryOp.h
@ -69,7 +69,7 @@ struct ei_traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
        HereditaryBits
      | (int(LhsFlags) & int(RhsFlags) & LinearAccessBit)
      | (ei_functor_traits<BinaryOp>::PacketAccess && ((int(LhsFlags) & RowMajorBit)==(int(RhsFlags) & RowMajorBit))
-        ? int(LhsFlags) & int(RhsFlags) & PacketAccessBit : 0)),
+        ? (int(LhsFlags) & int(RhsFlags) & PacketAccessBit) : 0)),
    CoeffReadCost = LhsCoeffReadCost + RhsCoeffReadCost + ei_functor_traits<BinaryOp>::Cost
  };
 };
@ -108,6 +108,17 @@ class CwiseBinaryOp : ei_no_assignment_operator,
      return m_functor.packetOp(m_lhs.template packet<LoadMode>(row, col), m_rhs.template packet<LoadMode>(row, col));
    }

+    inline const Scalar _coeff(int index) const
+    {
+      return m_functor(m_lhs.coeff(index), m_rhs.coeff(index));
+    }
+
+    template<int LoadMode>
+    inline PacketScalar _packet(int index) const
+    {
+      return m_functor.packetOp(m_lhs.template packet<LoadMode>(index), m_rhs.template packet<LoadMode>(index));
+    }
+
  protected:
    const LhsNested m_lhs;
    const RhsNested m_rhs;
--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h
@ -50,7 +50,9 @@ struct ei_traits<CwiseNullaryOp<NullaryOp, MatrixType> >
    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
    Flags = (MatrixType::Flags
-      & (HereditaryBits | LinearAccessBit | (ei_functor_traits<NullaryOp>::PacketAccess ? PacketAccessBit : 0)))
+      & (  HereditaryBits
+         | (ei_functor_has_linear_access<NullaryOp>::ret ? LinearAccessBit : 0)
+         | (ei_functor_traits<NullaryOp>::PacketAccess ? PacketAccessBit : 0)))
      | (ei_functor_traits<NullaryOp>::IsRepeatable ? 0 : EvalBeforeNestingBit),
    CoeffReadCost = ei_functor_traits<NullaryOp>::Cost
  };
@ -89,6 +91,17 @@ class CwiseNullaryOp : ei_no_assignment_operator,
      return m_functor.packetOp();
    }

+    const Scalar _coeff(int index) const
+    {
+      return m_functor(index);
+    }
+
+    template<int LoadMode>
+    PacketScalar _packet(int) const
+    {
+      return m_functor.packetOp();
+    }
+
  protected:
    const ei_int_if_dynamic<RowsAtCompileTime> m_rows;
    const ei_int_if_dynamic<ColsAtCompileTime> m_cols;
--- a/Eigen/src/Core/CwiseUnaryOp.h
+++ b/Eigen/src/Core/CwiseUnaryOp.h
@ -90,6 +90,17 @@ class CwiseUnaryOp : ei_no_assignment_operator,
      return m_functor.packetOp(m_matrix.template packet<LoadMode>(row, col));
    }

+    inline const Scalar _coeff(int index) const
+    {
+      return m_functor(m_matrix.coeff(index));
+    }
+
+    template<int LoadMode>
+    inline PacketScalar _packet(int index) const
+    {
+      return m_functor.packetOp(m_matrix.template packet<LoadMode>(index));
+    }
+
  protected:
    const typename MatrixType::Nested m_matrix;
    const UnaryOp m_functor;
--- a/Eigen/src/Core/DiagonalCoeffs.h
+++ b/Eigen/src/Core/DiagonalCoeffs.h
@ -56,7 +56,8 @@ struct ei_traits<DiagonalCoeffs<MatrixType> >
    MaxColsAtCompileTime = 1,
    Flags = (RowsAtCompileTime == Dynamic && ColsAtCompileTime == Dynamic
            ? (unsigned int)_MatrixTypeNested::Flags
-            : (unsigned int)_MatrixTypeNested::Flags &~ LargeBit) & HereditaryBits,
+            : (unsigned int)_MatrixTypeNested::Flags &~ LargeBit)
+          & (HereditaryBits | LinearAccessBit),
    CoeffReadCost = _MatrixTypeNested::CoeffReadCost
  };
 };
@ -87,6 +88,16 @@ template<typename MatrixType> class DiagonalCoeffs
      return m_matrix.coeff(row, row);
    }

+    inline Scalar& _coeffRef(int index)
+    {
+      return m_matrix.const_cast_derived().coeffRef(index, index);
+    }
+
+    inline const Scalar _coeff(int index) const
+    {
+      return m_matrix.coeff(index, index);
+    }
+
  protected:

    const typename MatrixType::Nested m_matrix;
--- a/Eigen/src/Core/DiagonalProduct.h
+++ b/Eigen/src/Core/DiagonalProduct.h
@ -52,8 +52,7 @@ struct ei_traits<Product<LhsNested, RhsNested, DiagonalProduct> >
                     && (RowsAtCompileTime % ei_packet_traits<Scalar>::size == 0),

    RemovedBits = ~(((RhsFlags & RowMajorBit) && (!CanVectorizeLhs) ? 0 : RowMajorBit)
-                | ((RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic) ? 0 : LargeBit))
-                | LinearAccessBit,
+                | ((RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic) ? 0 : LargeBit)),

    Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & RemovedBits)
          | (CanVectorizeLhs || CanVectorizeRhs ? PacketAccessBit : 0),
--- a/Eigen/src/Core/Dot.h
+++ b/Eigen/src/Core/Dot.h
@ -175,27 +175,20 @@ struct ei_dot_impl<Derived1, Derived2, LinearVectorization, NoUnrolling>
    const int size = v1.size();
    const int packetSize = ei_packet_traits<Scalar>::size;
    const int alignedSize = (size/packetSize)*packetSize;
-    const bool rowVector1 = Derived1::RowsAtCompileTime == 1;
-    const bool rowVector2 = Derived2::RowsAtCompileTime == 1;
    Scalar res;

    // do the vectorizable part of the sum
    if(size >= packetSize)
    {
-      PacketScalar packet_res;
-      packet_res = ei_pmul(
-                         v1.template packet<Aligned>(0, 0),
-                         v2.template packet<Aligned>(0, 0)
-                   );
+      PacketScalar packet_res = ei_pmul(
+                                  v1.template packet<Aligned>(0),
+                                  v2.template packet<Aligned>(0)
+                                );
      for(int index = packetSize; index<alignedSize; index += packetSize)
      {
-        const int row1 = rowVector1 ? 0 : index;
-        const int col1 = rowVector1 ? index : 0;
-        const int row2 = rowVector2 ? 0 : index;
-        const int col2 = rowVector2 ? index : 0;
        packet_res = ei_pmadd(
-                       v1.template packet<Aligned>(row1, col1),
-                       v2.template packet<Aligned>(row2, col2),
+                       v1.template packet<Aligned>(index),
+                       v2.template packet<Aligned>(index),
                       packet_res
                     );
      }
@ -213,11 +206,7 @@ struct ei_dot_impl<Derived1, Derived2, LinearVectorization, NoUnrolling>
    // do the remainder of the vector
    for(int index = alignedSize; index < size; index++)
    {
-      const int row1 = rowVector1 ? 0 : index;
-      const int col1 = rowVector1 ? index : 0;
-      const int row2 = rowVector2 ? 0 : index;
-      const int col2 = rowVector2 ? index : 0;
-      res += v1.coeff(row1, col1) * v2.coeff(row2, col2);
+      res += v1.coeff(index) * v2.coeff(index);
    }

    return res;
--- a/Eigen/src/Core/Flagged.h
+++ b/Eigen/src/Core/Flagged.h
@ -84,6 +84,16 @@ template<typename ExpressionType, unsigned int Added, unsigned int Removed> clas
      return m_matrix.const_cast_derived().coeffRef(row, col);
    }

+    inline const Scalar _coeff(int index) const
+    {
+      return m_matrix.coeff(index);
+    }
+
+    inline Scalar& _coeffRef(int index)
+    {
+      return m_matrix.const_cast_derived().coeffRef(index);
+    }
+
    template<int LoadMode>
    inline const PacketScalar _packet(int row, int col) const
    {
@ -96,6 +106,18 @@ template<typename ExpressionType, unsigned int Added, unsigned int Removed> clas
      m_matrix.const_cast_derived().template writePacket<LoadMode>(row, col, x);
    }

+    template<int LoadMode>
+    inline const PacketScalar _packet(int index) const
+    {
+      return m_matrix.template packet<LoadMode>(index);
+    }
+
+    template<int LoadMode>
+    inline void _writePacket(int index, const PacketScalar& x)
+    {
+      m_matrix.const_cast_derived().template writePacket<LoadMode>(index, x);
+    }
+
  protected:
    ExpressionTypeNested m_matrix;
 };
--- a/Eigen/src/Core/Functors.h
+++ b/Eigen/src/Core/Functors.h
@ -302,13 +302,13 @@ struct ei_scalar_quotient1_op : ei_scalar_quotient1_impl<Scalar, NumTraits<Scala

 // nullary functors

-template<typename Scalar, bool PacketAccess = (int(ei_packet_traits<Scalar>::size)>1?true:false) > struct ei_scalar_constant_op;
+template<typename Scalar, bool PacketAccess = (int(ei_packet_traits<Scalar>::size)>1) > struct ei_scalar_constant_op;

 template<typename Scalar>
 struct ei_scalar_constant_op<Scalar,true> {
  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
  inline ei_scalar_constant_op(const Scalar& other) : m_other(ei_pset1(other)) { }
-  inline const Scalar operator() (int, int) const { return ei_pfirst(m_other); }
+  inline const Scalar operator() (int, int = 0) const { return ei_pfirst(m_other); }
  inline const PacketScalar packetOp() const
  { return m_other; }
  const PacketScalar m_other;
@ -316,7 +316,7 @@ struct ei_scalar_constant_op<Scalar,true> {
 template<typename Scalar>
 struct ei_scalar_constant_op<Scalar,false> {
  inline ei_scalar_constant_op(const Scalar& other) : m_other(other) { }
-  inline const Scalar operator() (int, int) const { return m_other; }
+  inline const Scalar operator() (int, int = 0) const { return m_other; }
  const Scalar m_other;
 };
 template<typename Scalar>
@ -331,4 +331,11 @@ template<typename Scalar>
 struct ei_functor_traits<ei_scalar_identity_op<Scalar> >
 { enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = false, IsRepeatable = true }; };

+// NOTE quick hack:
+// all functors allow linear access, except ei_scalar_identity_op. So we fix here a quick meta
+// to indicate whether a functor allows linear access, just always answering 'yes' except for
+// ei_scalar_identity_op.
+template<typename Functor> struct ei_functor_has_linear_access { enum { ret = 1 }; };
+template<typename Scalar> struct ei_functor_has_linear_access<ei_scalar_identity_op<Scalar> > { enum { ret = 0 }; };
+
 #endif // EIGEN_FUNCTORS_H
--- a/Eigen/src/Core/IO.h
+++ b/Eigen/src/Core/IO.h
@ -31,14 +31,14 @@
  */
 template<typename Derived>
 std::ostream & operator <<
-( std::ostream & s,
-  const MatrixBase<Derived> & m )
+(std::ostream & s,
+ const MatrixBase<Derived> & m)
 {
-  for( int i = 0; i < m.rows(); i++ )
+  for(int i = 0; i < m.rows(); i++)
  {
-    s << m( i, 0 );
-    for (int j = 1; j < m.cols(); j++ )
-      s << " " << m( i, j );
+    s << m.coeff(i, 0);
+    for(int j = 1; j < m.cols(); j++)
+      s << " " << m.coeff(i, j);
    if( i < m.rows() - 1)
      s << "\n";
  }
--- a/Eigen/src/Core/InverseProduct.h
+++ b/Eigen/src/Core/InverseProduct.h
@ -55,33 +55,33 @@ typename OtherDerived::Eval MatrixBase<Derived>::inverseProduct(const MatrixBase
    {
      // forward substitution
      if(Flags & UnitDiagBit)
-	res.coeffRef(0,c) = other.coeff(0,c);
+        res.coeffRef(0,c) = other.coeff(0,c);
      else
-	res.coeffRef(0,c) = other.coeff(0,c)/coeff(0, 0);
+        res.coeffRef(0,c) = other.coeff(0,c)/coeff(0, 0);
      for(int i=1; i<rows(); ++i)
      {
-	Scalar tmp = other.coeff(i,c) - ((this->row(i).start(i)) * res.col(c).start(i)).coeff(0,0);
-	if (Flags & UnitDiagBit)
-	  res.coeffRef(i,c) = tmp;
-	else
-	  res.coeffRef(i,c) = tmp/coeff(i,i);
+        Scalar tmp = other.coeff(i,c) - ((this->row(i).start(i)) * res.col(c).start(i)).coeff(0,0);
+        if (Flags & UnitDiagBit)
+          res.coeffRef(i,c) = tmp;
+        else
+          res.coeffRef(i,c) = tmp/coeff(i,i);
      }
    }
    else
    {
      // backward substitution
      if(Flags & UnitDiagBit)
-	res.coeffRef(cols()-1,c) = other.coeff(cols()-1,c);
+        res.coeffRef(cols()-1,c) = other.coeff(cols()-1,c);
      else
-	res.coeffRef(cols()-1,c) = other.coeff(cols()-1, c)/coeff(rows()-1, cols()-1);
+        res.coeffRef(cols()-1,c) = other.coeff(cols()-1, c)/coeff(rows()-1, cols()-1);
      for(int i=rows()-2 ; i>=0 ; --i)
      {
-	Scalar tmp = other.coeff(i,c)
-                     - ((this->row(i).end(cols()-i-1)) * res.col(c).end(cols()-i-1)).coeff(0,0);
-	if (Flags & UnitDiagBit)
-	  res.coeffRef(i,c) = tmp;
-	else
-	  res.coeffRef(i,c) = tmp/coeff(i,i);
+        Scalar tmp = other.coeff(i,c)
+                   - ((this->row(i).end(cols()-i-1)) * res.col(c).end(cols()-i-1)).coeff(0,0);
+        if (Flags & UnitDiagBit)
+          res.coeffRef(i,c) = tmp;
+        else
+          res.coeffRef(i,c) = tmp/coeff(i,i);
      }
    }
  }
--- a/Eigen/src/Core/Map.h
+++ b/Eigen/src/Core/Map.h
@ -80,6 +80,16 @@ template<typename MatrixType> class Map
        return const_cast<Scalar*>(m_data)[row + col * m_rows];
    }

+    inline const Scalar& _coeff(int index) const
+    {
+      return m_data[index];
+    }
+
+    inline Scalar& _coeffRef(int index)
+    {
+      return m_data[index];
+    }
+
  public:
    inline Map(const Scalar* data, int rows, int cols) : m_data(data), m_rows(rows), m_cols(cols)
    {
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h
@ -128,6 +128,11 @@ class Matrix : public MatrixBase<Matrix<_Scalar, _Rows, _Cols, _MaxRows, _MaxCol
        return m_storage.data()[row + col * m_storage.rows()];
    }

+    inline const Scalar& _coeff(int index) const
+    {
+      return m_storage.data()[index];
+    }
+
    inline Scalar& _coeffRef(int row, int col)
    {
      if(Flags & RowMajorBit)
@ -136,20 +141,33 @@ class Matrix : public MatrixBase<Matrix<_Scalar, _Rows, _Cols, _MaxRows, _MaxCol
        return m_storage.data()[row + col * m_storage.rows()];
    }

+    inline Scalar& _coeffRef(int index)
+    {
+      return m_storage.data()[index];
+    }
+
    template<int LoadMode>
    inline PacketScalar _packet(int row, int col) const
    {
-      ei_internal_assert(Flags & PacketAccessBit);
      if(Flags & RowMajorBit)
        if (LoadMode==Aligned)
-          return ei_pload(&m_storage.data()[col + row * m_storage.cols()]);
+          return ei_pload(m_storage.data() + col + row * m_storage.cols());
        else
-          return ei_ploadu(&m_storage.data()[col + row * m_storage.cols()]);
+          return ei_ploadu(m_storage.data() + col + row * m_storage.cols());
      else
        if (LoadMode==Aligned)
-          return ei_pload(&m_storage.data()[row + col * m_storage.rows()]);
+          return ei_pload(m_storage.data() + row + col * m_storage.rows());
        else
-          return ei_ploadu(&m_storage.data()[row + col * m_storage.rows()]);
+          return ei_ploadu(m_storage.data() + row + col * m_storage.rows());
+    }
+
+    template<int LoadMode>
+    inline PacketScalar _packet(int index) const
+    {
+      if (LoadMode==Aligned)
+        return ei_pload(m_storage.data() + index);
+      else
+        return ei_ploadu(m_storage.data() + index);
    }

    template<int StoreMode>
@ -158,14 +176,23 @@ class Matrix : public MatrixBase<Matrix<_Scalar, _Rows, _Cols, _MaxRows, _MaxCol
      ei_internal_assert(Flags & PacketAccessBit);
      if(Flags & RowMajorBit)
        if (StoreMode==Aligned)
-          ei_pstore(&m_storage.data()[col + row * m_storage.cols()], x);
+          ei_pstore(m_storage.data() + col + row * m_storage.cols(), x);
        else
-          ei_pstoreu(&m_storage.data()[col + row * m_storage.cols()], x);
+          ei_pstoreu(m_storage.data() + col + row * m_storage.cols(), x);
      else
        if (StoreMode==Aligned)
-          ei_pstore(&m_storage.data()[row + col * m_storage.rows()], x);
+          ei_pstore(m_storage.data() + row + col * m_storage.rows(), x);
        else
-          ei_pstoreu(&m_storage.data()[row + col * m_storage.rows()], x);
+          ei_pstoreu(m_storage.data() + row + col * m_storage.rows(), x);
+    }
+
+    template<int StoreMode>
+    inline void _writePacket(int index, const PacketScalar& x)
+    {
+        if (StoreMode==Aligned)
+          ei_pstore(m_storage.data() + index, x);
+        else
+          ei_pstoreu(m_storage.data() + index, x);
    }

  public:
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@ -228,6 +228,11 @@ template<typename Derived> class MatrixBase
    template<int StoreMode>
    void writePacket(int row, int col, const PacketScalar& x);

+    template<int LoadMode>
+    PacketScalar packet(int index) const;
+    template<int StoreMode>
+    void writePacket(int index, const PacketScalar& x);
+
    const Scalar x() const;
    const Scalar y() const;
    const Scalar z() const;
@ -307,11 +312,11 @@ template<typename Derived> class MatrixBase
    Block<Derived> block(int start, int size);
    const Block<Derived> block(int start, int size) const;

-    Block<Derived> start(int size);
-    const Block<Derived> start(int size) const;
+    typename SubVectorReturnType<Dynamic>::Type start(int size);
+    const typename SubVectorReturnType<Dynamic>::Type start(int size) const;

-    Block<Derived> end(int size);
-    const Block<Derived> end(int size) const;
+    typename SubVectorReturnType<Dynamic>::Type end(int size);
+    const typename SubVectorReturnType<Dynamic>::Type end(int size) const;

    Block<Derived> corner(CornerType type, int cRows, int cCols);
    const Block<Derived> corner(CornerType type, int cRows, int cCols) const;
--- a/Eigen/src/Core/NestByValue.h
+++ b/Eigen/src/Core/NestByValue.h
@ -76,6 +76,16 @@ template<typename ExpressionType> class NestByValue
      return m_expression.const_cast_derived().coeffRef(row, col);
    }

+    inline const Scalar _coeff(int index) const
+    {
+      return m_expression.coeff(index);
+    }
+
+    inline Scalar& _coeffRef(int index)
+    {
+      return m_expression.const_cast_derived().coeffRef(index);
+    }
+
    template<int LoadMode>
    inline const PacketScalar _packet(int row, int col) const
    {
@ -88,6 +98,18 @@ template<typename ExpressionType> class NestByValue
      m_expression.const_cast_derived().template writePacket<LoadMode>(row, col, x);
    }

+    template<int LoadMode>
+    inline const PacketScalar _packet(int index) const
+    {
+      return m_expression.template packet<LoadMode>(index);
+    }
+
+    template<int LoadMode>
+    inline void _writePacket(int index, const PacketScalar& x)
+    {
+      m_expression.const_cast_derived().template writePacket<LoadMode>(index, x);
+    }
+
  protected:
    const ExpressionType m_expression;
 };
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@ -151,8 +151,7 @@ struct ei_traits<Product<LhsNested, RhsNested, ProductMode> >
                   && (ProductMode==(int)CacheFriendlyProduct ? (int)LhsFlags & RowMajorBit : (!CanVectorizeLhs)),

    RemovedBits = ~((EvalToRowMajor ? 0 : RowMajorBit)
-                | ((RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic) ? 0 : LargeBit)
-                | LinearAccessBit),
+                | ((RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic) ? 0 : LargeBit)),

    Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & RemovedBits)
          | EvalBeforeAssigningBit
@ -224,6 +223,18 @@ template<typename LhsNested, typename RhsNested, int ProductMode> class Product
      return res;
    }

+    /* Allow index-based non-packet access. It is impossible though to allow index-based packed access,
+     * which is why we don't set the LinearAccessBit.
+     */
+    const Scalar _coeff(int index) const
+    {
+      Scalar res;
+      const int row = RowsAtCompileTime == 1 ? 0 : index;
+      const int col = RowsAtCompileTime == 1 ? index : 0;
+      ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res);
+      return res;
+    }
+
    template<int LoadMode>
    const PacketScalar _packet(int row, int col) const
    {
@ -235,9 +246,6 @@ template<typename LhsNested, typename RhsNested, int ProductMode> class Product
      return res;
    }

-    template<typename Lhs_, typename Rhs_, int ProductMode_, typename DestDerived_, bool DirectAccess_>
-    friend struct ei_cache_friendly_selector;
-
  protected:
    const LhsNested m_lhs;
    const RhsNested m_rhs;
--- a/Eigen/src/Core/Sum.h
+++ b/Eigen/src/Core/Sum.h
@ -186,34 +186,15 @@ struct ei_sum_impl<Derived, LinearVectorization, NoUnrolling>
    const int size = mat.size();
    const int packetSize = ei_packet_traits<Scalar>::size;
    const int alignedSize = (size/packetSize)*packetSize;
-    const bool rowMajor = Derived::Flags&RowMajorBit;
-    const int innerSize = rowMajor ? mat.cols() : mat.rows();
-    const int outerSize = rowMajor ? mat.rows() : mat.cols();
    Scalar res;

-    // do the vectorizable part of the sum
    if(size >= packetSize)
    {
-      PacketScalar packet_res;
-      packet_res = mat.template packet<Aligned>(0, 0);
-      int row = 0;
-      int col = 0;
-      int index = packetSize;
-      while (index<alignedSize)
-      {
-        row = rowMajor ? index/innerSize : index%innerSize;
-        col = rowMajor ? index%innerSize : index/innerSize;
-        int start = rowMajor ? col : row;
-        int end = std::min(innerSize, start+alignedSize-index);
-        if (end<start) getchar();
-        for ( ; (rowMajor ? col : row)<end; (rowMajor ? col : row)+=packetSize)
-          packet_res = ei_padd(packet_res, mat.template packet<Aligned>(row, col));
-        index += (rowMajor ? col : row) - start;
-      }
-      res = ei_predux(packet_res);
+      PacketScalar packet_res = mat.template packet<Aligned>(0, 0);
+      for(int index = packetSize; index < alignedSize; index += packetSize)
+        packet_res = ei_padd(packet_res, mat.template packet<Aligned>(index));

-      // now we must do the rest without vectorization.
-      if(alignedSize == size) return res;
+      res = ei_predux(packet_res);
    }
    else // too small to vectorize anything.
         // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
@ -221,25 +202,11 @@ struct ei_sum_impl<Derived, LinearVectorization, NoUnrolling>
      res = Scalar(0);
    }

-    const int k = alignedSize/innerSize;
-
-    // do the remainder of the current row or col
-    for(int i = alignedSize%innerSize; i < innerSize; i++)
+    for(int index = alignedSize; index < size; index++)
    {
-      const int row = rowMajor ? k : i;
-      const int col = rowMajor ? i : k;
-      res += mat.coeff(row, col);
+      res += mat.coeff(index);
    }

-    // do the remaining rows or cols
-    for(int j = k+1; j < outerSize; j++)
-      for(int i = 0; i < innerSize; i++)
-      {
-        const int row = rowMajor ? i : j;
-        const int col = rowMajor ? j : i;
-        res += mat.coeff(row, col);
-      }
-
    return res;
  }
 };
--- a/Eigen/src/Core/util/Constants.h
+++ b/Eigen/src/Core/util/Constants.h
@ -56,26 +56,60 @@ const unsigned int EvalBeforeNestingBit = 0x2;
  * means the expression should be evaluated before any assignement */
 const unsigned int EvalBeforeAssigningBit = 0x4;

-/** \ingroup flags
-  *
-  * currently unused. Means the matrix probably has a very big size.
-  * Could eventually be used as a hint to determine which algorithms
-  * to use. */
-const unsigned int LargeBit = 0x8;
-
 #ifdef EIGEN_VECTORIZE
 /** \ingroup flags
  *
-  * means the expression might be vectorized */
-const unsigned int PacketAccessBit = 0x10;
+  * Short version: means the expression might be vectorized
+  *
+  * Long version: means that the coefficients can be handled by packets
+  * and start at a memory location whose alignment meets the requirements
+  * of the present CPU architecture for optimized packet access. In the fixed-size
+  * case, there is the additional condition that the total size of the coefficients
+  * array is a multiple of the packet size, so that it is possible to access all the
+  * coefficients by packets. In the dynamic-size case, there is no such condition
+  * on the total size, so it might not be possible to access the few last coeffs
+  * by packets.
+  *
+  * \note If vectorization is not enabled (EIGEN_VECTORIZE is not defined) this constant
+  * is set to the value 0.
+  */
+const unsigned int PacketAccessBit = 0x8;
 #else
 const unsigned int PacketAccessBit = 0x0;
 #endif

 /** \ingroup flags
  *
-  * means the expression can be seen as 1D vector (used for explicit vectorization) */
-const unsigned int LinearAccessBit = 0x20;
+  * Short version: means the expression can be seen as 1D vector.
+  *
+  * Long version: means that one can access the coefficients
+  * of this expression by coeff(int), and coeffRef(int) in the case of a lvalue expression. These
+  * index-based access methods are guaranteed
+  * to not have to do any runtime computation of a (row, col)-pair from the index, so that it
+  * is guaranteed that whenever it is available, index-based access is at least as fast as
+  * (row,col)-based access. Expressions for which that isn't possible don't have the LinearAccessBit.
+  *
+  * If both PacketAccessBit and LinearAccessBit are set, then the
+  * packets of this expression can be accessed by packet(int), and writePacket(int) in the case of a
+  * lvalue expression.
+  *
+  * Typically, all vector expressions have the LinearAccessBit, but there is one exception:
+  * Product expressions don't have it, because it would be troublesome for vectorization, even when the
+  * Product is a vector expression. Thus, vector Product expressions allow index-based coefficient access but
+  * not index-based packet access, so they don't have the LinearAccessBit.
+  */
+const unsigned int LinearAccessBit = 0x10;
+
+/** \ingroup flags
+  *
+  * Means that the underlying array of coefficients can be directly accessed. This means two things.
+  * First, references to the coefficients must be available through coeffRef(int, int). This rules out read-only
+  * expressions whose coefficients are computed on demand by coeff(int, int). Second, the memory layout of the
+  * array of coefficients must be exactly the natural one suggested by rows(), cols(), stride(), and the RowMajorBit.
+  * This rules out expressions such as DiagonalCoeffs, whose coefficients, though referencable, do not have
+  * such a regular memory layout.
+  */
+const unsigned int DirectAccessBit = 0x20;

 /** \ingroup flags
  *
@ -102,19 +136,19 @@ const unsigned int UpperTriangularBit = 0x200;
  * means the strictly upper triangular part is 0 */
 const unsigned int LowerTriangularBit = 0x400;

-/** \ingroup flags
-  *
-  * means the underlying matrix data can be direclty accessed (contrary to certain
-  * expressions where the matrix coefficients need to be computed rather than just read from
-  * memory) */
-const unsigned int DirectAccessBit = 0x800;
-
 /** \ingroup flags
  *
  * means the object is just an array of scalars, and operations on it are regarded as operations
  * on every of these scalars taken separately.
  */
-const unsigned int ArrayBit = 0x1000;
+const unsigned int ArrayBit = 0x800;
+
+/** \ingroup flags
+  *
+  * currently unused. Means the matrix probably has a very big size.
+  * Could eventually be used as a hint to determine which algorithms
+  * to use. */
+const unsigned int LargeBit = 0x1000;

 // list of flags that are inherited by default
 const unsigned int HereditaryBits = RowMajorBit
@ -135,7 +169,6 @@ const unsigned int UnitUpper = UpperTriangularBit | UnitDiagBit;
 const unsigned int UnitLower = LowerTriangularBit | UnitDiagBit;
 const unsigned int Diagonal = Upper | Lower;

-
 enum { Aligned=0, UnAligned=1 };
 enum { ConditionalJumpCost = 5 };
 enum CornerType { TopLeft, TopRight, BottomLeft, BottomRight };
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@ -155,7 +155,8 @@ class ei_corrected_matrix_flags
                                                     // so let us strictly honor the user's choice.
                         ? SuggestedFlags&RowMajorBit
                         : Cols > 1 ? RowMajorBit : 0,
-           is_big = MaxRows == Dynamic || MaxCols == Dynamic,
+           inner_max_size = row_major_bit ? MaxCols : MaxRows,
+           is_big = inner_max_size == Dynamic,
           linear_size = Cols * Rows,
           packet_access_bit
            = ei_packet_traits<Scalar>::size > 1
--- a/Eigen/src/Core/util/StaticAssert.h
+++ b/Eigen/src/Core/util/StaticAssert.h
@ -75,7 +75,9 @@


 // static assertion failling if the type \a TYPE is not a vector type
-#define EIGEN_STATIC_ASSERT_VECTOR_ONLY(TYPE) EIGEN_STATIC_ASSERT(TYPE::IsVectorAtCompileTime,you_tried_calling_a_vector_method_on_a_matrix)
+#define EIGEN_STATIC_ASSERT_VECTOR_ONLY(TYPE) \
+  EIGEN_STATIC_ASSERT(TYPE::IsVectorAtCompileTime, \
+                      you_tried_calling_a_vector_method_on_a_matrix)

 // static assertion failling if the two vector expression types are not compatible (same fixed-size or dynamic size)
 #define EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(TYPE0,TYPE1) \
--- a/bench/benchVecAdd.cpp
+++ b/bench/benchVecAdd.cpp
@ -0,0 +1,134 @@
+
+#include <Eigen/Core>
+#include <bench/BenchTimer.h>
+using namespace Eigen;
+
+#ifndef SIZE
+#define SIZE 50
+#endif
+
+#ifndef REPEAT
+#define REPEAT 10000
+#endif
+
+typedef float Scalar;
+
+__attribute__ ((noinline)) void benchVec(Scalar* a, Scalar* b, Scalar* c, int size);
+__attribute__ ((noinline)) void benchVec(MatrixXf& a, MatrixXf& b, MatrixXf& c);
+__attribute__ ((noinline)) void benchVec(VectorXf& a, VectorXf& b, VectorXf& c);
+
+int main(int argc, char* argv[])
+{
+    int size = SIZE * 8;
+    int size2 = size * size;
+    Scalar* a = ei_aligned_malloc<Scalar>(size2);
+    Scalar* b = ei_aligned_malloc<Scalar>(size2);
+    Scalar* c = ei_aligned_malloc<Scalar>(size2); 
+    
+    for (int i=0; i<size; ++i)
+    {
+        a[i] = b[i] = c[i] = 0;
+    }
+    
+    BenchTimer timer;
+    
+    timer.reset();
+    for (int k=0; k<3; ++k)
+    {
+        timer.start();
+        benchVec(a, b, c, size2);
+        timer.stop();
+    }
+    std::cout << timer.value() << "s  " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n";
+    
+    for (int innersize = size; innersize>2 ; --innersize)
+    {
+        if (size2%innersize==0)
+        {
+            int outersize = size2/innersize;
+            MatrixXf ma = MatrixXf::map(a, innersize, outersize );
+            MatrixXf mb = MatrixXf::map(b, innersize, outersize );
+            MatrixXf mc = MatrixXf::map(c, innersize, outersize );
+            timer.reset();
+            for (int k=0; k<3; ++k)
+            {
+                timer.start();
+                benchVec(ma, mb, mc);
+                timer.stop();
+            }
+            std::cout << innersize << " x " << outersize << "  " << timer.value() << "s   " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n";
+        }
+    }
+    
+    VectorXf va = VectorXf::map(a, size2);
+    VectorXf vb = VectorXf::map(b, size2);
+    VectorXf vc = VectorXf::map(c, size2);
+    timer.reset();
+    for (int k=0; k<3; ++k)
+    {
+        timer.start();
+        benchVec(va, vb, vc);
+        timer.stop();
+    }
+    std::cout << timer.value() << "s   " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n";
+
+    return 0;
+}
+
+void benchVec(MatrixXf& a, MatrixXf& b, MatrixXf& c)
+{
+    for (int k=0; k<REPEAT; ++k)
+        a = a + b;
+}
+
+void benchVec(VectorXf& a, VectorXf& b, VectorXf& c)
+{
+    for (int k=0; k<REPEAT; ++k)
+        a = a + b;
+}
+
+void benchVec(Scalar* a, Scalar* b, Scalar* c, int size)
+{
+    typedef ei_packet_traits<Scalar>::type PacketScalar;
+    const int PacketSize = ei_packet_traits<Scalar>::size;
+    PacketScalar a0, a1, a2, a3, b0, b1, b2, b3;
+    for (int k=0; k<REPEAT; ++k)
+        for (int i=0; i<size; i+=PacketSize*8)
+        {
+            a0 = ei_pload(&a[i]);
+            b0 = ei_pload(&b[i]);
+            a1 = ei_pload(&a[i+1*PacketSize]);
+            b1 = ei_pload(&b[i+1*PacketSize]);
+            a2 = ei_pload(&a[i+2*PacketSize]);
+            b2 = ei_pload(&b[i+2*PacketSize]);
+            a3 = ei_pload(&a[i+3*PacketSize]);
+            b3 = ei_pload(&b[i+3*PacketSize]);
+            ei_pstore(&a[i], ei_padd(a0, b0));
+            a0 = ei_pload(&a[i+4*PacketSize]);
+            b0 = ei_pload(&b[i+4*PacketSize]);
+            
+            ei_pstore(&a[i+1*PacketSize], ei_padd(a1, b1));
+            a1 = ei_pload(&a[i+5*PacketSize]);
+            b1 = ei_pload(&b[i+5*PacketSize]);
+            
+            ei_pstore(&a[i+2*PacketSize], ei_padd(a2, b2));
+            a2 = ei_pload(&a[i+6*PacketSize]);
+            b2 = ei_pload(&b[i+6*PacketSize]);
+            
+            ei_pstore(&a[i+3*PacketSize], ei_padd(a3, b3));
+            a3 = ei_pload(&a[i+7*PacketSize]);
+            b3 = ei_pload(&b[i+7*PacketSize]);
+            
+            ei_pstore(&a[i+4*PacketSize], ei_padd(a0, b0));
+            ei_pstore(&a[i+5*PacketSize], ei_padd(a1, b1));
+            ei_pstore(&a[i+6*PacketSize], ei_padd(a2, b2));
+            ei_pstore(&a[i+7*PacketSize], ei_padd(a3, b3));
+            
+//             ei_pstore(&a[i+2*PacketSize], ei_padd(ei_pload(&a[i+2*PacketSize]), ei_pload(&b[i+2*PacketSize])));
+//             ei_pstore(&a[i+3*PacketSize], ei_padd(ei_pload(&a[i+3*PacketSize]), ei_pload(&b[i+3*PacketSize])));
+//             ei_pstore(&a[i+4*PacketSize], ei_padd(ei_pload(&a[i+4*PacketSize]), ei_pload(&b[i+4*PacketSize])));
+//             ei_pstore(&a[i+5*PacketSize], ei_padd(ei_pload(&a[i+5*PacketSize]), ei_pload(&b[i+5*PacketSize])));
+//             ei_pstore(&a[i+6*PacketSize], ei_padd(ei_pload(&a[i+6*PacketSize]), ei_pload(&b[i+6*PacketSize])));
+//             ei_pstore(&a[i+7*PacketSize], ei_padd(ei_pload(&a[i+7*PacketSize]), ei_pload(&b[i+7*PacketSize])));
+        }
+}