From 1985fb0551837fd5017858d6d7e82fd110294cfa Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 9 Apr 2008 12:31:55 +0000
Subject: [PATCH] Added initial experimental support for explicit
 vectorization. Currently only the following platform/operations are
 supported:  - SSE2 compatible architecture  - compiler compatible with
 intel's SSE2 intrinsics  - float, double and int data types  - fixed size
 matrices with a storage major dimension multiple of 4 (or 2 for double)  -
 scalar-matrix product, component wise: +,-,*,min,max  - matrix-matrix product
 only if the left matrix is vectorizable and column major    or the right
 matrix is vectorizable and row major, e.g.:    a.transpose() * b is not
 vectorized with the default column major storage. To use it you must define
 EIGEN_VECTORIZE and EIGEN_INTEL_PLATFORM.

---
 Eigen/Core                           |  14 ++-
 Eigen/src/Core/Block.h               |   4 +-
 Eigen/src/Core/CwiseBinaryOp.h       |   9 +-
 Eigen/src/Core/CwiseUnaryOp.h        |   8 +-
 Eigen/src/Core/DiagonalCoeffs.h      |   4 +-
 Eigen/src/Core/DiagonalMatrix.h      |   2 +-
 Eigen/src/Core/ForwardDeclarations.h |   7 +-
 Eigen/src/Core/Functors.h            |  44 +++++--
 Eigen/src/Core/Identity.h            |   2 +-
 Eigen/src/Core/Map.h                 |   2 +-
 Eigen/src/Core/Matrix.h              |  22 +++-
 Eigen/src/Core/MatrixBase.h          |  10 ++
 Eigen/src/Core/MatrixStorage.h       |  22 +++-
 Eigen/src/Core/Minor.h               |   2 +-
 Eigen/src/Core/NumTraits.h           |   7 --
 Eigen/src/Core/Ones.h                |   2 +-
 Eigen/src/Core/OperatorEquals.h      | 174 ++++++++++++++++++++-------
 Eigen/src/Core/PacketMath.h          |  85 +++++++++++++
 Eigen/src/Core/Product.h             |  69 ++++++++++-
 Eigen/src/Core/Random.h              |   2 +-
 Eigen/src/Core/Redux.h               |   4 +-
 Eigen/src/Core/Transpose.h           |  10 ++
 Eigen/src/Core/Util.h                |  14 +++
 Eigen/src/Core/Zero.h                |   2 +-
 test/main.h                          |   8 +-
 25 files changed, 436 insertions(+), 93 deletions(-)
 create mode 100644 Eigen/src/Core/PacketMath.h
diff --git a/Eigen/Core b/Eigen/Core
index 22a2ed6f2..dc28951f0 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -4,17 +4,26 @@
 #include <cstdlib>
 #include <cmath>
 #include <complex>
-#ifndef EIGEN_USE_CUSTOM_ASSERT
 #include <cassert>
-#endif
 #include <iostream>
 
+#ifdef EIGEN_VECTORIZE
+#ifdef EIGEN_INTEL_PLATFORM
+#include <emmintrin.h>
+#include <xmmintrin.h>
+#else
+#undef EIGEN_VECTORIZE
+#endif
+#endif
+
 namespace Eigen {
 
 #include "src/Core/Util.h"
 #include "src/Core/ForwardDeclarations.h"
 #include "src/Core/NumTraits.h"
 #include "src/Core/MathFunctions.h"
+#include "src/Core/PacketMath.h"
+#include "src/Core/Functors.h"
 #include "src/Core/MatrixBase.h"
 #include "src/Core/Coeffs.h"
 #include "src/Core/OperatorEquals.h"
@@ -42,7 +51,6 @@ namespace Eigen {
 #include "src/Core/IO.h"
 #include "src/Core/Swap.h"
 #include "src/Core/CommaInitializer.h"
-#include "src/Core/Functors.h"
 
 } // namespace Eigen
 
diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h
index 34f98030e..f0c1d11c0 100644
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@@ -67,9 +67,9 @@ struct ei_traits<Block<MatrixType, BlockRows, BlockCols> >
       : (BlockRows==Dynamic ? MatrixType::MaxRowsAtCompileTime : BlockRows),
     MaxColsAtCompileTime = ColsAtCompileTime == 1 ? 1
       : (BlockCols==Dynamic ? MatrixType::MaxColsAtCompileTime : BlockCols),
-    Flags = RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic
+    Flags = (RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic
             ? (unsigned int)MatrixType::Flags
-            : (unsigned int)MatrixType::Flags &~ LargeBit,
+            : (unsigned int)MatrixType::Flags &~ LargeBit) & ~VectorizableBit,
     CoeffReadCost = MatrixType::CoeffReadCost
   };
 };
diff --git a/Eigen/src/Core/CwiseBinaryOp.h b/Eigen/src/Core/CwiseBinaryOp.h
index f7764e9b4..0ca6d3922 100644
--- a/Eigen/src/Core/CwiseBinaryOp.h
+++ b/Eigen/src/Core/CwiseBinaryOp.h
@@ -60,7 +60,9 @@ struct ei_traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
     ColsAtCompileTime = Lhs::ColsAtCompileTime,
     MaxRowsAtCompileTime = Lhs::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = Lhs::MaxColsAtCompileTime,
-    Flags = Lhs::Flags | Rhs::Flags,
+    Flags = ((Lhs::Flags | Rhs::Flags) & ~VectorizableBit)
+      | (ei_functor_traits<BinaryOp>::IsVectorizable && ((Lhs::Flags&RowMajorBit)==(Rhs::Flags&RowMajorBit))
+        ? (Lhs::Flags & Rhs::Flags & VectorizableBit) : 0),
     CoeffReadCost = Lhs::CoeffReadCost + Rhs::CoeffReadCost + ei_functor_traits<BinaryOp>::Cost
   };
 };
@@ -89,6 +91,11 @@ class CwiseBinaryOp : ei_no_assignment_operator,
       return m_functor(m_lhs.coeff(row, col), m_rhs.coeff(row, col));
     }
 
+    PacketScalar _packetCoeff(int row, int col) const
+    {
+      return m_functor.packetOp(m_lhs.packetCoeff(row, col), m_rhs.packetCoeff(row, col));
+    }
+
   protected:
     const typename Lhs::XprCopy m_lhs;
     const typename Rhs::XprCopy m_rhs;
diff --git a/Eigen/src/Core/CwiseUnaryOp.h b/Eigen/src/Core/CwiseUnaryOp.h
index b26b55be8..5c2ba1b07 100644
--- a/Eigen/src/Core/CwiseUnaryOp.h
+++ b/Eigen/src/Core/CwiseUnaryOp.h
@@ -50,7 +50,8 @@ struct ei_traits<CwiseUnaryOp<UnaryOp, MatrixType> >
     ColsAtCompileTime = MatrixType::ColsAtCompileTime,
     MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-    Flags = MatrixType::Flags,
+    Flags = (MatrixType::Flags & ~VectorizableBit)
+      | (ei_functor_traits<UnaryOp>::IsVectorizable ? MatrixType::Flags & VectorizableBit : 0),
     CoeffReadCost = MatrixType::CoeffReadCost + ei_functor_traits<UnaryOp>::Cost
   };
 };
@@ -76,6 +77,11 @@ class CwiseUnaryOp : ei_no_assignment_operator,
       return m_functor(m_matrix.coeff(row, col));
     }
 
+    PacketScalar _packetCoeff(int row, int col) const
+    {
+      return m_functor.packetOp(m_matrix.packetCoeff(row, col));
+    }
+
   protected:
     const typename MatrixType::XprCopy m_matrix;
     const UnaryOp m_functor;
diff --git a/Eigen/src/Core/DiagonalCoeffs.h b/Eigen/src/Core/DiagonalCoeffs.h
index 030de5cf0..7f8fea162 100644
--- a/Eigen/src/Core/DiagonalCoeffs.h
+++ b/Eigen/src/Core/DiagonalCoeffs.h
@@ -52,9 +52,9 @@ struct ei_traits<DiagonalCoeffs<MatrixType> >
                             : EIGEN_ENUM_MIN(MatrixType::MaxRowsAtCompileTime,
                                              MatrixType::MaxColsAtCompileTime),
     MaxColsAtCompileTime = 1,
-    Flags = RowsAtCompileTime == Dynamic && ColsAtCompileTime == Dynamic
+    Flags = (RowsAtCompileTime == Dynamic && ColsAtCompileTime == Dynamic
             ? (unsigned int)MatrixType::Flags
-            : (unsigned int)MatrixType::Flags &~ LargeBit,
+            : (unsigned int)MatrixType::Flags &~ LargeBit) & ~VectorizableBit,
     CoeffReadCost = MatrixType::CoeffReadCost
   };
 };
diff --git a/Eigen/src/Core/DiagonalMatrix.h b/Eigen/src/Core/DiagonalMatrix.h
index b7fffab72..6a243a402 100644
--- a/Eigen/src/Core/DiagonalMatrix.h
+++ b/Eigen/src/Core/DiagonalMatrix.h
@@ -47,7 +47,7 @@ struct ei_traits<DiagonalMatrix<CoeffsVectorType> >
     ColsAtCompileTime = CoeffsVectorType::SizeAtCompileTime,
     MaxRowsAtCompileTime = CoeffsVectorType::MaxSizeAtCompileTime,
     MaxColsAtCompileTime = CoeffsVectorType::MaxSizeAtCompileTime,
-    Flags = CoeffsVectorType::Flags,
+    Flags = CoeffsVectorType::Flags & ~VectorizableBit,
     CoeffReadCost = CoeffsVectorType::CoeffReadCost
   };
 };
diff --git a/Eigen/src/Core/ForwardDeclarations.h b/Eigen/src/Core/ForwardDeclarations.h
index 32be8cd68..d9699301d 100644
--- a/Eigen/src/Core/ForwardDeclarations.h
+++ b/Eigen/src/Core/ForwardDeclarations.h
@@ -65,7 +65,7 @@ template<typename Scalar> struct ei_scalar_cos_op;
 template<typename Scalar> struct ei_scalar_sin_op;
 template<typename Scalar> struct ei_scalar_pow_op;
 template<typename Scalar, typename NewType> struct ei_scalar_cast_op;
-template<typename Scalar> struct ei_scalar_multiple_op;
+template<typename Scalar, bool IsVectorizable> struct ei_scalar_multiple_op;
 template<typename Scalar> struct ei_scalar_quotient1_op;
 template<typename Scalar> struct ei_scalar_min_op;
 template<typename Scalar> struct ei_scalar_max_op;
@@ -116,5 +116,10 @@ template<typename T> struct ei_functor_traits
   };
 };
 
+template<typename T> struct ei_packet_traits
+{
+  typedef T type;
+  enum {size=1};
+};
 
 #endif // EIGEN_FORWARDDECLARATIONS_H
diff --git a/Eigen/src/Core/Functors.h b/Eigen/src/Core/Functors.h
index 44f982d11..d0f5151bc 100644
--- a/Eigen/src/Core/Functors.h
+++ b/Eigen/src/Core/Functors.h
@@ -34,12 +34,15 @@
   */
 template<typename Scalar> struct ei_scalar_sum_op EIGEN_EMPTY_STRUCT {
   const Scalar operator() (const Scalar& a, const Scalar& b) const { return a + b; }
+  template<typename PacketScalar>
+  PacketScalar packetOp(const PacketScalar& a, const PacketScalar& b) const
+  { return ei_padd(a,b); }
 };
 template<typename Scalar>
 struct ei_functor_traits<ei_scalar_sum_op<Scalar> > {
   enum {
     Cost = NumTraits<Scalar>::AddCost,
-    IsVectorizable = NumTraits<Scalar>::PacketSize>0
+    IsVectorizable = ei_packet_traits<Scalar>::size>1
   };
 };
 
@@ -50,12 +53,15 @@ struct ei_functor_traits<ei_scalar_sum_op<Scalar> > {
   */
 template<typename Scalar> struct ei_scalar_product_op EIGEN_EMPTY_STRUCT {
   const Scalar operator() (const Scalar& a, const Scalar& b) const { return a * b; }
+  template<typename PacketScalar>
+  PacketScalar packetOp(const PacketScalar& a, const PacketScalar& b) const
+  { return ei_pmul(a,b); }
 };
 template<typename Scalar>
 struct ei_functor_traits<ei_scalar_product_op<Scalar> > {
   enum {
     Cost = NumTraits<Scalar>::MulCost,
-    IsVectorizable = NumTraits<Scalar>::PacketSize>0
+    IsVectorizable = ei_packet_traits<Scalar>::size>1
   };
 };
 
@@ -66,12 +72,15 @@ struct ei_functor_traits<ei_scalar_product_op<Scalar> > {
   */
 template<typename Scalar> struct ei_scalar_min_op EIGEN_EMPTY_STRUCT {
   const Scalar operator() (const Scalar& a, const Scalar& b) const { return std::min(a, b); }
+  template<typename PacketScalar>
+  PacketScalar packetOp(const PacketScalar& a, const PacketScalar& b) const
+  { return ei_pmin(a,b); }
 };
 template<typename Scalar>
 struct ei_functor_traits<ei_scalar_min_op<Scalar> > {
   enum {
     Cost = NumTraits<Scalar>::AddCost,
-    IsVectorizable = NumTraits<Scalar>::PacketSize>0
+    IsVectorizable = ei_packet_traits<Scalar>::size>1
   };
 };
 
@@ -82,12 +91,15 @@ struct ei_functor_traits<ei_scalar_min_op<Scalar> > {
   */
 template<typename Scalar> struct ei_scalar_max_op EIGEN_EMPTY_STRUCT {
   const Scalar operator() (const Scalar& a, const Scalar& b) const { return std::max(a, b); }
+  template<typename PacketScalar>
+  PacketScalar packetOp(const PacketScalar& a, const PacketScalar& b) const
+  { return ei_pmax(a,b); }
 };
 template<typename Scalar>
 struct ei_functor_traits<ei_scalar_max_op<Scalar> > {
   enum {
     Cost = NumTraits<Scalar>::AddCost,
-    IsVectorizable = NumTraits<Scalar>::PacketSize>0
+    IsVectorizable = ei_packet_traits<Scalar>::size>1
   };
 };
 
@@ -100,13 +112,16 @@ struct ei_functor_traits<ei_scalar_max_op<Scalar> > {
   * \sa class CwiseBinaryOp, MatrixBase::operator-
   */
 template<typename Scalar> struct ei_scalar_difference_op EIGEN_EMPTY_STRUCT {
-    const Scalar operator() (const Scalar& a, const Scalar& b) const { return a - b; }
+  const Scalar operator() (const Scalar& a, const Scalar& b) const { return a - b; }
+  template<typename PacketScalar>
+  PacketScalar packetOp(const PacketScalar& a, const PacketScalar& b) const
+  { return ei_psub(a,b); }
 };
 template<typename Scalar>
 struct ei_functor_traits<ei_scalar_difference_op<Scalar> > {
   enum {
     Cost = NumTraits<Scalar>::AddCost,
-    IsVectorizable = NumTraits<Scalar>::PacketSize>0
+    IsVectorizable = ei_packet_traits<Scalar>::size>1
   };
 };
 
@@ -194,15 +209,26 @@ struct ei_functor_traits<ei_scalar_cast_op<Scalar,NewType> >
   *
   * \sa class CwiseUnaryOp, MatrixBase::operator*, MatrixBase::operator/
   */
+template<typename Scalar, bool IsVectorizable = (int(ei_packet_traits<Scalar>::size)>1?true:false) > struct ei_scalar_multiple_op;
+
 template<typename Scalar>
-struct ei_scalar_multiple_op {
-  ei_scalar_multiple_op(const Scalar& other) : m_other(other) {}
+struct ei_scalar_multiple_op<Scalar,true> {
+  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
+  ei_scalar_multiple_op(const Scalar& other) : m_other(ei_pset1(other)) { }
+  Scalar operator() (const Scalar& a) const { return a * ei_pfirst(m_other); }
+  PacketScalar packetOp(const PacketScalar& a) const
+  { return ei_pmul(a, m_other); }
+  const PacketScalar m_other;
+};
+template<typename Scalar>
+struct ei_scalar_multiple_op<Scalar,false> {
+  ei_scalar_multiple_op(const Scalar& other) : m_other(other) { }
   Scalar operator() (const Scalar& a) const { return a * m_other; }
   const Scalar m_other;
 };
 template<typename Scalar>
 struct ei_functor_traits<ei_scalar_multiple_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::MulCost, IsVectorizable = false }; };
+{ enum { Cost = NumTraits<Scalar>::MulCost, IsVectorizable = ei_packet_traits<Scalar>::size>1 }; };
 
 template<typename Scalar, bool HasFloatingPoint>
 struct ei_scalar_quotient1_impl {
diff --git a/Eigen/src/Core/Identity.h b/Eigen/src/Core/Identity.h
index 104a06e2f..0783983c1 100644
--- a/Eigen/src/Core/Identity.h
+++ b/Eigen/src/Core/Identity.h
@@ -40,7 +40,7 @@ struct ei_traits<Identity<MatrixType> >
     ColsAtCompileTime = MatrixType::ColsAtCompileTime,
     MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-    Flags = MatrixType::Flags,
+    Flags = MatrixType::Flags & ~VectorizableBit,
     CoeffReadCost = NumTraits<Scalar>::ReadCost
   };
 };
diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h
index cbb1633ad..f17107a65 100644
--- a/Eigen/src/Core/Map.h
+++ b/Eigen/src/Core/Map.h
@@ -47,7 +47,7 @@ struct ei_traits<Map<MatrixType> >
     ColsAtCompileTime = MatrixType::ColsAtCompileTime,
     MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-    Flags = MatrixType::Flags,
+    Flags = MatrixType::Flags & ~VectorizableBit,
     CoeffReadCost = NumTraits<Scalar>::ReadCost
   };
 };
diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h
index b9a47f7b0..da898f031 100644
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h
@@ -79,7 +79,10 @@ struct ei_traits<Matrix<_Scalar, _Rows, _Cols, _Flags, _MaxRows, _MaxCols> >
     ColsAtCompileTime = _Cols,
     MaxRowsAtCompileTime = _MaxRows,
     MaxColsAtCompileTime = _MaxCols,
-    Flags = _Flags,
+    Flags = (_Flags & ~VectorizableBit)
+      | (( (ei_packet_traits<Scalar>::size>1) && (_Rows!=Dynamic) && (_Cols!=Dynamic)
+        && ((_Flags&RowMajorBit) && ((_Cols%ei_packet_traits<Scalar>::size)==0)
+            || ((_Rows%ei_packet_traits<Scalar>::size)==0) ) ) ? VectorizableBit  : 0),
     CoeffReadCost = NumTraits<Scalar>::ReadCost
   };
 };
@@ -119,6 +122,23 @@ class Matrix : public MatrixBase<Matrix<_Scalar, _Rows, _Cols,
         return m_storage.data()[row + col * m_storage.rows()];
     }
 
+    PacketScalar _packetCoeff(int row, int col) const
+    {
+      ei_internal_assert(Flags & VectorizableBit);
+      if(Flags & RowMajorBit)
+        return ei_pload(&m_storage.data()[col + row * m_storage.cols()]);
+      else
+        return ei_pload(&m_storage.data()[row + col * m_storage.rows()]);
+    }
+    void _writePacketCoeff(int row, int col, const PacketScalar& x)
+    {
+      ei_internal_assert(Flags & VectorizableBit);
+      if(Flags & RowMajorBit)
+        ei_pstore(&m_storage.data()[col + row * m_storage.cols()], x);
+      else
+        ei_pstore(&m_storage.data()[row + col * m_storage.rows()], x);
+    }
+
   public:
     /** \returns a const pointer to the data array of this matrix */
     const Scalar *data() const
diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h
index 2bc54701d..62953eded 100644
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@@ -59,6 +59,8 @@ template<typename Derived> class MatrixBase
     //@{
     typedef typename ei_traits<Derived>::Scalar Scalar;
 
+    typedef typename ei_packet_traits<Scalar>::type PacketScalar;
+
     enum {
 
       RowsAtCompileTime = ei_traits<Derived>::RowsAtCompileTime,
@@ -211,6 +213,9 @@ template<typename Derived> class MatrixBase
     Scalar& coeffRef(int index);
     Scalar& operator[](int index);
 
+    PacketScalar packetCoeff(int row, int col) const { return derived()._packetCoeff(row,col); }
+    void writePacketCoeff(int row, int col, const PacketScalar& x) { return derived()._writePacketCoeff(row,col,x); }
+
     const Scalar x() const;
     const Scalar y() const;
     const Scalar z() const;
@@ -484,6 +489,11 @@ template<typename Derived> class MatrixBase
     { return *static_cast<Derived*>(const_cast<MatrixBase*>(this)); }
     //@}
 
+  private:
+
+    PacketScalar _packetCoeff(int , int) const { ei_internal_assert(false && "_packetCoeff not defined"); }
+    void _writePacketCoeff(int , int, const PacketScalar&) { ei_internal_assert(false && "_packetCoeff not defined"); }
+
 };
 
 #endif // EIGEN_MATRIXBASE_H
diff --git a/Eigen/src/Core/MatrixStorage.h b/Eigen/src/Core/MatrixStorage.h
index 91290ea59..cca4414d3 100644
--- a/Eigen/src/Core/MatrixStorage.h
+++ b/Eigen/src/Core/MatrixStorage.h
@@ -6,12 +6,12 @@
 //
 // Eigen is free software; you can redistribute it and/or
 // modify it under the terms of the GNU Lesser General Public
-// License as published by the Free Software Foundation; either 
+// License as published by the Free Software Foundation; either
 // version 3 of the License, or (at your option) any later version.
 //
 // Alternatively, you can redistribute it and/or
 // modify it under the terms of the GNU General Public License as
-// published by the Free Software Foundation; either version 2 of 
+// published by the Free Software Foundation; either version 2 of
 // the License, or (at your option) any later version.
 //
 // Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
@@ -39,18 +39,28 @@
   */
 template<typename T, int Size, int _Rows, int _Cols> class ei_matrix_storage;
 
-// purely fixed-size matrix.
+template <typename T, int Size, bool Align> struct ei_aligned_array
+{
+  EIGEN_ALIGN_128 T array[Size];
+};
+
+template <typename T, int Size> struct ei_aligned_array<T,Size,false>
+{
+  T array[Size];
+};
+
+// purely fixed-size matrix
 template<typename T, int Size, int _Rows, int _Cols> class ei_matrix_storage
 {
-    T m_data[Size];
+    ei_aligned_array<T,Size,((Size*sizeof(T))%16)==0> m_data;
   public:
     ei_matrix_storage() {}
     ei_matrix_storage(int,int,int) {}
     static int rows(void) {return _Rows;}
     static int cols(void) {return _Cols;}
     void resize(int,int,int) {}
-    const T *data() const { return m_data; }
-    T *data() { return m_data; }
+    const T *data() const { return m_data.array; }
+    T *data() { return m_data.array; }
 };
 
 // dynamic-size matrix with fixed-size storage
diff --git a/Eigen/src/Core/Minor.h b/Eigen/src/Core/Minor.h
index 911ac2151..1b060928f 100644
--- a/Eigen/src/Core/Minor.h
+++ b/Eigen/src/Core/Minor.h
@@ -50,7 +50,7 @@ struct ei_traits<Minor<MatrixType> >
                                 MatrixType::MaxRowsAtCompileTime - 1 : Dynamic,
     MaxColsAtCompileTime = (MatrixType::MaxColsAtCompileTime != Dynamic) ?
                                 MatrixType::MaxColsAtCompileTime - 1 : Dynamic,
-    Flags = MatrixType::Flags,
+    Flags = MatrixType::Flags & ~VectorizableBit,
     CoeffReadCost = MatrixType::CoeffReadCost
   };
 };
diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h
index ba546e86e..137f38ee2 100644
--- a/Eigen/src/Core/NumTraits.h
+++ b/Eigen/src/Core/NumTraits.h
@@ -57,7 +57,6 @@ template<> struct NumTraits<int>
     ReadCost = 1,
     AddCost = 1,
     MulCost = 1,
-    PacketSize = 4
   };
 };
 
@@ -71,7 +70,6 @@ template<> struct NumTraits<float>
     ReadCost = 1,
     AddCost = 1,
     MulCost = 1,
-    PacketSize = 4
   };
 };
 
@@ -85,7 +83,6 @@ template<> struct NumTraits<double>
     ReadCost = 1,
     AddCost = 1,
     MulCost = 1,
-    PacketSize = 2
   };
 };
 
@@ -99,7 +96,6 @@ template<typename _Real> struct NumTraits<std::complex<_Real> >
     ReadCost = 2,
     AddCost = 2 * NumTraits<Real>::AddCost,
     MulCost = 4 * NumTraits<Real>::MulCost + 2 * NumTraits<Real>::AddCost,
-    PacketSize = 0
   };
 };
 
@@ -113,7 +109,6 @@ template<> struct NumTraits<long long int>
     ReadCost = 1,
     AddCost = 1,
     MulCost = 1,
-    PacketSize = 0
   };
 };
 
@@ -127,7 +122,6 @@ template<> struct NumTraits<long double>
     ReadCost = 1,
     AddCost = 2,
     MulCost = 2,
-    PacketSize = 0
   };
 };
 
@@ -141,7 +135,6 @@ template<> struct NumTraits<bool>
     ReadCost = 1,
     AddCost = 1,
     MulCost = 1,
-    PacketSize = 0
   };
 };
 
diff --git a/Eigen/src/Core/Ones.h b/Eigen/src/Core/Ones.h
index 4cb4bc348..bcc71764c 100644
--- a/Eigen/src/Core/Ones.h
+++ b/Eigen/src/Core/Ones.h
@@ -41,7 +41,7 @@ struct ei_traits<Ones<MatrixType> >
     ColsAtCompileTime = MatrixType::ColsAtCompileTime,
     MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-    Flags = MatrixType::Flags,
+    Flags = MatrixType::Flags & ~VectorizableBit,
     CoeffReadCost = NumTraits<Scalar>::ReadCost
   };
 };
diff --git a/Eigen/src/Core/OperatorEquals.h b/Eigen/src/Core/OperatorEquals.h
index 5529c8313..dff1954cc 100644
--- a/Eigen/src/Core/OperatorEquals.h
+++ b/Eigen/src/Core/OperatorEquals.h
@@ -63,6 +63,48 @@ struct ei_matrix_operator_equals_unroller<Derived1, Derived2, Dynamic>
   static void run(Derived1 &, const Derived2 &) {}
 };
 
+//----
+
+template<typename Derived1, typename Derived2, int UnrollCount>
+struct ei_matrix_operator_equals_packet_unroller
+{
+  enum {
+    index = UnrollCount-ei_packet_traits<typename Derived1::Scalar>::size,
+    row = Derived1::Flags&RowMajorBit ? index / Derived1::ColsAtCompileTime : index % Derived1::RowsAtCompileTime,
+    col = Derived1::Flags&RowMajorBit ? index % Derived1::ColsAtCompileTime : index / Derived1::RowsAtCompileTime
+  };
+
+  static void run(Derived1 &dst, const Derived2 &src)
+  {
+    ei_matrix_operator_equals_packet_unroller<Derived1, Derived2, index>::run(dst, src);
+    dst.writePacketCoeff(row, col, src.packetCoeff(row, col));
+  }
+};
+
+template<typename Derived1, typename Derived2>
+struct ei_matrix_operator_equals_packet_unroller<Derived1, Derived2, 2>
+{
+  static void run(Derived1 &dst, const Derived2 &src)
+  {
+    dst.writePacketCoeff(0, 0, src.packetCoeff(0, 0));
+  }
+};
+
+// prevent buggy user code from causing an infinite recursion
+template<typename Derived1, typename Derived2>
+struct ei_matrix_operator_equals_packet_unroller<Derived1, Derived2, 0>
+{
+  static void run(Derived1 &, const Derived2 &) {exit(666);}
+};
+
+template<typename Derived1, typename Derived2>
+struct ei_matrix_operator_equals_packet_unroller<Derived1, Derived2, Dynamic>
+{
+  static void run(Derived1 &, const Derived2 &) {exit(666);}
+};
+
+//----
+
 template<typename Derived1, typename Derived2, int UnrollCount>
 struct ei_vector_operator_equals_unroller
 {
@@ -97,54 +139,17 @@ struct ei_vector_operator_equals_unroller<Derived1, Derived2, Dynamic>
   static void run(Derived1 &, const Derived2 &) {}
 };
 
+template <typename Derived, typename OtherDerived,
+bool Vectorize = (Derived::Flags & OtherDerived::Flags & VectorizableBit)
+              && ((Derived::Flags&RowMajorBit)==(OtherDerived::Flags&RowMajorBit))>
+struct ei_operator_equals_impl;
+
 template<typename Derived>
 template<typename OtherDerived>
 Derived& MatrixBase<Derived>
   ::lazyAssign(const MatrixBase<OtherDerived>& other)
 {
-  const bool unroll = SizeAtCompileTime * OtherDerived::CoeffReadCost <= EIGEN_UNROLLING_LIMIT;
-  if(IsVectorAtCompileTime && OtherDerived::IsVectorAtCompileTime)
-    // copying a vector expression into a vector
-  {
-    ei_assert(size() == other.size());
-    if(unroll)
-      ei_vector_operator_equals_unroller
-        <Derived, OtherDerived,
-        unroll ? SizeAtCompileTime : Dynamic
-        >::run(derived(), other.derived());
-    else
-      for(int i = 0; i < size(); i++)
-        coeffRef(i) = other.coeff(i);
-  }
-  else // copying a matrix expression into a matrix
-  {
-    ei_assert(rows() == other.rows() && cols() == other.cols());
-    if(unroll)
-    {
-      ei_matrix_operator_equals_unroller
-        <Derived, OtherDerived,
-        unroll ? SizeAtCompileTime : Dynamic
-        >::run(derived(), other.derived());
-    }
-    else
-    {
-      if(ColsAtCompileTime == Dynamic || RowsAtCompileTime != Dynamic)
-      {
-        // traverse in column-major order
-        for(int j = 0; j < cols(); j++)
-          for(int i = 0; i < rows(); i++)
-            coeffRef(i, j) = other.coeff(i, j);
-      }
-      else
-      {
-        // traverse in row-major order
-        // in order to allow the compiler to unroll the inner loop
-        for(int i = 0; i < rows(); i++)
-          for(int j = 0; j < cols(); j++)
-            coeffRef(i, j) = other.coeff(i, j);
-      }
-    }
-  }
+  ei_operator_equals_impl<Derived,OtherDerived>::execute(derived(),other.derived());
   return derived();
 }
 
@@ -161,4 +166,87 @@ Derived& MatrixBase<Derived>
     return lazyAssign(other.derived());
 }
 
+template <typename Derived, typename OtherDerived>
+struct ei_operator_equals_impl<Derived, OtherDerived, false>
+{
+  static void execute(Derived & dst, const OtherDerived & src)
+  {
+    const bool unroll = Derived::SizeAtCompileTime * OtherDerived::CoeffReadCost <= EIGEN_UNROLLING_LIMIT;
+    if(Derived::IsVectorAtCompileTime && OtherDerived::IsVectorAtCompileTime)
+      // copying a vector expression into a vector
+    {
+      ei_assert(dst.size() == src.size());
+      if(unroll)
+        ei_vector_operator_equals_unroller
+          <Derived, OtherDerived,
+          unroll ? Derived::SizeAtCompileTime : Dynamic
+          >::run(dst.derived(), src.derived());
+      else
+        for(int i = 0; i < dst.size(); i++)
+          dst.coeffRef(i) = src.coeff(i);
+    }
+    else // copying a matrix expression into a matrix
+    {
+      ei_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
+      if(unroll)
+      {
+        ei_matrix_operator_equals_unroller
+          <Derived, OtherDerived,
+          unroll ? Derived::SizeAtCompileTime : Dynamic
+          >::run(dst.derived(), src.derived());
+      }
+      else
+      {
+        if(Derived::ColsAtCompileTime == Dynamic || Derived::RowsAtCompileTime != Dynamic)
+        {
+          // traverse in column-major order
+          for(int j = 0; j < dst.cols(); j++)
+            for(int i = 0; i < dst.rows(); i++)
+              dst.coeffRef(i, j) = src.coeff(i, j);
+        }
+        else
+        {
+          // traverse in row-major order
+          // in order to allow the compiler to unroll the inner loop
+          for(int i = 0; i < dst.rows(); i++)
+            for(int j = 0; j < dst.cols(); j++)
+              dst.coeffRef(i, j) = src.coeff(i, j);
+        }
+      }
+    }
+  }
+};
+
+template <typename Derived, typename OtherDerived>
+struct ei_operator_equals_impl<Derived, OtherDerived, true>
+{
+  static void execute(Derived & dst, const OtherDerived & src)
+  {
+    const bool unroll = Derived::SizeAtCompileTime * OtherDerived::CoeffReadCost <= EIGEN_UNROLLING_LIMIT;
+    ei_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
+    if(unroll)
+    {
+      ei_matrix_operator_equals_packet_unroller
+        <Derived, OtherDerived,
+          unroll ? Derived::SizeAtCompileTime : Dynamic>::run
+          (dst.const_cast_derived(), src.derived());
+    }
+    else
+    {
+      if(OtherDerived::Flags&RowMajorBit)
+      {
+        for(int i = 0; i < dst.rows(); i++)
+          for(int j = 0; j < dst.cols(); j+=ei_packet_traits<typename Derived::Scalar>::size)
+            dst.writePacketCoeff(i, j, src.packetCoeff(i, j));
+      }
+      else
+      {
+        for(int j = 0; j < dst.cols(); j++)
+          for(int i = 0; i < dst.rows(); i+=ei_packet_traits<typename Derived::Scalar>::size)
+            dst.writePacketCoeff(i, j, src.packetCoeff(i, j));
+      }
+    }
+  }
+};
+
 #endif // EIGEN_OPERATOREQUALS_H
diff --git a/Eigen/src/Core/PacketMath.h b/Eigen/src/Core/PacketMath.h
new file mode 100644
index 000000000..aab123533
--- /dev/null
+++ b/Eigen/src/Core/PacketMath.h
@@ -0,0 +1,85 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra. Eigen itself is part of the KDE project.
+//
+// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef EIGEN_PACKET_MATH_H
+#define EIGEN_PACKET_MATH_H
+
+#ifdef EIGEN_INTEL_PLATFORM
+
+template<> struct ei_packet_traits<float>  { typedef __m128  type; enum {size=4}; };
+template<> struct ei_packet_traits<double> { typedef __m128d type; enum {size=2}; };
+template<> struct ei_packet_traits<int>    { typedef __m128i type; enum {size=4}; };
+
+inline __m128  ei_padd(const __m128&  a, const __m128&  b) { return _mm_add_ps(a,b); }
+inline __m128d ei_padd(const __m128d& a, const __m128d& b) { return _mm_add_pd(a,b); }
+inline __m128i ei_padd(const __m128i& a, const __m128i& b) { return _mm_add_epi32(a,b); }
+
+inline __m128  ei_psub(const __m128&  a, const __m128&  b) { return _mm_sub_ps(a,b); }
+inline __m128d ei_psub(const __m128d& a, const __m128d& b) { return _mm_sub_pd(a,b); }
+inline __m128i ei_psub(const __m128i& a, const __m128i& b) { return _mm_sub_epi32(a,b); }
+
+inline __m128  ei_pmul(const __m128&  a, const __m128&  b) { return _mm_mul_ps(a,b); }
+inline __m128d ei_pmul(const __m128d& a, const __m128d& b) { return _mm_mul_pd(a,b); }
+inline __m128i ei_pmul(const __m128i& a, const __m128i& b) { return _mm_mul_epu32(a,b); }
+
+inline __m128  ei_pmin(const __m128&  a, const __m128&  b) { return _mm_min_ps(a,b); }
+inline __m128d ei_pmin(const __m128d& a, const __m128d& b) { return _mm_min_pd(a,b); }
+inline __m128i ei_pmin(const __m128i& a, const __m128i& b)
+{
+  __m128i mask = _mm_cmplt_epi32(a,b);
+  return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b));
+}
+
+inline __m128  ei_pmax(const __m128&  a, const __m128&  b) { return _mm_max_ps(a,b); }
+inline __m128d ei_pmax(const __m128d& a, const __m128d& b) { return _mm_max_pd(a,b); }
+inline __m128i ei_pmax(const __m128i& a, const __m128i& b)
+{
+  __m128i mask = _mm_cmpgt_epi32(a,b);
+  return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b));
+}
+
+inline __m128  ei_pload(const float*   from) { return _mm_load_ps(from); }
+inline __m128d ei_pload(const double*  from) { return _mm_load_pd(from); }
+inline __m128i ei_pload(const __m128i* from) { return _mm_load_si128(from); }
+
+inline __m128  ei_pload1(const float*  from) { return _mm_load1_ps(from); }
+inline __m128d ei_pload1(const double* from) { return _mm_load1_pd(from); }
+inline __m128i ei_pload1(const int*    from) { return _mm_set1_epi32(*from); }
+
+inline __m128  ei_pset1(const float&  from) { return _mm_set1_ps(from); }
+inline __m128d ei_pset1(const double& from) { return _mm_set1_pd(from); }
+inline __m128i ei_pset1(const int&    from) { return _mm_set1_epi32(from); }
+
+inline void ei_pstore(float*   to, const __m128&  from) { _mm_store_ps(to, from); }
+inline void ei_pstore(double*  to, const __m128d& from) { _mm_store_pd(to, from); }
+inline void ei_pstore(__m128i* to, const __m128i& from) { _mm_store_si128(to, from); }
+
+inline float  ei_pfirst(const __m128&  a) { return _mm_cvtss_f32(a); }
+inline double ei_pfirst(const __m128d& a) { return _mm_cvtsd_f64(a); }
+inline int    ei_pfirst(const __m128i& a) { return _mm_cvtsi128_si32(a); }
+
+#endif
+
+#endif // EIGEN_PACKET_MATH_H
+
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index 7f149075b..cfb5d3e10 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -60,6 +60,38 @@ struct ei_product_unroller<Index, 0, Lhs, Rhs>
   static void run(int, int, const Lhs&, const Rhs&, typename Lhs::Scalar&) {}
 };
 
+
+template<bool RowMajor, int Index, int Size, typename Lhs, typename Rhs, typename PacketScalar>
+struct ei_packet_product_unroller
+{
+  static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
+  {
+    ei_packet_product_unroller<RowMajor, Index-1, Size, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, res);
+    if (RowMajor)
+      res =  ei_padd(res, ei_pmul(ei_pset1(lhs.coeff(row, Index)), rhs.packetCoeff(Index, col)));
+    else
+      res =  ei_padd(res, ei_pmul(lhs.packetCoeff(row, Index), ei_pset1(rhs.coeff(Index, col))));
+  }
+};
+
+template<bool RowMajor, int Size, typename Lhs, typename Rhs, typename PacketScalar>
+struct ei_packet_product_unroller<RowMajor, 0, Size, Lhs, Rhs, PacketScalar>
+{
+  static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
+  {
+    if (RowMajor)
+      res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.packetCoeff(0, col));
+    else
+      res = ei_pmul(lhs.packetCoeff(row, 0), ei_pset1(rhs.coeff(0, col)));
+  }
+};
+
+template<bool RowMajor, int Index, typename Lhs, typename Rhs, typename PacketScalar>
+struct ei_packet_product_unroller<RowMajor, Index, Dynamic, Lhs, Rhs, PacketScalar>
+{
+  static void run(int, int, const Lhs&, const Rhs&, PacketScalar&) {}
+};
+
 /** \class Product
   *
   * \brief Expression of the product of two matrices
@@ -97,11 +129,14 @@ struct ei_traits<Product<Lhs, Rhs, EvalMode> >
     ColsAtCompileTime = Rhs::ColsAtCompileTime,
     MaxRowsAtCompileTime = Lhs::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = Rhs::MaxColsAtCompileTime,
-    Flags = ( (RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic)
+    Flags = (( (RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic)
               ? (unsigned int)(LhsFlags | RhsFlags)
               : (unsigned int)(LhsFlags | RhsFlags) & ~LargeBit )
           | EvalBeforeAssigningBit
-          | (ei_product_eval_mode<Lhs, Rhs>::value == (int)CacheOptimal ? EvalBeforeNestingBit : 0),
+          | (ei_product_eval_mode<Lhs, Rhs>::value == (int)CacheOptimal ? EvalBeforeNestingBit : 0))
+          & (~(RowMajorBit|VectorizableBit))
+            | (((!Lhs::Flags&RowMajorBit) && Lhs::Flags&VectorizableBit) ? VectorizableBit
+              : ((Rhs::Flags&RowMajorBit  && Rhs::Flags&VectorizableBit) ? (RowMajorBit|VectorizableBit) : EIGEN_DEFAULT_MATRIX_STORAGE_ORDER)),
     CoeffReadCost
       = Lhs::ColsAtCompileTime == Dynamic
       ? Dynamic
@@ -157,6 +192,36 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm
       return res;
     }
 
+    PacketScalar _packetCoeff(int row, int col) const EIGEN_ALWAYS_INLINE
+    {
+      PacketScalar res;
+      if(Lhs::ColsAtCompileTime <= EIGEN_UNROLLING_LIMIT)
+      {
+        ei_packet_product_unroller<Flags&RowMajorBit, Lhs::ColsAtCompileTime-1,
+                            Lhs::ColsAtCompileTime <= EIGEN_UNROLLING_LIMIT
+                              ? Lhs::ColsAtCompileTime : Dynamic,
+                            Lhs, Rhs, PacketScalar>
+          ::run(row, col, m_lhs, m_rhs, res);
+//           std::cout << "vec unrolled product\n";
+      }
+      else
+      {
+        if (Flags&RowMajorBit)
+        {
+          res = ei_pmul(ei_pset1(m_lhs.coeff(row, 0)),m_rhs.packetCoeff(0, col));
+          for(int i = 1; i < m_lhs.cols(); i++)
+            res =  ei_padd(res, ei_pmul(ei_pset1(m_lhs.coeff(row, i)), m_rhs.packetCoeff(i, col)));
+        }
+        else
+        {
+          res = ei_pmul(m_lhs.packetCoeff(row, 0), ei_pset1(m_rhs.coeff(0, col)));
+          for(int i = 1; i < m_lhs.cols(); i++)
+            res =  ei_padd(res, ei_pmul(m_lhs.packetCoeff(row, i), ei_pset1(m_rhs.coeff(i, col))));
+        }
+      }
+      return res;
+    }
+
   protected:
     const LhsXprCopy m_lhs;
     const RhsXprCopy m_rhs;
diff --git a/Eigen/src/Core/Random.h b/Eigen/src/Core/Random.h
index 7ac3633fe..4d6a21da4 100644
--- a/Eigen/src/Core/Random.h
+++ b/Eigen/src/Core/Random.h
@@ -41,7 +41,7 @@ struct ei_traits<Random<MatrixType> >
     ColsAtCompileTime = ei_traits<MatrixType>::ColsAtCompileTime,
     MaxRowsAtCompileTime = ei_traits<MatrixType>::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = ei_traits<MatrixType>::MaxColsAtCompileTime,
-    Flags = ei_traits<MatrixType>::Flags | EvalBeforeNestingBit,
+    Flags = (ei_traits<MatrixType>::Flags | EvalBeforeNestingBit) & ~VectorizableBit,
     CoeffReadCost = 2 * NumTraits<Scalar>::MulCost // FIXME: arbitrary value
   };
 };
diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h
index f7fdbc077..12ceedd76 100644
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h
@@ -94,9 +94,9 @@ struct ei_traits<PartialRedux<Direction, BinaryOp, MatrixType> >
     ColsAtCompileTime = Direction==Horizontal ? 1 : MatrixType::ColsAtCompileTime,
     MaxRowsAtCompileTime = Direction==Vertical   ? 1 : MatrixType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = Direction==Horizontal ? 1 : MatrixType::MaxColsAtCompileTime,
-    Flags = (RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic)
+    Flags = ((RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic)
           ? (unsigned int)_MatrixTypeXprCopy::Flags
-          : (unsigned int)_MatrixTypeXprCopy::Flags & ~LargeBit,
+          : (unsigned int)_MatrixTypeXprCopy::Flags & ~LargeBit) & ~VectorizableBit,
     TraversalSize = Direction==Vertical ? RowsAtCompileTime : ColsAtCompileTime,
     CoeffReadCost = TraversalSize * _MatrixTypeXprCopy::CoeffReadCost
                   + (TraversalSize - 1) * ei_functor_traits<BinaryOp>::Cost
diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h
index f2e547225..6710f3092 100644
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@@ -77,6 +77,16 @@ template<typename MatrixType> class Transpose
       return m_matrix.coeff(col, row);
     }
 
+    PacketScalar _packetCoeff(int row, int col) const
+    {
+      return m_matrix.packetCoeff(col, row);
+    }
+
+    void _writePacketCoeff(int row, int col, const PacketScalar& x)
+    {
+      m_matrix.const_cast_derived().writePacketCoeff(col, row, x);
+    }
+
   protected:
     const typename MatrixType::XprCopy m_matrix;
 };
diff --git a/Eigen/src/Core/Util.h b/Eigen/src/Core/Util.h
index ad8a15b07..e2c95bc53 100644
--- a/Eigen/src/Core/Util.h
+++ b/Eigen/src/Core/Util.h
@@ -1,6 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra. Eigen itself is part of the KDE project.
 //
+// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
 // Copyright (C) 2006-2008 Benoit Jacob <jacob@math.jussieu.fr>
 //
 // Eigen is free software; you can redistribute it and/or
@@ -51,11 +52,13 @@ using Eigen::MatrixBase;
 #define EIGEN_NO_DEBUG
 #endif
 
+#ifndef ei_assert
 #ifdef EIGEN_NO_DEBUG
 #define ei_assert(x)
 #else
 #define ei_assert(x) assert(x)
 #endif
+#endif
 
 #ifdef EIGEN_INTERNAL_DEBUGGING
 #define ei_internal_assert(x) ei_assert(x);
@@ -79,6 +82,12 @@ using Eigen::MatrixBase;
 #define EIGEN_ALWAYS_INLINE
 #endif
 
+#if (defined __GNUC__)
+#define EIGEN_ALIGN_128 __attribute__ ((aligned(16)))
+#else
+#define EIGEN_ALIGN_128
+#endif
+
 #define EIGEN_INHERIT_ASSIGNMENT_OPERATOR(Derived, Op) \
 template<typename OtherDerived> \
 Derived& operator Op(const MatrixBase<OtherDerived>& other) \
@@ -107,6 +116,7 @@ EIGEN_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, /=)
 #define _EIGEN_GENERIC_PUBLIC_INTERFACE(Derived, BaseClass) \
 typedef BaseClass Base; \
 typedef typename Eigen::ei_traits<Derived>::Scalar Scalar; \
+typedef typename Base::PacketScalar PacketScalar; \
 typedef typename Eigen::ei_xpr_copy<Derived>::type XprCopy; \
 typedef typename Eigen::ei_eval<Derived>::type Eval; \
 enum { RowsAtCompileTime = Base::RowsAtCompileTime, \
@@ -132,7 +142,11 @@ const unsigned int RowMajorBit = 0x1;
 const unsigned int EvalBeforeNestingBit = 0x2;
 const unsigned int EvalBeforeAssigningBit = 0x4;
 const unsigned int LargeBit = 0x8;
+#ifdef EIGEN_VECTORIZE
 const unsigned int VectorizableBit = 0x10;
+#else
+const unsigned int VectorizableBit = 0x0;
+#endif
 
 
 enum { ConditionalJumpCost = 5 };
diff --git a/Eigen/src/Core/Zero.h b/Eigen/src/Core/Zero.h
index 15108b794..1daffd1c4 100644
--- a/Eigen/src/Core/Zero.h
+++ b/Eigen/src/Core/Zero.h
@@ -41,7 +41,7 @@ struct ei_traits<Zero<MatrixType> >
     ColsAtCompileTime = MatrixType::ColsAtCompileTime,
     MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-    Flags = MatrixType::Flags,
+    Flags = MatrixType::Flags & ~VectorizableBit,
     CoeffReadCost = NumTraits<Scalar>::ReadCost
   };
 };
diff --git a/test/main.h b/test/main.h
index 388a8973d..67b5b456f 100644
--- a/test/main.h
+++ b/test/main.h
@@ -55,10 +55,6 @@
   #define EI_PP_MAKE_STRING2(S) #S
   #define EI_PP_MAKE_STRING(S) EI_PP_MAKE_STRING2(S)
 
-  #ifdef assert
-  #undef assert
-  #endif
-
   // If EIGEN_DEBUG_ASSERTS is defined and if no assertion is raised while
   // one should have been, then the list of excecuted assertions is printed out.
   //
@@ -74,7 +70,7 @@
       static std::vector<std::string> ei_assert_list;
     }
 
-    #define assert(a)                       \
+    #define ei_assert(a)                       \
       if( (!(a)) && (!no_more_assert) )     \
       {                                     \
         Eigen::no_more_assert = true;       \
@@ -103,7 +99,7 @@
 
   #else // EIGEN_DEBUG_ASSERTS
 
-    #define assert(a)                       \
+    #define ei_assert(a)                       \
       if( (!(a)) && (!no_more_assert) )     \
       {                                     \
         Eigen::no_more_assert = true;       \