diff --git a/CMakeLists.txt b/CMakeLists.txt
index b533e3b66..95a8a2311 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,7 +7,7 @@ SET(CMAKE_INCLUDE_CURRENT_DIR ON)
 
 IF(CMAKE_COMPILER_IS_GNUCXX)
   IF(CMAKE_SYSTEM_NAME MATCHES Linux)
-    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O1 -g -pedantic -Wnon-virtual-dtor -Wno-long-long -ansi -Wundef -Wcast-align -Wchar-subscripts -Wall -W -Wpointer-arith -Wwrite-strings -Wformat-security -fno-exceptions -fno-check-new -fno-common -fstrict-aliasing")
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pedantic -Wnon-virtual-dtor -Wno-long-long -ansi -Wundef -Wcast-align -Wchar-subscripts -Wall -W -Wpointer-arith -Wwrite-strings -Wformat-security -fno-exceptions -fno-check-new -fno-common -fstrict-aliasing")
     IF(TEST_OPENMP)
       SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
       MESSAGE("Enabling OpenMP in tests/examples")
diff --git a/Eigen/Core b/Eigen/Core
index 3e1b5184d..c81341103 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -35,7 +35,9 @@ namespace Eigen {
 #include "src/Core/CwiseBinaryOp.h"
 #include "src/Core/CwiseUnaryOp.h"
 #include "src/Core/CwiseNullaryOp.h"
-#include "src/Core/ProductWIP.h"
+#include "src/Core/InverseProduct.h"
+#include "src/Core/CacheFriendlyProduct.h"
+#include "src/Core/Product.h"
 #include "src/Core/Block.h"
 #include "src/Core/Minor.h"
 #include "src/Core/Transpose.h"
@@ -51,7 +53,6 @@ namespace Eigen {
 #include "src/Core/CommaInitializer.h"
 #include "src/Core/Extract.h"
 #include "src/Core/Part.h"
-#include "src/Core/InverseProduct.h"
 
 } // namespace Eigen
 
diff --git a/Eigen/src/Core/CwiseUnaryOp.h b/Eigen/src/Core/CwiseUnaryOp.h
index 26197b369..8d2737e12 100644
--- a/Eigen/src/Core/CwiseUnaryOp.h
+++ b/Eigen/src/Core/CwiseUnaryOp.h
@@ -118,7 +118,7 @@ template<typename Derived>
 inline const CwiseUnaryOp<ei_scalar_opposite_op<typename ei_traits<Derived>::Scalar>,Derived>
 MatrixBase<Derived>::operator-() const
 {
-  return CwiseUnaryOp<ei_scalar_opposite_op<Scalar>, Derived>(derived());
+  return derived();
 }
 
 /** \returns an expression of the coefficient-wise absolute value of \c *this
@@ -127,7 +127,7 @@ template<typename Derived>
 inline const CwiseUnaryOp<ei_scalar_abs_op<typename ei_traits<Derived>::Scalar>,Derived>
 MatrixBase<Derived>::cwiseAbs() const
 {
-  return CwiseUnaryOp<ei_scalar_abs_op<Scalar>,Derived>(derived());
+  return derived();
 }
 
 /** \returns an expression of the coefficient-wise squared absolute value of \c *this
@@ -136,7 +136,7 @@ template<typename Derived>
 inline const CwiseUnaryOp<ei_scalar_abs2_op<typename ei_traits<Derived>::Scalar>,Derived>
 MatrixBase<Derived>::cwiseAbs2() const
 {
-  return CwiseUnaryOp<ei_scalar_abs2_op<Scalar>,Derived>(derived());
+  return derived();
 }
 
 /** \returns an expression of the complex conjugate of *this.
@@ -146,7 +146,7 @@ template<typename Derived>
 inline const CwiseUnaryOp<ei_scalar_conjugate_op<typename ei_traits<Derived>::Scalar>, Derived>
 MatrixBase<Derived>::conjugate() const
 {
-  return CwiseUnaryOp<ei_scalar_conjugate_op<Scalar>, Derived>(derived());
+  return derived();
 }
 
 /** \returns an expression of *this with the \a Scalar type casted to
@@ -161,7 +161,7 @@ template<typename NewType>
 inline const CwiseUnaryOp<ei_scalar_cast_op<typename ei_traits<Derived>::Scalar, NewType>, Derived>
 MatrixBase<Derived>::cast() const
 {
-  return CwiseUnaryOp<ei_scalar_cast_op<Scalar, NewType>, Derived>(derived());
+  return derived();
 }
 
 /** \relates MatrixBase */
@@ -201,7 +201,7 @@ template<typename Derived>
 inline const CwiseUnaryOp<ei_scalar_sqrt_op<typename ei_traits<Derived>::Scalar>, Derived>
 MatrixBase<Derived>::cwiseSqrt() const
 {
-  return CwiseUnaryOp<ei_scalar_sqrt_op<Scalar>, Derived>(derived());
+  return derived();
 }
 
 /** \returns an expression of the coefficient-wise exponential of *this. */
@@ -209,7 +209,7 @@ template<typename Derived>
 inline const CwiseUnaryOp<ei_scalar_exp_op<typename ei_traits<Derived>::Scalar>, Derived>
 MatrixBase<Derived>::cwiseExp() const
 {
-  return CwiseUnaryOp<ei_scalar_exp_op<Scalar>, Derived>(derived());
+  return derived();
 }
 
 /** \returns an expression of the coefficient-wise logarithm of *this. */
@@ -217,7 +217,7 @@ template<typename Derived>
 inline const CwiseUnaryOp<ei_scalar_log_op<typename ei_traits<Derived>::Scalar>, Derived>
 MatrixBase<Derived>::cwiseLog() const
 {
-  return CwiseUnaryOp<ei_scalar_log_op<Scalar>, Derived>(derived());
+  return derived();
 }
 
 /** \returns an expression of the coefficient-wise cosine of *this. */
@@ -225,7 +225,7 @@ template<typename Derived>
 inline const CwiseUnaryOp<ei_scalar_cos_op<typename ei_traits<Derived>::Scalar>, Derived>
 MatrixBase<Derived>::cwiseCos() const
 {
-  return CwiseUnaryOp<ei_scalar_cos_op<Scalar>, Derived>(derived());
+  return derived();
 }
 
 /** \returns an expression of the coefficient-wise sine of *this. */
@@ -233,10 +233,10 @@ template<typename Derived>
 inline const CwiseUnaryOp<ei_scalar_sin_op<typename ei_traits<Derived>::Scalar>, Derived>
 MatrixBase<Derived>::cwiseSin() const
 {
-  return CwiseUnaryOp<ei_scalar_sin_op<Scalar>, Derived>(derived());
+  return derived();
 }
 
-/** \relates MatrixBase */
+/** \returns an expression of the coefficient-wise power of *this to the given exponent. */
 template<typename Derived>
 inline const CwiseUnaryOp<ei_scalar_pow_op<typename ei_traits<Derived>::Scalar>, Derived>
 MatrixBase<Derived>::cwisePow(const Scalar& exponent) const
diff --git a/Eigen/src/Core/DiagonalMatrix.h b/Eigen/src/Core/DiagonalMatrix.h
index bd420511f..0581c669c 100644
--- a/Eigen/src/Core/DiagonalMatrix.h
+++ b/Eigen/src/Core/DiagonalMatrix.h
@@ -95,7 +95,7 @@ template<typename Derived>
 inline const DiagonalMatrix<Derived>
 MatrixBase<Derived>::asDiagonal() const
 {
-  return DiagonalMatrix<Derived>(derived());
+  return derived();
 }
 
 /** \returns true if *this is approximately equal to a diagonal matrix,
diff --git a/Eigen/src/Core/Flagged.h b/Eigen/src/Core/Flagged.h
index 9167c8a97..f287abee4 100644
--- a/Eigen/src/Core/Flagged.h
+++ b/Eigen/src/Core/Flagged.h
@@ -33,7 +33,7 @@
   * \param Added the flags added to the expression
   * \param Removed the flags removed from the expression (has priority over Added).
   *
-  * This class represents an expression whose flags have been modified
+  * This class represents an expression whose flags have been modified.
   * It is the return type of MatrixBase::flagged()
   * and most of the time this is the only way it is used.
   *
@@ -94,7 +94,11 @@ template<typename ExpressionType, unsigned int Added, unsigned int Removed> clas
     }
 
   protected:
-    const ExpressionType m_matrix;
+    const typename ei_meta_if<
+      Added & ~Removed & NestByValueBit,
+      ExpressionType,
+      typename ExpressionType::Nested
+    >::ret m_matrix;
 };
 
 /** \returns an expression of *this with added flags
@@ -121,7 +125,7 @@ MatrixBase<Derived>::lazy() const
   */
 template<typename Derived>
 inline const Flagged<Derived, NestByValueBit, 0>
-MatrixBase<Derived>::temporary() const
+MatrixBase<Derived>::nestByValue() const
 {
   return derived();
 }
diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h
index 18525a5d1..1a634ce37 100644
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@@ -452,7 +452,7 @@ template<typename Derived> class MatrixBase
     template<unsigned int Added>
     const Flagged<Derived, Added, 0> marked() const;
     const Flagged<Derived, 0, EvalBeforeNestingBit | EvalBeforeAssigningBit> lazy() const;
-    const Flagged<Derived, NestByValueBit, 0> temporary() const;
+    const Flagged<Derived, NestByValueBit, 0> nestByValue() const;
 
     /** \returns number of elements to skip to pass from one row (resp. column) to another
       * for a row-major (resp. column-major) matrix.
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index 5d3e99281..15867d704 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -41,7 +41,7 @@ template<int Size, typename Lhs, typename Rhs>
 struct ei_product_unroller<0, Size, Lhs, Rhs>
 {
   inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs,
-                               typename Lhs::Scalar &res)
+                  typename Lhs::Scalar &res)
   {
     res = lhs.coeff(row, 0) * rhs.coeff(0, col);
   }
@@ -60,12 +60,6 @@ struct ei_product_unroller<Index, 0, Lhs, Rhs>
   inline static void run(int, int, const Lhs&, const Rhs&, typename Lhs::Scalar&) {}
 };
 
-template<typename Lhs, typename Rhs>
-struct ei_product_unroller<0, Dynamic, Lhs, Rhs>
-{
-  static void run(int, int, const Lhs&, const Rhs&, typename Lhs::Scalar&) {}
-};
-
 template<bool RowMajor, int Index, int Size, typename Lhs, typename Rhs, typename PacketScalar>
 struct ei_packet_product_unroller;
 
@@ -119,12 +113,6 @@ struct ei_packet_product_unroller<false, Index, Dynamic, Lhs, Rhs, PacketScalar>
   inline static void run(int, int, const Lhs&, const Rhs&, PacketScalar&) {}
 };
 
-template<typename Lhs, typename Rhs, typename PacketScalar>
-struct ei_packet_product_unroller<false, 0, Dynamic, Lhs, Rhs, PacketScalar>
-{
-  static void run(int, int, const Lhs&, const Rhs&, PacketScalar&) {}
-};
-
 template<typename Product, bool RowMajor = true> struct ProductPacketCoeffImpl {
   inline static typename Product::PacketScalar execute(const Product& product, int row, int col)
   { return product._packetCoeffRowMajor(row,col); }
@@ -153,18 +141,74 @@ template<typename Lhs, typename Rhs> struct ei_product_eval_mode
 {
   enum{ value =  Lhs::MaxRowsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
               && Rhs::MaxColsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
-              && (!( (Lhs::Flags&RowMajorBit) && ((Rhs::Flags&RowMajorBit) ^ RowMajorBit)))
+              && Lhs::MaxColsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
               ? CacheFriendlyProduct : NormalProduct };
 };
 
+template<typename T> class ei_product_eval_to_column_major
+{
+    typedef typename ei_traits<T>::Scalar _Scalar;
+    enum {_MaxRows = ei_traits<T>::MaxRowsAtCompileTime,
+          _MaxCols = ei_traits<T>::MaxColsAtCompileTime,
+          _Flags = ei_traits<T>::Flags
+    };
+
+  public:
+    typedef Matrix<_Scalar,
+                  ei_traits<T>::RowsAtCompileTime,
+                  ei_traits<T>::ColsAtCompileTime,
+                  ei_corrected_matrix_flags<_Scalar, ei_size_at_compile_time<_MaxRows,_MaxCols>::ret, _Flags>::ret & ~RowMajorBit,
+                  ei_traits<T>::MaxRowsAtCompileTime,
+                  ei_traits<T>::MaxColsAtCompileTime> type;
+};
+
+template<typename T, int n=1> struct ei_product_nested_rhs
+{
+  typedef typename ei_meta_if<
+    (ei_traits<T>::Flags & NestByValueBit) && (!(ei_traits<T>::Flags & RowMajorBit)) && (int(ei_traits<T>::Flags) & DirectAccessBit),
+    T,
+    typename ei_meta_if<
+        ((ei_traits<T>::Flags & EvalBeforeNestingBit)
+      || (ei_traits<T>::Flags & RowMajorBit)
+      || (!(ei_traits<T>::Flags & DirectAccessBit))
+      || (n+1) * (NumTraits<typename ei_traits<T>::Scalar>::ReadCost) < (n-1) * T::CoeffReadCost),
+      typename ei_product_eval_to_column_major<T>::type,
+      const T&
+    >::ret
+  >::ret type;
+};
+
+template<typename T, int n=1> struct ei_product_nested_lhs
+{
+  typedef typename ei_meta_if<
+    ei_traits<T>::Flags & NestByValueBit && (int(ei_traits<T>::Flags) & DirectAccessBit),
+    T,
+    typename ei_meta_if<
+         int(ei_traits<T>::Flags) & EvalBeforeNestingBit
+      || (!(int(ei_traits<T>::Flags) & DirectAccessBit))
+      || (n+1) * int(NumTraits<typename ei_traits<T>::Scalar>::ReadCost) < (n-1) * int(T::CoeffReadCost),
+      typename ei_eval<T>::type,
+      const T&
+    >::ret
+  >::ret type;
+};
+
 template<typename Lhs, typename Rhs, int EvalMode>
 struct ei_traits<Product<Lhs, Rhs, EvalMode> >
 {
   typedef typename Lhs::Scalar Scalar;
-  typedef typename ei_nested<Lhs,Rhs::ColsAtCompileTime>::type LhsNested;
-  typedef typename ei_nested<Rhs,Lhs::RowsAtCompileTime>::type RhsNested;
-  typedef typename ei_unref<LhsNested>::type _LhsNested;
-  typedef typename ei_unref<RhsNested>::type _RhsNested;
+  // the cache friendly product evals lhs once only
+  // FIXME what to do if we chose to dynamically call the normal product from the cache friendly one for small matrices ?
+  typedef typename ei_meta_if<EvalMode==CacheFriendlyProduct,
+      typename ei_product_nested_lhs<Lhs,1>::type,
+      typename ei_nested<Lhs,Rhs::ColsAtCompileTime>::type>::ret LhsNested;
+
+  // NOTE that rhs must be ColumnMajor, so we might need a special nested type calculation
+  typedef typename ei_meta_if<EvalMode==CacheFriendlyProduct,
+      typename ei_product_nested_rhs<Rhs,Lhs::RowsAtCompileTime>::type,
+      typename ei_nested<Rhs,Lhs::RowsAtCompileTime>::type>::ret RhsNested;
+  typedef typename ei_unconst<typename ei_unref<LhsNested>::type>::type _LhsNested;
+  typedef typename ei_unconst<typename ei_unref<RhsNested>::type>::type _RhsNested;
   enum {
     LhsCoeffReadCost = _LhsNested::CoeffReadCost,
     RhsCoeffReadCost = _RhsNested::CoeffReadCost,
@@ -174,6 +218,8 @@ struct ei_traits<Product<Lhs, Rhs, EvalMode> >
     ColsAtCompileTime = Rhs::ColsAtCompileTime,
     MaxRowsAtCompileTime = Lhs::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = Rhs::MaxColsAtCompileTime,
+    // the vectorization flags are only used by the normal product,
+    // the other one is always vectorized !
     _RhsVectorizable = (RhsFlags & RowMajorBit) && (RhsFlags & VectorizableBit) && (ColsAtCompileTime % ei_packet_traits<Scalar>::size == 0),
     _LhsVectorizable = (!(LhsFlags & RowMajorBit)) && (LhsFlags & VectorizableBit) && (RowsAtCompileTime % ei_packet_traits<Scalar>::size == 0),
     _Vectorizable = (_LhsVectorizable || _RhsVectorizable) ? 1 : 0,
@@ -207,6 +253,10 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm
     typedef typename ei_traits<Product>::_LhsNested _LhsNested;
     typedef typename ei_traits<Product>::_RhsNested _RhsNested;
 
+    enum {
+      PacketSize = ei_packet_traits<Scalar>::size
+    };
+
     inline Product(const Lhs& lhs, const Rhs& rhs)
       : m_lhs(lhs), m_rhs(rhs)
     {
@@ -214,12 +264,12 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm
     }
 
     /** \internal */
-    template<typename DestDerived, int AlignedMode>
-    void _cacheOptimalEval(DestDerived& res, ei_meta_false) const;
-    #ifdef EIGEN_VECTORIZE
-    template<typename DestDerived, int AlignedMode>
-    void _cacheOptimalEval(DestDerived& res, ei_meta_true) const;
-    #endif
+    template<typename DestDerived>
+    void _cacheFriendlyEval(DestDerived& res) const;
+
+    /** \internal */
+    template<typename DestDerived>
+    void _cacheFriendlyEvalAndAdd(DestDerived& res) const;
 
   private:
 
@@ -252,7 +302,7 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm
       if(Lhs::ColsAtCompileTime <= EIGEN_UNROLLING_LIMIT)
       {
         PacketScalar res;
-        ei_packet_product_unroller<Flags&RowMajorBit, Lhs::ColsAtCompileTime-1,
+        ei_packet_product_unroller<Flags&RowMajorBit ? true : false, Lhs::ColsAtCompileTime-1,
                             Lhs::ColsAtCompileTime <= EIGEN_UNROLLING_LIMIT
                               ? Lhs::ColsAtCompileTime : Dynamic,
                             _LhsNested, _RhsNested, PacketScalar>
@@ -279,16 +329,10 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm
       for(int i = 1; i < m_lhs.cols(); i++)
         res =  ei_pmadd(m_lhs.template packetCoeff<Aligned>(row, i), ei_pset1(m_rhs.coeff(i, col)), res);
       return res;
-//       const PacketScalar tmp[4];
-//       ei_punpack(m_rhs.packetCoeff(0,col), tmp);
-//
-//       return
-//         ei_pmadd(m_lhs.packetCoeff(row, 0), tmp[0],
-//           ei_pmadd(m_lhs.packetCoeff(row, 1), tmp[1],
-//             ei_pmadd(m_lhs.packetCoeff(row, 2), tmp[2]
-//               ei_pmul(m_lhs.packetCoeff(row, 3), tmp[3]))));
     }
 
+    template<typename Lhs_, typename Rhs_, int EvalMode_, typename DestDerived_, bool DirectAccess_>
+    friend struct ei_cache_friendly_selector;
 
   protected:
     const LhsNested m_lhs;
@@ -296,9 +340,6 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm
 };
 
 /** \returns the matrix product of \c *this and \a other.
-  *
-  * \note This function causes an immediate evaluation. If you want to perform a matrix product
-  * without immediate evaluation, call .lazy() on one of the matrices before taking the product.
   *
   * \sa lazy(), operator*=(const MatrixBase&)
   */
@@ -322,168 +363,107 @@ MatrixBase<Derived>::operator*=(const MatrixBase<OtherDerived> &other)
   return *this = *this * other;
 }
 
+/** \internal */
+template<typename Derived>
+template<typename Lhs,typename Rhs>
+inline Derived&
+MatrixBase<Derived>::operator+=(const Flagged<Product<Lhs,Rhs,CacheFriendlyProduct>, 0, EvalBeforeNestingBit | EvalBeforeAssigningBit>& other)
+{
+  other._expression()._cacheFriendlyEvalAndAdd(const_cast_derived());
+  return derived();
+}
+
 template<typename Derived>
 template<typename Lhs, typename Rhs>
 inline Derived& MatrixBase<Derived>::lazyAssign(const Product<Lhs,Rhs,CacheFriendlyProduct>& product)
 {
-  product.template _cacheOptimalEval<Derived, Aligned>(derived(),
-    #ifdef EIGEN_VECTORIZE
-    typename ei_meta_if<Flags & VectorizableBit, ei_meta_true, ei_meta_false>::ret()
-    #else
-    ei_meta_false()
-    #endif
-    );
+  product._cacheFriendlyEval(derived());
   return derived();
 }
 
-template<typename Lhs, typename Rhs, int EvalMode>
-template<typename DestDerived, int AlignedMode>
-void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res, ei_meta_false) const
+template<typename Lhs, typename Rhs, int EvalMode, typename DestDerived, bool DirectAccess>
+struct ei_cache_friendly_selector
 {
-  res.setZero();
-  const int cols4 = m_lhs.cols() & 0xfffffffC;
-  if (Lhs::Flags&RowMajorBit)
+  typedef Product<Lhs,Rhs,EvalMode> Prod;
+  typedef typename Prod::_LhsNested _LhsNested;
+  typedef typename Prod::_RhsNested _RhsNested;
+  typedef typename Prod::Scalar Scalar;
+  static inline void eval(const Prod& product, DestDerived& res)
   {
-//     std::cout << "opt rhs\n";
-    int j=0;
-    for(; j<cols4; j+=4)
+    if ( product._rows()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+      && product._cols()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+      && product.m_lhs.cols()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+    )
     {
-      for(int k=0; k<this->rows(); ++k)
-      {
-        const Scalar tmp0 = m_lhs.coeff(k,j  );
-        const Scalar tmp1 = m_lhs.coeff(k,j+1);
-        const Scalar tmp2 = m_lhs.coeff(k,j+2);
-        const Scalar tmp3 = m_lhs.coeff(k,j+3);
-        for (int i=0; i<this->cols(); ++i)
-          res.coeffRef(k,i) += tmp0 * m_rhs.coeff(j+0,i) + tmp1 * m_rhs.coeff(j+1,i)
-                             + tmp2 * m_rhs.coeff(j+2,i) + tmp3 * m_rhs.coeff(j+3,i);
-      }
+      res.setZero();
+      ei_cache_friendly_product<Scalar>(
+        product._rows(), product._cols(), product.m_lhs.cols(),
+        _LhsNested::Flags&RowMajorBit, &(product.m_lhs.const_cast_derived().coeffRef(0,0)), product.m_lhs.stride(),
+        _RhsNested::Flags&RowMajorBit, &(product.m_rhs.const_cast_derived().coeffRef(0,0)), product.m_rhs.stride(),
+        Prod::Flags&RowMajorBit, &(res.coeffRef(0,0)), res.stride()
+      );
     }
-    for(; j<m_lhs.cols(); ++j)
+    else
     {
-      for(int k=0; k<this->rows(); ++k)
-      {
-        const Scalar tmp = m_rhs.coeff(k,j);
-        for (int i=0; i<this->cols(); ++i)
-          res.coeffRef(k,i) += tmp * m_lhs.coeff(j,i);
-      }
+      res = Product<_LhsNested,_RhsNested,NormalProduct>(product.m_lhs, product.m_rhs).lazy();
     }
   }
-  else
+
+  static inline void eval_and_add(const Prod& product, DestDerived& res)
   {
-//     std::cout << "opt lhs\n";
-    int j = 0;
-    for(; j<cols4; j+=4)
+    if ( product._rows()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+      && product._cols()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+      && product.m_lhs.cols()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+    )
     {
-      for(int k=0; k<this->cols(); ++k)
-      {
-        const Scalar tmp0 = m_rhs.coeff(j  ,k);
-        const Scalar tmp1 = m_rhs.coeff(j+1,k);
-        const Scalar tmp2 = m_rhs.coeff(j+2,k);
-        const Scalar tmp3 = m_rhs.coeff(j+3,k);
-        for (int i=0; i<this->rows(); ++i)
-          res.coeffRef(i,k) += tmp0 * m_lhs.coeff(i,j+0) + tmp1 * m_lhs.coeff(i,j+1)
-                             + tmp2 * m_lhs.coeff(i,j+2) + tmp3 * m_lhs.coeff(i,j+3);
-      }
+      ei_cache_friendly_product<Scalar>(
+        product._rows(), product._cols(), product.m_lhs.cols(),
+        _LhsNested::Flags&RowMajorBit, &(product.m_lhs.const_cast_derived().coeffRef(0,0)), product.m_lhs.stride(),
+        _RhsNested::Flags&RowMajorBit, &(product.m_rhs.const_cast_derived().coeffRef(0,0)), product.m_rhs.stride(),
+        Prod::Flags&RowMajorBit, &(res.coeffRef(0,0)), res.stride()
+      );
     }
-    for(; j<m_lhs.cols(); ++j)
+    else
     {
-      for(int k=0; k<this->cols(); ++k)
-      {
-        const Scalar tmp = m_rhs.coeff(j,k);
-        for (int i=0; i<this->rows(); ++i)
-          res.coeffRef(i,k) += tmp * m_lhs.coeff(i,j);
-      }
+      res += Product<_LhsNested,_RhsNested,NormalProduct>(product.m_lhs, product.m_rhs).lazy();
     }
   }
+};
+
+template<typename Lhs, typename Rhs, int EvalMode, typename DestDerived>
+struct ei_cache_friendly_selector<Lhs,Rhs,EvalMode,DestDerived,false>
+{
+  typedef Product<Lhs,Rhs,EvalMode> Prod;
+  typedef typename Prod::_LhsNested _LhsNested;
+  typedef typename Prod::_RhsNested _RhsNested;
+  typedef typename Prod::Scalar Scalar;
+  static inline void eval(const Prod& product, DestDerived& res)
+  {
+    res = Product<_LhsNested,_RhsNested,NormalProduct>(product.m_lhs, product.m_rhs).lazy();
+  }
+
+  static inline void eval_and_add(const Prod& product, DestDerived& res)
+  {
+    res += Product<_LhsNested,_RhsNested,NormalProduct>(product.m_lhs, product.m_rhs).lazy();
+  }
+};
+
+template<typename Lhs, typename Rhs, int EvalMode>
+template<typename DestDerived>
+inline void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEval(DestDerived& res) const
+{
+  ei_cache_friendly_selector<Lhs,Rhs,EvalMode,DestDerived,
+                             _LhsNested::Flags&_RhsNested::Flags&DirectAccessBit>
+    ::eval(*this, res);
 }
 
-#ifdef EIGEN_VECTORIZE
 template<typename Lhs, typename Rhs, int EvalMode>
-template<typename DestDerived, int AlignedMode>
-void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res, ei_meta_true) const
+template<typename DestDerived>
+inline void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalAndAdd(DestDerived& res) const
 {
-
-  if (((Lhs::Flags&RowMajorBit) && (_cols() % ei_packet_traits<Scalar>::size != 0))
-    || (_rows() % ei_packet_traits<Scalar>::size != 0))
-  {
-    return _cacheOptimalEval<DestDerived, AlignedMode>(res, ei_meta_false());
-  }
-
-  res.setZero();
-  const int cols4 = m_lhs.cols() & 0xfffffffC;
-  if (Lhs::Flags&RowMajorBit)
-  {
-//     std::cout << "packet rhs\n";
-    int j=0;
-    for(; j<cols4; j+=4)
-    {
-      for(int k=0; k<this->rows(); k++)
-      {
-        const typename ei_packet_traits<Scalar>::type tmp0 = ei_pset1(m_lhs.coeff(k,j+0));
-        const typename ei_packet_traits<Scalar>::type tmp1 = ei_pset1(m_lhs.coeff(k,j+1));
-        const typename ei_packet_traits<Scalar>::type tmp2 = ei_pset1(m_lhs.coeff(k,j+2));
-        const typename ei_packet_traits<Scalar>::type tmp3 = ei_pset1(m_lhs.coeff(k,j+3));
-        for (int i=0; i<this->cols(); i+=ei_packet_traits<Scalar>::size)
-        {
-          res.template writePacketCoeff<AlignedMode>(k,i,
-            ei_pmadd(tmp0, m_rhs.template packetCoeff<AlignedMode>(j+0,i),
-              ei_pmadd(tmp1, m_rhs.template packetCoeff<AlignedMode>(j+1,i),
-                ei_pmadd(tmp2, m_rhs.template packetCoeff<AlignedMode>(j+2,i),
-                  ei_pmadd(tmp3, m_rhs.template packetCoeff<AlignedMode>(j+3,i),
-                    res.template packetCoeff<AlignedMode>(k,i)))))
-          );
-        }
-      }
-    }
-    for(; j<m_lhs.cols(); ++j)
-    {
-      for(int k=0; k<this->rows(); k++)
-      {
-        const typename ei_packet_traits<Scalar>::type tmp = ei_pset1(m_lhs.coeff(k,j));
-        for (int i=0; i<this->cols(); i+=ei_packet_traits<Scalar>::size)
-          res.template writePacketCoeff<AlignedMode>(k,i,
-            ei_pmadd(tmp, m_rhs.template packetCoeff<AlignedMode>(j,i), res.template packetCoeff<AlignedMode>(k,i)));
-      }
-    }
-  }
-  else
-  {
-//     std::cout << "packet lhs\n";
-    int k=0;
-    for(; k<cols4; k+=4)
-    {
-      for(int j=0; j<this->cols(); j+=1)
-      {
-        const typename ei_packet_traits<Scalar>::type tmp0 = ei_pset1(m_rhs.coeff(k+0,j));
-        const typename ei_packet_traits<Scalar>::type tmp1 = ei_pset1(m_rhs.coeff(k+1,j));
-        const typename ei_packet_traits<Scalar>::type tmp2 = ei_pset1(m_rhs.coeff(k+2,j));
-        const typename ei_packet_traits<Scalar>::type tmp3 = ei_pset1(m_rhs.coeff(k+3,j));
-
-        for (int i=0; i<this->rows(); i+=ei_packet_traits<Scalar>::size)
-        {
-          res.template writePacketCoeff<AlignedMode>(i,j,
-            ei_pmadd(tmp0, m_lhs.template packetCoeff<AlignedMode>(i,k),
-              ei_pmadd(tmp1, m_lhs.template packetCoeff<AlignedMode>(i,k+1),
-                ei_pmadd(tmp2, m_lhs.template packetCoeff<AlignedMode>(i,k+2),
-                  ei_pmadd(tmp3, m_lhs.template packetCoeff<AlignedMode>(i,k+3),
-                    res.template packetCoeff<AlignedMode>(i,j)))))
-          );
-        }
-      }
-    }
-    for(; k<m_lhs.cols(); ++k)
-    {
-      for(int j=0; j<this->cols(); j++)
-      {
-        const typename ei_packet_traits<Scalar>::type tmp = ei_pset1(m_rhs.coeff(k,j));
-        for (int i=0; i<this->rows(); i+=ei_packet_traits<Scalar>::size)
-          res.template writePacketCoeff<AlignedMode>(k,j,
-            ei_pmadd(tmp, m_lhs.template packetCoeff<AlignedMode>(i,k), res.template packetCoeff<AlignedMode>(i,j)));
-      }
-    }
-  }
+  ei_cache_friendly_selector<Lhs,Rhs,EvalMode,DestDerived,
+                             _LhsNested::Flags&_RhsNested::Flags&DirectAccessBit>
+    ::eval_and_add(*this, res);
 }
-#endif // EIGEN_VECTORIZE
 
 #endif // EIGEN_PRODUCT_H
diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h
index fd28f4bab..c536a4608 100644
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@@ -107,7 +107,7 @@ template<typename Derived>
 inline Transpose<Derived>
 MatrixBase<Derived>::transpose()
 {
-  return Transpose<Derived>(derived());
+  return derived();
 }
 
 /** This is the const version of transpose(). \sa adjoint() */
@@ -115,7 +115,7 @@ template<typename Derived>
 inline const Transpose<Derived>
 MatrixBase<Derived>::transpose() const
 {
-  return Transpose<Derived>(derived());
+  return derived();
 }
 
 /** \returns an expression of the adjoint (i.e. conjugate transpose) of *this.
@@ -130,7 +130,7 @@ inline const Transpose<
                , NestByValueBit, 0> >
 MatrixBase<Derived>::adjoint() const
 {
-  return conjugate().temporary().transpose();
+  return conjugate().nestByValue();
 }
 
 #endif // EIGEN_TRANSPOSE_H
diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h
index e599d8a3d..0e6ed4b21 100644
--- a/Eigen/src/Core/util/Constants.h
+++ b/Eigen/src/Core/util/Constants.h
@@ -53,16 +53,16 @@ const unsigned int HereditaryBits = RowMajorBit
                                   | EvalBeforeAssigningBit
                                   | LargeBit;
 
-// Possible values for the PartType parameter of part() and the ExtractType parameter of extract()
+// Possible values for the Mode parameter of part() and of extract()
 const unsigned int Upper = UpperTriangularBit;
 const unsigned int StrictlyUpper = UpperTriangularBit | ZeroDiagBit;
 const unsigned int Lower = LowerTriangularBit;
 const unsigned int StrictlyLower = LowerTriangularBit | ZeroDiagBit;
 
-// additional possible values for the PartType parameter of part()
+// additional possible values for the Mode parameter of part()
 const unsigned int SelfAdjoint = SelfAdjointBit;
 
-// additional possible values for the ExtractType parameter of extract()
+// additional possible values for the Mode parameter of extract()
 const unsigned int UnitUpper = UpperTriangularBit | UnitDiagBit;
 const unsigned int UnitLower = LowerTriangularBit | UnitDiagBit;
 
diff --git a/Eigen/src/Core/ProductWIP.h b/disabled/Product.h
similarity index 61%
rename from Eigen/src/Core/ProductWIP.h
rename to disabled/Product.h
index d1bc86a13..5d3e99281 100644
--- a/Eigen/src/Core/ProductWIP.h
+++ b/disabled/Product.h
@@ -26,8 +26,6 @@
 #ifndef EIGEN_PRODUCT_H
 #define EIGEN_PRODUCT_H
 
-#include "CacheFriendlyProduct.h"
-
 template<int Index, int Size, typename Lhs, typename Rhs>
 struct ei_product_unroller
 {
@@ -43,7 +41,7 @@ template<int Size, typename Lhs, typename Rhs>
 struct ei_product_unroller<0, Size, Lhs, Rhs>
 {
   inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs,
-                  typename Lhs::Scalar &res)
+                               typename Lhs::Scalar &res)
   {
     res = lhs.coeff(row, 0) * rhs.coeff(0, col);
   }
@@ -62,6 +60,12 @@ struct ei_product_unroller<Index, 0, Lhs, Rhs>
   inline static void run(int, int, const Lhs&, const Rhs&, typename Lhs::Scalar&) {}
 };
 
+template<typename Lhs, typename Rhs>
+struct ei_product_unroller<0, Dynamic, Lhs, Rhs>
+{
+  static void run(int, int, const Lhs&, const Rhs&, typename Lhs::Scalar&) {}
+};
+
 template<bool RowMajor, int Index, int Size, typename Lhs, typename Rhs, typename PacketScalar>
 struct ei_packet_product_unroller;
 
@@ -115,6 +119,12 @@ struct ei_packet_product_unroller<false, Index, Dynamic, Lhs, Rhs, PacketScalar>
   inline static void run(int, int, const Lhs&, const Rhs&, PacketScalar&) {}
 };
 
+template<typename Lhs, typename Rhs, typename PacketScalar>
+struct ei_packet_product_unroller<false, 0, Dynamic, Lhs, Rhs, PacketScalar>
+{
+  static void run(int, int, const Lhs&, const Rhs&, PacketScalar&) {}
+};
+
 template<typename Product, bool RowMajor = true> struct ProductPacketCoeffImpl {
   inline static typename Product::PacketScalar execute(const Product& product, int row, int col)
   { return product._packetCoeffRowMajor(row,col); }
@@ -143,74 +153,18 @@ template<typename Lhs, typename Rhs> struct ei_product_eval_mode
 {
   enum{ value =  Lhs::MaxRowsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
               && Rhs::MaxColsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
-              && Lhs::MaxColsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+              && (!( (Lhs::Flags&RowMajorBit) && ((Rhs::Flags&RowMajorBit) ^ RowMajorBit)))
               ? CacheFriendlyProduct : NormalProduct };
 };
 
-template<typename T> class ei_product_eval_to_column_major
-{
-    typedef typename ei_traits<T>::Scalar _Scalar;
-    enum {_MaxRows = ei_traits<T>::MaxRowsAtCompileTime,
-          _MaxCols = ei_traits<T>::MaxColsAtCompileTime,
-          _Flags = ei_traits<T>::Flags
-    };
-
-  public:
-    typedef Matrix<_Scalar,
-                  ei_traits<T>::RowsAtCompileTime,
-                  ei_traits<T>::ColsAtCompileTime,
-                  ei_corrected_matrix_flags<_Scalar, ei_size_at_compile_time<_MaxRows,_MaxCols>::ret, _Flags>::ret & ~RowMajorBit,
-                  ei_traits<T>::MaxRowsAtCompileTime,
-                  ei_traits<T>::MaxColsAtCompileTime> type;
-};
-
-template<typename T, int n=1> struct ei_product_nested_rhs
-{
-  typedef typename ei_meta_if<
-    (ei_traits<T>::Flags & NestByValueBit) && (!(ei_traits<T>::Flags & RowMajorBit)) && (int(ei_traits<T>::Flags) & DirectAccessBit),
-    T,
-    typename ei_meta_if<
-        ((ei_traits<T>::Flags & EvalBeforeNestingBit)
-      || (ei_traits<T>::Flags & RowMajorBit)
-      || (!(ei_traits<T>::Flags & DirectAccessBit))
-      || (n+1) * (NumTraits<typename ei_traits<T>::Scalar>::ReadCost) < (n-1) * T::CoeffReadCost),
-      typename ei_product_eval_to_column_major<T>::type,
-      const T&
-    >::ret
-  >::ret type;
-};
-
-template<typename T, int n=1> struct ei_product_nested_lhs
-{
-  typedef typename ei_meta_if<
-    ei_traits<T>::Flags & NestByValueBit && (int(ei_traits<T>::Flags) & DirectAccessBit),
-    T,
-    typename ei_meta_if<
-         int(ei_traits<T>::Flags) & EvalBeforeNestingBit
-      || (!(int(ei_traits<T>::Flags) & DirectAccessBit))
-      || (n+1) * int(NumTraits<typename ei_traits<T>::Scalar>::ReadCost) < (n-1) * int(T::CoeffReadCost),
-      typename ei_eval<T>::type,
-      const T&
-    >::ret
-  >::ret type;
-};
-
 template<typename Lhs, typename Rhs, int EvalMode>
 struct ei_traits<Product<Lhs, Rhs, EvalMode> >
 {
   typedef typename Lhs::Scalar Scalar;
-  // the cache friendly product evals lhs once only
-  // FIXME what to do if we chose to dynamically call the normal product from the cache friendly one for small matrices ?
-  typedef typename ei_meta_if<EvalMode==CacheFriendlyProduct,
-      typename ei_product_nested_lhs<Lhs,1>::type,
-      typename ei_nested<Lhs,Rhs::ColsAtCompileTime>::type>::ret LhsNested;
-
-  // NOTE that rhs must be ColumnMajor, so we might need a special nested type calculation
-  typedef typename ei_meta_if<EvalMode==CacheFriendlyProduct,
-      typename ei_product_nested_rhs<Rhs,Lhs::RowsAtCompileTime>::type,
-      typename ei_nested<Rhs,Lhs::RowsAtCompileTime>::type>::ret RhsNested;
-  typedef typename ei_unconst<typename ei_unref<LhsNested>::type>::type _LhsNested;
-  typedef typename ei_unconst<typename ei_unref<RhsNested>::type>::type _RhsNested;
+  typedef typename ei_nested<Lhs,Rhs::ColsAtCompileTime>::type LhsNested;
+  typedef typename ei_nested<Rhs,Lhs::RowsAtCompileTime>::type RhsNested;
+  typedef typename ei_unref<LhsNested>::type _LhsNested;
+  typedef typename ei_unref<RhsNested>::type _RhsNested;
   enum {
     LhsCoeffReadCost = _LhsNested::CoeffReadCost,
     RhsCoeffReadCost = _RhsNested::CoeffReadCost,
@@ -220,8 +174,6 @@ struct ei_traits<Product<Lhs, Rhs, EvalMode> >
     ColsAtCompileTime = Rhs::ColsAtCompileTime,
     MaxRowsAtCompileTime = Lhs::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = Rhs::MaxColsAtCompileTime,
-    // the vectorization flags are only used by the normal product,
-    // the other one is always vectorized !
     _RhsVectorizable = (RhsFlags & RowMajorBit) && (RhsFlags & VectorizableBit) && (ColsAtCompileTime % ei_packet_traits<Scalar>::size == 0),
     _LhsVectorizable = (!(LhsFlags & RowMajorBit)) && (LhsFlags & VectorizableBit) && (RowsAtCompileTime % ei_packet_traits<Scalar>::size == 0),
     _Vectorizable = (_LhsVectorizable || _RhsVectorizable) ? 1 : 0,
@@ -255,10 +207,6 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm
     typedef typename ei_traits<Product>::_LhsNested _LhsNested;
     typedef typename ei_traits<Product>::_RhsNested _RhsNested;
 
-    enum {
-      PacketSize = ei_packet_traits<Scalar>::size
-    };
-
     inline Product(const Lhs& lhs, const Rhs& rhs)
       : m_lhs(lhs), m_rhs(rhs)
     {
@@ -266,12 +214,12 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm
     }
 
     /** \internal */
-    template<typename DestDerived>
-    void _cacheFriendlyEval(DestDerived& res) const;
-
-    /** \internal */
-    template<typename DestDerived>
-    void _cacheFriendlyEvalAndAdd(DestDerived& res) const;
+    template<typename DestDerived, int AlignedMode>
+    void _cacheOptimalEval(DestDerived& res, ei_meta_false) const;
+    #ifdef EIGEN_VECTORIZE
+    template<typename DestDerived, int AlignedMode>
+    void _cacheOptimalEval(DestDerived& res, ei_meta_true) const;
+    #endif
 
   private:
 
@@ -304,7 +252,7 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm
       if(Lhs::ColsAtCompileTime <= EIGEN_UNROLLING_LIMIT)
       {
         PacketScalar res;
-        ei_packet_product_unroller<Flags&RowMajorBit ? true : false, Lhs::ColsAtCompileTime-1,
+        ei_packet_product_unroller<Flags&RowMajorBit, Lhs::ColsAtCompileTime-1,
                             Lhs::ColsAtCompileTime <= EIGEN_UNROLLING_LIMIT
                               ? Lhs::ColsAtCompileTime : Dynamic,
                             _LhsNested, _RhsNested, PacketScalar>
@@ -331,10 +279,16 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm
       for(int i = 1; i < m_lhs.cols(); i++)
         res =  ei_pmadd(m_lhs.template packetCoeff<Aligned>(row, i), ei_pset1(m_rhs.coeff(i, col)), res);
       return res;
+//       const PacketScalar tmp[4];
+//       ei_punpack(m_rhs.packetCoeff(0,col), tmp);
+//
+//       return
+//         ei_pmadd(m_lhs.packetCoeff(row, 0), tmp[0],
+//           ei_pmadd(m_lhs.packetCoeff(row, 1), tmp[1],
+//             ei_pmadd(m_lhs.packetCoeff(row, 2), tmp[2]
+//               ei_pmul(m_lhs.packetCoeff(row, 3), tmp[3]))));
     }
 
-    template<typename Lhs_, typename Rhs_, int EvalMode_, typename DestDerived_, bool DirectAccess_>
-    friend struct ei_cache_friendly_selector;
 
   protected:
     const LhsNested m_lhs;
@@ -342,6 +296,9 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm
 };
 
 /** \returns the matrix product of \c *this and \a other.
+  *
+  * \note This function causes an immediate evaluation. If you want to perform a matrix product
+  * without immediate evaluation, call .lazy() on one of the matrices before taking the product.
   *
   * \sa lazy(), operator*=(const MatrixBase&)
   */
@@ -365,107 +322,168 @@ MatrixBase<Derived>::operator*=(const MatrixBase<OtherDerived> &other)
   return *this = *this * other;
 }
 
-/** \internal */
-template<typename Derived>
-template<typename Lhs,typename Rhs>
-inline Derived&
-MatrixBase<Derived>::operator+=(const Flagged<Product<Lhs,Rhs,CacheFriendlyProduct>, 0, EvalBeforeNestingBit | EvalBeforeAssigningBit>& other)
-{
-  other._expression()._cacheFriendlyEvalAndAdd(const_cast_derived());
-  return derived();
-}
-
 template<typename Derived>
 template<typename Lhs, typename Rhs>
 inline Derived& MatrixBase<Derived>::lazyAssign(const Product<Lhs,Rhs,CacheFriendlyProduct>& product)
 {
-  product._cacheFriendlyEval(derived());
+  product.template _cacheOptimalEval<Derived, Aligned>(derived(),
+    #ifdef EIGEN_VECTORIZE
+    typename ei_meta_if<Flags & VectorizableBit, ei_meta_true, ei_meta_false>::ret()
+    #else
+    ei_meta_false()
+    #endif
+    );
   return derived();
 }
 
-template<typename Lhs, typename Rhs, int EvalMode, typename DestDerived, bool DirectAccess>
-struct ei_cache_friendly_selector
-{
-  typedef Product<Lhs,Rhs,EvalMode> Prod;
-  typedef typename Prod::_LhsNested _LhsNested;
-  typedef typename Prod::_RhsNested _RhsNested;
-  typedef typename Prod::Scalar Scalar;
-  static inline void eval(const Prod& product, DestDerived& res)
-  {
-    if ( product._rows()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
-      && product._cols()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
-      && product.m_lhs.cols()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
-    )
-    {
-      res.setZero();
-      ei_cache_friendly_product<Scalar>(
-        product._rows(), product._cols(), product.m_lhs.cols(),
-        _LhsNested::Flags&RowMajorBit, &(product.m_lhs.const_cast_derived().coeffRef(0,0)), product.m_lhs.stride(),
-        _RhsNested::Flags&RowMajorBit, &(product.m_rhs.const_cast_derived().coeffRef(0,0)), product.m_rhs.stride(),
-        Prod::Flags&RowMajorBit, &(res.coeffRef(0,0)), res.stride()
-      );
-    }
-    else
-    {
-      res = Product<_LhsNested,_RhsNested,NormalProduct>(product.m_lhs, product.m_rhs).lazy();
-    }
-  }
-
-  static inline void eval_and_add(const Prod& product, DestDerived& res)
-  {
-    if ( product._rows()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
-      && product._cols()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
-      && product.m_lhs.cols()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
-    )
-    {
-      ei_cache_friendly_product<Scalar>(
-        product._rows(), product._cols(), product.m_lhs.cols(),
-        _LhsNested::Flags&RowMajorBit, &(product.m_lhs.const_cast_derived().coeffRef(0,0)), product.m_lhs.stride(),
-        _RhsNested::Flags&RowMajorBit, &(product.m_rhs.const_cast_derived().coeffRef(0,0)), product.m_rhs.stride(),
-        Prod::Flags&RowMajorBit, &(res.coeffRef(0,0)), res.stride()
-      );
-    }
-    else
-    {
-      res += Product<_LhsNested,_RhsNested,NormalProduct>(product.m_lhs, product.m_rhs).lazy();
-    }
-  }
-};
-
-template<typename Lhs, typename Rhs, int EvalMode, typename DestDerived>
-struct ei_cache_friendly_selector<Lhs,Rhs,EvalMode,DestDerived,false>
-{
-  typedef Product<Lhs,Rhs,EvalMode> Prod;
-  typedef typename Prod::_LhsNested _LhsNested;
-  typedef typename Prod::_RhsNested _RhsNested;
-  typedef typename Prod::Scalar Scalar;
-  static inline void eval(const Prod& product, DestDerived& res)
-  {
-    res = Product<_LhsNested,_RhsNested,NormalProduct>(product.m_lhs, product.m_rhs).lazy();
-  }
-
-  static inline void eval_and_add(const Prod& product, DestDerived& res)
-  {
-    res += Product<_LhsNested,_RhsNested,NormalProduct>(product.m_lhs, product.m_rhs).lazy();
-  }
-};
-
 template<typename Lhs, typename Rhs, int EvalMode>
-template<typename DestDerived>
-inline void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEval(DestDerived& res) const
+template<typename DestDerived, int AlignedMode>
+void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res, ei_meta_false) const
 {
-  ei_cache_friendly_selector<Lhs,Rhs,EvalMode,DestDerived,
-                             _LhsNested::Flags&_RhsNested::Flags&DirectAccessBit>
-    ::eval(*this, res);
+  res.setZero();
+  const int cols4 = m_lhs.cols() & 0xfffffffC;
+  if (Lhs::Flags&RowMajorBit)
+  {
+//     std::cout << "opt rhs\n";
+    int j=0;
+    for(; j<cols4; j+=4)
+    {
+      for(int k=0; k<this->rows(); ++k)
+      {
+        const Scalar tmp0 = m_lhs.coeff(k,j  );
+        const Scalar tmp1 = m_lhs.coeff(k,j+1);
+        const Scalar tmp2 = m_lhs.coeff(k,j+2);
+        const Scalar tmp3 = m_lhs.coeff(k,j+3);
+        for (int i=0; i<this->cols(); ++i)
+          res.coeffRef(k,i) += tmp0 * m_rhs.coeff(j+0,i) + tmp1 * m_rhs.coeff(j+1,i)
+                             + tmp2 * m_rhs.coeff(j+2,i) + tmp3 * m_rhs.coeff(j+3,i);
+      }
+    }
+    for(; j<m_lhs.cols(); ++j)
+    {
+      for(int k=0; k<this->rows(); ++k)
+      {
+        const Scalar tmp = m_rhs.coeff(k,j);
+        for (int i=0; i<this->cols(); ++i)
+          res.coeffRef(k,i) += tmp * m_lhs.coeff(j,i);
+      }
+    }
+  }
+  else
+  {
+//     std::cout << "opt lhs\n";
+    int j = 0;
+    for(; j<cols4; j+=4)
+    {
+      for(int k=0; k<this->cols(); ++k)
+      {
+        const Scalar tmp0 = m_rhs.coeff(j  ,k);
+        const Scalar tmp1 = m_rhs.coeff(j+1,k);
+        const Scalar tmp2 = m_rhs.coeff(j+2,k);
+        const Scalar tmp3 = m_rhs.coeff(j+3,k);
+        for (int i=0; i<this->rows(); ++i)
+          res.coeffRef(i,k) += tmp0 * m_lhs.coeff(i,j+0) + tmp1 * m_lhs.coeff(i,j+1)
+                             + tmp2 * m_lhs.coeff(i,j+2) + tmp3 * m_lhs.coeff(i,j+3);
+      }
+    }
+    for(; j<m_lhs.cols(); ++j)
+    {
+      for(int k=0; k<this->cols(); ++k)
+      {
+        const Scalar tmp = m_rhs.coeff(j,k);
+        for (int i=0; i<this->rows(); ++i)
+          res.coeffRef(i,k) += tmp * m_lhs.coeff(i,j);
+      }
+    }
+  }
 }
 
+#ifdef EIGEN_VECTORIZE
 template<typename Lhs, typename Rhs, int EvalMode>
-template<typename DestDerived>
-inline void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalAndAdd(DestDerived& res) const
+template<typename DestDerived, int AlignedMode>
+void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res, ei_meta_true) const
 {
-  ei_cache_friendly_selector<Lhs,Rhs,EvalMode,DestDerived,
-                             _LhsNested::Flags&_RhsNested::Flags&DirectAccessBit>
-    ::eval_and_add(*this, res);
+
+  if (((Lhs::Flags&RowMajorBit) && (_cols() % ei_packet_traits<Scalar>::size != 0))
+    || (_rows() % ei_packet_traits<Scalar>::size != 0))
+  {
+    return _cacheOptimalEval<DestDerived, AlignedMode>(res, ei_meta_false());
+  }
+
+  res.setZero();
+  const int cols4 = m_lhs.cols() & 0xfffffffC;
+  if (Lhs::Flags&RowMajorBit)
+  {
+//     std::cout << "packet rhs\n";
+    int j=0;
+    for(; j<cols4; j+=4)
+    {
+      for(int k=0; k<this->rows(); k++)
+      {
+        const typename ei_packet_traits<Scalar>::type tmp0 = ei_pset1(m_lhs.coeff(k,j+0));
+        const typename ei_packet_traits<Scalar>::type tmp1 = ei_pset1(m_lhs.coeff(k,j+1));
+        const typename ei_packet_traits<Scalar>::type tmp2 = ei_pset1(m_lhs.coeff(k,j+2));
+        const typename ei_packet_traits<Scalar>::type tmp3 = ei_pset1(m_lhs.coeff(k,j+3));
+        for (int i=0; i<this->cols(); i+=ei_packet_traits<Scalar>::size)
+        {
+          res.template writePacketCoeff<AlignedMode>(k,i,
+            ei_pmadd(tmp0, m_rhs.template packetCoeff<AlignedMode>(j+0,i),
+              ei_pmadd(tmp1, m_rhs.template packetCoeff<AlignedMode>(j+1,i),
+                ei_pmadd(tmp2, m_rhs.template packetCoeff<AlignedMode>(j+2,i),
+                  ei_pmadd(tmp3, m_rhs.template packetCoeff<AlignedMode>(j+3,i),
+                    res.template packetCoeff<AlignedMode>(k,i)))))
+          );
+        }
+      }
+    }
+    for(; j<m_lhs.cols(); ++j)
+    {
+      for(int k=0; k<this->rows(); k++)
+      {
+        const typename ei_packet_traits<Scalar>::type tmp = ei_pset1(m_lhs.coeff(k,j));
+        for (int i=0; i<this->cols(); i+=ei_packet_traits<Scalar>::size)
+          res.template writePacketCoeff<AlignedMode>(k,i,
+            ei_pmadd(tmp, m_rhs.template packetCoeff<AlignedMode>(j,i), res.template packetCoeff<AlignedMode>(k,i)));
+      }
+    }
+  }
+  else
+  {
+//     std::cout << "packet lhs\n";
+    int k=0;
+    for(; k<cols4; k+=4)
+    {
+      for(int j=0; j<this->cols(); j+=1)
+      {
+        const typename ei_packet_traits<Scalar>::type tmp0 = ei_pset1(m_rhs.coeff(k+0,j));
+        const typename ei_packet_traits<Scalar>::type tmp1 = ei_pset1(m_rhs.coeff(k+1,j));
+        const typename ei_packet_traits<Scalar>::type tmp2 = ei_pset1(m_rhs.coeff(k+2,j));
+        const typename ei_packet_traits<Scalar>::type tmp3 = ei_pset1(m_rhs.coeff(k+3,j));
+
+        for (int i=0; i<this->rows(); i+=ei_packet_traits<Scalar>::size)
+        {
+          res.template writePacketCoeff<AlignedMode>(i,j,
+            ei_pmadd(tmp0, m_lhs.template packetCoeff<AlignedMode>(i,k),
+              ei_pmadd(tmp1, m_lhs.template packetCoeff<AlignedMode>(i,k+1),
+                ei_pmadd(tmp2, m_lhs.template packetCoeff<AlignedMode>(i,k+2),
+                  ei_pmadd(tmp3, m_lhs.template packetCoeff<AlignedMode>(i,k+3),
+                    res.template packetCoeff<AlignedMode>(i,j)))))
+          );
+        }
+      }
+    }
+    for(; k<m_lhs.cols(); ++k)
+    {
+      for(int j=0; j<this->cols(); j++)
+      {
+        const typename ei_packet_traits<Scalar>::type tmp = ei_pset1(m_rhs.coeff(k,j));
+        for (int i=0; i<this->rows(); i+=ei_packet_traits<Scalar>::size)
+          res.template writePacketCoeff<AlignedMode>(k,j,
+            ei_pmadd(tmp, m_lhs.template packetCoeff<AlignedMode>(i,k), res.template packetCoeff<AlignedMode>(i,j)));
+      }
+    }
+  }
 }
+#endif // EIGEN_VECTORIZE
 
 #endif // EIGEN_PRODUCT_H
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index 39af38afe..4e1a7696d 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -1,5 +1,11 @@
 IF(BUILD_DOC)
 
+IF(CMAKE_COMPILER_IS_GNUCXX)
+  IF(CMAKE_SYSTEM_NAME MATCHES Linux)
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O1 -g1")
+  ENDIF(CMAKE_SYSTEM_NAME MATCHES Linux)
+ENDIF(CMAKE_COMPILER_IS_GNUCXX)
+
 CONFIGURE_FILE(
   ${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile.in
   ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 6cd98800a..238006bdf 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,5 +1,11 @@
 IF(BUILD_TESTS)
 
+IF(CMAKE_COMPILER_IS_GNUCXX)
+  IF(CMAKE_SYSTEM_NAME MATCHES Linux)
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O1 -g2")
+  ENDIF(CMAKE_SYSTEM_NAME MATCHES Linux)
+ENDIF(CMAKE_COMPILER_IS_GNUCXX)
+
 OPTION(EIGEN_NO_ASSERTION_CHECKING "Disable checking of assertions" OFF)
 
 # similar to SET_TARGET_PROPERTIES but append the property instead of overwritting it
@@ -64,14 +70,14 @@ FIND_PACKAGE(Qt4 REQUIRED)
 INCLUDE_DIRECTORIES( ${QT_INCLUDE_DIR} )
 
 EI_ADD_TEST(basicstuff)
-EI_ADD_TEST(miscmatrices)
+EI_ADD_TEST(linearstructure)
+EI_ADD_TEST(cwiseop)
+EI_ADD_TEST(product)
 EI_ADD_TEST(adjoint)
 EI_ADD_TEST(submatrices)
+EI_ADD_TEST(miscmatrices)
 EI_ADD_TEST(smallvectors)
-EI_ADD_TEST(cwiseop)
 EI_ADD_TEST(map)
-EI_ADD_TEST(linearstructure)
-EI_ADD_TEST(product)
 EI_ADD_TEST(triangular)
 EI_ADD_TEST(cholesky)
 # EI_ADD_TEST(determinant)
diff --git a/test/adjoint.cpp b/test/adjoint.cpp
index ae1b002e5..9a15c4986 100644
--- a/test/adjoint.cpp
+++ b/test/adjoint.cpp
@@ -51,23 +51,15 @@ template<typename MatrixType> void adjoint(const MatrixType& m)
   Scalar s1 = ei_random<Scalar>(),
          s2 = ei_random<Scalar>();
 
-  // check involutivity of adjoint, transpose, conjugate
-  VERIFY_IS_APPROX(m1.transpose().transpose(),              m1);
-  VERIFY_IS_APPROX(m1.conjugate().conjugate(),              m1);
-  VERIFY_IS_APPROX(m1.adjoint().adjoint(),                  m1);
-
   // check basic compatibility of adjoint, transpose, conjugate
   VERIFY_IS_APPROX(m1.transpose().conjugate().adjoint(),    m1);
   VERIFY_IS_APPROX(m1.adjoint().conjugate().transpose(),    m1);
-  if(!NumTraits<Scalar>::IsComplex)
-    VERIFY_IS_APPROX(m1.adjoint().transpose(),              m1);
 
   // check multiplicative behavior
-  VERIFY_IS_APPROX((m1.transpose() * m2).transpose(),       m2.transpose() * m1);
+  std::cout << (m1.adjoint() * m2).adjoint() << std::endl;
+  std::cout << "------------------------------" << std::endl;
+  std::cout << m2.adjoint() * m1 << std::endl;
   VERIFY_IS_APPROX((m1.adjoint() * m2).adjoint(),           m2.adjoint() * m1);
-  VERIFY_IS_APPROX((m1.transpose() * m2).conjugate(),       m1.adjoint() * m2.conjugate());
-  VERIFY_IS_APPROX((s1 * m1).transpose(),                   s1 * m1.transpose());
-  VERIFY_IS_APPROX((s1 * m1).conjugate(),                   ei_conj(s1) * m1.conjugate());
   VERIFY_IS_APPROX((s1 * m1).adjoint(),                     ei_conj(s1) * m1.adjoint());
 
   // check basic properties of dot, norm, norm2
diff --git a/test/linearstructure.cpp b/test/linearstructure.cpp
index a7b058d69..8beb33925 100644
--- a/test/linearstructure.cpp
+++ b/test/linearstructure.cpp
@@ -62,11 +62,7 @@ template<typename MatrixType> void linearStructure(const MatrixType& m)
   VERIFY_IS_APPROX(-m2+m1+m2,               m1);
   VERIFY_IS_APPROX(m1*s1,                   s1*m1);
   VERIFY_IS_APPROX((m1+m2)*s1,              s1*m1+s1*m2);
-  VERIFY_IS_APPROX((s1+s2)*m1,              m1*s1+m1*s2);
-  VERIFY_IS_APPROX((m1-m2)*s1,              s1*m1-s1*m2);
-  VERIFY_IS_APPROX((s1-s2)*m1,              m1*s1-m1*s2);
   VERIFY_IS_APPROX((-m1+m2)*s1,             -s1*m1+s1*m2);
-  VERIFY_IS_APPROX((-s1+s2)*m1,             -m1*s1+m1*s2);
   m3 = m2; m3 += m1;
   VERIFY_IS_APPROX(m3,                      m1+m2);
   m3 = m2; m3 -= m1;
diff --git a/test/product.cpp b/test/product.cpp
index 70b41212a..a89497763 100644
--- a/test/product.cpp
+++ b/test/product.cpp
@@ -73,14 +73,10 @@ template<typename MatrixType> void product(const MatrixType& m)
   VERIFY_IS_APPROX(s1*(square*m1),          (s1*square)*m1);
   VERIFY_IS_APPROX(s1*(square*m1),          square*(m1*s1));
 
-  // continue testing Product.h: lazy product
-  VERIFY_IS_APPROX(square.lazy() * m1,  square*m1);
-  VERIFY_IS_APPROX(square * m1.lazy(),  square*m1);
   // again, test operator() to check const-qualification
   s1 += (square.lazy() * m1)(r,c);
 
   // test Product.h together with Identity.h
-  VERIFY_IS_APPROX(m1,                      identity*m1);
   VERIFY_IS_APPROX(v1,                      identity*v1);
   // again, test operator() to check const-qualification
   VERIFY_IS_APPROX(MatrixType::identity(rows, cols)(r,c), static_cast<Scalar>(r==c));
@@ -92,18 +88,14 @@ template<typename MatrixType> void product(const MatrixType& m)
 void test_product()
 {
   for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST( product(Matrix<float, 1, 1>()) );
-    CALL_SUBTEST( product(Matrix<float, 3, 3>()) );
-    CALL_SUBTEST( product(Matrix<float, 4, 2>()) );
+    CALL_SUBTEST( product(Matrix3i()) );
+    CALL_SUBTEST( product(Matrix<float, 3, 2>()) );
     CALL_SUBTEST( product(Matrix4d()) );
   }
   for(int i = 0; i < g_repeat; i++) {
-    int rows = ei_random<int>(1,320);
-    int cols = ei_random<int>(1,320);
-    CALL_SUBTEST( product(MatrixXf(rows, cols)) );
-    CALL_SUBTEST( product(MatrixXd(rows, cols)) );
-    CALL_SUBTEST( product(MatrixXi(rows, cols)) );
-    CALL_SUBTEST( product(MatrixXcf(rows, cols)) );
-    CALL_SUBTEST( product(MatrixXcd(rows, cols)) );
+    CALL_SUBTEST( product(MatrixXf(ei_random<int>(1,320), ei_random<int>(1,320))) );
+    CALL_SUBTEST( product(MatrixXd(ei_random<int>(1,320), ei_random<int>(1,320))) );
+    CALL_SUBTEST( product(MatrixXi(ei_random<int>(1,320), ei_random<int>(1,320))) );
+    CALL_SUBTEST( product(MatrixXcf(ei_random<int>(1,50), ei_random<int>(1,50))) );
   }
 }