Pulled the latest changes from the trunk

2025-09-26 00:03:14 +08:00 · 2015-02-06 05:25:03 -08:00 · 2015-02-06 05:25:03 -08:00 · c739102ef9
commit c739102ef9
parent 2559fa9b0f a8f2c6eec7
345 changed files with 19503 additions and 15221 deletions
--- a/.hgignore
+++ b/.hgignore
@ -30,3 +30,5 @@ log
 patch
 a
 a.*
+lapack/testing
+lapack/reference
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,6 +1,6 @@
 project(Eigen)

-cmake_minimum_required(VERSION 2.8.2)
+cmake_minimum_required(VERSION 2.8.4)

 # guard against in-source builds

@ -219,12 +219,26 @@ if(NOT MSVC)
    message(STATUS "Enabling AltiVec in tests/examples")
  endif()

+  option(EIGEN_TEST_VSX "Enable/Disable VSX in tests/examples" OFF)
+  if(EIGEN_TEST_VSX)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64 -mvsx")
+    message(STATUS "Enabling VSX in tests/examples")
+  endif()
+
  option(EIGEN_TEST_NEON "Enable/Disable Neon in tests/examples" OFF)
  if(EIGEN_TEST_NEON)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon -mcpu=cortex-a8")
    message(STATUS "Enabling NEON in tests/examples")
  endif()

+  option(EIGEN_TEST_NEON64 "Enable/Disable Neon in tests/examples" OFF)
+  if(EIGEN_TEST_NEON64)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+    message(STATUS "Enabling NEON in tests/examples")
+  endif()
+
+
+
  check_cxx_compiler_flag("-fopenmp" COMPILER_SUPPORT_OPENMP)
  if(COMPILER_SUPPORT_OPENMP)
    option(EIGEN_TEST_OPENMP "Enable/Disable OpenMP in tests/examples" OFF)
@ -434,6 +448,7 @@ if(cmake_generator_tolower MATCHES "makefile")
  message(STATUS "make check    | Build and run the unit-tests. Read this page:")
  message(STATUS "              |   http://eigen.tuxfamily.org/index.php?title=Tests")
  message(STATUS "make blas     | Build BLAS library (not the same thing as Eigen)")
+  message(STATUS "make uninstall| Removes files installed by make install")
  message(STATUS "--------------+--------------------------------------------------------------")
 else()
  message(STATUS "To build/run the unit tests, read this page:")
@ -469,3 +484,7 @@ install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/cmake/UseEigen3.cmake
                ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
          DESTINATION ${EIGEN_CONFIG_CMAKE_PATH}
        )
+
+# Add uninstall target
+add_custom_target ( uninstall
+    COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/EigenUninstall.cmake)
--- a/Eigen/Cholesky
+++ b/Eigen/Cholesky
@ -21,7 +21,6 @@
  * \endcode
  */

-#include "src/misc/Solve.h"
 #include "src/Cholesky/LLT.h"
 #include "src/Cholesky/LDLT.h"
 #ifdef EIGEN_USE_LAPACKE
--- a/Eigen/CholmodSupport
+++ b/Eigen/CholmodSupport
@ -33,12 +33,8 @@ extern "C" {
  *
  */

-#include "src/misc/Solve.h"
-#include "src/misc/SparseSolve.h"
-
 #include "src/CholmodSupport/CholmodSupport.h"

-
 #include "src/Core/util/ReenableStupidWarnings.h"

 #endif // EIGEN_CHOLMODSUPPORT_MODULE_H
--- a/Eigen/Core
+++ b/Eigen/Core
@ -57,7 +57,7 @@

 // Disable the ipa-cp-clone optimization flag with MinGW 6.x or newer (enabled by default with -O3)
 // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=556 for details.
-#if defined(__MINGW32__) && EIGEN_GNUC_AT_LEAST(4,6)
+#if EIGEN_COMP_MINGW && EIGEN_GNUC_AT_LEAST(4,6)
  #pragma GCC optimize ("-fno-ipa-cp-clone")
 #endif

@ -75,18 +75,18 @@
  #endif
 #endif

-#ifdef _MSC_VER
+#if EIGEN_COMP_MSVC
  #include <malloc.h> // for _aligned_malloc -- need it regardless of whether vectorization is enabled
-  #if (_MSC_VER >= 1500) // 2008 or later
+  #if (EIGEN_COMP_MSVC >= 1500) // 2008 or later
    // Remember that usage of defined() in a #define is undefined by the standard.
    // a user reported that in 64-bit mode, MSVC doesn't care to define _M_IX86_FP.
-    #if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || defined(_M_X64)
+    #if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || EIGEN_ARCH_x86_64
      #define EIGEN_SSE2_ON_MSVC_2008_OR_LATER
    #endif
  #endif
 #else
  // Remember that usage of defined() in a #define is undefined by the standard
-  #if (defined __SSE2__) && ( (!defined __GNUC__) || (defined __INTEL_COMPILER) || EIGEN_GNUC_AT_LEAST(4,2) )
+  #if (defined __SSE2__) && ( (!EIGEN_COMP_GNUC) || EIGEN_COMP_ICC || EIGEN_GNUC_AT_LEAST(4,2) )
    #define EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC
  #endif
 #endif
@ -125,9 +125,7 @@
      #define EIGEN_VECTORIZE_SSE4_1
      #define EIGEN_VECTORIZE_SSE4_2
    #endif
-    #ifdef __FMA__
-      #define EIGEN_VECTORIZE_FMA
-    #endif
+
    // include files

    // This extern "C" works around a MINGW-w64 compilation issue
@ -140,7 +138,7 @@
    extern "C" {
      // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly.
      // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus:
-      #if defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1110
+      #if EIGEN_COMP_ICC >= 1110
        #include <immintrin.h>
      #else
        #include <emmintrin.h>
@ -162,6 +160,15 @@
        #endif
      #endif
    } // end extern "C"
+  #elif defined __VSX__
+    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_VSX
+    #include <altivec.h>
+    // We need to #undef all these ugly tokens defined in <altivec.h>
+    // => use __vector instead of vector
+    #undef bool
+    #undef vector
+    #undef pixel
  #elif defined __ALTIVEC__
    #define EIGEN_VECTORIZE
    #define EIGEN_VECTORIZE_ALTIVEC
@ -171,7 +178,7 @@
    #undef bool
    #undef vector
    #undef pixel
-  #elif defined  __ARM_NEON__
+  #elif defined  __ARM_NEON
    #define EIGEN_VECTORIZE
    #define EIGEN_VECTORIZE_NEON
    #include <arm_neon.h>
@ -192,7 +199,7 @@
 #endif

 // MSVC for windows mobile does not have the errno.h file
-#if !(defined(_MSC_VER) && defined(_WIN32_WCE)) && !defined(__ARMCC_VERSION)
+#if !(EIGEN_COMP_MSVC && EIGEN_OS_WINCE) && !EIGEN_COMP_ARM
 #define EIGEN_HAS_ERRNO
 #endif

@ -218,7 +225,7 @@
 #endif

 // required for __cpuid, needs to be included after cmath
-#if defined(_MSC_VER) && (defined(_M_IX86)||defined(_M_X64)) && (!defined(_WIN32_WCE))
+#if EIGEN_COMP_MSVC && EIGEN_ARCH_i386_OR_x86_64 && !EIGEN_OS_WINCE
  #include <intrin.h>
 #endif

@ -240,6 +247,8 @@ inline static const char *SimdInstructionSetsInUse(void) {
  return "SSE, SSE2";
 #elif defined(EIGEN_VECTORIZE_ALTIVEC)
  return "AltiVec";
+#elif defined(EIGEN_VECTORIZE_VSX)
+  return "VSX";
 #elif defined(EIGEN_VECTORIZE_NEON)
  return "ARM NEON";
 #else
@ -271,8 +280,8 @@ using std::ptrdiff_t;
  */

 #include "src/Core/util/Constants.h"
-#include "src/Core/util/ForwardDeclarations.h"
 #include "src/Core/util/Meta.h"
+#include "src/Core/util/ForwardDeclarations.h"
 #include "src/Core/util/StaticAssert.h"
 #include "src/Core/util/XprHelper.h"
 #include "src/Core/util/Memory.h"
@ -291,7 +300,7 @@ using std::ptrdiff_t;
  #include "src/Core/arch/SSE/PacketMath.h"
  #include "src/Core/arch/SSE/MathFunctions.h"
  #include "src/Core/arch/SSE/Complex.h"
-#elif defined EIGEN_VECTORIZE_ALTIVEC
+#elif defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX)
  #include "src/Core/arch/AltiVec/PacketMath.h"
  #include "src/Core/arch/AltiVec/Complex.h"
 #elif defined EIGEN_VECTORIZE_NEON
@ -310,19 +319,16 @@ using std::ptrdiff_t;
 #include "src/Core/functors/UnaryFunctors.h"
 #include "src/Core/functors/NullaryFunctors.h"
 #include "src/Core/functors/StlFunctors.h"
+#include "src/Core/functors/AssignmentFunctors.h"

 #include "src/Core/DenseCoeffsBase.h"
 #include "src/Core/DenseBase.h"
 #include "src/Core/MatrixBase.h"
 #include "src/Core/EigenBase.h"

-#ifdef EIGEN_ENABLE_EVALUATORS
-#include "src/Core/functors/AssignmentFunctors.h"
 #include "src/Core/Product.h"
 #include "src/Core/CoreEvaluators.h"
 #include "src/Core/AssignEvaluator.h"
-#include "src/Core/ProductEvaluators.h"
-#endif

 #ifndef EIGEN_PARSED_BY_DOXYGEN // work around Doxygen bug triggered by Assign.h r814874
                                // at least confirmed with Doxygen 1.5.5 and 1.5.6
@ -333,7 +339,10 @@ using std::ptrdiff_t;
 #include "src/Core/util/BlasUtil.h"
 #include "src/Core/DenseStorage.h"
 #include "src/Core/NestByValue.h"
-#include "src/Core/ForceAlignedAccess.h"
+
+// #include "src/Core/ForceAlignedAccess.h"
+// #include "src/Core/Flagged.h"
+
 #include "src/Core/ReturnByValue.h"
 #include "src/Core/NoAlias.h"
 #include "src/Core/PlainObjectBase.h"
@ -346,12 +355,12 @@ using std::ptrdiff_t;
 #include "src/Core/SelfCwiseBinaryOp.h"
 #include "src/Core/Dot.h"
 #include "src/Core/StableNorm.h"
-#include "src/Core/MapBase.h"
 #include "src/Core/Stride.h"
+#include "src/Core/MapBase.h"
 #include "src/Core/Map.h"
+#include "src/Core/Ref.h"
 #include "src/Core/Block.h"
 #include "src/Core/VectorBlock.h"
-#include "src/Core/Ref.h"
 #include "src/Core/Transpose.h"
 #include "src/Core/DiagonalMatrix.h"
 #include "src/Core/Diagonal.h"
@ -364,14 +373,15 @@ using std::ptrdiff_t;
 #include "src/Core/IO.h"
 #include "src/Core/Swap.h"
 #include "src/Core/CommaInitializer.h"
-#include "src/Core/Flagged.h"
 #include "src/Core/ProductBase.h"
 #include "src/Core/GeneralProduct.h"
+#include "src/Core/Solve.h"
+#include "src/Core/Inverse.h"
 #include "src/Core/TriangularMatrix.h"
 #include "src/Core/SelfAdjointView.h"
 #include "src/Core/products/GeneralBlockPanelKernel.h"
 #include "src/Core/products/Parallelizer.h"
-#include "src/Core/products/CoeffBasedProduct.h"
+#include "src/Core/ProductEvaluators.h"
 #include "src/Core/products/GeneralMatrixVector.h"
 #include "src/Core/products/GeneralMatrixMatrix.h"
 #include "src/Core/SolveTriangular.h"
--- a/Eigen/IterativeLinearSolvers
+++ b/Eigen/IterativeLinearSolvers
@ -26,9 +26,7 @@
  * \endcode
  */

-#include "src/misc/Solve.h"
-#include "src/misc/SparseSolve.h"
-
+#include "src/IterativeLinearSolvers/SolveWithGuess.h"
 #include "src/IterativeLinearSolvers/IterativeSolverBase.h"
 #include "src/IterativeLinearSolvers/BasicPreconditioners.h"
 #include "src/IterativeLinearSolvers/ConjugateGradient.h"
--- a/Eigen/LU
+++ b/Eigen/LU
@ -16,7 +16,6 @@
  * \endcode
  */

-#include "src/misc/Solve.h"
 #include "src/misc/Kernel.h"
 #include "src/misc/Image.h"
 #include "src/LU/FullPivLU.h"
@ -25,7 +24,7 @@
 #include "src/LU/PartialPivLU_MKL.h"
 #endif
 #include "src/LU/Determinant.h"
-#include "src/LU/Inverse.h"
+#include "src/LU/InverseImpl.h"

 // Use the SSE optimized version whenever possible. At the moment the
 // SSE version doesn't compile when AVX is enabled
--- a/Eigen/PaStiXSupport
+++ b/Eigen/PaStiXSupport
@ -35,12 +35,8 @@ extern "C" {
  *
  */

-#include "src/misc/Solve.h"
-#include "src/misc/SparseSolve.h"
-
 #include "src/PaStiXSupport/PaStiXSupport.h"

-
 #include "src/Core/util/ReenableStupidWarnings.h"

 #endif // EIGEN_PASTIXSUPPORT_MODULE_H
--- a/Eigen/QR
+++ b/Eigen/QR
@ -24,7 +24,6 @@
  * \endcode
  */

-#include "src/misc/Solve.h"
 #include "src/QR/HouseholderQR.h"
 #include "src/QR/FullPivHouseholderQR.h"
 #include "src/QR/ColPivHouseholderQR.h"
--- a/Eigen/SPQRSupport
+++ b/Eigen/SPQRSupport
@ -21,8 +21,6 @@
  *
  */

-#include "src/misc/Solve.h"
-#include "src/misc/SparseSolve.h"
 #include "src/CholmodSupport/CholmodSupport.h"
 #include "src/SPQRSupport/SuiteSparseQRSupport.h"

--- a/Eigen/SVD
+++ b/Eigen/SVD
@ -12,21 +12,25 @@
  *
  *
  * This module provides SVD decomposition for matrices (both real and complex).
-  * This decomposition is accessible via the following MatrixBase method:
+  * Two decomposition algorithms are provided:
+  *  - JacobiSVD implementing two-sided Jacobi iterations is numerically very accurate, fast for small matrices, but very slow for larger ones.
+  *  - BDCSVD implementing a recursive divide & conquer strategy on top of an upper-bidiagonalization which remains fast for large problems.
+  * These decompositions are accessible via the respective classes and following MatrixBase methods:
  *  - MatrixBase::jacobiSvd()
+  *  - MatrixBase::bdcSvd()
  *
  * \code
  * #include <Eigen/SVD>
  * \endcode
  */

-#include "src/misc/Solve.h"
+#include "src/SVD/UpperBidiagonalization.h"
 #include "src/SVD/SVDBase.h"
 #include "src/SVD/JacobiSVD.h"
+#include "src/SVD/BDCSVD.h"
 #if defined(EIGEN_USE_LAPACKE) && !defined(EIGEN_USE_LAPACKE_STRICT)
 #include "src/SVD/JacobiSVD_MKL.h"
 #endif
-#include "src/SVD/UpperBidiagonalization.h"

 #include "src/Core/util/ReenableStupidWarnings.h"

--- a/Eigen/SparseCholesky
+++ b/Eigen/SparseCholesky
@ -34,8 +34,6 @@
 #error The SparseCholesky module has nothing to offer in MPL2 only mode
 #endif

-#include "src/misc/Solve.h"
-#include "src/misc/SparseSolve.h"
 #include "src/SparseCholesky/SimplicialCholesky.h"

 #ifndef EIGEN_MPL2_ONLY
--- a/Eigen/SparseCore
+++ b/Eigen/SparseCore
@ -26,37 +26,32 @@
  * This module depends on: Core.
  */

-namespace Eigen {
-
-/** The type used to identify a general sparse storage. */
-struct Sparse {};
-
-}
-
 #include "src/SparseCore/SparseUtil.h"
 #include "src/SparseCore/SparseMatrixBase.h"
+#include "src/SparseCore/SparseAssign.h"
 #include "src/SparseCore/CompressedStorage.h"
 #include "src/SparseCore/AmbiVector.h"
 #include "src/SparseCore/SparseMatrix.h"
 #include "src/SparseCore/MappedSparseMatrix.h"
 #include "src/SparseCore/SparseVector.h"
-#include "src/SparseCore/SparseBlock.h"
-#include "src/SparseCore/SparseTranspose.h"
 #include "src/SparseCore/SparseCwiseUnaryOp.h"
 #include "src/SparseCore/SparseCwiseBinaryOp.h"
+#include "src/SparseCore/SparseTranspose.h"
+#include "src/SparseCore/SparseBlock.h"
 #include "src/SparseCore/SparseDot.h"
-#include "src/SparseCore/SparsePermutation.h"
 #include "src/SparseCore/SparseRedux.h"
-#include "src/SparseCore/SparseFuzzy.h"
+#include "src/SparseCore/SparseView.h"
+#include "src/SparseCore/SparseDiagonalProduct.h"
 #include "src/SparseCore/ConservativeSparseSparseProduct.h"
 #include "src/SparseCore/SparseSparseProductWithPruning.h"
 #include "src/SparseCore/SparseProduct.h"
 #include "src/SparseCore/SparseDenseProduct.h"
-#include "src/SparseCore/SparseDiagonalProduct.h"
-#include "src/SparseCore/SparseTriangularView.h"
 #include "src/SparseCore/SparseSelfAdjointView.h"
+#include "src/SparseCore/SparseTriangularView.h"
 #include "src/SparseCore/TriangularSolver.h"
-#include "src/SparseCore/SparseView.h"
+#include "src/SparseCore/SparsePermutation.h"
+#include "src/SparseCore/SparseFuzzy.h"
+#include "src/SparseCore/SparseSolverBase.h"

 #include "src/Core/util/ReenableStupidWarnings.h"

--- a/Eigen/SparseLU
+++ b/Eigen/SparseLU
@ -20,9 +20,6 @@
  * Please, see the documentation of the SparseLU class for more details.
  */

-#include "src/misc/Solve.h"
-#include "src/misc/SparseSolve.h"
-
 // Ordering interface
 #include "OrderingMethods"

--- a/Eigen/SparseQR
+++ b/Eigen/SparseQR
@ -21,9 +21,6 @@
  * 
  */

-#include "src/misc/Solve.h"
-#include "src/misc/SparseSolve.h"
-
 #include "OrderingMethods"
 #include "src/SparseCore/SparseColEtree.h"
 #include "src/SparseQR/SparseQR.h"
--- a/Eigen/StdDeque
+++ b/Eigen/StdDeque
@ -14,7 +14,7 @@
 #include "Core"
 #include <deque>

-#if (defined(_MSC_VER) && defined(_WIN64)) /* MSVC auto aligns in 64 bit builds */
+#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 /* MSVC auto aligns in 64 bit builds */

 #define EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(...)

--- a/Eigen/StdList
+++ b/Eigen/StdList
@ -13,7 +13,7 @@
 #include "Core"
 #include <list>

-#if (defined(_MSC_VER) && defined(_WIN64)) /* MSVC auto aligns in 64 bit builds */    
+#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 /* MSVC auto aligns in 64 bit builds */    

 #define EIGEN_DEFINE_STL_LIST_SPECIALIZATION(...)

--- a/Eigen/StdVector
+++ b/Eigen/StdVector
@ -14,7 +14,7 @@
 #include "Core"
 #include <vector>

-#if (defined(_MSC_VER) && defined(_WIN64)) /* MSVC auto aligns in 64 bit builds */
+#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 /* MSVC auto aligns in 64 bit builds */

 #define EIGEN_DEFINE_STL_VECTOR_SPECIALIZATION(...)

--- a/Eigen/SuperLUSupport
+++ b/Eigen/SuperLUSupport
@ -48,12 +48,8 @@ namespace Eigen { struct SluMatrix; }
  *
  */

-#include "src/misc/Solve.h"
-#include "src/misc/SparseSolve.h"
-
 #include "src/SuperLUSupport/SuperLUSupport.h"

-
 #include "src/Core/util/ReenableStupidWarnings.h"

 #endif // EIGEN_SUPERLUSUPPORT_MODULE_H
--- a/Eigen/UmfPackSupport
+++ b/Eigen/UmfPackSupport
@ -26,9 +26,6 @@ extern "C" {
  *
  */

-#include "src/misc/Solve.h"
-#include "src/misc/SparseSolve.h"
-
 #include "src/UmfPackSupport/UmfPackSupport.h"

 #include "src/Core/util/ReenableStupidWarnings.h"
--- a/Eigen/src/Cholesky/LDLT.h
+++ b/Eigen/src/Cholesky/LDLT.h
@ -85,7 +85,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
      * according to the specified problem \a size.
      * \sa LDLT()
      */
-    LDLT(Index size)
+    explicit LDLT(Index size)
      : m_matrix(size, size),
        m_transpositions(size),
        m_temporary(size),
@ -98,7 +98,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
      * This calculates the decomposition for the input \a matrix.
      * \sa LDLT(Index size)
      */
-    LDLT(const MatrixType& matrix)
+    explicit LDLT(const MatrixType& matrix)
      : m_matrix(matrix.rows(), matrix.cols()),
        m_transpositions(matrix.rows()),
        m_temporary(matrix.rows()),
@ -175,13 +175,13 @@ template<typename _MatrixType, int _UpLo> class LDLT
      * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt()
      */
    template<typename Rhs>
-    inline const internal::solve_retval<LDLT, Rhs>
+    inline const Solve<LDLT, Rhs>
    solve(const MatrixBase<Rhs>& b) const
    {
      eigen_assert(m_isInitialized && "LDLT is not initialized.");
      eigen_assert(m_matrix.rows()==b.rows()
                && "LDLT::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<LDLT, Rhs>(*this, b.derived());
+      return Solve<LDLT, Rhs>(*this, b.derived());
    }

    template<typename Derived>
@ -217,6 +217,12 @@ template<typename _MatrixType, int _UpLo> class LDLT
      eigen_assert(m_isInitialized && "LDLT is not initialized.");
      return Success;
    }
+    
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<typename RhsType, typename DstType>
+    EIGEN_DEVICE_FUNC
+    void _solve_impl(const RhsType &rhs, DstType &dst) const;
+    #endif

  protected:

@ -400,16 +406,16 @@ template<typename MatrixType> struct LDLT_Traits<MatrixType,Lower>
 {
  typedef const TriangularView<const MatrixType, UnitLower> MatrixL;
  typedef const TriangularView<const typename MatrixType::AdjointReturnType, UnitUpper> MatrixU;
-  static inline MatrixL getL(const MatrixType& m) { return m; }
-  static inline MatrixU getU(const MatrixType& m) { return m.adjoint(); }
+  static inline MatrixL getL(const MatrixType& m) { return MatrixL(m); }
+  static inline MatrixU getU(const MatrixType& m) { return MatrixU(m.adjoint()); }
 };

 template<typename MatrixType> struct LDLT_Traits<MatrixType,Upper>
 {
  typedef const TriangularView<const typename MatrixType::AdjointReturnType, UnitLower> MatrixL;
  typedef const TriangularView<const MatrixType, UnitUpper> MatrixU;
-  static inline MatrixL getL(const MatrixType& m) { return m.adjoint(); }
-  static inline MatrixU getU(const MatrixType& m) { return m; }
+  static inline MatrixL getL(const MatrixType& m) { return MatrixL(m.adjoint()); }
+  static inline MatrixU getU(const MatrixType& m) { return MatrixU(m); }
 };

 } // end namespace internal
@ -427,6 +433,7 @@ LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::compute(const MatrixType& a)
  m_transpositions.resize(size);
  m_isInitialized = false;
  m_temporary.resize(size);
+  m_sign = internal::ZeroSign;

  internal::ldlt_inplace<UpLo>::unblocked(m_matrix, m_transpositions, m_temporary, m_sign);

@ -466,52 +473,45 @@ LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::rankUpdate(const MatrixBase<Deri
  return *this;
 }

-namespace internal {
-template<typename _MatrixType, int _UpLo, typename Rhs>
-struct solve_retval<LDLT<_MatrixType,_UpLo>, Rhs>
-  : solve_retval_base<LDLT<_MatrixType,_UpLo>, Rhs>
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template<typename _MatrixType, int _UpLo>
+template<typename RhsType, typename DstType>
+void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
-  typedef LDLT<_MatrixType,_UpLo> LDLTType;
-  EIGEN_MAKE_SOLVE_HELPERS(LDLTType,Rhs)
+  eigen_assert(rhs.rows() == rows());
+  // dst = P b
+  dst = m_transpositions * rhs;

-  template<typename Dest> void evalTo(Dest& dst) const
+  // dst = L^-1 (P b)
+  matrixL().solveInPlace(dst);
+
+  // dst = D^-1 (L^-1 P b)
+  // more precisely, use pseudo-inverse of D (see bug 241)
+  using std::abs;
+  const typename Diagonal<const MatrixType>::RealReturnType vecD(vectorD());
+  // In some previous versions, tolerance was set to the max of 1/highest and the maximal diagonal entry * epsilon
+  // as motivated by LAPACK's xGELSS:
+  // RealScalar tolerance = numext::maxi(vectorD.array().abs().maxCoeff() *NumTraits<RealScalar>::epsilon(),RealScalar(1) / NumTraits<RealScalar>::highest());
+  // However, LDLT is not rank revealing, and so adjusting the tolerance wrt to the highest
+  // diagonal element is not well justified and to numerical issues in some cases.
+  // Moreover, Lapack's xSYTRS routines use 0 for the tolerance.
+  RealScalar tolerance = RealScalar(1) / NumTraits<RealScalar>::highest();
+  
+  for (Index i = 0; i < vecD.size(); ++i)
  {
-    eigen_assert(rhs().rows() == dec().matrixLDLT().rows());
-    // dst = P b
-    dst = dec().transpositionsP() * rhs();
-
-    // dst = L^-1 (P b)
-    dec().matrixL().solveInPlace(dst);
-
-    // dst = D^-1 (L^-1 P b)
-    // more precisely, use pseudo-inverse of D (see bug 241)
-    using std::abs;
-    EIGEN_USING_STD_MATH(max);
-    typedef typename LDLTType::MatrixType MatrixType;
-    typedef typename LDLTType::RealScalar RealScalar;
-    const typename Diagonal<const MatrixType>::RealReturnType vectorD(dec().vectorD());
-    // In some previous versions, tolerance was set to the max of 1/highest and the maximal diagonal entry * epsilon
-    // as motivated by LAPACK's xGELSS:
-    // RealScalar tolerance = (max)(vectorD.array().abs().maxCoeff() *NumTraits<RealScalar>::epsilon(),RealScalar(1) / NumTraits<RealScalar>::highest());
-    // However, LDLT is not rank revealing, and so adjusting the tolerance wrt to the highest
-    // diagonal element is not well justified and to numerical issues in some cases.
-    // Moreover, Lapack's xSYTRS routines use 0 for the tolerance.
-    RealScalar tolerance = RealScalar(1) / NumTraits<RealScalar>::highest();
-    for (Index i = 0; i < vectorD.size(); ++i) {
-      if(abs(vectorD(i)) > tolerance)
-        dst.row(i) /= vectorD(i);
-      else
-        dst.row(i).setZero();
-    }
-
-    // dst = L^-T (D^-1 L^-1 P b)
-    dec().matrixU().solveInPlace(dst);
-
-    // dst = P^-1 (L^-T D^-1 L^-1 P b) = A^-1 b
-    dst = dec().transpositionsP().transpose() * dst;
+    if(abs(vecD(i)) > tolerance)
+      dst.row(i) /= vecD(i);
+    else
+      dst.row(i).setZero();
  }
-};
+
+  // dst = L^-T (D^-1 L^-1 P b)
+  matrixU().solveInPlace(dst);
+
+  // dst = P^-1 (L^-T D^-1 L^-1 P b) = A^-1 b
+  dst = m_transpositions.transpose() * dst;
 }
+#endif

 /** \internal use x = ldlt_object.solve(x);
  *
--- a/Eigen/src/Cholesky/LLT.h
+++ b/Eigen/src/Cholesky/LLT.h
@ -83,10 +83,10 @@ template<typename _MatrixType, int _UpLo> class LLT
      * according to the specified problem \a size.
      * \sa LLT()
      */
-    LLT(Index size) : m_matrix(size, size),
+    explicit LLT(Index size) : m_matrix(size, size),
                    m_isInitialized(false) {}

-    LLT(const MatrixType& matrix)
+    explicit LLT(const MatrixType& matrix)
      : m_matrix(matrix.rows(), matrix.cols()),
        m_isInitialized(false)
    {
@ -118,13 +118,13 @@ template<typename _MatrixType, int _UpLo> class LLT
      * \sa solveInPlace(), MatrixBase::llt(), SelfAdjointView::llt()
      */
    template<typename Rhs>
-    inline const internal::solve_retval<LLT, Rhs>
+    inline const Solve<LLT, Rhs>
    solve(const MatrixBase<Rhs>& b) const
    {
      eigen_assert(m_isInitialized && "LLT is not initialized.");
      eigen_assert(m_matrix.rows()==b.rows()
                && "LLT::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<LLT, Rhs>(*this, b.derived());
+      return Solve<LLT, Rhs>(*this, b.derived());
    }

    template<typename Derived>
@ -161,6 +161,12 @@ template<typename _MatrixType, int _UpLo> class LLT

    template<typename VectorType>
    LLT rankUpdate(const VectorType& vec, const RealScalar& sigma = 1);
+    
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<typename RhsType, typename DstType>
+    EIGEN_DEVICE_FUNC
+    void _solve_impl(const RhsType &rhs, DstType &dst) const;
+    #endif

  protected:
    /** \internal
@ -345,8 +351,8 @@ template<typename MatrixType> struct LLT_Traits<MatrixType,Lower>
 {
  typedef const TriangularView<const MatrixType, Lower> MatrixL;
  typedef const TriangularView<const typename MatrixType::AdjointReturnType, Upper> MatrixU;
-  static inline MatrixL getL(const MatrixType& m) { return m; }
-  static inline MatrixU getU(const MatrixType& m) { return m.adjoint(); }
+  static inline MatrixL getL(const MatrixType& m) { return MatrixL(m); }
+  static inline MatrixU getU(const MatrixType& m) { return MatrixU(m.adjoint()); }
  static bool inplace_decomposition(MatrixType& m)
  { return llt_inplace<typename MatrixType::Scalar, Lower>::blocked(m)==-1; }
 };
@ -355,8 +361,8 @@ template<typename MatrixType> struct LLT_Traits<MatrixType,Upper>
 {
  typedef const TriangularView<const typename MatrixType::AdjointReturnType, Lower> MatrixL;
  typedef const TriangularView<const MatrixType, Upper> MatrixU;
-  static inline MatrixL getL(const MatrixType& m) { return m.adjoint(); }
-  static inline MatrixU getU(const MatrixType& m) { return m; }
+  static inline MatrixL getL(const MatrixType& m) { return MatrixL(m.adjoint()); }
+  static inline MatrixU getU(const MatrixType& m) { return MatrixU(m); }
  static bool inplace_decomposition(MatrixType& m)
  { return llt_inplace<typename MatrixType::Scalar, Upper>::blocked(m)==-1; }
 };
@ -404,22 +410,16 @@ LLT<_MatrixType,_UpLo> LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, c

  return *this;
 }
-    
-namespace internal {
-template<typename _MatrixType, int UpLo, typename Rhs>
-struct solve_retval<LLT<_MatrixType, UpLo>, Rhs>
-  : solve_retval_base<LLT<_MatrixType, UpLo>, Rhs>
+ 
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template<typename _MatrixType,int _UpLo>
+template<typename RhsType, typename DstType>
+void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
-  typedef LLT<_MatrixType,UpLo> LLTType;
-  EIGEN_MAKE_SOLVE_HELPERS(LLTType,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dst = rhs();
-    dec().solveInPlace(dst);
-  }
-};
+  dst = rhs;
+  solveInPlace(dst);
 }
+#endif

 /** \internal use x = llt_object.solve(x);
  * 
--- a/Eigen/src/CholmodSupport/CholmodSupport.h
+++ b/Eigen/src/CholmodSupport/CholmodSupport.h
@ -105,7 +105,7 @@ const cholmod_sparse viewAsCholmod(const SparseMatrix<_Scalar,_Options,_Index>&
 /** Returns a view of the Eigen sparse matrix \a mat as Cholmod sparse matrix.
  * The data are not copied but shared. */
 template<typename _Scalar, int _Options, typename _Index, unsigned int UpLo>
-cholmod_sparse viewAsCholmod(const SparseSelfAdjointView<SparseMatrix<_Scalar,_Options,_Index>, UpLo>& mat)
+cholmod_sparse viewAsCholmod(const SparseSelfAdjointView<const SparseMatrix<_Scalar,_Options,_Index>, UpLo>& mat)
 {
  cholmod_sparse res = viewAsCholmod(mat.matrix().const_cast_derived());
  
@ -157,8 +157,12 @@ enum CholmodMode {
  * \sa class CholmodSupernodalLLT, class CholmodSimplicialLDLT, class CholmodSimplicialLLT
  */
 template<typename _MatrixType, int _UpLo, typename Derived>
-class CholmodBase : internal::noncopyable
+class CholmodBase : public SparseSolverBase<Derived>
 {
+  protected:
+    typedef SparseSolverBase<Derived> Base;
+    using Base::derived;
+    using Base::m_isInitialized;
  public:
    typedef _MatrixType MatrixType;
    enum { UpLo = _UpLo };
@ -170,14 +174,14 @@ class CholmodBase : internal::noncopyable
  public:

    CholmodBase()
-      : m_cholmodFactor(0), m_info(Success), m_isInitialized(false)
+      : m_cholmodFactor(0), m_info(Success)
    {
      m_shiftOffset[0] = m_shiftOffset[1] = RealScalar(0.0);
      cholmod_start(&m_cholmod);
    }

-    CholmodBase(const MatrixType& matrix)
-      : m_cholmodFactor(0), m_info(Success), m_isInitialized(false)
+    explicit CholmodBase(const MatrixType& matrix)
+      : m_cholmodFactor(0), m_info(Success)
    {
      m_shiftOffset[0] = m_shiftOffset[1] = RealScalar(0.0);
      cholmod_start(&m_cholmod);
@ -194,9 +198,6 @@ class CholmodBase : internal::noncopyable
    inline Index cols() const { return m_cholmodFactor->n; }
    inline Index rows() const { return m_cholmodFactor->n; }
    
-    Derived& derived() { return *static_cast<Derived*>(this); }
-    const Derived& derived() const { return *static_cast<const Derived*>(this); }
-    
    /** \brief Reports whether previous computation was successful.
      *
      * \returns \c Success if computation was succesful,
@ -216,34 +217,6 @@ class CholmodBase : internal::noncopyable
      return derived();
    }
    
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::solve_retval<CholmodBase, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "LLT is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "CholmodDecomposition::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<CholmodBase, Rhs>(*this, b.derived());
-    }
-    
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::sparse_solve_retval<CholmodBase, Rhs>
-    solve(const SparseMatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "LLT is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "CholmodDecomposition::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::sparse_solve_retval<CholmodBase, Rhs>(*this, b.derived());
-    }
-    
    /** Performs a symbolic decomposition on the sparsity pattern of \a matrix.
      *
      * This function is particularly useful when solving for several problems having the same structure.
@ -290,7 +263,7 @@ class CholmodBase : internal::noncopyable
    #ifndef EIGEN_PARSED_BY_DOXYGEN
    /** \internal */
    template<typename Rhs,typename Dest>
-    void _solve(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const
+    void _solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const
    {
      eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or symbolic()/numeric()");
      const Index size = m_cholmodFactor->n;
@ -312,7 +285,7 @@ class CholmodBase : internal::noncopyable
    
    /** \internal */
    template<typename RhsScalar, int RhsOptions, typename RhsIndex, typename DestScalar, int DestOptions, typename DestIndex>
-    void _solve(const SparseMatrix<RhsScalar,RhsOptions,RhsIndex> &b, SparseMatrix<DestScalar,DestOptions,DestIndex> &dest) const
+    void _solve_impl(const SparseMatrix<RhsScalar,RhsOptions,RhsIndex> &b, SparseMatrix<DestScalar,DestOptions,DestIndex> &dest) const
    {
      eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or symbolic()/numeric()");
      const Index size = m_cholmodFactor->n;
@ -357,7 +330,6 @@ class CholmodBase : internal::noncopyable
    cholmod_factor* m_cholmodFactor;
    RealScalar m_shiftOffset[2];
    mutable ComputationInfo m_info;
-    bool m_isInitialized;
    int m_factorizationIsOk;
    int m_analysisIsOk;
 };
@ -572,36 +544,6 @@ class CholmodDecomposition : public CholmodBase<_MatrixType, _UpLo, CholmodDecom
    }
 };

-namespace internal {
-  
-template<typename _MatrixType, int _UpLo, typename Derived, typename Rhs>
-struct solve_retval<CholmodBase<_MatrixType,_UpLo,Derived>, Rhs>
-  : solve_retval_base<CholmodBase<_MatrixType,_UpLo,Derived>, Rhs>
-{
-  typedef CholmodBase<_MatrixType,_UpLo,Derived> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
-
-template<typename _MatrixType, int _UpLo, typename Derived, typename Rhs>
-struct sparse_solve_retval<CholmodBase<_MatrixType,_UpLo,Derived>, Rhs>
-  : sparse_solve_retval_base<CholmodBase<_MatrixType,_UpLo,Derived>, Rhs>
-{
-  typedef CholmodBase<_MatrixType,_UpLo,Derived> Dec;
-  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
-
-} // end namespace internal
-
 } // end namespace Eigen

 #endif // EIGEN_CHOLMODSUPPORT_H
--- a/Eigen/src/Core/Array.h
+++ b/Eigen/src/Core/Array.h
@ -74,6 +74,21 @@ class Array
    {
      return Base::operator=(other);
    }
+    
+    /** Set all the entries to \a value.
+      * \sa DenseBase::setConstant(), DenseBase::fill()
+      */
+    /* This overload is needed because the usage of
+      *   using Base::operator=;
+      * fails on MSVC. Since the code below is working with GCC and MSVC, we skipped
+      * the usage of 'using'. This should be done only for operator=.
+      */
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Array& operator=(const Scalar &value)
+    {
+      Base::setConstant(value);
+      return *this;
+    }

    /** Copies the value of the expression \a other into \c *this with automatic resizing.
      *
@ -99,7 +114,7 @@ class Array
    {
      return Base::_set(other);
    }
-
+    
    /** Default constructor.
      *
      * For fixed-size matrices, does nothing.
@ -144,7 +159,6 @@ class Array
    }
 #endif

-
    #ifndef EIGEN_PARSED_BY_DOXYGEN
    template<typename T>
    EIGEN_DEVICE_FUNC
@ -244,13 +258,6 @@ class Array
      *this = other;
    }

-    /** Override MatrixBase::swap() since for dynamic-sized matrices of same type it is enough to swap the
-      * data pointers.
-      */
-    template<typename OtherDerived>
-    void swap(ArrayBase<OtherDerived> const & other)
-    { this->_swap(other.derived()); }
-
    EIGEN_DEVICE_FUNC inline Index innerStride() const { return 1; }
    EIGEN_DEVICE_FUNC inline Index outerStride() const { return this->innerSize(); }

--- a/Eigen/src/Core/ArrayBase.h
+++ b/Eigen/src/Core/ArrayBase.h
@ -64,8 +64,7 @@ template<typename Derived> class ArrayBase
    using Base::MaxSizeAtCompileTime;
    using Base::IsVectorAtCompileTime;
    using Base::Flags;
-    using Base::CoeffReadCost;
-
+    
    using Base::derived;
    using Base::const_cast_derived;
    using Base::rows;
@ -121,8 +120,15 @@ template<typename Derived> class ArrayBase
    EIGEN_DEVICE_FUNC
    Derived& operator=(const ArrayBase& other)
    {
-      return internal::assign_selector<Derived,Derived>::run(derived(), other.derived());
+      internal::call_assignment(derived(), other.derived());
+      return derived();
    }
+    
+    /** Set all the entries to \a value.
+      * \sa DenseBase::setConstant(), DenseBase::fill() */
+    EIGEN_DEVICE_FUNC
+    Derived& operator=(const Scalar &value)
+    { Base::setConstant(value); return derived(); }

    EIGEN_DEVICE_FUNC
    Derived& operator+=(const Scalar& scalar);
@ -153,9 +159,9 @@ template<typename Derived> class ArrayBase
    /** \returns an \link Eigen::MatrixBase Matrix \endlink expression of this array
      * \sa MatrixBase::array() */
    EIGEN_DEVICE_FUNC
-    MatrixWrapper<Derived> matrix() { return derived(); }
+    MatrixWrapper<Derived> matrix() { return MatrixWrapper<Derived>(derived()); }
    EIGEN_DEVICE_FUNC
-    const MatrixWrapper<const Derived> matrix() const { return derived(); }
+    const MatrixWrapper<const Derived> matrix() const { return MatrixWrapper<const Derived>(derived()); }

 //     template<typename Dest>
 //     inline void evalTo(Dest& dst) const { dst = matrix(); }
@ -186,8 +192,7 @@ template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator-=(const ArrayBase<OtherDerived> &other)
 {
-  SelfCwiseBinaryOp<internal::scalar_difference_op<Scalar>, Derived, OtherDerived> tmp(derived());
-  tmp = other.derived();
+  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar>());
  return derived();
 }

@ -200,8 +205,7 @@ template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator+=(const ArrayBase<OtherDerived>& other)
 {
-  SelfCwiseBinaryOp<internal::scalar_sum_op<Scalar>, Derived, OtherDerived> tmp(derived());
-  tmp = other.derived();
+  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar>());
  return derived();
 }

@ -214,8 +218,7 @@ template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator*=(const ArrayBase<OtherDerived>& other)
 {
-  SelfCwiseBinaryOp<internal::scalar_product_op<Scalar>, Derived, OtherDerived> tmp(derived());
-  tmp = other.derived();
+  call_assignment(derived(), other.derived(), internal::mul_assign_op<Scalar,typename OtherDerived::Scalar>());
  return derived();
 }

@ -228,8 +231,7 @@ template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator/=(const ArrayBase<OtherDerived>& other)
 {
-  SelfCwiseBinaryOp<internal::scalar_quotient_op<Scalar>, Derived, OtherDerived> tmp(derived());
-  tmp = other.derived();
+  call_assignment(derived(), other.derived(), internal::div_assign_op<Scalar>());
  return derived();
 }

--- a/Eigen/src/Core/ArrayWrapper.h
+++ b/Eigen/src/Core/ArrayWrapper.h
@ -44,6 +44,7 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
    typedef ArrayBase<ArrayWrapper> Base;
    EIGEN_DENSE_PUBLIC_INTERFACE(ArrayWrapper)
    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(ArrayWrapper)
+    typedef typename internal::remove_all<ExpressionType>::type NestedExpression;

    typedef typename internal::conditional<
                       internal::is_lvalue<ExpressionType>::value,
@ -54,7 +55,7 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
    typedef typename internal::nested<ExpressionType>::type NestedExpressionType;

    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE ArrayWrapper(ExpressionType& matrix) : m_expression(matrix) {}
+    explicit EIGEN_STRONG_INLINE ArrayWrapper(ExpressionType& matrix) : m_expression(matrix) {}

    EIGEN_DEVICE_FUNC
    inline Index rows() const { return m_expression.rows(); }
@ -186,6 +187,7 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >
    typedef MatrixBase<MatrixWrapper<ExpressionType> > Base;
    EIGEN_DENSE_PUBLIC_INTERFACE(MatrixWrapper)
    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(MatrixWrapper)
+    typedef typename internal::remove_all<ExpressionType>::type NestedExpression;

    typedef typename internal::conditional<
                       internal::is_lvalue<ExpressionType>::value,
@ -196,7 +198,7 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >
    typedef typename internal::nested<ExpressionType>::type NestedExpressionType;

    EIGEN_DEVICE_FUNC
-    inline MatrixWrapper(ExpressionType& a_matrix) : m_expression(a_matrix) {}
+    explicit inline MatrixWrapper(ExpressionType& a_matrix) : m_expression(a_matrix) {}

    EIGEN_DEVICE_FUNC
    inline Index rows() const { return m_expression.rows(); }
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h
@ -14,485 +14,6 @@

 namespace Eigen {

-namespace internal {
-
-/***************************************************************************
-* Part 1 : the logic deciding a strategy for traversal and unrolling       *
-***************************************************************************/
-
-template <typename Derived, typename OtherDerived>
-struct assign_traits
-{
-public:
-  enum {
-    DstIsAligned = Derived::Flags & AlignedBit,
-    DstHasDirectAccess = Derived::Flags & DirectAccessBit,
-    SrcIsAligned = OtherDerived::Flags & AlignedBit,
-    JointAlignment = bool(DstIsAligned) && bool(SrcIsAligned) ? Aligned : Unaligned
-  };
-
-private:
-  enum {
-    InnerSize = int(Derived::IsVectorAtCompileTime) ? int(Derived::SizeAtCompileTime)
-              : int(Derived::Flags)&RowMajorBit ? int(Derived::ColsAtCompileTime)
-              : int(Derived::RowsAtCompileTime),
-    InnerMaxSize = int(Derived::IsVectorAtCompileTime) ? int(Derived::MaxSizeAtCompileTime)
-              : int(Derived::Flags)&RowMajorBit ? int(Derived::MaxColsAtCompileTime)
-              : int(Derived::MaxRowsAtCompileTime),
-    MaxSizeAtCompileTime = Derived::SizeAtCompileTime,
-    PacketSize = packet_traits<typename Derived::Scalar>::size
-  };
-
-  enum {
-    StorageOrdersAgree = (int(Derived::IsRowMajor) == int(OtherDerived::IsRowMajor)),
-    MightVectorize = StorageOrdersAgree
-                  && (int(Derived::Flags) & int(OtherDerived::Flags) & ActualPacketAccessBit),
-    MayInnerVectorize  = MightVectorize && int(InnerSize)!=Dynamic && int(InnerSize)%int(PacketSize)==0
-                       && int(DstIsAligned) && int(SrcIsAligned),
-    MayLinearize = StorageOrdersAgree && (int(Derived::Flags) & int(OtherDerived::Flags) & LinearAccessBit),
-    MayLinearVectorize = MightVectorize && MayLinearize && DstHasDirectAccess
-                       && (DstIsAligned || MaxSizeAtCompileTime == Dynamic),
-      /* If the destination isn't aligned, we have to do runtime checks and we don't unroll,
-         so it's only good for large enough sizes. */
-    MaySliceVectorize  = MightVectorize && DstHasDirectAccess
-                       && (int(InnerMaxSize)==Dynamic || int(InnerMaxSize)>=3*PacketSize)
-      /* slice vectorization can be slow, so we only want it if the slices are big, which is
-         indicated by InnerMaxSize rather than InnerSize, think of the case of a dynamic block
-         in a fixed-size matrix */
-  };
-
-public:
-  enum {
-    Traversal = int(MayInnerVectorize)  ? int(InnerVectorizedTraversal)
-              : int(MayLinearVectorize) ? int(LinearVectorizedTraversal)
-              : int(MaySliceVectorize)  ? int(SliceVectorizedTraversal)
-              : int(MayLinearize)       ? int(LinearTraversal)
-                                        : int(DefaultTraversal),
-    Vectorized = int(Traversal) == InnerVectorizedTraversal
-              || int(Traversal) == LinearVectorizedTraversal
-              || int(Traversal) == SliceVectorizedTraversal
-  };
-
-private:
-  enum {
-    UnrollingLimit      = EIGEN_UNROLLING_LIMIT * (Vectorized ? int(PacketSize) : 1),
-    MayUnrollCompletely = int(Derived::SizeAtCompileTime) != Dynamic
-                       && int(OtherDerived::CoeffReadCost) != Dynamic
-                       && int(Derived::SizeAtCompileTime) * int(OtherDerived::CoeffReadCost) <= int(UnrollingLimit),
-    MayUnrollInner      = int(InnerSize) != Dynamic
-                       && int(OtherDerived::CoeffReadCost) != Dynamic
-                       && int(InnerSize) * int(OtherDerived::CoeffReadCost) <= int(UnrollingLimit)
-  };
-
-public:
-  enum {
-    Unrolling = (int(Traversal) == int(InnerVectorizedTraversal) || int(Traversal) == int(DefaultTraversal))
-                ? (
-                    int(MayUnrollCompletely) ? int(CompleteUnrolling)
-                  : int(MayUnrollInner)      ? int(InnerUnrolling)
-                                             : int(NoUnrolling)
-                  )
-              : int(Traversal) == int(LinearVectorizedTraversal)
-                ? ( bool(MayUnrollCompletely) && bool(DstIsAligned) ? int(CompleteUnrolling) : int(NoUnrolling) )
-              : int(Traversal) == int(LinearTraversal)
-                ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling) : int(NoUnrolling) )
-              : int(NoUnrolling)
-  };
-
-#ifdef EIGEN_DEBUG_ASSIGN
-  static void debug()
-  {
-    EIGEN_DEBUG_VAR(DstIsAligned)
-    EIGEN_DEBUG_VAR(SrcIsAligned)
-    EIGEN_DEBUG_VAR(JointAlignment)
-    EIGEN_DEBUG_VAR(Derived::SizeAtCompileTime)
-    EIGEN_DEBUG_VAR(OtherDerived::CoeffReadCost)
-    EIGEN_DEBUG_VAR(InnerSize)
-    EIGEN_DEBUG_VAR(InnerMaxSize)
-    EIGEN_DEBUG_VAR(PacketSize)
-    EIGEN_DEBUG_VAR(StorageOrdersAgree)
-    EIGEN_DEBUG_VAR(MightVectorize)
-    EIGEN_DEBUG_VAR(MayLinearize)
-    EIGEN_DEBUG_VAR(MayInnerVectorize)
-    EIGEN_DEBUG_VAR(MayLinearVectorize)
-    EIGEN_DEBUG_VAR(MaySliceVectorize)
-    EIGEN_DEBUG_VAR(Traversal)
-    EIGEN_DEBUG_VAR(UnrollingLimit)
-    EIGEN_DEBUG_VAR(MayUnrollCompletely)
-    EIGEN_DEBUG_VAR(MayUnrollInner)
-    EIGEN_DEBUG_VAR(Unrolling)
-  }
-#endif
-};
-
-/***************************************************************************
-* Part 2 : meta-unrollers
-***************************************************************************/
-
-/************************
-*** Default traversal ***
-************************/
-
-template<typename Derived1, typename Derived2, int Index, int Stop>
-struct assign_DefaultTraversal_CompleteUnrolling
-{
-  enum {
-    outer = Index / Derived1::InnerSizeAtCompileTime,
-    inner = Index % Derived1::InnerSizeAtCompileTime
-  };
-
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    dst.copyCoeffByOuterInner(outer, inner, src);
-    assign_DefaultTraversal_CompleteUnrolling<Derived1, Derived2, Index+1, Stop>::run(dst, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Stop>
-struct assign_DefaultTraversal_CompleteUnrolling<Derived1, Derived2, Stop, Stop>
-{
-  EIGEN_DEVICE_FUNC 
-  static EIGEN_STRONG_INLINE void run(Derived1 &, const Derived2 &) {}
-};
-
-template<typename Derived1, typename Derived2, int Index, int Stop>
-struct assign_DefaultTraversal_InnerUnrolling
-{
-  EIGEN_DEVICE_FUNC 
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src, typename Derived1::Index outer)
-  {
-    dst.copyCoeffByOuterInner(outer, Index, src);
-    assign_DefaultTraversal_InnerUnrolling<Derived1, Derived2, Index+1, Stop>::run(dst, src, outer);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Stop>
-struct assign_DefaultTraversal_InnerUnrolling<Derived1, Derived2, Stop, Stop>
-{
-  EIGEN_DEVICE_FUNC 
-  static EIGEN_STRONG_INLINE void run(Derived1 &, const Derived2 &, typename Derived1::Index) {}
-};
-
-/***********************
-*** Linear traversal ***
-***********************/
-
-template<typename Derived1, typename Derived2, int Index, int Stop>
-struct assign_LinearTraversal_CompleteUnrolling
-{
-  EIGEN_DEVICE_FUNC 
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    dst.copyCoeff(Index, src);
-    assign_LinearTraversal_CompleteUnrolling<Derived1, Derived2, Index+1, Stop>::run(dst, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Stop>
-struct assign_LinearTraversal_CompleteUnrolling<Derived1, Derived2, Stop, Stop>
-{
-  EIGEN_DEVICE_FUNC 
-  static EIGEN_STRONG_INLINE void run(Derived1 &, const Derived2 &) {}
-};
-
-/**************************
-*** Inner vectorization ***
-**************************/
-
-template<typename Derived1, typename Derived2, int Index, int Stop>
-struct assign_innervec_CompleteUnrolling
-{
-  enum {
-    outer = Index / Derived1::InnerSizeAtCompileTime,
-    inner = Index % Derived1::InnerSizeAtCompileTime,
-    JointAlignment = assign_traits<Derived1,Derived2>::JointAlignment
-  };
-
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    dst.template copyPacketByOuterInner<Derived2, Aligned, JointAlignment>(outer, inner, src);
-    assign_innervec_CompleteUnrolling<Derived1, Derived2,
-      Index+packet_traits<typename Derived1::Scalar>::size, Stop>::run(dst, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Stop>
-struct assign_innervec_CompleteUnrolling<Derived1, Derived2, Stop, Stop>
-{
-  static EIGEN_STRONG_INLINE void run(Derived1 &, const Derived2 &) {}
-};
-
-template<typename Derived1, typename Derived2, int Index, int Stop>
-struct assign_innervec_InnerUnrolling
-{
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src, typename Derived1::Index outer)
-  {
-    dst.template copyPacketByOuterInner<Derived2, Aligned, Aligned>(outer, Index, src);
-    assign_innervec_InnerUnrolling<Derived1, Derived2,
-      Index+packet_traits<typename Derived1::Scalar>::size, Stop>::run(dst, src, outer);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Stop>
-struct assign_innervec_InnerUnrolling<Derived1, Derived2, Stop, Stop>
-{
-  static EIGEN_STRONG_INLINE void run(Derived1 &, const Derived2 &, typename Derived1::Index) {}
-};
-
-/***************************************************************************
-* Part 3 : implementation of all cases
-***************************************************************************/
-
-template<typename Derived1, typename Derived2,
-         int Traversal = assign_traits<Derived1, Derived2>::Traversal,
-         int Unrolling = assign_traits<Derived1, Derived2>::Unrolling,
-         int Version = Specialized>
-struct assign_impl;
-
-/************************
-*** Default traversal ***
-************************/
-
-template<typename Derived1, typename Derived2, int Unrolling, int Version>
-struct assign_impl<Derived1, Derived2, InvalidTraversal, Unrolling, Version>
-{
-  EIGEN_DEVICE_FUNC
-  static inline void run(Derived1 &, const Derived2 &) { }
-};
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, DefaultTraversal, NoUnrolling, Version>
-{
-  typedef typename Derived1::Index Index;
-  EIGEN_DEVICE_FUNC 
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    const Index innerSize = dst.innerSize();
-    const Index outerSize = dst.outerSize();
-    for(Index outer = 0; outer < outerSize; ++outer)
-      for(Index inner = 0; inner < innerSize; ++inner)
-        dst.copyCoeffByOuterInner(outer, inner, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, DefaultTraversal, CompleteUnrolling, Version>
-{
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    assign_DefaultTraversal_CompleteUnrolling<Derived1, Derived2, 0, Derived1::SizeAtCompileTime>
-      ::run(dst, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, DefaultTraversal, InnerUnrolling, Version>
-{
-  typedef typename Derived1::Index Index;
-  EIGEN_DEVICE_FUNC 
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    const Index outerSize = dst.outerSize();
-    for(Index outer = 0; outer < outerSize; ++outer)
-      assign_DefaultTraversal_InnerUnrolling<Derived1, Derived2, 0, Derived1::InnerSizeAtCompileTime>
-        ::run(dst, src, outer);
-  }
-};
-
-/***********************
-*** Linear traversal ***
-***********************/
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, LinearTraversal, NoUnrolling, Version>
-{
-  typedef typename Derived1::Index Index;
-  EIGEN_DEVICE_FUNC
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    const Index size = dst.size();
-    for(Index i = 0; i < size; ++i)
-      dst.copyCoeff(i, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, LinearTraversal, CompleteUnrolling, Version>
-{
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    assign_LinearTraversal_CompleteUnrolling<Derived1, Derived2, 0, Derived1::SizeAtCompileTime>
-      ::run(dst, src);
-  }
-};
-
-/**************************
-*** Inner vectorization ***
-**************************/
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, InnerVectorizedTraversal, NoUnrolling, Version>
-{
-  typedef typename Derived1::Index Index;
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    const Index innerSize = dst.innerSize();
-    const Index outerSize = dst.outerSize();
-    const Index packetSize = packet_traits<typename Derived1::Scalar>::size;
-    for(Index outer = 0; outer < outerSize; ++outer)
-      for(Index inner = 0; inner < innerSize; inner+=packetSize)
-        dst.template copyPacketByOuterInner<Derived2, Aligned, Aligned>(outer, inner, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, InnerVectorizedTraversal, CompleteUnrolling, Version>
-{
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    assign_innervec_CompleteUnrolling<Derived1, Derived2, 0, Derived1::SizeAtCompileTime>
-      ::run(dst, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, InnerVectorizedTraversal, InnerUnrolling, Version>
-{
-  typedef typename Derived1::Index Index;
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    const Index outerSize = dst.outerSize();
-    for(Index outer = 0; outer < outerSize; ++outer)
-      assign_innervec_InnerUnrolling<Derived1, Derived2, 0, Derived1::InnerSizeAtCompileTime>
-        ::run(dst, src, outer);
-  }
-};
-
-/***************************
-*** Linear vectorization ***
-***************************/
-
-template <bool IsAligned = false>
-struct unaligned_assign_impl
-{
-  template <typename Derived, typename OtherDerived>
-  static EIGEN_STRONG_INLINE void run(const Derived&, OtherDerived&, typename Derived::Index, typename Derived::Index) {}
-};
-
-template <>
-struct unaligned_assign_impl<false>
-{
-  // MSVC must not inline this functions. If it does, it fails to optimize the
-  // packet access path.
-#ifdef _MSC_VER
-  template <typename Derived, typename OtherDerived>
-  static EIGEN_DONT_INLINE void run(const Derived& src, OtherDerived& dst, typename Derived::Index start, typename Derived::Index end)
-#else
-  template <typename Derived, typename OtherDerived>
-  static EIGEN_STRONG_INLINE void run(const Derived& src, OtherDerived& dst, typename Derived::Index start, typename Derived::Index end)
-#endif
-  {
-    for (typename Derived::Index index = start; index < end; ++index)
-      dst.copyCoeff(index, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, LinearVectorizedTraversal, NoUnrolling, Version>
-{
-  typedef typename Derived1::Index Index;
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    const Index size = dst.size();
-    typedef packet_traits<typename Derived1::Scalar> PacketTraits;
-    enum {
-      packetSize = PacketTraits::size,
-      dstAlignment = PacketTraits::AlignedOnScalar ? Aligned : int(assign_traits<Derived1,Derived2>::DstIsAligned) ,
-      srcAlignment = assign_traits<Derived1,Derived2>::JointAlignment
-    };
-    const Index alignedStart = assign_traits<Derived1,Derived2>::DstIsAligned ? 0
-                             : internal::first_aligned(&dst.coeffRef(0), size);
-    const Index alignedEnd = alignedStart + ((size-alignedStart)/packetSize)*packetSize;
-
-    unaligned_assign_impl<assign_traits<Derived1,Derived2>::DstIsAligned!=0>::run(src,dst,0,alignedStart);
-
-    for(Index index = alignedStart; index < alignedEnd; index += packetSize)
-    {
-      dst.template copyPacket<Derived2, dstAlignment, srcAlignment>(index, src);
-    }
-
-    unaligned_assign_impl<>::run(src,dst,alignedEnd,size);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, LinearVectorizedTraversal, CompleteUnrolling, Version>
-{
-  typedef typename Derived1::Index Index;
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    enum { size = Derived1::SizeAtCompileTime,
-           packetSize = packet_traits<typename Derived1::Scalar>::size,
-           alignedSize = (size/packetSize)*packetSize };
-
-    assign_innervec_CompleteUnrolling<Derived1, Derived2, 0, alignedSize>::run(dst, src);
-    assign_DefaultTraversal_CompleteUnrolling<Derived1, Derived2, alignedSize, size>::run(dst, src);
-  }
-};
-
-/**************************
-*** Slice vectorization ***
-***************************/
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, SliceVectorizedTraversal, NoUnrolling, Version>
-{
-  typedef typename Derived1::Index Index;
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    typedef packet_traits<typename Derived1::Scalar> PacketTraits;
-    enum {
-      packetSize = PacketTraits::size,
-      alignable = PacketTraits::AlignedOnScalar,
-      dstAlignment = alignable ? Aligned : int(assign_traits<Derived1,Derived2>::DstIsAligned) ,
-      srcAlignment = assign_traits<Derived1,Derived2>::JointAlignment
-    };
-    const Index packetAlignedMask = packetSize - 1;
-    const Index innerSize = dst.innerSize();
-    const Index outerSize = dst.outerSize();
-    const Index alignedStep = alignable ? (packetSize - dst.outerStride() % packetSize) & packetAlignedMask : 0;
-    Index alignedStart = ((!alignable) || assign_traits<Derived1,Derived2>::DstIsAligned) ? 0
-                       : internal::first_aligned(&dst.coeffRef(0,0), innerSize);
-
-    for(Index outer = 0; outer < outerSize; ++outer)
-    {
-      const Index alignedEnd = alignedStart + ((innerSize-alignedStart) & ~packetAlignedMask);
-      // do the non-vectorizable part of the assignment
-      for(Index inner = 0; inner<alignedStart ; ++inner)
-        dst.copyCoeffByOuterInner(outer, inner, src);
-
-      // do the vectorizable part of the assignment
-      for(Index inner = alignedStart; inner<alignedEnd; inner+=packetSize)
-        dst.template copyPacketByOuterInner<Derived2, dstAlignment, Unaligned>(outer, inner, src);
-
-      // do the non-vectorizable part of the assignment
-      for(Index inner = alignedEnd; inner<innerSize ; ++inner)
-        dst.copyCoeffByOuterInner(outer, inner, src);
-
-      alignedStart = std::min<Index>((alignedStart+alignedStep)%packetSize, innerSize);
-    }
-  }
-};
-
-} // end namespace internal
-
-/***************************************************************************
-* Part 4 : implementation of DenseBase methods
-***************************************************************************/
-
 template<typename Derived>
 template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
@ -506,91 +27,35 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
  EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Derived,OtherDerived)
  EIGEN_STATIC_ASSERT(SameType,YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)

-#ifdef EIGEN_TEST_EVALUATORS
-  
-#ifdef EIGEN_DEBUG_ASSIGN
-  internal::copy_using_evaluator_traits<Derived, OtherDerived>::debug();
-#endif
  eigen_assert(rows() == other.rows() && cols() == other.cols());
-  internal::call_dense_assignment_loop(derived(),other.derived());
+  internal::call_assignment_no_alias(derived(),other.derived());
  
-#else // EIGEN_TEST_EVALUATORS
-
-#ifdef EIGEN_DEBUG_ASSIGN
-  internal::assign_traits<Derived, OtherDerived>::debug();
-#endif
-  eigen_assert(rows() == other.rows() && cols() == other.cols());
-  internal::assign_impl<Derived, OtherDerived, int(SameType) ? int(internal::assign_traits<Derived, OtherDerived>::Traversal)
-                                                             : int(InvalidTraversal)>::run(derived(),other.derived());
-  
-#endif // EIGEN_TEST_EVALUATORS
-  
-#ifndef EIGEN_NO_DEBUG
-  checkTransposeAliasing(other.derived());
-#endif
  return derived();
 }

-namespace internal {
-
-template<typename Derived, typename OtherDerived,
-         bool EvalBeforeAssigning = (int(internal::traits<OtherDerived>::Flags) & EvalBeforeAssigningBit) != 0,
-         bool NeedToTranspose = ((int(Derived::RowsAtCompileTime) == 1 && int(OtherDerived::ColsAtCompileTime) == 1)
-                              |   // FIXME | instead of || to please GCC 4.4.0 stupid warning "suggest parentheses around &&".
-                                  // revert to || as soon as not needed anymore.
-                                  (int(Derived::ColsAtCompileTime) == 1 && int(OtherDerived::RowsAtCompileTime) == 1))
-                              && int(Derived::SizeAtCompileTime) != 1>
-struct assign_selector;
-
-template<typename Derived, typename OtherDerived>
-struct assign_selector<Derived,OtherDerived,false,false> {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.derived()); }
-  template<typename ActualDerived, typename ActualOtherDerived>
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Derived& evalTo(ActualDerived& dst, const ActualOtherDerived& other) { other.evalTo(dst); return dst; }
-};
-template<typename Derived, typename OtherDerived>
-struct assign_selector<Derived,OtherDerived,true,false> {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.eval()); }
-};
-template<typename Derived, typename OtherDerived>
-struct assign_selector<Derived,OtherDerived,false,true> {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.transpose()); }
-  template<typename ActualDerived, typename ActualOtherDerived>
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Derived& evalTo(ActualDerived& dst, const ActualOtherDerived& other) { Transpose<ActualDerived> dstTrans(dst); other.evalTo(dstTrans); return dst; }
-};
-template<typename Derived, typename OtherDerived>
-struct assign_selector<Derived,OtherDerived,true,true> {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.transpose().eval()); }
-};
-
-} // end namespace internal
-
 template<typename Derived>
 template<typename OtherDerived>
 EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator=(const DenseBase<OtherDerived>& other)
 {
-  return internal::assign_selector<Derived,OtherDerived>::run(derived(), other.derived());
+  internal::call_assignment(derived(), other.derived());
+  return derived();
 }

 template<typename Derived>
 EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator=(const DenseBase& other)
 {
-  return internal::assign_selector<Derived,Derived>::run(derived(), other.derived());
+  internal::call_assignment(derived(), other.derived());
+  return derived();
 }

 template<typename Derived>
 EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const MatrixBase& other)
 {
-  return internal::assign_selector<Derived,Derived>::run(derived(), other.derived());
+  internal::call_assignment(derived(), other.derived());
+  return derived();
 }

 template<typename Derived>
@ -598,7 +63,8 @@ template <typename OtherDerived>
 EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const DenseBase<OtherDerived>& other)
 {
-  return internal::assign_selector<Derived,OtherDerived>::run(derived(), other.derived());
+  internal::call_assignment(derived(), other.derived());
+  return derived();
 }

 template<typename Derived>
@ -606,7 +72,8 @@ template <typename OtherDerived>
 EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const EigenBase<OtherDerived>& other)
 {
-  return internal::assign_selector<Derived,OtherDerived,false>::evalTo(derived(), other.derived());
+  internal::call_assignment(derived(), other.derived());
+  return derived();
 }

 template<typename Derived>
@ -614,7 +81,8 @@ template<typename OtherDerived>
 EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
 {
-  return internal::assign_selector<Derived,OtherDerived,false>::evalTo(derived(), other.derived());
+  other.derived().evalTo(derived());
+  return derived();
 }

 } // end namespace Eigen
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@ -2,7 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2011 Benoit Jacob <jacob.benoit.1@gmail.com>
-// Copyright (C) 2011-2013 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2011-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2011-2012 Jitse Niesen <jitse@maths.leeds.ac.uk>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@ -24,37 +24,46 @@ namespace internal {

 // copy_using_evaluator_traits is based on assign_traits

-template <typename Derived, typename OtherDerived>
+template <typename DstEvaluator, typename SrcEvaluator, typename AssignFunc>
 struct copy_using_evaluator_traits
 {
+  typedef typename DstEvaluator::XprType Dst;
+  
+  enum {
+    DstFlags = DstEvaluator::Flags,
+    SrcFlags = SrcEvaluator::Flags
+  };
+  
 public:
  enum {
-    DstIsAligned = Derived::Flags & AlignedBit,
-    DstHasDirectAccess = Derived::Flags & DirectAccessBit,
-    SrcIsAligned = OtherDerived::Flags & AlignedBit,
-    JointAlignment = bool(DstIsAligned) && bool(SrcIsAligned) ? Aligned : Unaligned,
-    SrcEvalBeforeAssign = (evaluator_traits<OtherDerived>::HasEvalTo == 1)
+    DstIsAligned = DstFlags & AlignedBit,
+    DstHasDirectAccess = DstFlags & DirectAccessBit,
+    SrcIsAligned = SrcFlags & AlignedBit,
+    JointAlignment = bool(DstIsAligned) && bool(SrcIsAligned) ? Aligned : Unaligned
  };

 private:
  enum {
-    InnerSize = int(Derived::IsVectorAtCompileTime) ? int(Derived::SizeAtCompileTime)
-              : int(Derived::Flags)&RowMajorBit ? int(Derived::ColsAtCompileTime)
-              : int(Derived::RowsAtCompileTime),
-    InnerMaxSize = int(Derived::IsVectorAtCompileTime) ? int(Derived::MaxSizeAtCompileTime)
-              : int(Derived::Flags)&RowMajorBit ? int(Derived::MaxColsAtCompileTime)
-              : int(Derived::MaxRowsAtCompileTime),
-    MaxSizeAtCompileTime = Derived::SizeAtCompileTime,
-    PacketSize = packet_traits<typename Derived::Scalar>::size
+    InnerSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::SizeAtCompileTime)
+              : int(DstFlags)&RowMajorBit ? int(Dst::ColsAtCompileTime)
+              : int(Dst::RowsAtCompileTime),
+    InnerMaxSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::MaxSizeAtCompileTime)
+              : int(DstFlags)&RowMajorBit ? int(Dst::MaxColsAtCompileTime)
+              : int(Dst::MaxRowsAtCompileTime),
+    MaxSizeAtCompileTime = Dst::SizeAtCompileTime,
+    PacketSize = packet_traits<typename Dst::Scalar>::size
  };

  enum {
-    StorageOrdersAgree = (int(Derived::IsRowMajor) == int(OtherDerived::IsRowMajor)),
+    DstIsRowMajor = DstFlags&RowMajorBit,
+    SrcIsRowMajor = SrcFlags&RowMajorBit,
+    StorageOrdersAgree = (int(DstIsRowMajor) == int(SrcIsRowMajor)),
    MightVectorize = StorageOrdersAgree
-                  && (int(Derived::Flags) & int(OtherDerived::Flags) & ActualPacketAccessBit),
+                  && (int(DstFlags) & int(SrcFlags) & ActualPacketAccessBit)
+                  && (functor_traits<AssignFunc>::PacketAccess),
    MayInnerVectorize  = MightVectorize && int(InnerSize)!=Dynamic && int(InnerSize)%int(PacketSize)==0
                       && int(DstIsAligned) && int(SrcIsAligned),
-    MayLinearize = StorageOrdersAgree && (int(Derived::Flags) & int(OtherDerived::Flags) & LinearAccessBit),
+    MayLinearize = StorageOrdersAgree && (int(DstFlags) & int(SrcFlags) & LinearAccessBit),
    MayLinearVectorize = MightVectorize && MayLinearize && DstHasDirectAccess
                       && (DstIsAligned || MaxSizeAtCompileTime == Dynamic),
      /* If the destination isn't aligned, we have to do runtime checks and we don't unroll,
@ -68,8 +77,7 @@ private:

 public:
  enum {
-    Traversal = int(SrcEvalBeforeAssign) ? int(AllAtOnceTraversal) 
-              : int(MayInnerVectorize)   ? int(InnerVectorizedTraversal)
+    Traversal = int(MayInnerVectorize)   ? int(InnerVectorizedTraversal)
              : int(MayLinearVectorize)  ? int(LinearVectorizedTraversal)
              : int(MaySliceVectorize)   ? int(SliceVectorizedTraversal)
              : int(MayLinearize)        ? int(LinearTraversal)
@ -82,12 +90,12 @@ public:
 private:
  enum {
    UnrollingLimit      = EIGEN_UNROLLING_LIMIT * (Vectorized ? int(PacketSize) : 1),
-    MayUnrollCompletely = int(Derived::SizeAtCompileTime) != Dynamic
-                       && int(OtherDerived::CoeffReadCost) != Dynamic
-                       && int(Derived::SizeAtCompileTime) * int(OtherDerived::CoeffReadCost) <= int(UnrollingLimit),
+    MayUnrollCompletely = int(Dst::SizeAtCompileTime) != Dynamic
+                       && int(SrcEvaluator::CoeffReadCost) != Dynamic
+                       && int(Dst::SizeAtCompileTime) * int(SrcEvaluator::CoeffReadCost) <= int(UnrollingLimit),
    MayUnrollInner      = int(InnerSize) != Dynamic
-                       && int(OtherDerived::CoeffReadCost) != Dynamic
-                       && int(InnerSize) * int(OtherDerived::CoeffReadCost) <= int(UnrollingLimit)
+                       && int(SrcEvaluator::CoeffReadCost) != Dynamic
+                       && int(InnerSize) * int(SrcEvaluator::CoeffReadCost) <= int(UnrollingLimit)
  };

 public:
@ -110,6 +118,12 @@ public:
 #ifdef EIGEN_DEBUG_ASSIGN
  static void debug()
  {
+    std::cerr << "DstXpr: " << typeid(typename DstEvaluator::XprType).name() << std::endl;
+    std::cerr << "SrcXpr: " << typeid(typename SrcEvaluator::XprType).name() << std::endl;
+    std::cerr.setf(std::ios::hex, std::ios::basefield);
+    EIGEN_DEBUG_VAR(DstFlags)
+    EIGEN_DEBUG_VAR(SrcFlags)
+    std::cerr.unsetf(std::ios::hex);
    EIGEN_DEBUG_VAR(DstIsAligned)
    EIGEN_DEBUG_VAR(SrcIsAligned)
    EIGEN_DEBUG_VAR(JointAlignment)
@ -127,6 +141,7 @@ public:
    EIGEN_DEBUG_VAR(MayUnrollCompletely)
    EIGEN_DEBUG_VAR(MayUnrollInner)
    EIGEN_DEBUG_VAR(Unrolling)
+    std::cerr << std::endl;
  }
 #endif
 };
@ -142,6 +157,7 @@ public:
 template<typename Kernel, int Index, int Stop>
 struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling
 {
+  // FIXME: this is not very clean, perhaps this information should be provided by the kernel?
  typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
  typedef typename DstEvaluatorType::XprType DstXprType;
  
@ -150,7 +166,7 @@ struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling
    inner = Index % DstXprType::InnerSizeAtCompileTime
  };

-  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
  {
    kernel.assignCoeffByOuterInner(outer, inner);
    copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, Index+1, Stop>::run(kernel);
@ -160,13 +176,13 @@ struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling
 template<typename Kernel, int Stop>
 struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, Stop, Stop>
 {
-  static EIGEN_STRONG_INLINE void run(Kernel&) { }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) { }
 };

 template<typename Kernel, int Index, int Stop>
 struct copy_using_evaluator_DefaultTraversal_InnerUnrolling
 {
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel, int outer)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel, typename Kernel::Index outer)
  {
    kernel.assignCoeffByOuterInner(outer, Index);
    copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, Index+1, Stop>::run(kernel, outer);
@ -176,7 +192,7 @@ struct copy_using_evaluator_DefaultTraversal_InnerUnrolling
 template<typename Kernel, int Stop>
 struct copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, Stop, Stop>
 {
-  static EIGEN_STRONG_INLINE void run(Kernel&, int) { }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&, typename Kernel::Index) { }
 };

 /***********************
@ -186,7 +202,7 @@ struct copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, Stop, Stop>
 template<typename Kernel, int Index, int Stop>
 struct copy_using_evaluator_LinearTraversal_CompleteUnrolling
 {
-  static EIGEN_STRONG_INLINE void run(Kernel& kernel)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel)
  {
    kernel.assignCoeff(Index);
    copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, Index+1, Stop>::run(kernel);
@ -196,7 +212,7 @@ struct copy_using_evaluator_LinearTraversal_CompleteUnrolling
 template<typename Kernel, int Stop>
 struct copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, Stop, Stop>
 {
-  static EIGEN_STRONG_INLINE void run(Kernel&) { }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) { }
 };

 /**************************
@ -206,16 +222,17 @@ struct copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, Stop, Stop
 template<typename Kernel, int Index, int Stop>
 struct copy_using_evaluator_innervec_CompleteUnrolling
 {
+  // FIXME: this is not very clean, perhaps this information should be provided by the kernel?
  typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
  typedef typename DstEvaluatorType::XprType DstXprType;
-
+  
  enum {
    outer = Index / DstXprType::InnerSizeAtCompileTime,
    inner = Index % DstXprType::InnerSizeAtCompileTime,
    JointAlignment = Kernel::AssignmentTraits::JointAlignment
  };

-  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
  {
    kernel.template assignPacketByOuterInner<Aligned, JointAlignment>(outer, inner);
    enum { NextIndex = Index + packet_traits<typename DstXprType::Scalar>::size };
@ -226,17 +243,16 @@ struct copy_using_evaluator_innervec_CompleteUnrolling
 template<typename Kernel, int Stop>
 struct copy_using_evaluator_innervec_CompleteUnrolling<Kernel, Stop, Stop>
 {
-  static EIGEN_STRONG_INLINE void run(Kernel&) { }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) { }
 };

 template<typename Kernel, int Index, int Stop>
 struct copy_using_evaluator_innervec_InnerUnrolling
 {
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel, int outer)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel, typename Kernel::Index outer)
  {
    kernel.template assignPacketByOuterInner<Aligned, Aligned>(outer, Index);
-    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
-    enum { NextIndex = Index + packet_traits<typename DstXprType::Scalar>::size };
+    enum { NextIndex = Index + packet_traits<typename Kernel::Scalar>::size };
    copy_using_evaluator_innervec_InnerUnrolling<Kernel, NextIndex, Stop>::run(kernel, outer);
  }
 };
@ -244,7 +260,7 @@ struct copy_using_evaluator_innervec_InnerUnrolling
 template<typename Kernel, int Stop>
 struct copy_using_evaluator_innervec_InnerUnrolling<Kernel, Stop, Stop>
 {
-  static EIGEN_STRONG_INLINE void run(Kernel &, int) { }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &, typename Kernel::Index) { }
 };

 /***************************************************************************
@ -265,7 +281,7 @@ struct dense_assignment_loop;
 template<typename Kernel>
 struct dense_assignment_loop<Kernel, DefaultTraversal, NoUnrolling>
 {
-  static void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static void run(Kernel &kernel)
  {
    typedef typename Kernel::Index Index;
    
@ -280,7 +296,7 @@ struct dense_assignment_loop<Kernel, DefaultTraversal, NoUnrolling>
 template<typename Kernel>
 struct dense_assignment_loop<Kernel, DefaultTraversal, CompleteUnrolling>
 {
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
  {
    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
    copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, 0, DstXprType::SizeAtCompileTime>::run(kernel);
@ -291,7 +307,7 @@ template<typename Kernel>
 struct dense_assignment_loop<Kernel, DefaultTraversal, InnerUnrolling>
 {
  typedef typename Kernel::Index Index;
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
  {
    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;

@ -314,7 +330,7 @@ struct unaligned_dense_assignment_loop
 {
  // if IsAligned = true, then do nothing
  template <typename Kernel>
-  static EIGEN_STRONG_INLINE void run(Kernel&, typename Kernel::Index, typename Kernel::Index) {}
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&, typename Kernel::Index, typename Kernel::Index) {}
 };

 template <>
@ -323,14 +339,14 @@ struct unaligned_dense_assignment_loop<false>
  // MSVC must not inline this functions. If it does, it fails to optimize the
  // packet access path.
  // FIXME check which version exhibits this issue
-#ifdef _MSC_VER
+#if EIGEN_COMP_MSVC
  template <typename Kernel>
  static EIGEN_DONT_INLINE void run(Kernel &kernel,
                                    typename Kernel::Index start,
                                    typename Kernel::Index end)
 #else
  template <typename Kernel>
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel,
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel,
                                      typename Kernel::Index start,
                                      typename Kernel::Index end)
 #endif
@ -343,7 +359,7 @@ struct unaligned_dense_assignment_loop<false>
 template<typename Kernel>
 struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, NoUnrolling>
 {
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
  {
    typedef typename Kernel::Index Index;

@ -371,7 +387,7 @@ template<typename Kernel>
 struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, CompleteUnrolling>
 {
  typedef typename Kernel::Index Index;
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
  {
    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
    
@ -391,7 +407,7 @@ struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, CompleteUnrollin
 template<typename Kernel>
 struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, NoUnrolling>
 {
-  static inline void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static inline void run(Kernel &kernel)
  {
    typedef typename Kernel::Index Index;

@ -407,7 +423,7 @@ struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, NoUnrolling>
 template<typename Kernel>
 struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, CompleteUnrolling>
 {
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
  {
    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
    copy_using_evaluator_innervec_CompleteUnrolling<Kernel, 0, DstXprType::SizeAtCompileTime>::run(kernel);
@ -418,7 +434,7 @@ template<typename Kernel>
 struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, InnerUnrolling>
 {
  typedef typename Kernel::Index Index;
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
  {
    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
    const Index outerSize = kernel.outerSize();
@ -434,7 +450,7 @@ struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, InnerUnrolling>
 template<typename Kernel>
 struct dense_assignment_loop<Kernel, LinearTraversal, NoUnrolling>
 {
-  static inline void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static inline void run(Kernel &kernel)
  {
    typedef typename Kernel::Index Index;
    const Index size = kernel.size();
@ -446,7 +462,7 @@ struct dense_assignment_loop<Kernel, LinearTraversal, NoUnrolling>
 template<typename Kernel>
 struct dense_assignment_loop<Kernel, LinearTraversal, CompleteUnrolling>
 {
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
  {
    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
    copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, 0, DstXprType::SizeAtCompileTime>::run(kernel);
@ -460,7 +476,7 @@ struct dense_assignment_loop<Kernel, LinearTraversal, CompleteUnrolling>
 template<typename Kernel>
 struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
 {
-  static inline void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static inline void run(Kernel &kernel)
  {
    typedef typename Kernel::Index Index;
    typedef packet_traits<typename Kernel::Scalar> PacketTraits;
@ -496,25 +512,8 @@ struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
  }
 };

-/****************************
-*** All-at-once traversal ***
-****************************/
-
-// TODO: this 'AllAtOnceTraversal' should be dropped or caught earlier (Gael)
-// Indeed, what to do with the kernel's functor??
-template<typename Kernel>
-struct dense_assignment_loop<Kernel, AllAtOnceTraversal, NoUnrolling>
-{
-  static inline void run(Kernel & kernel)
-  {
-    // Evaluate rhs in temporary to prevent aliasing problems in a = a * a;
-    // TODO: Do not pass the xpr object to evalTo() (Jitse)
-    kernel.srcEvaluator().evalTo(kernel.dstEvaluator(), kernel.dstExpression());
-  }
-};
-
 /***************************************************************************
-* Part 4 : Generic Assignment routine
+* Part 4 : Generic dense assignment kernel
 ***************************************************************************/

 // This class generalize the assignment of a coefficient (or packet) from one dense evaluator
@ -523,7 +522,7 @@ struct dense_assignment_loop<Kernel, AllAtOnceTraversal, NoUnrolling>
 // This abstraction level permits to keep the evaluation loops as simple and as generic as possible.
 // One can customize the assignment using this generic dense_assignment_kernel with different
 // functors, or by completely overloading it, by-passing a functor.
-template<typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT, typename Functor>
+template<typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT, typename Functor, int Version = Specialized>
 class generic_dense_assignment_kernel
 {
 protected:
@ -535,35 +534,44 @@ public:
  typedef SrcEvaluatorTypeT SrcEvaluatorType;
  typedef typename DstEvaluatorType::Scalar Scalar;
  typedef typename DstEvaluatorType::Index Index;
-  typedef copy_using_evaluator_traits<DstXprType, SrcXprType> AssignmentTraits;
+  typedef copy_using_evaluator_traits<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor> AssignmentTraits;
  
  
-  generic_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
+  EIGEN_DEVICE_FUNC generic_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
    : m_dst(dst), m_src(src), m_functor(func), m_dstExpr(dstExpr)
-  {}
+  {
+    #ifdef EIGEN_DEBUG_ASSIGN
+    AssignmentTraits::debug();
+    #endif
+  }
  
-  Index size() const        { return m_dstExpr.size(); }
-  Index innerSize() const   { return m_dstExpr.innerSize(); }
-  Index outerSize() const   { return m_dstExpr.outerSize(); }
-  Index outerStride() const { return m_dstExpr.outerStride(); }
+  EIGEN_DEVICE_FUNC Index size() const        { return m_dstExpr.size(); }
+  EIGEN_DEVICE_FUNC Index innerSize() const   { return m_dstExpr.innerSize(); }
+  EIGEN_DEVICE_FUNC Index outerSize() const   { return m_dstExpr.outerSize(); }
+  EIGEN_DEVICE_FUNC Index rows() const        { return m_dstExpr.rows(); }
+  EIGEN_DEVICE_FUNC Index cols() const        { return m_dstExpr.cols(); }
+  EIGEN_DEVICE_FUNC Index outerStride() const { return m_dstExpr.outerStride(); }
  
  // TODO get rid of this one:
-  DstXprType& dstExpression() const { return m_dstExpr; }
+  EIGEN_DEVICE_FUNC DstXprType& dstExpression() const { return m_dstExpr; }
  
-  DstEvaluatorType& dstEvaluator() { return m_dst; }
-  const SrcEvaluatorType& srcEvaluator() const { return m_src; }
+  EIGEN_DEVICE_FUNC DstEvaluatorType& dstEvaluator() { return m_dst; }
+  EIGEN_DEVICE_FUNC const SrcEvaluatorType& srcEvaluator() const { return m_src; }
  
-  void assignCoeff(Index row, Index col)
+  /// Assign src(row,col) to dst(row,col) through the assignment functor.
+  EIGEN_DEVICE_FUNC void assignCoeff(Index row, Index col)
  {
    m_functor.assignCoeff(m_dst.coeffRef(row,col), m_src.coeff(row,col));
  }
  
-  void assignCoeff(Index index)
+  /// \sa assignCoeff(Index,Index)
+  EIGEN_DEVICE_FUNC void assignCoeff(Index index)
  {
    m_functor.assignCoeff(m_dst.coeffRef(index), m_src.coeff(index));
  }
  
-  void assignCoeffByOuterInner(Index outer, Index inner)
+  /// \sa assignCoeff(Index,Index)
+  EIGEN_DEVICE_FUNC void assignCoeffByOuterInner(Index outer, Index inner)
  {
    Index row = rowIndexByOuterInner(outer, inner); 
    Index col = colIndexByOuterInner(outer, inner); 
@ -572,40 +580,40 @@ public:
  
  
  template<int StoreMode, int LoadMode>
-  void assignPacket(Index row, Index col)
+  EIGEN_DEVICE_FUNC void assignPacket(Index row, Index col)
  {
    m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(row,col), m_src.template packet<LoadMode>(row,col));
  }
  
  template<int StoreMode, int LoadMode>
-  void assignPacket(Index index)
+  EIGEN_DEVICE_FUNC void assignPacket(Index index)
  {
    m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(index), m_src.template packet<LoadMode>(index));
  }
  
  template<int StoreMode, int LoadMode>
-  void assignPacketByOuterInner(Index outer, Index inner)
+  EIGEN_DEVICE_FUNC void assignPacketByOuterInner(Index outer, Index inner)
  {
    Index row = rowIndexByOuterInner(outer, inner); 
    Index col = colIndexByOuterInner(outer, inner);
    assignPacket<StoreMode,LoadMode>(row, col);
  }
  
-  static Index rowIndexByOuterInner(Index outer, Index inner)
+  EIGEN_DEVICE_FUNC static Index rowIndexByOuterInner(Index outer, Index inner)
  {
    typedef typename DstEvaluatorType::ExpressionTraits Traits;
    return int(Traits::RowsAtCompileTime) == 1 ? 0
      : int(Traits::ColsAtCompileTime) == 1 ? inner
-      : int(Traits::Flags)&RowMajorBit ? outer
+      : int(DstEvaluatorType::Flags)&RowMajorBit ? outer
      : inner;
  }

-  static Index colIndexByOuterInner(Index outer, Index inner)
+  EIGEN_DEVICE_FUNC static Index colIndexByOuterInner(Index outer, Index inner)
  {
    typedef typename DstEvaluatorType::ExpressionTraits Traits;
    return int(Traits::ColsAtCompileTime) == 1 ? 0
      : int(Traits::RowsAtCompileTime) == 1 ? inner
-      : int(Traits::Flags)&RowMajorBit ? inner
+      : int(DstEvaluatorType::Flags)&RowMajorBit ? inner
      : outer;
  }
  
@ -617,13 +625,13 @@ protected:
  DstXprType& m_dstExpr;
 };

+/***************************************************************************
+* Part 5 : Entry point for dense rectangular assignment
+***************************************************************************/
+
 template<typename DstXprType, typename SrcXprType, typename Functor>
-void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src, const Functor &func)
+EIGEN_DEVICE_FUNC void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src, const Functor &func)
 {
-#ifdef EIGEN_DEBUG_ASSIGN
-  // TODO these traits should be computed from information provided by the evaluators
-  internal::copy_using_evaluator_traits<DstXprType, SrcXprType>::debug();
-#endif
  eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
  
  typedef typename evaluator<DstXprType>::type DstEvaluatorType;
@ -639,201 +647,147 @@ void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src, co
 }

 template<typename DstXprType, typename SrcXprType>
-void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src)
+EIGEN_DEVICE_FUNC void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src)
 {
  call_dense_assignment_loop(dst, src, internal::assign_op<typename DstXprType::Scalar>());
 }

 /***************************************************************************
-* Part 5 : Entry points
+* Part 6 : Generic assignment
 ***************************************************************************/

-// Based on DenseBase::LazyAssign()
-// The following functions are just for testing and they are meant to be moved to operator= and the likes.
+// Based on the respective shapes of the destination and source,
+// the class AssignmentKind determine the kind of assignment mechanism.
+// AssignmentKind must define a Kind typedef.
+template<typename DstShape, typename SrcShape> struct AssignmentKind;

-template<typename DstXprType, template <typename> class StorageBase, typename SrcXprType>
-EIGEN_STRONG_INLINE
-const DstXprType& copy_using_evaluator(const NoAlias<DstXprType, StorageBase>& dst, 
-                                       const EigenBase<SrcXprType>& src)
-{
-  return noalias_copy_using_evaluator(dst.expression(), src.derived(), internal::assign_op<typename DstXprType::Scalar>());
-}
+// Assignement kind defined in this file:
+struct Dense2Dense {};
+struct EigenBase2EigenBase {};

-template<typename XprType, int AssumeAliasing = evaluator_traits<XprType>::AssumeAliasing>
-struct AddEvalIfAssumingAliasing;
-
-template<typename XprType>
-struct AddEvalIfAssumingAliasing<XprType, 0>
-{
-  static const XprType& run(const XprType& xpr) 
-  {
-    return xpr;
-  }
-};
-
-template<typename XprType>
-struct AddEvalIfAssumingAliasing<XprType, 1>
-{
-  static const EvalToTemp<XprType> run(const XprType& xpr)
-  {
-    return EvalToTemp<XprType>(xpr);
-  }
-};
-
-template<typename DstXprType, typename SrcXprType, typename Functor>
-EIGEN_STRONG_INLINE
-const DstXprType& copy_using_evaluator(const EigenBase<DstXprType>& dst, const EigenBase<SrcXprType>& src, const Functor &func)
-{
-  return noalias_copy_using_evaluator(dst.const_cast_derived(), 
-                                      AddEvalIfAssumingAliasing<SrcXprType>::run(src.derived()),
-                                      func
-                                     );
-}
-
-// this mimics operator=
-template<typename DstXprType, typename SrcXprType>
-EIGEN_STRONG_INLINE
-const DstXprType& copy_using_evaluator(const EigenBase<DstXprType>& dst, const EigenBase<SrcXprType>& src)
-{
-  return copy_using_evaluator(dst.const_cast_derived(), src.derived(), internal::assign_op<typename DstXprType::Scalar>());
-}
-
-template<typename DstXprType, typename SrcXprType, typename Functor>
-EIGEN_STRONG_INLINE
-const DstXprType& noalias_copy_using_evaluator(const PlainObjectBase<DstXprType>& dst, const EigenBase<SrcXprType>& src, const Functor &func)
-{
-#ifdef EIGEN_DEBUG_ASSIGN
-  internal::copy_using_evaluator_traits<DstXprType, SrcXprType>::debug();
-#endif
-#ifdef EIGEN_NO_AUTOMATIC_RESIZING
-  eigen_assert((dst.size()==0 || (IsVectorAtCompileTime ? (dst.size() == src.size())
-                                                        : (dst.rows() == src.rows() && dst.cols() == src.cols())))
-              && "Size mismatch. Automatic resizing is disabled because EIGEN_NO_AUTOMATIC_RESIZING is defined");
-#else
-  dst.const_cast_derived().resizeLike(src.derived());
-#endif
-  call_dense_assignment_loop(dst.const_cast_derived(), src.derived(), func);
-  return dst.derived();
-}
-
-template<typename DstXprType, typename SrcXprType, typename Functor>
-EIGEN_STRONG_INLINE
-const DstXprType& noalias_copy_using_evaluator(const EigenBase<DstXprType>& dst, const EigenBase<SrcXprType>& src, const Functor &func)
-{
-  call_dense_assignment_loop(dst.const_cast_derived(), src.derived(), func);
-  return dst.derived();
-}
-
-// Based on DenseBase::swap()
-// TODO: Check whether we need to do something special for swapping two
-//       Arrays or Matrices. (Jitse)
-
-// Overload default assignPacket behavior for swapping them
-template<typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT>
-class swap_kernel : public generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, swap_assign_op<typename DstEvaluatorTypeT::Scalar> >
-{
-  typedef generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, swap_assign_op<typename DstEvaluatorTypeT::Scalar> > Base;
-  typedef typename DstEvaluatorTypeT::PacketScalar PacketScalar;
-  using Base::m_dst;
-  using Base::m_src;
-  using Base::m_functor;
-  
-public:
-  typedef typename Base::Scalar Scalar;
-  typedef typename Base::Index Index;
-  typedef typename Base::DstXprType DstXprType;
-  
-  swap_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, DstXprType& dstExpr)
-    : Base(dst, src, swap_assign_op<Scalar>(), dstExpr)
-  {}
-  
-  template<int StoreMode, int LoadMode>
-  void assignPacket(Index row, Index col)
-  {
-    m_functor.template swapPacket<StoreMode,LoadMode,PacketScalar>(&m_dst.coeffRef(row,col), &const_cast<SrcEvaluatorTypeT&>(m_src).coeffRef(row,col));
-  }
-  
-  template<int StoreMode, int LoadMode>
-  void assignPacket(Index index)
-  {
-    m_functor.template swapPacket<StoreMode,LoadMode,PacketScalar>(&m_dst.coeffRef(index), &const_cast<SrcEvaluatorTypeT&>(m_src).coeffRef(index));
-  }
-  
-  // TODO find a simple way not to have to copy/paste this function from generic_dense_assignment_kernel, by simple I mean no CRTP (Gael)
-  template<int StoreMode, int LoadMode>
-  void assignPacketByOuterInner(Index outer, Index inner)
-  {
-    Index row = Base::rowIndexByOuterInner(outer, inner); 
-    Index col = Base::colIndexByOuterInner(outer, inner);
-    assignPacket<StoreMode,LoadMode>(row, col);
-  }
-};
-  
-template<typename DstXprType, typename SrcXprType>
-void swap_using_evaluator(const DstXprType& dst, const SrcXprType& src)
-{
-  // TODO there is too much redundancy with call_dense_assignment_loop
-  
-  eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
-  
-  typedef typename evaluator<DstXprType>::type DstEvaluatorType;
-  typedef typename evaluator<SrcXprType>::type SrcEvaluatorType;
-
-  DstEvaluatorType dstEvaluator(dst);
-  SrcEvaluatorType srcEvaluator(src);
+template<typename,typename> struct AssignmentKind { typedef EigenBase2EigenBase Kind; };
+template<> struct AssignmentKind<DenseShape,DenseShape> { typedef Dense2Dense Kind; };
    
-  typedef swap_kernel<DstEvaluatorType,SrcEvaluatorType> Kernel;
-  Kernel kernel(dstEvaluator, srcEvaluator, dst.const_cast_derived());
+// This is the main assignment class
+template< typename DstXprType, typename SrcXprType, typename Functor,
+          typename Kind = typename AssignmentKind< typename evaluator_traits<DstXprType>::Shape , typename evaluator_traits<SrcXprType>::Shape >::Kind,
+          typename Scalar = typename DstXprType::Scalar>
+struct Assignment;
+
+
+// The only purpose of this call_assignment() function is to deal with noalias() / AssumeAliasing and automatic transposition.
+// Indeed, I (Gael) think that this concept of AssumeAliasing was a mistake, and it makes thing quite complicated.
+// So this intermediate function removes everything related to AssumeAliasing such that Assignment
+// does not has to bother about these annoying details.
+
+template<typename Dst, typename Src>
+EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src)
+{
+  call_assignment(dst, src, internal::assign_op<typename Dst::Scalar>());
+}
+template<typename Dst, typename Src>
+EIGEN_DEVICE_FUNC void call_assignment(const Dst& dst, const Src& src)
+{
+  call_assignment(dst, src, internal::assign_op<typename Dst::Scalar>());
+}
+                     
+// Deal with AssumeAliasing
+template<typename Dst, typename Src, typename Func>
+EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if<evaluator_traits<Src>::AssumeAliasing==1, void*>::type = 0)
+{
+  typename plain_matrix_type<Src>::type tmp(src);
+  call_assignment_no_alias(dst, tmp, func);
+}
+
+template<typename Dst, typename Src, typename Func>
+EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if<evaluator_traits<Src>::AssumeAliasing==0, void*>::type = 0)
+{
+  call_assignment_no_alias(dst, src, func);
+}
+
+// by-pass AssumeAliasing
+// FIXME the const version should probably not be needed
+// When there is no aliasing, we require that 'dst' has been properly resized
+template<typename Dst, template <typename> class StorageBase, typename Src, typename Func>
+EIGEN_DEVICE_FUNC void call_assignment(const NoAlias<Dst,StorageBase>& dst, const Src& src, const Func& func)
+{
+  call_assignment_no_alias(dst.expression(), src, func);
+}
+template<typename Dst, template <typename> class StorageBase, typename Src, typename Func>
+EIGEN_DEVICE_FUNC void call_assignment(NoAlias<Dst,StorageBase>& dst, const Src& src, const Func& func)
+{
+  call_assignment_no_alias(dst.expression(), src, func);
+}
+
+
+template<typename Dst, typename Src, typename Func>
+EIGEN_DEVICE_FUNC void call_assignment_no_alias(Dst& dst, const Src& src, const Func& func)
+{
+  enum {
+    NeedToTranspose = (  (int(Dst::RowsAtCompileTime) == 1 && int(Src::ColsAtCompileTime) == 1)
+                        |   // FIXME | instead of || to please GCC 4.4.0 stupid warning "suggest parentheses around &&".
+                                // revert to || as soon as not needed anymore.
+                         (int(Dst::ColsAtCompileTime) == 1 && int(Src::RowsAtCompileTime) == 1))
+                     && int(Dst::SizeAtCompileTime) != 1
+  };
+
+  typename Dst::Index dstRows = NeedToTranspose ? src.cols() : src.rows();
+  typename Dst::Index dstCols = NeedToTranspose ? src.rows() : src.cols();
+  if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+    dst.resize(dstRows, dstCols);
  
-  dense_assignment_loop<Kernel>::run(kernel);
-}
+  typedef typename internal::conditional<NeedToTranspose, Transpose<Dst>, Dst>::type ActualDstTypeCleaned;
+  typedef typename internal::conditional<NeedToTranspose, Transpose<Dst>, Dst&>::type ActualDstType;
+  ActualDstType actualDst(dst);
+  
+  // TODO check whether this is the right place to perform these checks:
+  EIGEN_STATIC_ASSERT_LVALUE(Dst)
+  EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(ActualDstTypeCleaned,Src)

-// Based on MatrixBase::operator+= (in CwiseBinaryOp.h)
-template<typename DstXprType, typename SrcXprType>
-void add_assign_using_evaluator(const MatrixBase<DstXprType>& dst, const MatrixBase<SrcXprType>& src)
+  // TODO this line is commented to allow matrix = permutation
+  // Actually, the "Scalar" type for a permutation matrix does not really make sense,
+  // perhaps it could be void, and EIGEN_CHECK_BINARY_COMPATIBILIY could allow micing void with anything...?
+//   EIGEN_CHECK_BINARY_COMPATIBILIY(Func,typename ActualDstTypeCleaned::Scalar,typename Src::Scalar);
+  
+  Assignment<ActualDstTypeCleaned,Src,Func>::run(actualDst, src, func);
+}
+template<typename Dst, typename Src>
+EIGEN_DEVICE_FUNC void call_assignment_no_alias(Dst& dst, const Src& src)
 {
-  typedef typename DstXprType::Scalar Scalar;
-  copy_using_evaluator(dst.derived(), src.derived(), add_assign_op<Scalar>());
+  call_assignment_no_alias(dst, src, internal::assign_op<typename Dst::Scalar>());
 }

-// Based on ArrayBase::operator+=
-template<typename DstXprType, typename SrcXprType>
-void add_assign_using_evaluator(const ArrayBase<DstXprType>& dst, const ArrayBase<SrcXprType>& src)
+// forward declaration
+template<typename Dst, typename Src> void check_for_aliasing(const Dst &dst, const Src &src);
+
+// Generic Dense to Dense assignment
+template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
+struct Assignment<DstXprType, SrcXprType, Functor, Dense2Dense, Scalar>
 {
-  typedef typename DstXprType::Scalar Scalar;
-  copy_using_evaluator(dst.derived(), src.derived(), add_assign_op<Scalar>());
-}
+  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
+  {
+    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
+    
+#ifndef EIGEN_NO_DEBUG
+    internal::check_for_aliasing(dst, src);
+#endif
+    
+    call_dense_assignment_loop(dst, src, func);
+  }
+};

-// TODO: Add add_assign_using_evaluator for EigenBase ? (Jitse)
-
-template<typename DstXprType, typename SrcXprType>
-void subtract_assign_using_evaluator(const MatrixBase<DstXprType>& dst, const MatrixBase<SrcXprType>& src)
+// Generic assignment through evalTo.
+// TODO: not sure we have to keep that one, but it helps porting current code to new evaluator mechanism.
+template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
+struct Assignment<DstXprType, SrcXprType, Functor, EigenBase2EigenBase, Scalar>
 {
-  typedef typename DstXprType::Scalar Scalar;
-  copy_using_evaluator(dst.derived(), src.derived(), sub_assign_op<Scalar>());
-}
-
-template<typename DstXprType, typename SrcXprType>
-void subtract_assign_using_evaluator(const ArrayBase<DstXprType>& dst, const ArrayBase<SrcXprType>& src)
-{
-  typedef typename DstXprType::Scalar Scalar;
-  copy_using_evaluator(dst.derived(), src.derived(), sub_assign_op<Scalar>());
-}
-
-template<typename DstXprType, typename SrcXprType>
-void multiply_assign_using_evaluator(const ArrayBase<DstXprType>& dst, const ArrayBase<SrcXprType>& src)
-{
-  typedef typename DstXprType::Scalar Scalar;
-  copy_using_evaluator(dst.derived(), src.derived(), mul_assign_op<Scalar>());
-}
-
-template<typename DstXprType, typename SrcXprType>
-void divide_assign_using_evaluator(const ArrayBase<DstXprType>& dst, const ArrayBase<SrcXprType>& src)
-{
-  typedef typename DstXprType::Scalar Scalar;
-  copy_using_evaluator(dst.derived(), src.derived(), div_assign_op<Scalar>());
-}
-
+  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &/*func*/)
+  {
+    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
+    
+    src.evalTo(dst);
+  }
+};

 } // namespace internal

--- a/Eigen/src/Core/BandMatrix.h
+++ b/Eigen/src/Core/BandMatrix.h
@ -204,7 +204,7 @@ class BandMatrix : public BandMatrixBase<BandMatrix<_Scalar,Rows,Cols,Supers,Sub
    typedef typename internal::traits<BandMatrix>::Index Index;
    typedef typename internal::traits<BandMatrix>::CoefficientsType CoefficientsType;

-    inline BandMatrix(Index rows=Rows, Index cols=Cols, Index supers=Supers, Index subs=Subs)
+    explicit inline BandMatrix(Index rows=Rows, Index cols=Cols, Index supers=Supers, Index subs=Subs)
      : m_coeffs(1+supers+subs,cols),
        m_rows(rows), m_supers(supers), m_subs(subs)
    {
@ -266,7 +266,7 @@ class BandMatrixWrapper : public BandMatrixBase<BandMatrixWrapper<_CoefficientsT
    typedef typename internal::traits<BandMatrixWrapper>::CoefficientsType CoefficientsType;
    typedef typename internal::traits<BandMatrixWrapper>::Index Index;

-    inline BandMatrixWrapper(const CoefficientsType& coeffs, Index rows=_Rows, Index cols=_Cols, Index supers=_Supers, Index subs=_Subs)
+    explicit inline BandMatrixWrapper(const CoefficientsType& coeffs, Index rows=_Rows, Index cols=_Cols, Index supers=_Supers, Index subs=_Subs)
      : m_coeffs(coeffs),
        m_rows(rows), m_supers(supers), m_subs(subs)
    {
@ -314,7 +314,7 @@ class TridiagonalMatrix : public BandMatrix<Scalar,Size,Size,Options&SelfAdjoint
    typedef BandMatrix<Scalar,Size,Size,Options&SelfAdjoint?0:1,1,Options|RowMajor> Base;
    typedef typename Base::Index Index;
  public:
-    TridiagonalMatrix(Index size = Size) : Base(size,size,Options&SelfAdjoint?0:1,1) {}
+    explicit TridiagonalMatrix(Index size = Size) : Base(size,size,Options&SelfAdjoint?0:1,1) {}

    inline typename Base::template DiagonalIntReturnType<1>::Type super()
    { return Base::template diagonal<1>(); }
@ -327,6 +327,25 @@ class TridiagonalMatrix : public BandMatrix<Scalar,Size,Size,Options&SelfAdjoint
  protected:
 };

+
+struct BandShape {};
+
+template<typename _Scalar, int _Rows, int _Cols, int _Supers, int _Subs, int _Options>
+struct evaluator_traits<BandMatrix<_Scalar,_Rows,_Cols,_Supers,_Subs,_Options> >
+  : public evaluator_traits_base<BandMatrix<_Scalar,_Rows,_Cols,_Supers,_Subs,_Options> >
+{
+  typedef BandShape Shape;
+};
+
+template<typename _CoefficientsType,int _Rows, int _Cols, int _Supers, int _Subs,int _Options>
+struct evaluator_traits<BandMatrixWrapper<_CoefficientsType,_Rows,_Cols,_Supers,_Subs,_Options> >
+  : public evaluator_traits_base<BandMatrixWrapper<_CoefficientsType,_Rows,_Cols,_Supers,_Subs,_Options> >
+{
+  typedef BandShape Shape;
+};
+
+template<> struct AssignmentKind<DenseShape,BandShape> { typedef EigenBase2EigenBase Kind; };
+
 } // end namespace internal

 } // end namespace Eigen
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@ -68,6 +68,7 @@ struct traits<Block<XprType, BlockRows, BlockCols, InnerPanel> > : traits<XprTyp
    MaxColsAtCompileTime = BlockCols==0 ? 0
                         : ColsAtCompileTime != Dynamic ? int(ColsAtCompileTime)
                         : int(traits<XprType>::MaxColsAtCompileTime),
+
    XprTypeIsRowMajor = (int(traits<XprType>::Flags)&RowMajorBit) != 0,
    IsRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
               : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
@ -80,18 +81,14 @@ struct traits<Block<XprType, BlockRows, BlockCols, InnerPanel> > : traits<XprTyp
    OuterStrideAtCompileTime = HasSameStorageOrderAsXprType
                             ? int(outer_stride_at_compile_time<XprType>::ret)
                             : int(inner_stride_at_compile_time<XprType>::ret),
-    MaskPacketAccessBit = (InnerSize == Dynamic || (InnerSize % packet_traits<Scalar>::size) == 0)
-                       && (InnerStrideAtCompileTime == 1)
-                        ? PacketAccessBit : 0,
-    MaskAlignedBit = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % EIGEN_ALIGN_BYTES) == 0)) ? AlignedBit : 0,
-    FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (traits<XprType>::Flags&LinearAccessBit))) ? LinearAccessBit : 0,
+    // IsAligned is needed by MapBase's assertions
+    // We can sefely set it to false here. Internal alignment errors will be detected by an eigen_internal_assert in the respective evaluator
+    IsAligned = 0,
+    // FIXME, this traits is rather specialized for dense object and it needs to be cleaned further
    FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0,
    FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0,
-    Flags0 = traits<XprType>::Flags & ( (HereditaryBits & ~RowMajorBit) |
-                                        DirectAccessBit |
-                                        MaskPacketAccessBit |
-                                        MaskAlignedBit),
-    Flags = Flags0 | FlagsLinearAccessBit | FlagsLvalueBit | FlagsRowMajorBit
+    Flags = (traits<XprType>::Flags & DirectAccessBit) | FlagsLvalueBit | FlagsRowMajorBit
+    // FIXME DirectAccessBit should not be handled by expressions
  };
 };

@ -111,6 +108,8 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel> class
    typedef Impl Base;
    EIGEN_GENERIC_PUBLIC_INTERFACE(Block)
    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Block)
+    
+    typedef typename internal::remove_all<XprType>::type NestedExpression;
  
    /** Column or Row constructor
      */
@ -179,7 +178,7 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
    EIGEN_DENSE_PUBLIC_INTERFACE(BlockType)
    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl_dense)

-    class InnerIterator;
+    // class InnerIterator; // FIXME apparently never used

    /** Column or Row constructor
      */
@ -333,6 +332,9 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
  : public MapBase<Block<XprType, BlockRows, BlockCols, InnerPanel> >
 {
    typedef Block<XprType, BlockRows, BlockCols, InnerPanel> BlockType;
+    enum {
+      XprTypeIsRowMajor = (int(traits<XprType>::Flags)&RowMajorBit) != 0
+    };
  public:

    typedef MapBase<BlockType> Base;
@ -343,9 +345,8 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
      */
    EIGEN_DEVICE_FUNC
    inline BlockImpl_dense(XprType& xpr, Index i)
-      : Base(internal::const_cast_ptr(&xpr.coeffRef(
-              (BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) ? i : 0,
-              (BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) ? i : 0)),
+      : Base(xpr.data() + i * (    ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && (!XprTypeIsRowMajor)) 
+                                || ((BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) && ( XprTypeIsRowMajor)) ? xpr.innerStride() : xpr.outerStride()),
             BlockRows==1 ? 1 : xpr.rows(),
             BlockCols==1 ? 1 : xpr.cols()),
        m_xpr(xpr)
@ -357,7 +358,8 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
      */
    EIGEN_DEVICE_FUNC
    inline BlockImpl_dense(XprType& xpr, Index startRow, Index startCol)
-      : Base(internal::const_cast_ptr(&xpr.coeffRef(startRow,startCol))), m_xpr(xpr)
+      : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol)),
+        m_xpr(xpr)
    {
      init();
    }
@ -368,7 +370,7 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
    inline BlockImpl_dense(XprType& xpr,
          Index startRow, Index startCol,
          Index blockRows, Index blockCols)
-      : Base(internal::const_cast_ptr(&xpr.coeffRef(startRow,startCol)), blockRows, blockCols),
+      : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol), blockRows, blockCols),
        m_xpr(xpr)
    {
      init();
--- a/Eigen/src/Core/BooleanRedux.h
+++ b/Eigen/src/Core/BooleanRedux.h
@ -17,9 +17,10 @@ namespace internal {
 template<typename Derived, int UnrollCount>
 struct all_unroller
 {
+  typedef typename Derived::ExpressionTraits Traits;
  enum {
-    col = (UnrollCount-1) / Derived::RowsAtCompileTime,
-    row = (UnrollCount-1) % Derived::RowsAtCompileTime
+    col = (UnrollCount-1) / Traits::RowsAtCompileTime,
+    row = (UnrollCount-1) % Traits::RowsAtCompileTime
  };

  static inline bool run(const Derived &mat)
@ -43,11 +44,12 @@ struct all_unroller<Derived, Dynamic>
 template<typename Derived, int UnrollCount>
 struct any_unroller
 {
+  typedef typename Derived::ExpressionTraits Traits;
  enum {
-    col = (UnrollCount-1) / Derived::RowsAtCompileTime,
-    row = (UnrollCount-1) % Derived::RowsAtCompileTime
+    col = (UnrollCount-1) / Traits::RowsAtCompileTime,
+    row = (UnrollCount-1) % Traits::RowsAtCompileTime
  };
-
+  
  static inline bool run(const Derived &mat)
  {
    return any_unroller<Derived, UnrollCount-1>::run(mat) || mat.coeff(row, col);
@ -78,19 +80,21 @@ struct any_unroller<Derived, Dynamic>
 template<typename Derived>
 inline bool DenseBase<Derived>::all() const
 {
+  typedef typename internal::evaluator<Derived>::type Evaluator;
  enum {
    unroll = SizeAtCompileTime != Dynamic
-          && CoeffReadCost != Dynamic
+          && Evaluator::CoeffReadCost != Dynamic
          && NumTraits<Scalar>::AddCost != Dynamic
-          && SizeAtCompileTime * (CoeffReadCost + NumTraits<Scalar>::AddCost) <= EIGEN_UNROLLING_LIMIT
+          && SizeAtCompileTime * (Evaluator::CoeffReadCost + NumTraits<Scalar>::AddCost) <= EIGEN_UNROLLING_LIMIT
  };
+  Evaluator evaluator(derived());
  if(unroll)
-    return internal::all_unroller<Derived, unroll ? int(SizeAtCompileTime) : Dynamic>::run(derived());
+    return internal::all_unroller<Evaluator, unroll ? int(SizeAtCompileTime) : Dynamic>::run(evaluator);
  else
  {
    for(Index j = 0; j < cols(); ++j)
      for(Index i = 0; i < rows(); ++i)
-        if (!coeff(i, j)) return false;
+        if (!evaluator.coeff(i, j)) return false;
    return true;
  }
 }
@ -102,19 +106,21 @@ inline bool DenseBase<Derived>::all() const
 template<typename Derived>
 inline bool DenseBase<Derived>::any() const
 {
+  typedef typename internal::evaluator<Derived>::type Evaluator;
  enum {
    unroll = SizeAtCompileTime != Dynamic
-          && CoeffReadCost != Dynamic
+          && Evaluator::CoeffReadCost != Dynamic
          && NumTraits<Scalar>::AddCost != Dynamic
-          && SizeAtCompileTime * (CoeffReadCost + NumTraits<Scalar>::AddCost) <= EIGEN_UNROLLING_LIMIT
+          && SizeAtCompileTime * (Evaluator::CoeffReadCost + NumTraits<Scalar>::AddCost) <= EIGEN_UNROLLING_LIMIT
  };
+  Evaluator evaluator(derived());
  if(unroll)
-    return internal::any_unroller<Derived, unroll ? int(SizeAtCompileTime) : Dynamic>::run(derived());
+    return internal::any_unroller<Evaluator, unroll ? int(SizeAtCompileTime) : Dynamic>::run(evaluator);
  else
  {
    for(Index j = 0; j < cols(); ++j)
      for(Index i = 0; i < rows(); ++i)
-        if (coeff(i, j)) return true;
+        if (evaluator.coeff(i, j)) return true;
    return false;
  }
 }
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
--- a/Eigen/src/Core/CoreIterators.h
+++ b/Eigen/src/Core/CoreIterators.h
@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@ -15,47 +15,116 @@ namespace Eigen {
 /* This file contains the respective InnerIterator definition of the expressions defined in Eigen/Core
 */

-/** \ingroup SparseCore_Module
-  * \class InnerIterator
-  * \brief An InnerIterator allows to loop over the element of a sparse (or dense) matrix or expression
-  *
-  * todo
+namespace internal {
+
+template<typename XprType, typename EvaluatorKind>
+class inner_iterator_selector;
+
+}
+
+/** \class InnerIterator
+  * \brief An InnerIterator allows to loop over the element of any matrix expression.
+  * 
+  * \warning To be used with care because an evaluator is constructed every time an InnerIterator iterator is constructed.
+  * 
+  * TODO: add a usage example
  */
-
-// generic version for dense matrix and expressions
-template<typename Derived> class DenseBase<Derived>::InnerIterator
+template<typename XprType>
+class InnerIterator
 {
-  protected:
-    typedef typename Derived::Scalar Scalar;
-    typedef typename Derived::Index Index;
-
-    enum { IsRowMajor = (Derived::Flags&RowMajorBit)==RowMajorBit };
-  public:
-    EIGEN_STRONG_INLINE InnerIterator(const Derived& expr, Index outer)
-      : m_expression(expr), m_inner(0), m_outer(outer), m_end(expr.innerSize())
-    {}
-
-    EIGEN_STRONG_INLINE Scalar value() const
-    {
-      return (IsRowMajor) ? m_expression.coeff(m_outer, m_inner)
-                          : m_expression.coeff(m_inner, m_outer);
-    }
-
-    EIGEN_STRONG_INLINE InnerIterator& operator++() { m_inner++; return *this; }
-
-    EIGEN_STRONG_INLINE Index index() const { return m_inner; }
-    inline Index row() const { return IsRowMajor ? m_outer : index(); }
-    inline Index col() const { return IsRowMajor ? index() : m_outer; }
-
-    EIGEN_STRONG_INLINE operator bool() const { return m_inner < m_end && m_inner>=0; }
-
-  protected:
-    const Derived& m_expression;
-    Index m_inner;
-    const Index m_outer;
-    const Index m_end;
+protected:
+  typedef internal::inner_iterator_selector<XprType, typename internal::evaluator_traits<XprType>::Kind> IteratorType;
+  typedef typename internal::evaluator<XprType>::type EvaluatorType;
+  typedef typename internal::traits<XprType>::Scalar Scalar;
+  typedef typename internal::traits<XprType>::Index Index;
+public:
+  /** Construct an iterator over the \a outerId -th row or column of \a xpr */
+  InnerIterator(const XprType &xpr, const Index &outerId)
+    : m_eval(xpr), m_iter(m_eval, outerId, xpr.innerSize())
+  {}
+  
+  /// \returns the value of the current coefficient.
+  EIGEN_STRONG_INLINE Scalar value() const          { return m_iter.value(); }
+  /** Increment the iterator \c *this to the next non-zero coefficient.
+    * Explicit zeros are not skipped over. To skip explicit zeros, see class SparseView
+    */
+  EIGEN_STRONG_INLINE InnerIterator& operator++()   { m_iter.operator++(); return *this; }
+  /// \returns the column or row index of the current coefficient.
+  EIGEN_STRONG_INLINE Index index() const           { return m_iter.index(); }
+  /// \returns the row index of the current coefficient.
+  EIGEN_STRONG_INLINE Index row() const             { return m_iter.row(); }
+  /// \returns the column index of the current coefficient.
+  EIGEN_STRONG_INLINE Index col() const             { return m_iter.col(); }
+  /// \returns \c true if the iterator \c *this still references a valid coefficient.
+  EIGEN_STRONG_INLINE operator bool() const         { return m_iter; }
+  
+protected:
+  EvaluatorType m_eval;
+  IteratorType m_iter;
+private:
+  // If you get here, then you're not using the right InnerIterator type, e.g.:
+  //   SparseMatrix<double,RowMajor> A;
+  //   SparseMatrix<double>::InnerIterator it(A,0);
+  template<typename T> InnerIterator(const EigenBase<T>&,Index outer);
 };

+namespace internal {
+
+// Generic inner iterator implementation for dense objects
+template<typename XprType>
+class inner_iterator_selector<XprType, IndexBased>
+{
+protected:
+  typedef typename evaluator<XprType>::type EvaluatorType;
+  typedef typename traits<XprType>::Scalar Scalar;
+  typedef typename traits<XprType>::Index Index;
+  enum { IsRowMajor = (XprType::Flags&RowMajorBit)==RowMajorBit };
+  
+public:
+  EIGEN_STRONG_INLINE inner_iterator_selector(const EvaluatorType &eval, const Index &outerId, const Index &innerSize)
+    : m_eval(eval), m_inner(0), m_outer(outerId), m_end(innerSize)
+  {}
+
+  EIGEN_STRONG_INLINE Scalar value() const
+  {
+    return (IsRowMajor) ? m_eval.coeff(m_outer, m_inner)
+                        : m_eval.coeff(m_inner, m_outer);
+  }
+
+  EIGEN_STRONG_INLINE inner_iterator_selector& operator++() { m_inner++; return *this; }
+
+  EIGEN_STRONG_INLINE Index index() const { return m_inner; }
+  inline Index row() const { return IsRowMajor ? m_outer : index(); }
+  inline Index col() const { return IsRowMajor ? index() : m_outer; }
+
+  EIGEN_STRONG_INLINE operator bool() const { return m_inner < m_end && m_inner>=0; }
+
+protected:
+  const EvaluatorType& m_eval;
+  Index m_inner;
+  const Index m_outer;
+  const Index m_end;
+};
+
+// For iterator-based evaluator, inner-iterator is already implemented as
+// evaluator<>::InnerIterator
+template<typename XprType>
+class inner_iterator_selector<XprType, IteratorBased>
+ : public evaluator<XprType>::InnerIterator
+{
+protected:
+  typedef typename evaluator<XprType>::InnerIterator Base;
+  typedef typename evaluator<XprType>::type EvaluatorType;
+  typedef typename traits<XprType>::Index Index;
+  
+public:
+  EIGEN_STRONG_INLINE inner_iterator_selector(const EvaluatorType &eval, const Index &outerId, const Index &/*innerSize*/)
+    : Base(eval, outerId)
+  {}  
+};
+
+} // end namespace internal
+
 } // end namespace Eigen

 #endif // EIGEN_COREITERATORS_H
--- a/Eigen/src/Core/CwiseBinaryOp.h
+++ b/Eigen/src/Core/CwiseBinaryOp.h
@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@ -56,8 +56,9 @@ struct traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
                       typename Rhs::Scalar
                     )
                   >::type Scalar;
-  typedef typename promote_storage_type<typename traits<Lhs>::StorageKind,
-                                           typename traits<Rhs>::StorageKind>::ret StorageKind;
+  typedef typename cwise_promote_storage_type<typename traits<Lhs>::StorageKind,
+                                              typename traits<Rhs>::StorageKind,
+                                              BinaryOp>::ret StorageKind;
  typedef typename promote_index_type<typename traits<Lhs>::Index,
                                         typename traits<Rhs>::Index>::type Index;
  typedef typename Lhs::Nested LhsNested;
@ -65,60 +66,37 @@ struct traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
  typedef typename remove_reference<LhsNested>::type _LhsNested;
  typedef typename remove_reference<RhsNested>::type _RhsNested;
  enum {
-    LhsCoeffReadCost = _LhsNested::CoeffReadCost,
-    RhsCoeffReadCost = _RhsNested::CoeffReadCost,
-    LhsFlags = _LhsNested::Flags,
-    RhsFlags = _RhsNested::Flags,
-    SameType = is_same<typename _LhsNested::Scalar,typename _RhsNested::Scalar>::value,
-    StorageOrdersAgree = (int(Lhs::Flags)&RowMajorBit)==(int(Rhs::Flags)&RowMajorBit),
-    Flags0 = (int(LhsFlags) | int(RhsFlags)) & (
-        HereditaryBits
-      | (int(LhsFlags) & int(RhsFlags) &
-           ( AlignedBit
-           | (StorageOrdersAgree ? LinearAccessBit : 0)
-           | (functor_traits<BinaryOp>::PacketAccess && StorageOrdersAgree && SameType ? PacketAccessBit : 0)
-           )
-        )
-     ),
-    Flags = (Flags0 & ~RowMajorBit) | (LhsFlags & RowMajorBit),
-    CoeffReadCost = LhsCoeffReadCost + RhsCoeffReadCost + functor_traits<BinaryOp>::Cost
+    Flags = _LhsNested::Flags & RowMajorBit
  };
 };
 } // end namespace internal

-// we require Lhs and Rhs to have the same scalar type. Currently there is no example of a binary functor
-// that would take two operands of different types. If there were such an example, then this check should be
-// moved to the BinaryOp functors, on a per-case basis. This would however require a change in the BinaryOp functors, as
-// currently they take only one typename Scalar template parameter.
-// It is tempting to always allow mixing different types but remember that this is often impossible in the vectorized paths.
-// So allowing mixing different types gives very unexpected errors when enabling vectorization, when the user tries to
-// add together a float matrix and a double matrix.
-#define EIGEN_CHECK_BINARY_COMPATIBILIY(BINOP,LHS,RHS) \
-  EIGEN_STATIC_ASSERT((internal::functor_is_product_like<BINOP>::ret \
-                        ? int(internal::scalar_product_traits<LHS, RHS>::Defined) \
-                        : int(internal::is_same<LHS, RHS>::value)), \
-    YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-
 template<typename BinaryOp, typename Lhs, typename Rhs, typename StorageKind>
 class CwiseBinaryOpImpl;

-template<typename BinaryOp, typename Lhs, typename Rhs>
-class CwiseBinaryOp : internal::no_assignment_operator,
+template<typename BinaryOp, typename LhsType, typename RhsType>
+class CwiseBinaryOp : 
  public CwiseBinaryOpImpl<
-          BinaryOp, Lhs, Rhs,
-          typename internal::promote_storage_type<typename internal::traits<Lhs>::StorageKind,
-                                           typename internal::traits<Rhs>::StorageKind>::ret>
+          BinaryOp, LhsType, RhsType,
+          typename internal::cwise_promote_storage_type<typename internal::traits<LhsType>::StorageKind,
+                                                        typename internal::traits<RhsType>::StorageKind,
+                                                        BinaryOp>::ret>,
+  internal::no_assignment_operator
 {
  public:
+    
+    typedef typename internal::remove_all<LhsType>::type Lhs;
+    typedef typename internal::remove_all<RhsType>::type Rhs;

    typedef typename CwiseBinaryOpImpl<
-        BinaryOp, Lhs, Rhs,
-        typename internal::promote_storage_type<typename internal::traits<Lhs>::StorageKind,
-                                         typename internal::traits<Rhs>::StorageKind>::ret>::Base Base;
+        BinaryOp, LhsType, RhsType,
+        typename internal::cwise_promote_storage_type<typename internal::traits<LhsType>::StorageKind,
+                                                      typename internal::traits<Rhs>::StorageKind,
+                                                      BinaryOp>::ret>::Base Base;
    EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseBinaryOp)

-    typedef typename internal::nested<Lhs>::type LhsNested;
-    typedef typename internal::nested<Rhs>::type RhsNested;
+    typedef typename internal::nested<LhsType>::type LhsNested;
+    typedef typename internal::nested<RhsType>::type RhsNested;
    typedef typename internal::remove_reference<LhsNested>::type _LhsNested;
    typedef typename internal::remove_reference<RhsNested>::type _RhsNested;

@ -165,43 +143,13 @@ class CwiseBinaryOp : internal::no_assignment_operator,
    const BinaryOp m_functor;
 };

-template<typename BinaryOp, typename Lhs, typename Rhs>
-class CwiseBinaryOpImpl<BinaryOp, Lhs, Rhs, Dense>
-  : public internal::dense_xpr_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >::type
+// Generic API dispatcher
+template<typename BinaryOp, typename Lhs, typename Rhs, typename StorageKind>
+class CwiseBinaryOpImpl
+  : public internal::generic_xpr_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >::type
 {
-    typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> Derived;
-  public:
-
-    typedef typename internal::dense_xpr_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >::type Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE( Derived )
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar coeff(Index rowId, Index colId) const
-    {
-      return derived().functor()(derived().lhs().coeff(rowId, colId),
-                                 derived().rhs().coeff(rowId, colId));
-    }
-
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet(Index rowId, Index colId) const
-    {
-      return derived().functor().packetOp(derived().lhs().template packet<LoadMode>(rowId, colId),
-                                          derived().rhs().template packet<LoadMode>(rowId, colId));
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
-    {
-      return derived().functor()(derived().lhs().coeff(index),
-                                 derived().rhs().coeff(index));
-    }
-
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet(Index index) const
-    {
-      return derived().functor().packetOp(derived().lhs().template packet<LoadMode>(index),
-                                          derived().rhs().template packet<LoadMode>(index));
-    }
+public:
+  typedef typename internal::generic_xpr_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >::type Base;
 };

 /** replaces \c *this by \c *this - \a other.
@ -213,8 +161,7 @@ template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived &
 MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived> &other)
 {
-  SelfCwiseBinaryOp<internal::scalar_difference_op<Scalar>, Derived, OtherDerived> tmp(derived());
-  tmp = other.derived();
+  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar>());
  return derived();
 }

@ -227,8 +174,7 @@ template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived &
 MatrixBase<Derived>::operator+=(const MatrixBase<OtherDerived>& other)
 {
-  SelfCwiseBinaryOp<internal::scalar_sum_op<Scalar>, Derived, OtherDerived> tmp(derived());
-  tmp = other.derived();
+  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar>());
  return derived();
 }

--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h
@ -35,19 +35,13 @@ template<typename NullaryOp, typename PlainObjectType>
 struct traits<CwiseNullaryOp<NullaryOp, PlainObjectType> > : traits<PlainObjectType>
 {
  enum {
-    Flags = (traits<PlainObjectType>::Flags
-      & (  HereditaryBits
-         | (functor_has_linear_access<NullaryOp>::ret ? LinearAccessBit : 0)
-         | (functor_traits<NullaryOp>::PacketAccess ? PacketAccessBit : 0)))
-      | (functor_traits<NullaryOp>::IsRepeatable ? 0 : EvalBeforeNestingBit),
-    CoeffReadCost = functor_traits<NullaryOp>::Cost
+    Flags = traits<PlainObjectType>::Flags & RowMajorBit
  };
 };
 }

 template<typename NullaryOp, typename PlainObjectType>
-class CwiseNullaryOp : internal::no_assignment_operator,
-  public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp, PlainObjectType> >::type
+class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp, PlainObjectType> >::type, internal::no_assignment_operator
 {
  public:

--- a/Eigen/src/Core/CwiseUnaryOp.h
+++ b/Eigen/src/Core/CwiseUnaryOp.h
@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@ -44,10 +44,7 @@ struct traits<CwiseUnaryOp<UnaryOp, XprType> >
  typedef typename XprType::Nested XprTypeNested;
  typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
  enum {
-    Flags = _XprTypeNested::Flags & (
-      HereditaryBits | LinearAccessBit | AlignedBit
-      | (functor_traits<UnaryOp>::PacketAccess ? PacketAccessBit : 0)),
-    CoeffReadCost = _XprTypeNested::CoeffReadCost + functor_traits<UnaryOp>::Cost
+    Flags = _XprTypeNested::Flags & RowMajorBit 
  };
 };
 }
@ -56,16 +53,16 @@ template<typename UnaryOp, typename XprType, typename StorageKind>
 class CwiseUnaryOpImpl;

 template<typename UnaryOp, typename XprType>
-class CwiseUnaryOp : internal::no_assignment_operator,
-  public CwiseUnaryOpImpl<UnaryOp, XprType, typename internal::traits<XprType>::StorageKind>
+class CwiseUnaryOp : public CwiseUnaryOpImpl<UnaryOp, XprType, typename internal::traits<XprType>::StorageKind>, internal::no_assignment_operator
 {
  public:

    typedef typename CwiseUnaryOpImpl<UnaryOp, XprType,typename internal::traits<XprType>::StorageKind>::Base Base;
    EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseUnaryOp)
+    typedef typename internal::remove_all<XprType>::type NestedExpression;

    EIGEN_DEVICE_FUNC
-    inline CwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp())
+    explicit inline CwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp())
      : m_xpr(xpr), m_functor(func) {}

    EIGEN_DEVICE_FUNC
@ -92,42 +89,13 @@ class CwiseUnaryOp : internal::no_assignment_operator,
    const UnaryOp m_functor;
 };

-// This is the generic implementation for dense storage.
-// It can be used for any expression types implementing the dense concept.
-template<typename UnaryOp, typename XprType>
-class CwiseUnaryOpImpl<UnaryOp,XprType,Dense>
-  : public internal::dense_xpr_base<CwiseUnaryOp<UnaryOp, XprType> >::type
+// Generic API dispatcher
+template<typename UnaryOp, typename XprType, typename StorageKind>
+class CwiseUnaryOpImpl
+  : public internal::generic_xpr_base<CwiseUnaryOp<UnaryOp, XprType> >::type
 {
-  public:
-
-    typedef CwiseUnaryOp<UnaryOp, XprType> Derived;
-    typedef typename internal::dense_xpr_base<CwiseUnaryOp<UnaryOp, XprType> >::type Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar coeff(Index rowId, Index colId) const
-    {
-      return derived().functor()(derived().nestedExpression().coeff(rowId, colId));
-    }
-
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet(Index rowId, Index colId) const
-    {
-      return derived().functor().packetOp(derived().nestedExpression().template packet<LoadMode>(rowId, colId));
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
-    {
-      return derived().functor()(derived().nestedExpression().coeff(index));
-    }
-
-    template<int LoadMode>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE PacketScalar packet(Index index) const
-    {
-      return derived().functor().packetOp(derived().nestedExpression().template packet<LoadMode>(index));
-    }
+public:
+  typedef typename internal::generic_xpr_base<CwiseUnaryOp<UnaryOp, XprType> >::type Base;
 };

 } // end namespace Eigen
--- a/Eigen/src/Core/CwiseUnaryView.h
+++ b/Eigen/src/Core/CwiseUnaryView.h
@ -37,8 +37,8 @@ struct traits<CwiseUnaryView<ViewOp, MatrixType> >
  typedef typename MatrixType::Nested MatrixTypeNested;
  typedef typename remove_all<MatrixTypeNested>::type _MatrixTypeNested;
  enum {
-    Flags = (traits<_MatrixTypeNested>::Flags & (HereditaryBits | LvalueBit | LinearAccessBit | DirectAccessBit)),
-    CoeffReadCost = traits<_MatrixTypeNested>::CoeffReadCost + functor_traits<ViewOp>::Cost,
+    FlagsLvalueBit = is_lvalue<MatrixType>::value ? LvalueBit : 0,
+    Flags = traits<_MatrixTypeNested>::Flags & (RowMajorBit | FlagsLvalueBit | DirectAccessBit), // FIXME DirectAccessBit should not be handled by expressions
    MatrixTypeInnerStride =  inner_stride_at_compile_time<MatrixType>::ret,
    // need to cast the sizeof's from size_t to int explicitly, otherwise:
    // "error: no integral type can represent all of the enumerator values
@ -62,8 +62,9 @@ class CwiseUnaryView : public CwiseUnaryViewImpl<ViewOp, MatrixType, typename in

    typedef typename CwiseUnaryViewImpl<ViewOp, MatrixType,typename internal::traits<MatrixType>::StorageKind>::Base Base;
    EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseUnaryView)
+    typedef typename internal::remove_all<MatrixType>::type NestedExpression;

-    inline CwiseUnaryView(const MatrixType& mat, const ViewOp& func = ViewOp())
+    explicit inline CwiseUnaryView(MatrixType& mat, const ViewOp& func = ViewOp())
      : m_matrix(mat), m_functor(func) {}

    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(CwiseUnaryView)
@ -88,6 +89,15 @@ class CwiseUnaryView : public CwiseUnaryViewImpl<ViewOp, MatrixType, typename in
    ViewOp m_functor;
 };

+// Generic API dispatcher
+template<typename ViewOp, typename XprType, typename StorageKind>
+class CwiseUnaryViewImpl
+  : public internal::generic_xpr_base<CwiseUnaryView<ViewOp, XprType> >::type
+{
+public:
+  typedef typename internal::generic_xpr_base<CwiseUnaryView<ViewOp, XprType> >::type Base;
+};
+
 template<typename ViewOp, typename MatrixType>
 class CwiseUnaryViewImpl<ViewOp,MatrixType,Dense>
  : public internal::dense_xpr_base< CwiseUnaryView<ViewOp, MatrixType> >::type
@ -100,38 +110,18 @@ class CwiseUnaryViewImpl<ViewOp,MatrixType,Dense>
    EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(CwiseUnaryViewImpl)
    
-    inline Scalar* data() { return &coeffRef(0); }
-    inline const Scalar* data() const { return &coeff(0); }
+    EIGEN_DEVICE_FUNC inline Scalar* data() { return &(this->coeffRef(0)); }
+    EIGEN_DEVICE_FUNC inline const Scalar* data() const { return &(this->coeff(0)); }

-    inline Index innerStride() const
+    EIGEN_DEVICE_FUNC inline Index innerStride() const
    {
      return derived().nestedExpression().innerStride() * sizeof(typename internal::traits<MatrixType>::Scalar) / sizeof(Scalar);
    }

-    inline Index outerStride() const
+    EIGEN_DEVICE_FUNC inline Index outerStride() const
    {
      return derived().nestedExpression().outerStride() * sizeof(typename internal::traits<MatrixType>::Scalar) / sizeof(Scalar);
    }
-
-    EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const
-    {
-      return derived().functor()(derived().nestedExpression().coeff(row, col));
-    }
-
-    EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
-    {
-      return derived().functor()(derived().nestedExpression().coeff(index));
-    }
-
-    EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col)
-    {
-      return derived().functor()(const_cast_derived().nestedExpression().coeffRef(row, col));
-    }
-
-    EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
-    {
-      return derived().functor()(const_cast_derived().nestedExpression().coeffRef(index));
-    }
 };

 } // end namespace Eigen
--- a/Eigen/src/Core/DenseBase.h
+++ b/Eigen/src/Core/DenseBase.h
@ -50,7 +50,11 @@ template<typename Derived> class DenseBase
    using internal::special_scalar_op_base<Derived,typename internal::traits<Derived>::Scalar,
                typename NumTraits<typename internal::traits<Derived>::Scalar>::Real>::operator*;

-    class InnerIterator;
+
+    /** Inner iterator type to iterate over the coefficients of a row or column.
+      * \sa class InnerIterator
+      */
+    typedef Eigen::InnerIterator<Derived> InnerIterator;

    typedef typename internal::traits<Derived>::StorageKind StorageKind;

@ -74,16 +78,6 @@ template<typename Derived> class DenseBase
    using Base::colIndexByOuterInner;
    using Base::coeff;
    using Base::coeffByOuterInner;
-    using Base::packet;
-    using Base::packetByOuterInner;
-    using Base::writePacket;
-    using Base::writePacketByOuterInner;
-    using Base::coeffRef;
-    using Base::coeffRefByOuterInner;
-    using Base::copyCoeff;
-    using Base::copyCoeffByOuterInner;
-    using Base::copyPacket;
-    using Base::copyPacketByOuterInner;
    using Base::operator();
    using Base::operator[];
    using Base::x;
@ -169,16 +163,11 @@ template<typename Derived> class DenseBase
      InnerSizeAtCompileTime = int(IsVectorAtCompileTime) ? int(SizeAtCompileTime)
                             : int(IsRowMajor) ? int(ColsAtCompileTime) : int(RowsAtCompileTime),

-      CoeffReadCost = internal::traits<Derived>::CoeffReadCost,
-        /**< This is a rough measure of how expensive it is to read one coefficient from
-          * this expression.
-          */
-
      InnerStrideAtCompileTime = internal::inner_stride_at_compile_time<Derived>::ret,
      OuterStrideAtCompileTime = internal::outer_stride_at_compile_time<Derived>::ret
    };

-    enum { ThisConstantIsPrivateInPlainObjectBase };
+    enum { IsPlainObjectBase = 0 };

    /** \returns the number of nonzero coefficients which is in practice the number
      * of stored coefficients. */
@ -278,7 +267,8 @@ template<typename Derived> class DenseBase
    Derived& operator=(const ReturnByValue<OtherDerived>& func);

 #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** Copies \a other into *this without evaluating other. \returns a reference to *this. */
+    /** Copies \a other into *this without evaluating other. \returns a reference to *this.
+      * \deprecated */
    template<typename OtherDerived>
    EIGEN_DEVICE_FUNC
    Derived& lazyAssign(const DenseBase<OtherDerived>& other);
@ -287,27 +277,24 @@ template<typename Derived> class DenseBase
    EIGEN_DEVICE_FUNC
    CommaInitializer<Derived> operator<< (const Scalar& s);

+    // TODO flagged is temporarly disabled. It seems useless now
    template<unsigned int Added,unsigned int Removed>
-    const Flagged<Derived, Added, Removed> flagged() const;
+    EIGEN_DEPRECATED
+    const Derived& flagged() const
+    { return derived(); }

    template<typename OtherDerived>
    EIGEN_DEVICE_FUNC
    CommaInitializer<Derived> operator<< (const DenseBase<OtherDerived>& other);

+    typedef Transpose<Derived> TransposeReturnType;
    EIGEN_DEVICE_FUNC
-    Eigen::Transpose<Derived> transpose();
+    TransposeReturnType transpose();
    typedef typename internal::add_const<Transpose<const Derived> >::type ConstTransposeReturnType;
    EIGEN_DEVICE_FUNC
    ConstTransposeReturnType transpose() const;
    EIGEN_DEVICE_FUNC
    void transposeInPlace();
-#ifndef EIGEN_NO_DEBUG
-  protected:
-    template<typename OtherDerived>
-    void checkTransposeAliasing(const OtherDerived& other) const;
-  public:
-#endif
-

    EIGEN_DEVICE_FUNC static const ConstantReturnType
    Constant(Index rows, Index cols, const Scalar& value);
@ -387,16 +374,17 @@ template<typename Derived> class DenseBase
      // size types on MSVC.
      return typename internal::eval<Derived>::type(derived());
    }
-
+    
    /** swaps *this with the expression \a other.
      *
      */
    template<typename OtherDerived>
    EIGEN_DEVICE_FUNC
-    void swap(const DenseBase<OtherDerived>& other,
-              int = OtherDerived::ThisConstantIsPrivateInPlainObjectBase)
+    void swap(const DenseBase<OtherDerived>& other)
    {
-      SwapWrapper<Derived>(derived()).lazyAssign(other.derived());
+      EIGEN_STATIC_ASSERT(!OtherDerived::IsPlainObjectBase,THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
+      eigen_assert(rows()==other.rows() && cols()==other.cols());
+      call_assignment(derived(), other.const_cast_derived(), internal::swap_assign_op<Scalar>());
    }

    /** swaps *this with the matrix or array \a other.
@ -406,10 +394,10 @@ template<typename Derived> class DenseBase
    EIGEN_DEVICE_FUNC
    void swap(PlainObjectBase<OtherDerived>& other)
    {
-      SwapWrapper<Derived>(derived()).lazyAssign(other.derived());
+      eigen_assert(rows()==other.rows() && cols()==other.cols());
+      call_assignment(derived(), other.derived(), internal::swap_assign_op<Scalar>());
    }

-
    EIGEN_DEVICE_FUNC inline const NestByValue<Derived> nestByValue() const;
    EIGEN_DEVICE_FUNC inline const ForceAlignedAccess<Derived> forceAlignedAccess() const;
    EIGEN_DEVICE_FUNC inline ForceAlignedAccess<Derived> forceAlignedAccess();
--- a/Eigen/src/Core/DenseCoeffsBase.h
+++ b/Eigen/src/Core/DenseCoeffsBase.h
@ -97,8 +97,8 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
    EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const
    {
      eigen_internal_assert(row >= 0 && row < rows()
-                        && col >= 0 && col < cols());
-      return derived().coeff(row, col);
+                         && col >= 0 && col < cols());
+      return typename internal::evaluator<Derived>::type(derived()).coeff(row,col);
    }

    EIGEN_DEVICE_FUNC
@ -117,7 +117,7 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
    {
      eigen_assert(row >= 0 && row < rows()
          && col >= 0 && col < cols());
-      return derived().coeff(row, col);
+      return coeff(row, col);
    }

    /** Short version: don't use this function, use
@ -140,7 +140,7 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
    coeff(Index index) const
    {
      eigen_internal_assert(index >= 0 && index < size());
-      return derived().coeff(index);
+      return typename internal::evaluator<Derived>::type(derived()).coeff(index);
    }


@ -159,7 +159,7 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
      EIGEN_STATIC_ASSERT(Derived::IsVectorAtCompileTime,
                          THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD)
      eigen_assert(index >= 0 && index < size());
-      return derived().coeff(index);
+      return coeff(index);
    }

    /** \returns the coefficient at given index.
@ -177,7 +177,7 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
    operator()(Index index) const
    {
      eigen_assert(index >= 0 && index < size());
-      return derived().coeff(index);
+      return coeff(index);
    }

    /** equivalent to operator[](0).  */
@ -217,9 +217,8 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
    template<int LoadMode>
    EIGEN_STRONG_INLINE PacketReturnType packet(Index row, Index col) const
    {
-      eigen_internal_assert(row >= 0 && row < rows()
-                      && col >= 0 && col < cols());
-      return derived().template packet<LoadMode>(row,col);
+      eigen_internal_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
+      return typename internal::evaluator<Derived>::type(derived()).template packet<LoadMode>(row,col);
    }


@ -245,7 +244,7 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
    EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
    {
      eigen_internal_assert(index >= 0 && index < size());
-      return derived().template packet<LoadMode>(index);
+      return typename internal::evaluator<Derived>::type(derived()).template packet<LoadMode>(index);
    }

  protected:
@ -325,8 +324,8 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
    EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col)
    {
      eigen_internal_assert(row >= 0 && row < rows()
-                        && col >= 0 && col < cols());
-      return derived().coeffRef(row, col);
+                         && col >= 0 && col < cols());
+      return typename internal::evaluator<Derived>::type(derived()).coeffRef(row,col);
    }

    EIGEN_DEVICE_FUNC
@ -348,7 +347,7 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
    {
      eigen_assert(row >= 0 && row < rows()
          && col >= 0 && col < cols());
-      return derived().coeffRef(row, col);
+      return coeffRef(row, col);
    }


@ -372,7 +371,7 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
    coeffRef(Index index)
    {
      eigen_internal_assert(index >= 0 && index < size());
-      return derived().coeffRef(index);
+      return typename internal::evaluator<Derived>::type(derived()).coeffRef(index);
    }

    /** \returns a reference to the coefficient at given index.
@ -389,7 +388,7 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
      EIGEN_STATIC_ASSERT(Derived::IsVectorAtCompileTime,
                          THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD)
      eigen_assert(index >= 0 && index < size());
-      return derived().coeffRef(index);
+      return coeffRef(index);
    }

    /** \returns a reference to the coefficient at given index.
@ -406,7 +405,7 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
    operator()(Index index)
    {
      eigen_assert(index >= 0 && index < size());
-      return derived().coeffRef(index);
+      return coeffRef(index);
    }

    /** equivalent to operator[](0).  */
@ -432,144 +431,6 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar&
    w() { return (*this)[3]; }
-
-    /** \internal
-      * Stores the given packet of coefficients, at the given row and column of this expression. It is your responsibility
-      * to ensure that a packet really starts there. This method is only available on expressions having the
-      * PacketAccessBit.
-      *
-      * The \a LoadMode parameter may have the value \a #Aligned or \a #Unaligned. Its effect is to select
-      * the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
-      * starting at an address which is a multiple of the packet size.
-      */
-
-    template<int StoreMode>
-    EIGEN_STRONG_INLINE void writePacket
-    (Index row, Index col, const typename internal::packet_traits<Scalar>::type& val)
-    {
-      eigen_internal_assert(row >= 0 && row < rows()
-                        && col >= 0 && col < cols());
-      derived().template writePacket<StoreMode>(row,col,val);
-    }
-
-
-    /** \internal */
-    template<int StoreMode>
-    EIGEN_STRONG_INLINE void writePacketByOuterInner
-    (Index outer, Index inner, const typename internal::packet_traits<Scalar>::type& val)
-    {
-      writePacket<StoreMode>(rowIndexByOuterInner(outer, inner),
-                            colIndexByOuterInner(outer, inner),
-                            val);
-    }
-
-    /** \internal
-      * Stores the given packet of coefficients, at the given index in this expression. It is your responsibility
-      * to ensure that a packet really starts there. This method is only available on expressions having the
-      * PacketAccessBit and the LinearAccessBit.
-      *
-      * The \a LoadMode parameter may have the value \a Aligned or \a Unaligned. Its effect is to select
-      * the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
-      * starting at an address which is a multiple of the packet size.
-      */
-    template<int StoreMode>
-    EIGEN_STRONG_INLINE void writePacket
-    (Index index, const typename internal::packet_traits<Scalar>::type& val)
-    {
-      eigen_internal_assert(index >= 0 && index < size());
-      derived().template writePacket<StoreMode>(index,val);
-    }
-
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-
-    /** \internal Copies the coefficient at position (row,col) of other into *this.
-      *
-      * This method is overridden in SwapWrapper, allowing swap() assignments to share 99% of their code
-      * with usual assignments.
-      *
-      * Outside of this internal usage, this method has probably no usefulness. It is hidden in the public API dox.
-      */
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE void copyCoeff(Index row, Index col, const DenseBase<OtherDerived>& other)
-    {
-      eigen_internal_assert(row >= 0 && row < rows()
-                        && col >= 0 && col < cols());
-      derived().coeffRef(row, col) = other.derived().coeff(row, col);
-    }
-
-    /** \internal Copies the coefficient at the given index of other into *this.
-      *
-      * This method is overridden in SwapWrapper, allowing swap() assignments to share 99% of their code
-      * with usual assignments.
-      *
-      * Outside of this internal usage, this method has probably no usefulness. It is hidden in the public API dox.
-      */
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE void copyCoeff(Index index, const DenseBase<OtherDerived>& other)
-    {
-      eigen_internal_assert(index >= 0 && index < size());
-      derived().coeffRef(index) = other.derived().coeff(index);
-    }
-
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE void copyCoeffByOuterInner(Index outer, Index inner, const DenseBase<OtherDerived>& other)
-    {
-      const Index row = rowIndexByOuterInner(outer,inner);
-      const Index col = colIndexByOuterInner(outer,inner);
-      // derived() is important here: copyCoeff() may be reimplemented in Derived!
-      derived().copyCoeff(row, col, other);
-    }
-
-    /** \internal Copies the packet at position (row,col) of other into *this.
-      *
-      * This method is overridden in SwapWrapper, allowing swap() assignments to share 99% of their code
-      * with usual assignments.
-      *
-      * Outside of this internal usage, this method has probably no usefulness. It is hidden in the public API dox.
-      */
-
-    template<typename OtherDerived, int StoreMode, int LoadMode>
-    EIGEN_STRONG_INLINE void copyPacket(Index row, Index col, const DenseBase<OtherDerived>& other)
-    {
-      eigen_internal_assert(row >= 0 && row < rows()
-                        && col >= 0 && col < cols());
-      derived().template writePacket<StoreMode>(row, col,
-        other.derived().template packet<LoadMode>(row, col));
-    }
-
-    /** \internal Copies the packet at the given index of other into *this.
-      *
-      * This method is overridden in SwapWrapper, allowing swap() assignments to share 99% of their code
-      * with usual assignments.
-      *
-      * Outside of this internal usage, this method has probably no usefulness. It is hidden in the public API dox.
-      */
-
-    template<typename OtherDerived, int StoreMode, int LoadMode>
-    EIGEN_STRONG_INLINE void copyPacket(Index index, const DenseBase<OtherDerived>& other)
-    {
-      eigen_internal_assert(index >= 0 && index < size());
-      derived().template writePacket<StoreMode>(index,
-        other.derived().template packet<LoadMode>(index));
-    }
-
-    /** \internal */
-    template<typename OtherDerived, int StoreMode, int LoadMode>
-    EIGEN_STRONG_INLINE void copyPacketByOuterInner(Index outer, Index inner, const DenseBase<OtherDerived>& other)
-    {
-      const Index row = rowIndexByOuterInner(outer,inner);
-      const Index col = colIndexByOuterInner(outer,inner);
-      // derived() is important here: copyCoeff() may be reimplemented in Derived!
-      derived().template copyPacket< OtherDerived, StoreMode, LoadMode>(row, col, other);
-    }
-#endif
-
 };

 /** \brief Base class providing direct read-only coefficient access to matrices and arrays.
--- a/Eigen/src/Core/DenseStorage.h
+++ b/Eigen/src/Core/DenseStorage.h
@ -130,7 +130,7 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseSt
  public:
    EIGEN_DEVICE_FUNC DenseStorage() {}
    EIGEN_DEVICE_FUNC
-    DenseStorage(internal::constructor_without_unaligned_array_assert)
+    explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
      : m_data(internal::constructor_without_unaligned_array_assert()) {}
    EIGEN_DEVICE_FUNC 
    DenseStorage(const DenseStorage& other) : m_data(other.m_data) {}
@ -155,7 +155,7 @@ template<typename T, int _Rows, int _Cols, int _Options> class DenseStorage<T, 0
 {
  public:
    EIGEN_DEVICE_FUNC DenseStorage() {}
-    EIGEN_DEVICE_FUNC DenseStorage(internal::constructor_without_unaligned_array_assert) {}
+    EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert) {}
    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage&) {}
    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage&) { return *this; }
    EIGEN_DEVICE_FUNC DenseStorage(DenseIndex,DenseIndex,DenseIndex) {}
@ -186,7 +186,7 @@ template<typename T, int Size, int _Options> class DenseStorage<T, Size, Dynamic
    DenseIndex m_cols;
  public:
    EIGEN_DEVICE_FUNC DenseStorage() : m_rows(0), m_cols(0) {}
-    DenseStorage(internal::constructor_without_unaligned_array_assert)
+    explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
      : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(0), m_cols(0) {}
    DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_rows(other.m_rows), m_cols(other.m_cols) {}
    DenseStorage& operator=(const DenseStorage& other) 
@ -217,7 +217,7 @@ template<typename T, int Size, int _Cols, int _Options> class DenseStorage<T, Si
    DenseIndex m_rows;
  public:
    EIGEN_DEVICE_FUNC DenseStorage() : m_rows(0) {}
-    DenseStorage(internal::constructor_without_unaligned_array_assert)
+    explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
      : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(0) {}
    DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_rows(other.m_rows) {}
    DenseStorage& operator=(const DenseStorage& other) 
@ -246,7 +246,7 @@ template<typename T, int Size, int _Rows, int _Options> class DenseStorage<T, Si
    DenseIndex m_cols;
  public:
    EIGEN_DEVICE_FUNC DenseStorage() : m_cols(0) {}
-    DenseStorage(internal::constructor_without_unaligned_array_assert)
+    explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
      : m_data(internal::constructor_without_unaligned_array_assert()), m_cols(0) {}
    DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_cols(other.m_cols) {}
    DenseStorage& operator=(const DenseStorage& other)
@ -276,7 +276,7 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
    DenseIndex m_cols;
  public:
    EIGEN_DEVICE_FUNC DenseStorage() : m_data(0), m_rows(0), m_cols(0) {}
-    DenseStorage(internal::constructor_without_unaligned_array_assert)
+    explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
       : m_data(0), m_rows(0), m_cols(0) {}
    DenseStorage(DenseIndex size, DenseIndex nbRows, DenseIndex nbCols)
      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(nbRows), m_cols(nbCols)
@ -350,7 +350,7 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
    DenseIndex m_cols;
  public:
    EIGEN_DEVICE_FUNC DenseStorage() : m_data(0), m_cols(0) {}
-    DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_cols(0) {}
+    explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_cols(0) {}
    DenseStorage(DenseIndex size, DenseIndex, DenseIndex nbCols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_cols(nbCols)
    { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN }
    DenseStorage(const DenseStorage& other)
@ -416,7 +416,7 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
    DenseIndex m_rows;
  public:
    EIGEN_DEVICE_FUNC DenseStorage() : m_data(0), m_rows(0) {}
-    DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_rows(0) {}
+    explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_rows(0) {}
    DenseStorage(DenseIndex size, DenseIndex nbRows, DenseIndex) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(nbRows)
    { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN }
    DenseStorage(const DenseStorage& other)
--- a/Eigen/src/Core/Diagonal.h
+++ b/Eigen/src/Core/Diagonal.h
@ -52,8 +52,7 @@ struct traits<Diagonal<MatrixType,DiagIndex> >
                                                 MatrixType::MaxColsAtCompileTime - EIGEN_PLAIN_ENUM_MAX( DiagIndex, 0))),
    MaxColsAtCompileTime = 1,
    MaskLvalueBit = is_lvalue<MatrixType>::value ? LvalueBit : 0,
-    Flags = (unsigned int)_MatrixTypeNested::Flags & (HereditaryBits | LinearAccessBit | MaskLvalueBit | DirectAccessBit) & ~RowMajorBit,
-    CoeffReadCost = _MatrixTypeNested::CoeffReadCost,
+    Flags = (unsigned int)_MatrixTypeNested::Flags & (RowMajorBit | MaskLvalueBit | DirectAccessBit) & ~RowMajorBit, // FIXME DirectAccessBit should not be handled by expressions
    MatrixTypeOuterStride = outer_stride_at_compile_time<MatrixType>::ret,
    InnerStrideAtCompileTime = MatrixTypeOuterStride == Dynamic ? Dynamic : MatrixTypeOuterStride+1,
    OuterStrideAtCompileTime = 0
@ -71,17 +70,15 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
    EIGEN_DENSE_PUBLIC_INTERFACE(Diagonal)

    EIGEN_DEVICE_FUNC
-    inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index) {}
+    explicit inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index) {}

    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Diagonal)

    EIGEN_DEVICE_FUNC
    inline Index rows() const
    {
-      EIGEN_USING_STD_MATH(min);
-      return m_index.value()<0 ? (min)(Index(m_matrix.cols()),Index(m_matrix.rows()+m_index.value()))
-                               : (min)(Index(m_matrix.rows()),Index(m_matrix.cols()-m_index.value()));
-      
+      return m_index.value()<0 ? numext::mini(Index(m_matrix.cols()),Index(m_matrix.rows()+m_index.value()))
+                               : numext::mini(Index(m_matrix.rows()),Index(m_matrix.cols()-m_index.value()));
    }

    EIGEN_DEVICE_FUNC
@ -149,14 +146,14 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
    }

    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename MatrixType::Nested>::type& 
+    inline const typename internal::remove_all<typename MatrixType::Nested>::type& 
    nestedExpression() const 
    {
      return m_matrix;
    }

    EIGEN_DEVICE_FUNC
-    int index() const
+    inline Index index() const
    {
      return m_index.value();
    }
@ -173,7 +170,7 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
    EIGEN_STRONG_INLINE Index rowOffset() const { return m_index.value()>0 ? 0 : -m_index.value(); }
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index colOffset() const { return m_index.value()>0 ? m_index.value() : 0; }
-    // triger a compile time error is someone try to call packet
+    // trigger a compile time error is someone try to call packet
    template<int LoadMode> typename MatrixType::PacketReturnType packet(Index) const;
    template<int LoadMode> typename MatrixType::PacketReturnType packet(Index,Index) const;
 };
@ -190,7 +187,7 @@ template<typename Derived>
 inline typename MatrixBase<Derived>::DiagonalReturnType
 MatrixBase<Derived>::diagonal()
 {
-  return derived();
+  return DiagonalReturnType(derived());
 }

 /** This is the const version of diagonal(). */
@ -213,18 +210,18 @@ MatrixBase<Derived>::diagonal() const
  *
  * \sa MatrixBase::diagonal(), class Diagonal */
 template<typename Derived>
-inline typename MatrixBase<Derived>::template DiagonalIndexReturnType<DynamicIndex>::Type
+inline typename MatrixBase<Derived>::DiagonalDynamicIndexReturnType
 MatrixBase<Derived>::diagonal(Index index)
 {
-  return typename DiagonalIndexReturnType<DynamicIndex>::Type(derived(), index);
+  return DiagonalDynamicIndexReturnType(derived(), index);
 }

 /** This is the const version of diagonal(Index). */
 template<typename Derived>
-inline typename MatrixBase<Derived>::template ConstDiagonalIndexReturnType<DynamicIndex>::Type
+inline typename MatrixBase<Derived>::ConstDiagonalDynamicIndexReturnType
 MatrixBase<Derived>::diagonal(Index index) const
 {
-  return typename ConstDiagonalIndexReturnType<DynamicIndex>::Type(derived(), index);
+  return ConstDiagonalDynamicIndexReturnType(derived(), index);
 }

 /** \returns an expression of the \a DiagIndex-th sub or super diagonal of the matrix \c *this
@ -239,20 +236,20 @@ MatrixBase<Derived>::diagonal(Index index) const
  *
  * \sa MatrixBase::diagonal(), class Diagonal */
 template<typename Derived>
-template<int Index>
-inline typename MatrixBase<Derived>::template DiagonalIndexReturnType<Index>::Type
+template<int Index_>
+inline typename MatrixBase<Derived>::template DiagonalIndexReturnType<Index_>::Type
 MatrixBase<Derived>::diagonal()
 {
-  return derived();
+  return typename DiagonalIndexReturnType<Index_>::Type(derived());
 }

 /** This is the const version of diagonal<int>(). */
 template<typename Derived>
-template<int Index>
-inline typename MatrixBase<Derived>::template ConstDiagonalIndexReturnType<Index>::Type
+template<int Index_>
+inline typename MatrixBase<Derived>::template ConstDiagonalIndexReturnType<Index_>::Type
 MatrixBase<Derived>::diagonal() const
 {
-  return derived();
+  return typename ConstDiagonalIndexReturnType<Index_>::Type(derived());
 }

 } // end namespace Eigen
--- a/Eigen/src/Core/DiagonalMatrix.h
+++ b/Eigen/src/Core/DiagonalMatrix.h
@ -30,7 +30,7 @@ class DiagonalBase : public EigenBase<Derived>
      MaxRowsAtCompileTime = DiagonalVectorType::MaxSizeAtCompileTime,
      MaxColsAtCompileTime = DiagonalVectorType::MaxSizeAtCompileTime,
      IsVectorAtCompileTime = 0,
-      Flags = 0
+      Flags = NoPreferredStorageOrderBit
    };

    typedef Matrix<Scalar, RowsAtCompileTime, ColsAtCompileTime, 0, MaxRowsAtCompileTime, MaxColsAtCompileTime> DenseMatrixType;
@ -44,18 +44,7 @@ class DiagonalBase : public EigenBase<Derived>

    EIGEN_DEVICE_FUNC
    DenseMatrixType toDenseMatrix() const { return derived(); }
-    template<typename DenseDerived>
-    EIGEN_DEVICE_FUNC
-    void evalTo(MatrixBase<DenseDerived> &other) const;
-    template<typename DenseDerived>
-    EIGEN_DEVICE_FUNC
-    void addTo(MatrixBase<DenseDerived> &other) const
-    { other.diagonal() += diagonal(); }
-    template<typename DenseDerived>
-    EIGEN_DEVICE_FUNC
-    void subTo(MatrixBase<DenseDerived> &other) const
-    { other.diagonal() -= diagonal(); }
-
+    
    EIGEN_DEVICE_FUNC
    inline const DiagonalVectorType& diagonal() const { return derived().diagonal(); }
    EIGEN_DEVICE_FUNC
@ -66,44 +55,37 @@ class DiagonalBase : public EigenBase<Derived>
    EIGEN_DEVICE_FUNC
    inline Index cols() const { return diagonal().size(); }

-    /** \returns the diagonal matrix product of \c *this by the matrix \a matrix.
-      */
    template<typename MatrixDerived>
    EIGEN_DEVICE_FUNC
-    const DiagonalProduct<MatrixDerived, Derived, OnTheLeft>
+    const Product<Derived,MatrixDerived,LazyProduct>
    operator*(const MatrixBase<MatrixDerived> &matrix) const
    {
-      return DiagonalProduct<MatrixDerived, Derived, OnTheLeft>(matrix.derived(), derived());
+      return Product<Derived, MatrixDerived, LazyProduct>(derived(),matrix.derived());
    }

+    typedef DiagonalWrapper<const CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const DiagonalVectorType> > InverseReturnType;
    EIGEN_DEVICE_FUNC
-    inline const DiagonalWrapper<const CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const DiagonalVectorType> >
+    inline const InverseReturnType
    inverse() const
    {
-      return diagonal().cwiseInverse();
+      return InverseReturnType(diagonal().cwiseInverse());
    }
    
+    typedef DiagonalWrapper<const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DiagonalVectorType> > ScalarMultipleReturnType;
    EIGEN_DEVICE_FUNC
-    inline const DiagonalWrapper<const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DiagonalVectorType> >
+    inline const ScalarMultipleReturnType
    operator*(const Scalar& scalar) const
    {
-      return diagonal() * scalar;
+      return ScalarMultipleReturnType(diagonal() * scalar);
    }
    EIGEN_DEVICE_FUNC
-    friend inline const DiagonalWrapper<const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DiagonalVectorType> >
+    friend inline const ScalarMultipleReturnType
    operator*(const Scalar& scalar, const DiagonalBase& other)
    {
-      return other.diagonal() * scalar;
+      return ScalarMultipleReturnType(other.diagonal() * scalar);
    }
 };

-template<typename Derived>
-template<typename DenseDerived>
-void DiagonalBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
-{
-  other.setZero();
-  other.diagonal() = diagonal();
-}
 #endif

 /** \class DiagonalMatrix
@ -125,10 +107,10 @@ struct traits<DiagonalMatrix<_Scalar,SizeAtCompileTime,MaxSizeAtCompileTime> >
 : traits<Matrix<_Scalar,SizeAtCompileTime,SizeAtCompileTime,0,MaxSizeAtCompileTime,MaxSizeAtCompileTime> >
 {
  typedef Matrix<_Scalar,SizeAtCompileTime,1,0,MaxSizeAtCompileTime,1> DiagonalVectorType;
-  typedef Dense StorageKind;
+  typedef DiagonalShape StorageKind;
  typedef DenseIndex Index;
  enum {
-    Flags = LvalueBit
+    Flags = LvalueBit | NoPreferredStorageOrderBit
  };
 };
 }
@ -164,7 +146,7 @@ class DiagonalMatrix

    /** Constructs a diagonal matrix with given dimension  */
    EIGEN_DEVICE_FUNC
-    inline DiagonalMatrix(Index dim) : m_diagonal(dim) {}
+    explicit inline DiagonalMatrix(Index dim) : m_diagonal(dim) {}

    /** 2D constructor. */
    EIGEN_DEVICE_FUNC
@ -249,13 +231,14 @@ struct traits<DiagonalWrapper<_DiagonalVectorType> >
  typedef _DiagonalVectorType DiagonalVectorType;
  typedef typename DiagonalVectorType::Scalar Scalar;
  typedef typename DiagonalVectorType::Index Index;
-  typedef typename DiagonalVectorType::StorageKind StorageKind;
+  typedef DiagonalShape StorageKind;
+  typedef typename traits<DiagonalVectorType>::XprKind XprKind;
  enum {
    RowsAtCompileTime = DiagonalVectorType::SizeAtCompileTime,
    ColsAtCompileTime = DiagonalVectorType::SizeAtCompileTime,
-    MaxRowsAtCompileTime = DiagonalVectorType::SizeAtCompileTime,
-    MaxColsAtCompileTime = DiagonalVectorType::SizeAtCompileTime,
-    Flags =  traits<DiagonalVectorType>::Flags & LvalueBit
+    MaxRowsAtCompileTime = DiagonalVectorType::MaxSizeAtCompileTime,
+    MaxColsAtCompileTime = DiagonalVectorType::MaxSizeAtCompileTime,
+    Flags =  (traits<DiagonalVectorType>::Flags & LvalueBit) | NoPreferredStorageOrderBit
  };
 };
 }
@ -272,7 +255,7 @@ class DiagonalWrapper

    /** Constructor from expression of diagonal coefficients to wrap. */
    EIGEN_DEVICE_FUNC
-    inline DiagonalWrapper(DiagonalVectorType& a_diagonal) : m_diagonal(a_diagonal) {}
+    explicit inline DiagonalWrapper(DiagonalVectorType& a_diagonal) : m_diagonal(a_diagonal) {}

    /** \returns a const reference to the wrapped expression of diagonal coefficients. */
    EIGEN_DEVICE_FUNC
@ -295,7 +278,7 @@ template<typename Derived>
 inline const DiagonalWrapper<const Derived>
 MatrixBase<Derived>::asDiagonal() const
 {
-  return derived();
+  return DiagonalWrapper<const Derived>(derived());
 }

 /** \returns true if *this is approximately equal to a diagonal matrix,
@ -326,6 +309,33 @@ bool MatrixBase<Derived>::isDiagonal(const RealScalar& prec) const
  return true;
 }

+namespace internal {
+
+template<> struct storage_kind_to_shape<DiagonalShape> { typedef DiagonalShape Shape; };
+
+struct Diagonal2Dense {};
+
+template<> struct AssignmentKind<DenseShape,DiagonalShape> { typedef Diagonal2Dense Kind; };
+
+// Diagonal matrix to Dense assignment
+template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
+struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Dense, Scalar>
+{
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &/*func*/)
+  {
+    dst.setZero();
+    dst.diagonal() = src.diagonal();
+  }
+  
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar> &/*func*/)
+  { dst.diagonal() += src.diagonal(); }
+  
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar> &/*func*/)
+  { dst.diagonal() -= src.diagonal(); }
+};
+
+} // namespace internal
+
 } // end namespace Eigen

 #endif // EIGEN_DIAGONALMATRIX_H
--- a/Eigen/src/Core/DiagonalProduct.h
+++ b/Eigen/src/Core/DiagonalProduct.h
@ -13,116 +13,14 @@

 namespace Eigen { 

-namespace internal {
-template<typename MatrixType, typename DiagonalType, int ProductOrder>
-struct traits<DiagonalProduct<MatrixType, DiagonalType, ProductOrder> >
- : traits<MatrixType>
-{
-  typedef typename scalar_product_traits<typename MatrixType::Scalar, typename DiagonalType::Scalar>::ReturnType Scalar;
-  enum {
-    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
-    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-
-    _StorageOrder = MatrixType::Flags & RowMajorBit ? RowMajor : ColMajor,
-    _ScalarAccessOnDiag =  !((int(_StorageOrder) == ColMajor && int(ProductOrder) == OnTheLeft)
-                          ||(int(_StorageOrder) == RowMajor && int(ProductOrder) == OnTheRight)),
-    _SameTypes = is_same<typename MatrixType::Scalar, typename DiagonalType::Scalar>::value,
-    // FIXME currently we need same types, but in the future the next rule should be the one
-    //_Vectorizable = bool(int(MatrixType::Flags)&PacketAccessBit) && ((!_PacketOnDiag) || (_SameTypes && bool(int(DiagonalType::DiagonalVectorType::Flags)&PacketAccessBit))),
-    _Vectorizable = bool(int(MatrixType::Flags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagonalType::DiagonalVectorType::Flags)&PacketAccessBit))),
-    _LinearAccessMask = (RowsAtCompileTime==1 || ColsAtCompileTime==1) ? LinearAccessBit : 0,
-
-    Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixType::Flags)) | (_Vectorizable ? PacketAccessBit : 0) | AlignedBit,//(int(MatrixType::Flags)&int(DiagonalType::DiagonalVectorType::Flags)&AlignedBit),
-    CoeffReadCost = NumTraits<Scalar>::MulCost + MatrixType::CoeffReadCost + DiagonalType::DiagonalVectorType::CoeffReadCost
-  };
-};
-}
-
-template<typename MatrixType, typename DiagonalType, int ProductOrder>
-class DiagonalProduct : internal::no_assignment_operator,
-                        public MatrixBase<DiagonalProduct<MatrixType, DiagonalType, ProductOrder> >
-{
-  public:
-
-    typedef MatrixBase<DiagonalProduct> Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(DiagonalProduct)
-
-    inline DiagonalProduct(const MatrixType& matrix, const DiagonalType& diagonal)
-      : m_matrix(matrix), m_diagonal(diagonal)
-    {
-      eigen_assert(diagonal.diagonal().size() == (ProductOrder == OnTheLeft ? matrix.rows() : matrix.cols()));
-    }
-
-    EIGEN_STRONG_INLINE Index rows() const { return m_matrix.rows(); }
-    EIGEN_STRONG_INLINE Index cols() const { return m_matrix.cols(); }
-
-    EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
-    {
-      return m_diagonal.diagonal().coeff(ProductOrder == OnTheLeft ? row : col) * m_matrix.coeff(row, col);
-    }
-    
-    EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const
-    {
-      enum {
-        StorageOrder = int(MatrixType::Flags) & RowMajorBit ? RowMajor : ColMajor
-      };
-      return coeff(int(StorageOrder)==ColMajor?idx:0,int(StorageOrder)==ColMajor?0:idx);
-    }
-
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet(Index row, Index col) const
-    {
-      enum {
-        StorageOrder = Flags & RowMajorBit ? RowMajor : ColMajor
-      };
-      const Index indexInDiagonalVector = ProductOrder == OnTheLeft ? row : col;
-      return packet_impl<LoadMode>(row,col,indexInDiagonalVector,typename internal::conditional<
-        ((int(StorageOrder) == RowMajor && int(ProductOrder) == OnTheLeft)
-       ||(int(StorageOrder) == ColMajor && int(ProductOrder) == OnTheRight)), internal::true_type, internal::false_type>::type());
-    }
-    
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet(Index idx) const
-    {
-      enum {
-        StorageOrder = int(MatrixType::Flags) & RowMajorBit ? RowMajor : ColMajor
-      };
-      return packet<LoadMode>(int(StorageOrder)==ColMajor?idx:0,int(StorageOrder)==ColMajor?0:idx);
-    }
-
-  protected:
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet_impl(Index row, Index col, Index id, internal::true_type) const
-    {
-      return internal::pmul(m_matrix.template packet<LoadMode>(row, col),
-                     internal::pset1<PacketScalar>(m_diagonal.diagonal().coeff(id)));
-    }
-
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet_impl(Index row, Index col, Index id, internal::false_type) const
-    {
-      enum {
-        InnerSize = (MatrixType::Flags & RowMajorBit) ? MatrixType::ColsAtCompileTime : MatrixType::RowsAtCompileTime,
-        DiagonalVectorPacketLoadMode = (LoadMode == Aligned && (((InnerSize%16) == 0) || (int(DiagonalType::DiagonalVectorType::Flags)&AlignedBit)==AlignedBit) ? Aligned : Unaligned)
-      };
-      return internal::pmul(m_matrix.template packet<LoadMode>(row, col),
-                     m_diagonal.diagonal().template packet<DiagonalVectorPacketLoadMode>(id));
-    }
-
-    typename MatrixType::Nested m_matrix;
-    typename DiagonalType::Nested m_diagonal;
-};
-
 /** \returns the diagonal matrix product of \c *this by the diagonal matrix \a diagonal.
  */
 template<typename Derived>
 template<typename DiagonalDerived>
-inline const DiagonalProduct<Derived, DiagonalDerived, OnTheRight>
+inline const Product<Derived, DiagonalDerived, LazyProduct>
 MatrixBase<Derived>::operator*(const DiagonalBase<DiagonalDerived> &a_diagonal) const
 {
-  return DiagonalProduct<Derived, DiagonalDerived, OnTheRight>(derived(), a_diagonal.derived());
+  return Product<Derived, DiagonalDerived, LazyProduct>(derived(),a_diagonal.derived());
 }

 } // end namespace Eigen
--- a/Eigen/src/Core/Dot.h
+++ b/Eigen/src/Core/Dot.h
@ -113,8 +113,7 @@ template<typename Derived>
 inline const typename MatrixBase<Derived>::PlainObject
 MatrixBase<Derived>::normalized() const
 {
-  typedef typename internal::nested<Derived>::type Nested;
-  typedef typename internal::remove_reference<Nested>::type _Nested;
+  typedef typename internal::nested_eval<Derived,2>::type _Nested;
  _Nested n(derived());
  return n / n.norm();
 }
@ -206,8 +205,8 @@ template<typename OtherDerived>
 bool MatrixBase<Derived>::isOrthogonal
 (const MatrixBase<OtherDerived>& other, const RealScalar& prec) const
 {
-  typename internal::nested<Derived,2>::type nested(derived());
-  typename internal::nested<OtherDerived,2>::type otherNested(other.derived());
+  typename internal::nested_eval<Derived,2>::type nested(derived());
+  typename internal::nested_eval<OtherDerived,2>::type otherNested(other.derived());
  return numext::abs2(nested.dot(otherNested)) <= prec * prec * nested.squaredNorm() * otherNested.squaredNorm();
 }

--- a/Eigen/src/Core/EigenBase.h
+++ b/Eigen/src/Core/EigenBase.h
@ -121,7 +121,7 @@ template<typename Derived>
 template<typename OtherDerived>
 Derived& DenseBase<Derived>::operator=(const EigenBase<OtherDerived> &other)
 {
-  other.derived().evalTo(derived());
+  call_assignment(derived(), other.derived());
  return derived();
 }

@ -129,7 +129,7 @@ template<typename Derived>
 template<typename OtherDerived>
 Derived& DenseBase<Derived>::operator+=(const EigenBase<OtherDerived> &other)
 {
-  other.derived().addTo(derived());
+  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar>());
  return derived();
 }

@ -137,7 +137,7 @@ template<typename Derived>
 template<typename OtherDerived>
 Derived& DenseBase<Derived>::operator-=(const EigenBase<OtherDerived> &other)
 {
-  other.derived().subTo(derived());
+  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar>());
  return derived();
 }

--- a/Eigen/src/Core/Flagged.h
+++ b/Eigen/src/Core/Flagged.h
@ -48,39 +48,39 @@ template<typename ExpressionType, unsigned int Added, unsigned int Removed> clas
        ExpressionType, const ExpressionType&>::type ExpressionTypeNested;
    typedef typename ExpressionType::InnerIterator InnerIterator;

-    inline Flagged(const ExpressionType& matrix) : m_matrix(matrix) {}
+    explicit inline Flagged(const ExpressionType& matrix) : m_matrix(matrix) {}

-    inline Index rows() const { return m_matrix.rows(); }
-    inline Index cols() const { return m_matrix.cols(); }
-    inline Index outerStride() const { return m_matrix.outerStride(); }
-    inline Index innerStride() const { return m_matrix.innerStride(); }
+    EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.rows(); }
+    EIGEN_DEVICE_FUNC inline Index cols() const { return m_matrix.cols(); }
+    EIGEN_DEVICE_FUNC inline Index outerStride() const { return m_matrix.outerStride(); }
+    EIGEN_DEVICE_FUNC inline Index innerStride() const { return m_matrix.innerStride(); }

-    inline CoeffReturnType coeff(Index row, Index col) const
+    EIGEN_DEVICE_FUNC inline CoeffReturnType coeff(Index row, Index col) const
    {
      return m_matrix.coeff(row, col);
    }

-    inline CoeffReturnType coeff(Index index) const
+    EIGEN_DEVICE_FUNC inline CoeffReturnType coeff(Index index) const
    {
      return m_matrix.coeff(index);
    }
    
-    inline const Scalar& coeffRef(Index row, Index col) const
+    EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index row, Index col) const
    {
      return m_matrix.const_cast_derived().coeffRef(row, col);
    }

-    inline const Scalar& coeffRef(Index index) const
+    EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index index) const
    {
      return m_matrix.const_cast_derived().coeffRef(index);
    }

-    inline Scalar& coeffRef(Index row, Index col)
+    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index col)
    {
      return m_matrix.const_cast_derived().coeffRef(row, col);
    }

-    inline Scalar& coeffRef(Index index)
+    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index)
    {
      return m_matrix.const_cast_derived().coeffRef(index);
    }
@ -109,13 +109,13 @@ template<typename ExpressionType, unsigned int Added, unsigned int Removed> clas
      m_matrix.const_cast_derived().template writePacket<LoadMode>(index, x);
    }

-    const ExpressionType& _expression() const { return m_matrix; }
+    EIGEN_DEVICE_FUNC const ExpressionType& _expression() const { return m_matrix; }

    template<typename OtherDerived>
-    typename ExpressionType::PlainObject solveTriangular(const MatrixBase<OtherDerived>& other) const;
+    EIGEN_DEVICE_FUNC typename ExpressionType::PlainObject solveTriangular(const MatrixBase<OtherDerived>& other) const;

    template<typename OtherDerived>
-    void solveTriangularInPlace(const MatrixBase<OtherDerived>& other) const;
+    EIGEN_DEVICE_FUNC void solveTriangularInPlace(const MatrixBase<OtherDerived>& other) const;

  protected:
    ExpressionTypeNested m_matrix;
@ -132,7 +132,7 @@ template<unsigned int Added,unsigned int Removed>
 inline const Flagged<Derived, Added, Removed>
 DenseBase<Derived>::flagged() const
 {
-  return derived();
+  return Flagged<Derived, Added, Removed>(derived());
 }

 } // end namespace Eigen
--- a/Eigen/src/Core/ForceAlignedAccess.h
+++ b/Eigen/src/Core/ForceAlignedAccess.h
@ -39,29 +39,29 @@ template<typename ExpressionType> class ForceAlignedAccess
    typedef typename internal::dense_xpr_base<ForceAlignedAccess>::type Base;
    EIGEN_DENSE_PUBLIC_INTERFACE(ForceAlignedAccess)

-    inline ForceAlignedAccess(const ExpressionType& matrix) : m_expression(matrix) {}
+    EIGEN_DEVICE_FUNC explicit inline ForceAlignedAccess(const ExpressionType& matrix) : m_expression(matrix) {}

-    inline Index rows() const { return m_expression.rows(); }
-    inline Index cols() const { return m_expression.cols(); }
-    inline Index outerStride() const { return m_expression.outerStride(); }
-    inline Index innerStride() const { return m_expression.innerStride(); }
+    EIGEN_DEVICE_FUNC inline Index rows() const { return m_expression.rows(); }
+    EIGEN_DEVICE_FUNC inline Index cols() const { return m_expression.cols(); }
+    EIGEN_DEVICE_FUNC inline Index outerStride() const { return m_expression.outerStride(); }
+    EIGEN_DEVICE_FUNC inline Index innerStride() const { return m_expression.innerStride(); }

-    inline const CoeffReturnType coeff(Index row, Index col) const
+    EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index row, Index col) const
    {
      return m_expression.coeff(row, col);
    }

-    inline Scalar& coeffRef(Index row, Index col)
+    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index col)
    {
      return m_expression.const_cast_derived().coeffRef(row, col);
    }

-    inline const CoeffReturnType coeff(Index index) const
+    EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index index) const
    {
      return m_expression.coeff(index);
    }

-    inline Scalar& coeffRef(Index index)
+    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index)
    {
      return m_expression.const_cast_derived().coeffRef(index);
    }
@ -90,7 +90,7 @@ template<typename ExpressionType> class ForceAlignedAccess
      m_expression.const_cast_derived().template writePacket<Aligned>(index, x);
    }

-    operator const ExpressionType&() const { return m_expression; }
+    EIGEN_DEVICE_FUNC operator const ExpressionType&() const { return m_expression; }

  protected:
    const ExpressionType& m_expression;
@ -127,7 +127,7 @@ template<bool Enable>
 inline typename internal::add_const_on_value_type<typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type>::type
 MatrixBase<Derived>::forceAlignedAccessIf() const
 {
-  return derived();
+  return derived();  // FIXME This should not work but apparently is never used
 }

 /** \returns an expression of *this with forced aligned access if \a Enable is true.
@ -138,7 +138,7 @@ template<bool Enable>
 inline typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type
 MatrixBase<Derived>::forceAlignedAccessIf()
 {
-  return derived();
+  return derived();  // FIXME This should not work but apparently is never used
 }

 } // end namespace Eigen
--- a/Eigen/src/Core/Fuzzy.h
+++ b/Eigen/src/Core/Fuzzy.h
@ -22,10 +22,9 @@ struct isApprox_selector
  EIGEN_DEVICE_FUNC
  static bool run(const Derived& x, const OtherDerived& y, const typename Derived::RealScalar& prec)
  {
-    EIGEN_USING_STD_MATH(min);
-    typename internal::nested<Derived,2>::type nested(x);
-    typename internal::nested<OtherDerived,2>::type otherNested(y);
-    return (nested - otherNested).cwiseAbs2().sum() <= prec * prec * (min)(nested.cwiseAbs2().sum(), otherNested.cwiseAbs2().sum());
+    typename internal::nested_eval<Derived,2>::type nested(x);
+    typename internal::nested_eval<OtherDerived,2>::type otherNested(y);
+    return (nested - otherNested).cwiseAbs2().sum() <= prec * prec * numext::mini(nested.cwiseAbs2().sum(), otherNested.cwiseAbs2().sum());
  }
 };

--- a/Eigen/src/Core/GeneralProduct.h
+++ b/Eigen/src/Core/GeneralProduct.h
@ -13,28 +13,6 @@

 namespace Eigen {

-/** \class GeneralProduct
-  * \ingroup Core_Module
-  *
-  * \brief Expression of the product of two general matrices or vectors
-  *
-  * \param LhsNested the type used to store the left-hand side
-  * \param RhsNested the type used to store the right-hand side
-  * \param ProductMode the type of the product
-  *
-  * This class represents an expression of the product of two general matrices.
-  * We call a general matrix, a dense matrix with full storage. For instance,
-  * This excludes triangular, selfadjoint, and sparse matrices.
-  * It is the return type of the operator* between general matrices. Its template
-  * arguments are determined automatically by ProductReturnType. Therefore,
-  * GeneralProduct should never be used direclty. To determine the result type of a
-  * function which involves a matrix product, use ProductReturnType::Type.
-  *
-  * \sa ProductReturnType, MatrixBase::operator*(const MatrixBase<OtherDerived>&)
-  */
-template<typename Lhs, typename Rhs, int ProductType = internal::product_type<Lhs,Rhs>::value>
-class GeneralProduct;
-
 enum {
  Large = 2,
  Small = 3
@ -59,14 +37,14 @@ template<typename Lhs, typename Rhs> struct product_type
  typedef typename remove_all<Lhs>::type _Lhs;
  typedef typename remove_all<Rhs>::type _Rhs;
  enum {
-    MaxRows  = _Lhs::MaxRowsAtCompileTime,
-    Rows  = _Lhs::RowsAtCompileTime,
-    MaxCols  = _Rhs::MaxColsAtCompileTime,
-    Cols  = _Rhs::ColsAtCompileTime,
-    MaxDepth = EIGEN_SIZE_MIN_PREFER_FIXED(_Lhs::MaxColsAtCompileTime,
-                                           _Rhs::MaxRowsAtCompileTime),
-    Depth = EIGEN_SIZE_MIN_PREFER_FIXED(_Lhs::ColsAtCompileTime,
-                                        _Rhs::RowsAtCompileTime)
+    MaxRows = traits<_Lhs>::MaxRowsAtCompileTime,
+    Rows    = traits<_Lhs>::RowsAtCompileTime,
+    MaxCols = traits<_Rhs>::MaxColsAtCompileTime,
+    Cols    = traits<_Rhs>::ColsAtCompileTime,
+    MaxDepth = EIGEN_SIZE_MIN_PREFER_FIXED(traits<_Lhs>::MaxColsAtCompileTime,
+                                           traits<_Rhs>::MaxRowsAtCompileTime),
+    Depth = EIGEN_SIZE_MIN_PREFER_FIXED(traits<_Lhs>::ColsAtCompileTime,
+                                        traits<_Rhs>::RowsAtCompileTime)
  };

  // the splitting into different lines of code here, introducing the _select enums and the typedef below,
@ -81,7 +59,8 @@ private:

 public:
  enum {
-    value = selector::ret
+    value = selector::ret,
+    ret = selector::ret
  };
 #ifdef EIGEN_DEBUG_PRODUCT
  static void debug()
@ -97,6 +76,31 @@ public:
 #endif
 };

+// template<typename Lhs, typename Rhs> struct product_tag
+// {
+// private:
+//   
+//   typedef typename remove_all<Lhs>::type _Lhs;
+//   typedef typename remove_all<Rhs>::type _Rhs;
+//   enum {
+//     Rows  = _Lhs::RowsAtCompileTime,
+//     Cols  = _Rhs::ColsAtCompileTime,
+//     Depth = EIGEN_SIZE_MIN_PREFER_FIXED(_Lhs::ColsAtCompileTime, _Rhs::RowsAtCompileTime)
+//   };
+// 
+//   enum {
+//     rows_select = Rows==1 ? int(Rows) : int(Large),
+//     cols_select = Cols==1 ? int(Cols) : int(Large),
+//     depth_select = Depth==1 ? int(Depth) : int(Large)
+//   };
+//   typedef product_type_selector<rows_select, cols_select, depth_select> selector;
+// 
+// public:
+//   enum {
+//     ret = selector::ret
+//   };
+// 
+// };

 /* The following allows to select the kind of product at compile time
 * based on the three dimensions of the product.
@ -127,54 +131,6 @@ template<>              struct product_type_selector<Large,Large,Small>  { enum

 } // end namespace internal

-/** \class ProductReturnType
-  * \ingroup Core_Module
-  *
-  * \brief Helper class to get the correct and optimized returned type of operator*
-  *
-  * \param Lhs the type of the left-hand side
-  * \param Rhs the type of the right-hand side
-  * \param ProductMode the type of the product (determined automatically by internal::product_mode)
-  *
-  * This class defines the typename Type representing the optimized product expression
-  * between two matrix expressions. In practice, using ProductReturnType<Lhs,Rhs>::Type
-  * is the recommended way to define the result type of a function returning an expression
-  * which involve a matrix product. The class Product should never be
-  * used directly.
-  *
-  * \sa class Product, MatrixBase::operator*(const MatrixBase<OtherDerived>&)
-  */
-template<typename Lhs, typename Rhs, int ProductType>
-struct ProductReturnType
-{
-  // TODO use the nested type to reduce instanciations ????
-//   typedef typename internal::nested<Lhs,Rhs::ColsAtCompileTime>::type LhsNested;
-//   typedef typename internal::nested<Rhs,Lhs::RowsAtCompileTime>::type RhsNested;
-
-  typedef GeneralProduct<Lhs/*Nested*/, Rhs/*Nested*/, ProductType> Type;
-};
-
-template<typename Lhs, typename Rhs>
-struct ProductReturnType<Lhs,Rhs,CoeffBasedProductMode>
-{
-  typedef typename internal::nested<Lhs, Rhs::ColsAtCompileTime, typename internal::plain_matrix_type<Lhs>::type >::type LhsNested;
-  typedef typename internal::nested<Rhs, Lhs::RowsAtCompileTime, typename internal::plain_matrix_type<Rhs>::type >::type RhsNested;
-  typedef CoeffBasedProduct<LhsNested, RhsNested, EvalBeforeAssigningBit | EvalBeforeNestingBit> Type;
-};
-
-template<typename Lhs, typename Rhs>
-struct ProductReturnType<Lhs,Rhs,LazyCoeffBasedProductMode>
-{
-  typedef typename internal::nested<Lhs, Rhs::ColsAtCompileTime, typename internal::plain_matrix_type<Lhs>::type >::type LhsNested;
-  typedef typename internal::nested<Rhs, Lhs::RowsAtCompileTime, typename internal::plain_matrix_type<Rhs>::type >::type RhsNested;
-  typedef CoeffBasedProduct<LhsNested, RhsNested, NestByRefBit> Type;
-};
-
-// this is a workaround for sun CC
-template<typename Lhs, typename Rhs>
-struct LazyProductReturnType : public ProductReturnType<Lhs,Rhs,LazyCoeffBasedProductMode>
-{};
-
 /***********************************************************************
 *  Implementation of Inner Vector Vector Product
 ***********************************************************************/
@ -186,119 +142,10 @@ struct LazyProductReturnType : public ProductReturnType<Lhs,Rhs,LazyCoeffBasedPr
 // product ends up to a row-vector times col-vector product... To tackle this use
 // case, we could have a specialization for Block<MatrixType,1,1> with: operator=(Scalar x);

-namespace internal {
-
-template<typename Lhs, typename Rhs>
-struct traits<GeneralProduct<Lhs,Rhs,InnerProduct> >
- : traits<Matrix<typename scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType,1,1> >
-{};
-
-}
-
-template<typename Lhs, typename Rhs>
-class GeneralProduct<Lhs, Rhs, InnerProduct>
-  : internal::no_assignment_operator,
-    public Matrix<typename internal::scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType,1,1>
-{
-    typedef Matrix<typename internal::scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType,1,1> Base;
-  public:
-    GeneralProduct(const Lhs& lhs, const Rhs& rhs)
-    {
-      EIGEN_STATIC_ASSERT((internal::is_same<typename Lhs::RealScalar, typename Rhs::RealScalar>::value),
-        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-
-      Base::coeffRef(0,0) = (lhs.transpose().cwiseProduct(rhs)).sum();
-    }
-
-    /** Convertion to scalar */
-    operator const typename Base::Scalar() const {
-      return Base::coeff(0,0);
-    }
-};
-
 /***********************************************************************
 *  Implementation of Outer Vector Vector Product
 ***********************************************************************/

-namespace internal {
-
-// Column major
-template<typename ProductType, typename Dest, typename Func>
-EIGEN_DONT_INLINE void outer_product_selector_run(const ProductType& prod, Dest& dest, const Func& func, const false_type&)
-{
-  typedef typename Dest::Index Index;
-  // FIXME make sure lhs is sequentially stored
-  // FIXME not very good if rhs is real and lhs complex while alpha is real too
-  const Index cols = dest.cols();
-  for (Index j=0; j<cols; ++j)
-    func(dest.col(j), prod.rhs().coeff(0,j) * prod.lhs());
-}
-
-// Row major
-template<typename ProductType, typename Dest, typename Func>
-EIGEN_DONT_INLINE void outer_product_selector_run(const ProductType& prod, Dest& dest, const Func& func, const true_type&) {
-  typedef typename Dest::Index Index;
-  // FIXME make sure rhs is sequentially stored
-  // FIXME not very good if lhs is real and rhs complex while alpha is real too
-  const Index rows = dest.rows();
-  for (Index i=0; i<rows; ++i)
-    func(dest.row(i), prod.lhs().coeff(i,0) * prod.rhs());
-}
-
-template<typename Lhs, typename Rhs>
-struct traits<GeneralProduct<Lhs,Rhs,OuterProduct> >
- : traits<ProductBase<GeneralProduct<Lhs,Rhs,OuterProduct>, Lhs, Rhs> >
-{};
-
-}
-
-template<typename Lhs, typename Rhs>
-class GeneralProduct<Lhs, Rhs, OuterProduct>
-  : public ProductBase<GeneralProduct<Lhs,Rhs,OuterProduct>, Lhs, Rhs>
-{
-    template<typename T> struct IsRowMajor : internal::conditional<(int(T::Flags)&RowMajorBit), internal::true_type, internal::false_type>::type {};
-
-  public:
-    EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct)
-
-    GeneralProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs)
-    {
-      EIGEN_STATIC_ASSERT((internal::is_same<typename Lhs::RealScalar, typename Rhs::RealScalar>::value),
-        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-    }
-
-    struct set  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived()  = src; } };
-    struct add  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } };
-    struct sub  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } };
-    struct adds {
-      Scalar m_scale;
-      adds(const Scalar& s) : m_scale(s) {}
-      template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const {
-        dst.const_cast_derived() += m_scale * src;
-      }
-    };
-
-    template<typename Dest>
-    inline void evalTo(Dest& dest) const {
-      internal::outer_product_selector_run(*this, dest, set(), IsRowMajor<Dest>());
-    }
-
-    template<typename Dest>
-    inline void addTo(Dest& dest) const {
-      internal::outer_product_selector_run(*this, dest, add(), IsRowMajor<Dest>());
-    }
-
-    template<typename Dest>
-    inline void subTo(Dest& dest) const {
-      internal::outer_product_selector_run(*this, dest, sub(), IsRowMajor<Dest>());
-    }
-
-    template<typename Dest> void scaleAndAddTo(Dest& dest, const Scalar& alpha) const
-    {
-      internal::outer_product_selector_run(*this, dest, adds(alpha), IsRowMajor<Dest>());
-    }
-};
-
 /***********************************************************************
 *  Implementation of General Matrix Vector Product
 ***********************************************************************/
@ -312,60 +159,13 @@ class GeneralProduct<Lhs, Rhs, OuterProduct>
 */
 namespace internal {

-template<typename Lhs, typename Rhs>
-struct traits<GeneralProduct<Lhs,Rhs,GemvProduct> >
- : traits<ProductBase<GeneralProduct<Lhs,Rhs,GemvProduct>, Lhs, Rhs> >
-{};
-
 template<int Side, int StorageOrder, bool BlasCompatible>
-struct gemv_selector;
+struct gemv_dense_sense_selector;

 } // end namespace internal

-template<typename Lhs, typename Rhs>
-class GeneralProduct<Lhs, Rhs, GemvProduct>
-  : public ProductBase<GeneralProduct<Lhs,Rhs,GemvProduct>, Lhs, Rhs>
-{
-  public:
-    EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct)
-
-    typedef typename Lhs::Scalar LhsScalar;
-    typedef typename Rhs::Scalar RhsScalar;
-
-    GeneralProduct(const Lhs& a_lhs, const Rhs& a_rhs) : Base(a_lhs,a_rhs)
-    {
-//       EIGEN_STATIC_ASSERT((internal::is_same<typename Lhs::Scalar, typename Rhs::Scalar>::value),
-//         YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-    }
-
-    enum { Side = Lhs::IsVectorAtCompileTime ? OnTheLeft : OnTheRight };
-    typedef typename internal::conditional<int(Side)==OnTheRight,_LhsNested,_RhsNested>::type MatrixType;
-
-    template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
-    {
-      eigen_assert(m_lhs.rows() == dst.rows() && m_rhs.cols() == dst.cols());
-      internal::gemv_selector<Side,(int(MatrixType::Flags)&RowMajorBit) ? RowMajor : ColMajor,
-                       bool(internal::blas_traits<MatrixType>::HasUsableDirectAccess)>::run(*this, dst, alpha);
-    }
-};
-
 namespace internal {

-// The vector is on the left => transposition
-template<int StorageOrder, bool BlasCompatible>
-struct gemv_selector<OnTheLeft,StorageOrder,BlasCompatible>
-{
-  template<typename ProductType, typename Dest>
-  static void run(const ProductType& prod, Dest& dest, const typename ProductType::Scalar& alpha)
-  {
-    Transpose<Dest> destT(dest);
-    enum { OtherStorageOrder = StorageOrder == RowMajor ? ColMajor : RowMajor };
-    gemv_selector<OnTheRight,OtherStorageOrder,BlasCompatible>
-      ::run(GeneralProduct<Transpose<const typename ProductType::_RhsNested>,Transpose<const typename ProductType::_LhsNested>, GemvProduct>
-        (prod.rhs().transpose(), prod.lhs().transpose()), destT, alpha);
-  }
-};
-
 template<typename Scalar,int Size,int MaxSize,bool Cond> struct gemv_static_vector_if;

 template<typename Scalar,int Size,int MaxSize>
@ -402,27 +202,43 @@ struct gemv_static_vector_if<Scalar,Size,MaxSize,true>
  #endif
 };

-template<> struct gemv_selector<OnTheRight,ColMajor,true>
+// The vector is on the left => transposition
+template<int StorageOrder, bool BlasCompatible>
+struct gemv_dense_sense_selector<OnTheLeft,StorageOrder,BlasCompatible>
 {
-  template<typename ProductType, typename Dest>
-  static inline void run(const ProductType& prod, Dest& dest, const typename ProductType::Scalar& alpha)
+  template<typename Lhs, typename Rhs, typename Dest>
+  static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
  {
-    typedef typename ProductType::Index Index;
-    typedef typename ProductType::LhsScalar   LhsScalar;
-    typedef typename ProductType::RhsScalar   RhsScalar;
-    typedef typename ProductType::Scalar      ResScalar;
-    typedef typename ProductType::RealScalar  RealScalar;
-    typedef typename ProductType::ActualLhsType ActualLhsType;
-    typedef typename ProductType::ActualRhsType ActualRhsType;
-    typedef typename ProductType::LhsBlasTraits LhsBlasTraits;
-    typedef typename ProductType::RhsBlasTraits RhsBlasTraits;
+    Transpose<Dest> destT(dest);
+    enum { OtherStorageOrder = StorageOrder == RowMajor ? ColMajor : RowMajor };
+    gemv_dense_sense_selector<OnTheRight,OtherStorageOrder,BlasCompatible>
+      ::run(rhs.transpose(), lhs.transpose(), destT, alpha);
+  }
+};
+
+template<> struct gemv_dense_sense_selector<OnTheRight,ColMajor,true>
+{
+  template<typename Lhs, typename Rhs, typename Dest>
+  static inline void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
+  {
+    typedef typename Dest::Index Index;
+    typedef typename Lhs::Scalar   LhsScalar;
+    typedef typename Rhs::Scalar   RhsScalar;
+    typedef typename Dest::Scalar  ResScalar;
+    typedef typename Dest::RealScalar  RealScalar;
+    
+    typedef internal::blas_traits<Lhs> LhsBlasTraits;
+    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+    typedef internal::blas_traits<Rhs> RhsBlasTraits;
+    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+  
    typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;

-    ActualLhsType actualLhs = LhsBlasTraits::extract(prod.lhs());
-    ActualRhsType actualRhs = RhsBlasTraits::extract(prod.rhs());
+    ActualLhsType actualLhs = LhsBlasTraits::extract(lhs);
+    ActualRhsType actualRhs = RhsBlasTraits::extract(rhs);

-    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs())
-                                  * RhsBlasTraits::extractScalarFactor(prod.rhs());
+    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs)
+                                  * RhsBlasTraits::extractScalarFactor(rhs);

    enum {
      // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
@ -477,34 +293,35 @@ template<> struct gemv_selector<OnTheRight,ColMajor,true>
  }
 };

-template<> struct gemv_selector<OnTheRight,RowMajor,true>
+template<> struct gemv_dense_sense_selector<OnTheRight,RowMajor,true>
 {
-  template<typename ProductType, typename Dest>
-  static void run(const ProductType& prod, Dest& dest, const typename ProductType::Scalar& alpha)
+  template<typename Lhs, typename Rhs, typename Dest>
+  static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
  {
-    typedef typename ProductType::LhsScalar LhsScalar;
-    typedef typename ProductType::RhsScalar RhsScalar;
-    typedef typename ProductType::Scalar    ResScalar;
-    typedef typename ProductType::Index Index;
-    typedef typename ProductType::ActualLhsType ActualLhsType;
-    typedef typename ProductType::ActualRhsType ActualRhsType;
-    typedef typename ProductType::_ActualRhsType _ActualRhsType;
-    typedef typename ProductType::LhsBlasTraits LhsBlasTraits;
-    typedef typename ProductType::RhsBlasTraits RhsBlasTraits;
+    typedef typename Dest::Index Index;
+    typedef typename Lhs::Scalar   LhsScalar;
+    typedef typename Rhs::Scalar   RhsScalar;
+    typedef typename Dest::Scalar  ResScalar;
+    
+    typedef internal::blas_traits<Lhs> LhsBlasTraits;
+    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+    typedef internal::blas_traits<Rhs> RhsBlasTraits;
+    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+    typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned;

-    typename add_const<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(prod.lhs());
-    typename add_const<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(prod.rhs());
+    typename add_const<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs);
+    typename add_const<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(rhs);

-    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs())
-                                  * RhsBlasTraits::extractScalarFactor(prod.rhs());
+    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs)
+                                  * RhsBlasTraits::extractScalarFactor(rhs);

    enum {
      // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
      // on, the other hand it is good for the cache to pack the vector anyways...
-      DirectlyUseRhs = _ActualRhsType::InnerStrideAtCompileTime==1
+      DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1
    };

-    gemv_static_vector_if<RhsScalar,_ActualRhsType::SizeAtCompileTime,_ActualRhsType::MaxSizeAtCompileTime,!DirectlyUseRhs> static_rhs;
+    gemv_static_vector_if<RhsScalar,ActualRhsTypeCleaned::SizeAtCompileTime,ActualRhsTypeCleaned::MaxSizeAtCompileTime,!DirectlyUseRhs> static_rhs;

    ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhsPtr,actualRhs.size(),
        DirectlyUseRhs ? const_cast<RhsScalar*>(actualRhs.data()) : static_rhs.data());
@ -515,7 +332,7 @@ template<> struct gemv_selector<OnTheRight,RowMajor,true>
      Index size = actualRhs.size();
      EIGEN_DENSE_STORAGE_CTOR_PLUGIN
      #endif
-      Map<typename _ActualRhsType::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
+      Map<typename ActualRhsTypeCleaned::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
    }

    typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper;
@ -530,29 +347,29 @@ template<> struct gemv_selector<OnTheRight,RowMajor,true>
  }
 };

-template<> struct gemv_selector<OnTheRight,ColMajor,false>
+template<> struct gemv_dense_sense_selector<OnTheRight,ColMajor,false>
 {
-  template<typename ProductType, typename Dest>
-  static void run(const ProductType& prod, Dest& dest, const typename ProductType::Scalar& alpha)
+  template<typename Lhs, typename Rhs, typename Dest>
+  static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
  {
    typedef typename Dest::Index Index;
    // TODO makes sure dest is sequentially stored in memory, otherwise use a temp
-    const Index size = prod.rhs().rows();
+    const Index size = rhs.rows();
    for(Index k=0; k<size; ++k)
-      dest += (alpha*prod.rhs().coeff(k)) * prod.lhs().col(k);
+      dest += (alpha*rhs.coeff(k)) * lhs.col(k);
  }
 };

-template<> struct gemv_selector<OnTheRight,RowMajor,false>
+template<> struct gemv_dense_sense_selector<OnTheRight,RowMajor,false>
 {
-  template<typename ProductType, typename Dest>
-  static void run(const ProductType& prod, Dest& dest, const typename ProductType::Scalar& alpha)
+  template<typename Lhs, typename Rhs, typename Dest>
+  static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
  {
    typedef typename Dest::Index Index;
    // TODO makes sure rhs is sequentially stored in memory, otherwise use a temp
-    const Index rows = prod.rows();
+    const Index rows = dest.rows();
    for(Index i=0; i<rows; ++i)
-      dest.coeffRef(i) += alpha * (prod.lhs().row(i).cwiseProduct(prod.rhs().transpose())).sum();
+      dest.coeffRef(i) += alpha * (lhs.row(i).cwiseProduct(rhs.transpose())).sum();
  }
 };

@ -570,7 +387,6 @@ template<> struct gemv_selector<OnTheRight,RowMajor,false>
  */
 #ifndef __CUDACC__

-#ifdef EIGEN_TEST_EVALUATORS
 template<typename Derived>
 template<typename OtherDerived>
 inline const Product<Derived, OtherDerived>
@ -601,39 +417,9 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const

  return Product<Derived, OtherDerived>(derived(), other.derived());
 }
-#else
-template<typename Derived>
-template<typename OtherDerived>
-inline const typename ProductReturnType<Derived, OtherDerived>::Type
-MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
-{
-  // A note regarding the function declaration: In MSVC, this function will sometimes
-  // not be inlined since DenseStorage is an unwindable object for dynamic
-  // matrices and product types are holding a member to store the result.
-  // Thus it does not help tagging this function with EIGEN_STRONG_INLINE.
-  enum {
-    ProductIsValid =  Derived::ColsAtCompileTime==Dynamic
-                   || OtherDerived::RowsAtCompileTime==Dynamic
-                   || int(Derived::ColsAtCompileTime)==int(OtherDerived::RowsAtCompileTime),
-    AreVectors = Derived::IsVectorAtCompileTime && OtherDerived::IsVectorAtCompileTime,
-    SameSizes = EIGEN_PREDICATE_SAME_MATRIX_SIZE(Derived,OtherDerived)
-  };
-  // note to the lost user:
-  //    * for a dot product use: v1.dot(v2)
-  //    * for a coeff-wise product use: v1.cwiseProduct(v2)
-  EIGEN_STATIC_ASSERT(ProductIsValid || !(AreVectors && SameSizes),
-    INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS)
-  EIGEN_STATIC_ASSERT(ProductIsValid || !(SameSizes && !AreVectors),
-    INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION)
-  EIGEN_STATIC_ASSERT(ProductIsValid || SameSizes, INVALID_MATRIX_PRODUCT)
-#ifdef EIGEN_DEBUG_PRODUCT
-  internal::product_type<Derived,OtherDerived>::debug();
-#endif
-  return typename ProductReturnType<Derived,OtherDerived>::Type(derived(), other.derived());
-}
-#endif

-#endif
+#endif // __CUDACC__
+
 /** \returns an expression of the matrix product of \c *this and \a other without implicit evaluation.
  *
  * The returned product will behave like any other expressions: the coefficients of the product will be
@ -647,7 +433,7 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
  */
 template<typename Derived>
 template<typename OtherDerived>
-const typename LazyProductReturnType<Derived,OtherDerived>::Type
+const Product<Derived,OtherDerived,LazyProduct>
 MatrixBase<Derived>::lazyProduct(const MatrixBase<OtherDerived> &other) const
 {
  enum {
@ -666,7 +452,7 @@ MatrixBase<Derived>::lazyProduct(const MatrixBase<OtherDerived> &other) const
    INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION)
  EIGEN_STATIC_ASSERT(ProductIsValid || SameSizes, INVALID_MATRIX_PRODUCT)

-  return typename LazyProductReturnType<Derived,OtherDerived>::Type(derived(), other.derived());
+  return Product<Derived,OtherDerived,LazyProduct>(derived(), other.derived());
 }

 } // end namespace Eigen
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@ -129,12 +129,12 @@ pdiv(const Packet& a,
 /** \internal \returns the min of \a a and \a b  (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pmin(const Packet& a,
-        const Packet& b) { EIGEN_USING_STD_MATH(min); return (min)(a, b); }
+        const Packet& b) { return numext::mini(a, b); }

 /** \internal \returns the max of \a a and \a b  (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pmax(const Packet& a,
-        const Packet& b) { EIGEN_USING_STD_MATH(max); return (max)(a, b); }
+        const Packet& b) { return numext::maxi(a, b); }

 /** \internal \returns the absolute value of \a a */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
@ -245,8 +245,8 @@ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu
 /** \internal tries to do cache prefetching of \a addr */
 template<typename Scalar> inline void prefetch(const Scalar* addr)
 {
-#if !defined(_MSC_VER)
-__builtin_prefetch(addr);
+#if !EIGEN_COMP_MSVC
+  __builtin_prefetch(addr);
 #endif
 }

--- a/Eigen/src/Core/GlobalFunctions.h
+++ b/Eigen/src/Core/GlobalFunctions.h
@ -15,7 +15,7 @@
  template<typename Derived> \
  inline const Eigen::CwiseUnaryOp<Eigen::internal::FUNCTOR<typename Derived::Scalar>, const Derived> \
  NAME(const Eigen::ArrayBase<Derived>& x) { \
-    return x.derived(); \
+    return Eigen::CwiseUnaryOp<Eigen::internal::FUNCTOR<typename Derived::Scalar>, const Derived>(x.derived()); \
  }

 #define EIGEN_ARRAY_DECLARE_GLOBAL_EIGEN_UNARY(NAME,FUNCTOR) \
@ -30,7 +30,7 @@
  { \
    static inline typename NAME##_retval<ArrayBase<Derived> >::type run(const Eigen::ArrayBase<Derived>& x) \
    { \
-      return x.derived(); \
+      return typename NAME##_retval<ArrayBase<Derived> >::type(x.derived()); \
    } \
  };

--- a/Eigen/src/Core/Inverse.h
+++ b/Eigen/src/Core/Inverse.h
@ -0,0 +1,129 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_INVERSE_H
+#define EIGEN_INVERSE_H
+
+namespace Eigen { 
+
+// TODO move the general declaration in Core, and rename this file DenseInverseImpl.h, or something like this...
+
+template<typename XprType,typename StorageKind> class InverseImpl;
+
+namespace internal {
+
+template<typename XprType>
+struct traits<Inverse<XprType> >
+  : traits<typename XprType::PlainObject>
+{
+  typedef typename XprType::PlainObject PlainObject;
+  typedef traits<PlainObject> BaseTraits;
+  enum {
+    Flags = BaseTraits::Flags & RowMajorBit
+  };
+};
+
+} // end namespace internal
+
+/** \class Inverse
+  *
+  * \brief Expression of the inverse of another expression
+  *
+  * \tparam XprType the type of the expression we are taking the inverse
+  *
+  * This class represents an abstract expression of A.inverse()
+  * and most of the time this is the only way it is used.
+  *
+  */
+template<typename XprType>
+class Inverse : public InverseImpl<XprType,typename internal::traits<XprType>::StorageKind>
+{
+public:
+  typedef typename XprType::Index Index;
+  typedef typename XprType::PlainObject                       PlainObject;
+  typedef typename internal::nested<XprType>::type            XprTypeNested;
+  typedef typename internal::remove_all<XprTypeNested>::type  XprTypeNestedCleaned;
+  
+  explicit Inverse(const XprType &xpr)
+    : m_xpr(xpr)
+  {}
+  
+  EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); }
+  EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); }
+
+  EIGEN_DEVICE_FUNC const XprTypeNestedCleaned& nestedExpression() const { return m_xpr; }
+
+protected:
+  XprTypeNested m_xpr;
+};
+
+/** \internal
+  * Specialization of the Inverse expression for dense expressions.
+  * Direct access to the coefficients are discared.
+  * FIXME this intermediate class is probably not needed anymore.
+  */
+template<typename XprType>
+class InverseImpl<XprType,Dense>
+  : public MatrixBase<Inverse<XprType> >
+{
+  typedef Inverse<XprType> Derived;
+  
+public:
+  
+  typedef MatrixBase<Derived> Base;
+  EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
+  typedef typename internal::remove_all<XprType>::type NestedExpression;
+
+private:
+  
+  Scalar coeff(Index row, Index col) const;
+  Scalar coeff(Index i) const;
+};
+
+namespace internal {
+
+/** \internal
+  * \brief Default evaluator for Inverse expression.
+  * 
+  * This default evaluator for Inverse expression simply evaluate the inverse into a temporary
+  * by a call to internal::call_assignment_no_alias.
+  * Therefore, inverse implementers only have to specialize Assignment<Dst,Inverse<...>, ...> for
+  * there own nested expression.
+  *
+  * \sa class Inverse
+  */
+template<typename ArgType>
+struct unary_evaluator<Inverse<ArgType> >
+  : public evaluator<typename Inverse<ArgType>::PlainObject>::type
+{
+  typedef Inverse<ArgType> InverseType;
+  typedef typename InverseType::PlainObject PlainObject;
+  typedef typename evaluator<PlainObject>::type Base;
+  
+  typedef evaluator<InverseType> type;
+  typedef evaluator<InverseType> nestedType;
+  
+  enum { Flags = Base::Flags | EvalBeforeNestingBit };
+
+  unary_evaluator(const InverseType& inv_xpr)
+    : m_result(inv_xpr.rows(), inv_xpr.cols())
+  {
+    ::new (static_cast<Base*>(this)) Base(m_result);
+    internal::call_assignment_no_alias(m_result, inv_xpr);
+  }
+  
+protected:
+  PlainObject m_result;
+};
+  
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_INVERSE_H
--- a/Eigen/src/Core/Map.h
+++ b/Eigen/src/Core/Map.h
@ -79,22 +79,9 @@ struct traits<Map<PlainObjectType, MapOptions, StrideType> >
    OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0
                             ? int(PlainObjectType::OuterStrideAtCompileTime)
                             : int(StrideType::OuterStrideAtCompileTime),
-    HasNoInnerStride = InnerStrideAtCompileTime == 1,
-    HasNoOuterStride = StrideType::OuterStrideAtCompileTime == 0,
-    HasNoStride = HasNoInnerStride && HasNoOuterStride,
    IsAligned = bool(EIGEN_ALIGN) && ((int(MapOptions)&Aligned)==Aligned),
-    IsDynamicSize = PlainObjectType::SizeAtCompileTime==Dynamic,
-    KeepsPacketAccess = bool(HasNoInnerStride)
-                        && ( bool(IsDynamicSize)
-                           || HasNoOuterStride
-                           || ( OuterStrideAtCompileTime!=Dynamic
-                           && ((static_cast<int>(sizeof(Scalar))*OuterStrideAtCompileTime)%EIGEN_ALIGN_BYTES)==0 ) ),
    Flags0 = TraitsBase::Flags & (~NestByRefBit),
-    Flags1 = IsAligned ? (int(Flags0) | AlignedBit) : (int(Flags0) & ~AlignedBit),
-    Flags2 = (bool(HasNoStride) || bool(PlainObjectType::IsVectorAtCompileTime))
-           ? int(Flags1) : int(Flags1 & ~LinearAccessBit),
-    Flags3 = is_lvalue<PlainObjectType>::value ? int(Flags2) : (int(Flags2) & ~LvalueBit),
-    Flags = KeepsPacketAccess ? int(Flags3) : (int(Flags3) & ~PacketAccessBit)
+    Flags = is_lvalue<PlainObjectType>::value ? int(Flags0) : (int(Flags0) & ~LvalueBit)
  };
 private:
  enum { Options }; // Expressions don't have Options
@ -135,7 +122,7 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
      * \param a_stride optional Stride object, passing the strides.
      */
    EIGEN_DEVICE_FUNC
-    inline Map(PointerArgType dataPtr, const StrideType& a_stride = StrideType())
+    explicit inline Map(PointerArgType dataPtr, const StrideType& a_stride = StrideType())
      : Base(cast_to_pointer_type(dataPtr)), m_stride(a_stride)
    {
      PlainObjectType::Base::_check_template_params();
--- a/Eigen/src/Core/MapBase.h
+++ b/Eigen/src/Core/MapBase.h
@ -12,7 +12,7 @@
 #define EIGEN_MAPBASE_H

 #define EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived) \
-      EIGEN_STATIC_ASSERT((int(internal::traits<Derived>::Flags) & LinearAccessBit) || Derived::IsVectorAtCompileTime, \
+      EIGEN_STATIC_ASSERT((int(internal::evaluator<Derived>::Flags) & LinearAccessBit) || Derived::IsVectorAtCompileTime, \
                          YOU_ARE_TRYING_TO_USE_AN_INDEX_BASED_ACCESSOR_ON_AN_EXPRESSION_THAT_DOES_NOT_SUPPORT_THAT)

 namespace Eigen { 
@ -85,7 +85,7 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
      *
      * \sa innerStride(), outerStride()
      */
-    inline const Scalar* data() const { return m_data; }
+    EIGEN_DEVICE_FUNC inline const Scalar* data() const { return m_data; }

    EIGEN_DEVICE_FUNC
    inline const Scalar& coeff(Index rowId, Index colId) const
@ -128,7 +128,7 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
    }

    EIGEN_DEVICE_FUNC
-    inline MapBase(PointerType dataPtr) : m_data(dataPtr), m_rows(RowsAtCompileTime), m_cols(ColsAtCompileTime)
+    explicit inline MapBase(PointerType dataPtr) : m_data(dataPtr), m_rows(RowsAtCompileTime), m_cols(ColsAtCompileTime)
    {
      EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
      checkSanity();
@ -161,11 +161,7 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
    EIGEN_DEVICE_FUNC
    void checkSanity() const
    {
-      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(internal::traits<Derived>::Flags&PacketAccessBit,
-                                        internal::inner_stride_at_compile_time<Derived>::ret==1),
-                          PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1);
-      eigen_assert(EIGEN_IMPLIES(internal::traits<Derived>::Flags&AlignedBit, (size_t(m_data) % EIGEN_ALIGN_BYTES) == 0)
-                   && "data is not aligned");
+      eigen_assert(EIGEN_IMPLIES(internal::traits<Derived>::IsAligned, (size_t(m_data) % EIGEN_ALIGN_BYTES) == 0) && "data is not aligned");
    }

    PointerType m_data;
@ -176,6 +172,7 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
 template<typename Derived> class MapBase<Derived, WriteAccessors>
  : public MapBase<Derived, ReadOnlyAccessors>
 {
+    typedef MapBase<Derived, ReadOnlyAccessors> ReadOnlyMapBase;
  public:

    typedef MapBase<Derived, ReadOnlyAccessors> Base;
@ -243,11 +240,13 @@ template<typename Derived> class MapBase<Derived, WriteAccessors>
    EIGEN_DEVICE_FUNC
    Derived& operator=(const MapBase& other)
    {
-      Base::Base::operator=(other);
+      ReadOnlyMapBase::Base::operator=(other);
      return derived();
    }

-    using Base::Base::operator=;
+    // In theory we could simply refer to Base:Base::operator=, but MSVC does not like Base::Base,
+    // see bugs 821 and 920.
+    using ReadOnlyMapBase::Base::operator=;
 };

 #undef EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@ -14,7 +14,7 @@ namespace Eigen {

 // On WINCE, std::abs is defined for int only, so let's defined our own overloads:
 // This issue has been confirmed with MSVC 2008 only, but the issue might exist for more recent versions too.
-#if defined(_WIN32_WCE) && defined(_MSC_VER) && _MSC_VER<=1500
+#if EIGEN_OS_WINCE && EIGEN_COMP_MSVC && EIGEN_COMP_MSVC<=1500
 long        abs(long        x) { return (labs(x));  }
 double      abs(double      x) { return (fabs(x));  }
 float       abs(float       x) { return (fabsf(x)); }
@ -360,50 +360,31 @@ inline NewType cast(const OldType& x)
 }

 /****************************************************************************
-* Implementation of atanh2                                                *
+* Implementation of logp1                                                *
 ****************************************************************************/

 template<typename Scalar>
-struct atanh2_impl
+struct log1p_impl
 {
-  static inline Scalar run(const Scalar& x, const Scalar& r)
+  static inline Scalar run(const Scalar& x)
  {
    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
-    #if (__cplusplus >= 201103L) && !defined(__CYGWIN__)
+    // Let's be conservative and enable the default C++11 implementation only if we are sure it exists
+    #if (__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC)  \
+        && (EIGEN_ARCH_i386_OR_x86_64) && (EIGEN_OS_GNULINUX || EIGEN_OS_WIN_STRICT || EIGEN_OS_MAC)
      using std::log1p;
-      return log1p(2 * x / (r - x)) / 2;
+      return log1p(x);
    #else
-      using std::abs;
+      typedef typename NumTraits<Scalar>::Real RealScalar;
      using std::log;
-      using std::sqrt;
-      Scalar z = x / r;
-      if (r == 0 || abs(z) > sqrt(NumTraits<Scalar>::epsilon()))
-        return log((r + x) / (r - x)) / 2;
-      else
-        return z + z*z*z / 3;
+      Scalar x1p = RealScalar(1) + x;
+      return ( x1p == Scalar(1) ) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) );
    #endif
  }
 };

-template<typename RealScalar>
-struct atanh2_impl<std::complex<RealScalar> >
-{
-  typedef std::complex<RealScalar> Scalar;
-  static inline Scalar run(const Scalar& x, const Scalar& r)
-  {
-    using std::log;
-    using std::norm;
-    using std::sqrt;
-    Scalar z = x / r;
-    if (r == Scalar(0) || norm(z) > NumTraits<RealScalar>::epsilon())
-      return RealScalar(0.5) * log((r + x) / (r - x));
-    else
-      return z + z*z*z / RealScalar(3);
-  }
-};
-
 template<typename Scalar>
-struct atanh2_retval
+struct log1p_retval
 {
  typedef Scalar type;
 };
@ -591,6 +572,22 @@ inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random()
 ****************************************************************************/

 namespace numext {
+  
+template<typename T>
+EIGEN_DEVICE_FUNC
+inline T mini(const T& x, const T& y)
+{
+  EIGEN_USING_STD_MATH(min);
+  return min EIGEN_NOT_A_MACRO (x,y);
+}
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+inline T maxi(const T& x, const T& y)
+{
+  EIGEN_USING_STD_MATH(max);
+  return max EIGEN_NOT_A_MACRO (x,y);
+}

 template<typename Scalar>
 EIGEN_DEVICE_FUNC
@ -664,9 +661,9 @@ inline EIGEN_MATHFUNC_RETVAL(hypot, Scalar) hypot(const Scalar& x, const Scalar&

 template<typename Scalar>
 EIGEN_DEVICE_FUNC
-inline EIGEN_MATHFUNC_RETVAL(atanh2, Scalar) atanh2(const Scalar& x, const Scalar& y)
+inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x)
 {
-  return EIGEN_MATHFUNC_IMPL(atanh2, Scalar)::run(x, y);
+  return EIGEN_MATHFUNC_IMPL(log1p, Scalar)::run(x);
 }

 template<typename Scalar>
@ -694,6 +691,21 @@ bool (isfinite)(const std::complex<T>& x)
  return isfinite(real(x)) && isfinite(imag(x));
 }

+// Log base 2 for 32 bits positive integers.
+// Conveniently returns 0 for x==0.
+inline int log2(int x)
+{
+  eigen_assert(x>=0);
+  unsigned int v(x);
+  static const int table[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
+  v |= v >> 1;
+  v |= v >> 2;
+  v |= v >> 4;
+  v |= v >> 8;
+  v |= v >> 16;
+  return table[(v * 0x07C4ACDDU) >> 27];
+}
+
 } // end namespace numext

 namespace internal {
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h
@ -115,7 +115,8 @@ struct traits<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
    MaxRowsAtCompileTime = _MaxRows,
    MaxColsAtCompileTime = _MaxCols,
    Flags = compute_matrix_flags<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>::ret,
-    CoeffReadCost = NumTraits<Scalar>::ReadCost,
+    // FIXME, the following flag in only used to define NeedsToAlign in PlainObjectBase
+    EvaluatorFlags = compute_matrix_evaluator_flags<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>::ret,
    Options = _Options,
    InnerStrideAtCompileTime = 1,
    OuterStrideAtCompileTime = (Options&RowMajor) ? ColsAtCompileTime : RowsAtCompileTime
@ -213,7 +214,7 @@ class Matrix

    // FIXME is it still needed
    EIGEN_DEVICE_FUNC
-    Matrix(internal::constructor_without_unaligned_array_assert)
+    explicit Matrix(internal::constructor_without_unaligned_array_assert)
      : Base(internal::constructor_without_unaligned_array_assert())
    { Base::_check_template_params(); EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED }

@ -359,15 +360,6 @@ class Matrix
      *this = other;
    }

-    /** \internal
-      * \brief Override MatrixBase::swap() since for dynamic-sized matrices
-      * of same type it is enough to swap the data pointers.
-      */
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    void swap(MatrixBase<OtherDerived> const & other)
-    { this->_swap(other.derived()); }
-
    EIGEN_DEVICE_FUNC inline Index innerStride() const { return 1; }
    EIGEN_DEVICE_FUNC inline Index outerStride() const { return this->innerSize(); }

--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@ -66,8 +66,7 @@ template<typename Derived> class MatrixBase
    using Base::MaxSizeAtCompileTime;
    using Base::IsVectorAtCompileTime;
    using Base::Flags;
-    using Base::CoeffReadCost;
-
+    
    using Base::derived;
    using Base::const_cast_derived;
    using Base::rows;
@ -181,26 +180,20 @@ template<typename Derived> class MatrixBase
 #ifdef __CUDACC__
    template<typename OtherDerived>
    EIGEN_DEVICE_FUNC
-    const typename LazyProductReturnType<Derived,OtherDerived>::Type
+    const Product<Derived,OtherDerived,LazyProduct>
    operator*(const MatrixBase<OtherDerived> &other) const
    { return this->lazyProduct(other); }
 #else

-#ifdef EIGEN_TEST_EVALUATORS
    template<typename OtherDerived>
    const Product<Derived,OtherDerived>
    operator*(const MatrixBase<OtherDerived> &other) const;
-#else
-    template<typename OtherDerived>
-    const typename ProductReturnType<Derived,OtherDerived>::Type
-    operator*(const MatrixBase<OtherDerived> &other) const;
-#endif

 #endif

    template<typename OtherDerived>
    EIGEN_DEVICE_FUNC 
-    const typename LazyProductReturnType<Derived,OtherDerived>::Type
+    const Product<Derived,OtherDerived,LazyProduct>
    lazyProduct(const MatrixBase<OtherDerived> &other) const;

    template<typename OtherDerived>
@ -214,7 +207,7 @@ template<typename Derived> class MatrixBase

    template<typename DiagonalDerived>
    EIGEN_DEVICE_FUNC
-    const DiagonalProduct<Derived, DiagonalDerived, OnTheRight>
+    const Product<Derived, DiagonalDerived, LazyProduct>
    operator*(const DiagonalBase<DiagonalDerived> &diagonal) const;

    template<typename OtherDerived>
@ -251,19 +244,14 @@ template<typename Derived> class MatrixBase
    template<int Index>
    EIGEN_DEVICE_FUNC
    typename ConstDiagonalIndexReturnType<Index>::Type diagonal() const;
-
-    // Note: The "MatrixBase::" prefixes are added to help MSVC9 to match these declarations with the later implementations.
-    // On the other hand they confuse MSVC8...
-    #if (defined _MSC_VER) && (_MSC_VER >= 1500) // 2008 or later
-    typename MatrixBase::template DiagonalIndexReturnType<DynamicIndex>::Type diagonal(Index index);
-    typename MatrixBase::template ConstDiagonalIndexReturnType<DynamicIndex>::Type diagonal(Index index) const;
-    #else
-    EIGEN_DEVICE_FUNC
-    typename DiagonalIndexReturnType<DynamicIndex>::Type diagonal(Index index);
    
+    typedef Diagonal<Derived,DynamicIndex> DiagonalDynamicIndexReturnType;
+    typedef typename internal::add_const<Diagonal<const Derived,DynamicIndex> >::type ConstDiagonalDynamicIndexReturnType;
+
    EIGEN_DEVICE_FUNC
-    typename ConstDiagonalIndexReturnType<DynamicIndex>::Type diagonal(Index index) const;
-    #endif
+    DiagonalDynamicIndexReturnType diagonal(Index index);
+    EIGEN_DEVICE_FUNC
+    ConstDiagonalDynamicIndexReturnType diagonal(Index index) const;

    template<unsigned int Mode> struct TriangularViewReturnType { typedef TriangularView<Derived, Mode> Type; };
    template<unsigned int Mode> struct ConstTriangularViewReturnType { typedef const TriangularView<const Derived, Mode> Type; };
@ -334,10 +322,12 @@ template<typename Derived> class MatrixBase

    NoAlias<Derived,Eigen::MatrixBase > noalias();

-    inline const ForceAlignedAccess<Derived> forceAlignedAccess() const;
-    inline ForceAlignedAccess<Derived> forceAlignedAccess();
-    template<bool Enable> inline typename internal::add_const_on_value_type<typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type>::type forceAlignedAccessIf() const;
-    template<bool Enable> inline typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type forceAlignedAccessIf();
+    // TODO forceAlignedAccess is temporarily disabled
+    // Need to find a nicer workaround.
+    inline const Derived& forceAlignedAccess() const { return derived(); }
+    inline Derived& forceAlignedAccess() { return derived(); }
+    template<bool Enable> inline const Derived& forceAlignedAccessIf() const { return derived(); }
+    template<bool Enable> inline Derived& forceAlignedAccessIf() { return derived(); }

    Scalar trace() const;

@ -348,10 +338,10 @@ template<typename Derived> class MatrixBase

    /** \returns an \link Eigen::ArrayBase Array \endlink expression of this matrix
      * \sa ArrayBase::matrix() */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ArrayWrapper<Derived> array() { return derived(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ArrayWrapper<Derived> array() { return ArrayWrapper<Derived>(derived()); }
    /** \returns a const \link Eigen::ArrayBase Array \endlink expression of this matrix
      * \sa ArrayBase::matrix() */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const ArrayWrapper<const Derived> array() const { return derived(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const ArrayWrapper<const Derived> array() const { return ArrayWrapper<const Derived>(derived()); }

 /////////// LU module ///////////

@ -361,7 +351,8 @@ template<typename Derived> class MatrixBase
    const PartialPivLU<PlainObject> lu() const;

    EIGEN_DEVICE_FUNC
-    const internal::inverse_impl<Derived> inverse() const;
+    const Inverse<Derived> inverse() const;
+    
    template<typename ResultType>
    void computeInverseAndDetWithCheck(
      ResultType& inverse,
@ -394,6 +385,7 @@ template<typename Derived> class MatrixBase
 /////////// SVD module ///////////

    JacobiSVD<PlainObject> jacobiSvd(unsigned int computationOptions = 0) const;
+    BDCSVD<PlainObject>    bdcSvd(unsigned int computationOptions = 0) const;

 /////////// Geometry module ///////////

--- a/Eigen/src/Core/NestByValue.h
+++ b/Eigen/src/Core/NestByValue.h
@ -40,29 +40,29 @@ template<typename ExpressionType> class NestByValue
    typedef typename internal::dense_xpr_base<NestByValue>::type Base;
    EIGEN_DENSE_PUBLIC_INTERFACE(NestByValue)

-    inline NestByValue(const ExpressionType& matrix) : m_expression(matrix) {}
+    EIGEN_DEVICE_FUNC explicit inline NestByValue(const ExpressionType& matrix) : m_expression(matrix) {}

-    inline Index rows() const { return m_expression.rows(); }
-    inline Index cols() const { return m_expression.cols(); }
-    inline Index outerStride() const { return m_expression.outerStride(); }
-    inline Index innerStride() const { return m_expression.innerStride(); }
+    EIGEN_DEVICE_FUNC inline Index rows() const { return m_expression.rows(); }
+    EIGEN_DEVICE_FUNC inline Index cols() const { return m_expression.cols(); }
+    EIGEN_DEVICE_FUNC inline Index outerStride() const { return m_expression.outerStride(); }
+    EIGEN_DEVICE_FUNC inline Index innerStride() const { return m_expression.innerStride(); }

-    inline const CoeffReturnType coeff(Index row, Index col) const
+    EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index row, Index col) const
    {
      return m_expression.coeff(row, col);
    }

-    inline Scalar& coeffRef(Index row, Index col)
+    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index col)
    {
      return m_expression.const_cast_derived().coeffRef(row, col);
    }

-    inline const CoeffReturnType coeff(Index index) const
+    EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index index) const
    {
      return m_expression.coeff(index);
    }

-    inline Scalar& coeffRef(Index index)
+    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index)
    {
      return m_expression.const_cast_derived().coeffRef(index);
    }
@ -91,7 +91,7 @@ template<typename ExpressionType> class NestByValue
      m_expression.const_cast_derived().template writePacket<LoadMode>(index, x);
    }

-    operator const ExpressionType&() const { return m_expression; }
+    EIGEN_DEVICE_FUNC operator const ExpressionType&() const { return m_expression; }

  protected:
    const ExpressionType m_expression;
--- a/Eigen/src/Core/NoAlias.h
+++ b/Eigen/src/Core/NoAlias.h
@ -30,68 +30,35 @@ namespace Eigen {
 template<typename ExpressionType, template <typename> class StorageBase>
 class NoAlias
 {
-    typedef typename ExpressionType::Scalar Scalar;
  public:
-    NoAlias(ExpressionType& expression) : m_expression(expression) {}
-
-    /** Behaves like MatrixBase::lazyAssign(other)
-      * \sa MatrixBase::lazyAssign() */
+    typedef typename ExpressionType::Scalar Scalar;
+    
+    explicit NoAlias(ExpressionType& expression) : m_expression(expression) {}
+    
    template<typename OtherDerived>
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE ExpressionType& operator=(const StorageBase<OtherDerived>& other)
-    { return internal::assign_selector<ExpressionType,OtherDerived,false>::run(m_expression,other.derived()); }
-
-    /** \sa MatrixBase::operator+= */
+    {
+      call_assignment_no_alias(m_expression, other.derived(), internal::assign_op<Scalar>());
+      return m_expression;
+    }
+    
    template<typename OtherDerived>
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE ExpressionType& operator+=(const StorageBase<OtherDerived>& other)
    {
-      typedef SelfCwiseBinaryOp<internal::scalar_sum_op<Scalar>, ExpressionType, OtherDerived> SelfAdder;
-      SelfAdder tmp(m_expression);
-      typedef typename internal::nested<OtherDerived>::type OtherDerivedNested;
-      typedef typename internal::remove_all<OtherDerivedNested>::type _OtherDerivedNested;
-      internal::assign_selector<SelfAdder,_OtherDerivedNested,false>::run(tmp,OtherDerivedNested(other.derived()));
+      call_assignment_no_alias(m_expression, other.derived(), internal::add_assign_op<Scalar>());
      return m_expression;
    }
-
-    /** \sa MatrixBase::operator-= */
+    
    template<typename OtherDerived>
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE ExpressionType& operator-=(const StorageBase<OtherDerived>& other)
    {
-      typedef SelfCwiseBinaryOp<internal::scalar_difference_op<Scalar>, ExpressionType, OtherDerived> SelfAdder;
-      SelfAdder tmp(m_expression);
-      typedef typename internal::nested<OtherDerived>::type OtherDerivedNested;
-      typedef typename internal::remove_all<OtherDerivedNested>::type _OtherDerivedNested;
-      internal::assign_selector<SelfAdder,_OtherDerivedNested,false>::run(tmp,OtherDerivedNested(other.derived()));
+      call_assignment_no_alias(m_expression, other.derived(), internal::sub_assign_op<Scalar>());
      return m_expression;
    }

-#ifndef EIGEN_PARSED_BY_DOXYGEN
-    template<typename ProductDerived, typename Lhs, typename Rhs>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE ExpressionType& operator+=(const ProductBase<ProductDerived, Lhs,Rhs>& other)
-    { other.derived().addTo(m_expression); return m_expression; }
-
-    template<typename ProductDerived, typename Lhs, typename Rhs>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE ExpressionType& operator-=(const ProductBase<ProductDerived, Lhs,Rhs>& other)
-    { other.derived().subTo(m_expression); return m_expression; }
-
-    template<typename Lhs, typename Rhs, int NestingFlags>
-    EIGEN_STRONG_INLINE ExpressionType& operator+=(const CoeffBasedProduct<Lhs,Rhs,NestingFlags>& other)
-    { return m_expression.derived() += CoeffBasedProduct<Lhs,Rhs,NestByRefBit>(other.lhs(), other.rhs()); }
-
-    template<typename Lhs, typename Rhs, int NestingFlags>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE ExpressionType& operator-=(const CoeffBasedProduct<Lhs,Rhs,NestingFlags>& other)
-    { return m_expression.derived() -= CoeffBasedProduct<Lhs,Rhs,NestByRefBit>(other.lhs(), other.rhs()); }
-    
-    template<typename OtherDerived>
-    ExpressionType& operator=(const ReturnByValue<OtherDerived>& func)
-    { return m_expression = func; }
-#endif
-
    EIGEN_DEVICE_FUNC
    ExpressionType& expression() const
    {
@ -133,7 +100,7 @@ class NoAlias
 template<typename Derived>
 NoAlias<Derived,MatrixBase> MatrixBase<Derived>::noalias()
 {
-  return derived();
+  return NoAlias<Derived, Eigen::MatrixBase >(derived());
 }

 } // end namespace Eigen
--- a/Eigen/src/Core/PermutationMatrix.h
+++ b/Eigen/src/Core/PermutationMatrix.h
@ -13,7 +13,8 @@

 namespace Eigen { 

-template<int RowCol,typename IndicesType,typename MatrixType, typename StorageKind> class PermutedImpl;
+// TODO: this does not seems to be needed at all:
+// template<int RowCol,typename IndicesType,typename MatrixType, typename StorageKind> class PermutedImpl;

 /** \class PermutationBase
  * \ingroup Core_Module
@ -60,7 +61,6 @@ class PermutationBase : public EigenBase<Derived>
    typedef typename Traits::IndicesType IndicesType;
    enum {
      Flags = Traits::Flags,
-      CoeffReadCost = Traits::CoeffReadCost,
      RowsAtCompileTime = Traits::RowsAtCompileTime,
      ColsAtCompileTime = Traits::ColsAtCompileTime,
      MaxRowsAtCompileTime = Traits::MaxRowsAtCompileTime,
@ -73,6 +73,7 @@ class PermutationBase : public EigenBase<Derived>
    typedef PermutationMatrix<IndicesType::SizeAtCompileTime,IndicesType::MaxSizeAtCompileTime,StorageIndexType>
            PlainPermutationType;
    using Base::derived;
+    typedef Transpose<PermutationBase> TransposeReturnType;
    #endif

    /** Copies the other permutation into *this */
@ -118,7 +119,7 @@ class PermutationBase : public EigenBase<Derived>
    void evalTo(MatrixBase<DenseDerived>& other) const
    {
      other.setZero();
-      for (int i=0; i<rows();++i)
+      for (Index i=0; i<rows(); ++i)
        other.coeffRef(indices().coeff(i),i) = typename DenseDerived::Scalar(1);
    }
    #endif
@ -163,10 +164,10 @@ class PermutationBase : public EigenBase<Derived>
      *
      * \returns a reference to *this.
      *
-      * \warning This is much slower than applyTranspositionOnTheRight(int,int):
+      * \warning This is much slower than applyTranspositionOnTheRight(Index,Index):
      * this has linear complexity and requires a lot of branching.
      *
-      * \sa applyTranspositionOnTheRight(int,int)
+      * \sa applyTranspositionOnTheRight(Index,Index)
      */
    Derived& applyTranspositionOnTheLeft(Index i, Index j)
    {
@ -185,7 +186,7 @@ class PermutationBase : public EigenBase<Derived>
      *
      * This is a fast operation, it only consists in swapping two indices.
      *
-      * \sa applyTranspositionOnTheLeft(int,int)
+      * \sa applyTranspositionOnTheLeft(Index,Index)
      */
    Derived& applyTranspositionOnTheRight(Index i, Index j)
    {
@ -198,14 +199,14 @@ class PermutationBase : public EigenBase<Derived>
      *
      * \note \note_try_to_help_rvo
      */
-    inline Transpose<PermutationBase> inverse() const
-    { return derived(); }
+    inline TransposeReturnType inverse() const
+    { return TransposeReturnType(derived()); }
    /** \returns the tranpose permutation matrix.
      *
      * \note \note_try_to_help_rvo
      */
-    inline Transpose<PermutationBase> transpose() const
-    { return derived(); }
+    inline TransposeReturnType transpose() const
+    { return TransposeReturnType(derived()); }

    /**** multiplication helpers to hopefully get RVO ****/

@ -215,13 +216,13 @@ class PermutationBase : public EigenBase<Derived>
    template<typename OtherDerived>
    void assignTranspose(const PermutationBase<OtherDerived>& other)
    {
-      for (int i=0; i<rows();++i) indices().coeffRef(other.indices().coeff(i)) = i;
+      for (Index i=0; i<rows();++i) indices().coeffRef(other.indices().coeff(i)) = i;
    }
    template<typename Lhs,typename Rhs>
    void assignProduct(const Lhs& lhs, const Rhs& rhs)
    {
      eigen_assert(lhs.cols() == rhs.rows());
-      for (int i=0; i<rows();++i) indices().coeffRef(i) = lhs.indices().coeff(rhs.indices().coeff(i));
+      for (Index i=0; i<rows();++i) indices().coeffRef(i) = lhs.indices().coeff(rhs.indices().coeff(i));
    }
 #endif

@ -274,6 +275,7 @@ template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex
 struct traits<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndexType> >
 : traits<Matrix<_StorageIndexType,SizeAtCompileTime,SizeAtCompileTime,0,MaxSizeAtCompileTime,MaxSizeAtCompileTime> >
 {
+  typedef PermutationStorage StorageKind;
  typedef Matrix<_StorageIndexType, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1> IndicesType;
  typedef typename IndicesType::Index Index;
  typedef _StorageIndexType StorageIndexType;
@ -287,6 +289,8 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile
    typedef internal::traits<PermutationMatrix> Traits;
  public:

+    typedef const PermutationMatrix& Nested;
+
    #ifndef EIGEN_PARSED_BY_DOXYGEN
    typedef typename Traits::IndicesType IndicesType;
    typedef typename Traits::StorageIndexType StorageIndexType;
@ -298,8 +302,10 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile

    /** Constructs an uninitialized permutation matrix of given size.
      */
-    inline PermutationMatrix(Index size) : m_indices(size)
-    {}
+    explicit inline PermutationMatrix(Index size) : m_indices(size)
+    {
+      eigen_internal_assert(size <= NumTraits<StorageIndexType>::highest());
+    }

    /** Copy constructor. */
    template<typename OtherDerived>
@ -370,7 +376,10 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile
    PermutationMatrix(const Transpose<PermutationBase<Other> >& other)
      : m_indices(other.nestedPermutation().size())
    {
-      for (int i=0; i<m_indices.size();++i) m_indices.coeffRef(other.nestedPermutation().indices().coeff(i)) = i;
+      eigen_internal_assert(m_indices.size() <= NumTraits<StorageIndexType>::highest());
+      StorageIndexType end = StorageIndexType(m_indices.size());
+      for (StorageIndexType i=0; i<end;++i)
+        m_indices.coeffRef(other.nestedPermutation().indices().coeff(i)) = i;
    }
    template<typename Lhs,typename Rhs>
    PermutationMatrix(internal::PermPermProduct_t, const Lhs& lhs, const Rhs& rhs)
@ -391,6 +400,7 @@ template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex
 struct traits<Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndexType>,_PacketAccess> >
 : traits<Matrix<_StorageIndexType,SizeAtCompileTime,SizeAtCompileTime,0,MaxSizeAtCompileTime,MaxSizeAtCompileTime> >
 {
+  typedef PermutationStorage StorageKind;
  typedef Map<const Matrix<_StorageIndexType, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1>, _PacketAccess> IndicesType;
  typedef typename IndicesType::Index Index;
  typedef _StorageIndexType StorageIndexType;
@ -462,8 +472,6 @@ class Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageInd
  * \sa class PermutationBase, class PermutationMatrix
  */

-struct PermutationStorage {};
-
 template<typename _IndicesType> class TranspositionsWrapper;
 namespace internal {
 template<typename _IndicesType>
@ -477,10 +485,9 @@ struct traits<PermutationWrapper<_IndicesType> >
  enum {
    RowsAtCompileTime = _IndicesType::SizeAtCompileTime,
    ColsAtCompileTime = _IndicesType::SizeAtCompileTime,
-    MaxRowsAtCompileTime = IndicesType::MaxRowsAtCompileTime,
-    MaxColsAtCompileTime = IndicesType::MaxColsAtCompileTime,
-    Flags = 0,
-    CoeffReadCost = _IndicesType::CoeffReadCost
+    MaxRowsAtCompileTime = IndicesType::MaxSizeAtCompileTime,
+    MaxColsAtCompileTime = IndicesType::MaxSizeAtCompileTime,
+    Flags = 0
  };
 };
 }
@ -509,35 +516,39 @@ class PermutationWrapper : public PermutationBase<PermutationWrapper<_IndicesTyp
    typename IndicesType::Nested m_indices;
 };

+
+// TODO: Do we need to define these operator* functions? Would it be better to have them inherited
+// from MatrixBase?
+
 /** \returns the matrix with the permutation applied to the columns.
  */
-template<typename Derived, typename PermutationDerived>
-inline const internal::permut_matrix_product_retval<PermutationDerived, Derived, OnTheRight>
-operator*(const MatrixBase<Derived>& matrix,
-          const PermutationBase<PermutationDerived> &permutation)
+template<typename MatrixDerived, typename PermutationDerived>
+EIGEN_DEVICE_FUNC
+const Product<MatrixDerived, PermutationDerived, DefaultProduct>
+operator*(const MatrixBase<MatrixDerived> &matrix,
+          const PermutationBase<PermutationDerived>& permutation)
 {
-  return internal::permut_matrix_product_retval
-           <PermutationDerived, Derived, OnTheRight>
-           (permutation.derived(), matrix.derived());
+  return Product<MatrixDerived, PermutationDerived, DefaultProduct>
+            (matrix.derived(), permutation.derived());
 }

 /** \returns the matrix with the permutation applied to the rows.
  */
-template<typename Derived, typename PermutationDerived>
-inline const internal::permut_matrix_product_retval
-               <PermutationDerived, Derived, OnTheLeft>
+template<typename PermutationDerived, typename MatrixDerived>
+EIGEN_DEVICE_FUNC
+const Product<PermutationDerived, MatrixDerived, DefaultProduct>
 operator*(const PermutationBase<PermutationDerived> &permutation,
-          const MatrixBase<Derived>& matrix)
+          const MatrixBase<MatrixDerived>& matrix)
 {
-  return internal::permut_matrix_product_retval
-           <PermutationDerived, Derived, OnTheLeft>
-           (permutation.derived(), matrix.derived());
+  return Product<PermutationDerived, MatrixDerived, DefaultProduct>
+            (permutation.derived(), matrix.derived());
 }

 namespace internal {

 template<typename PermutationType, typename MatrixType, int Side, bool Transposed>
 struct traits<permut_matrix_product_retval<PermutationType, MatrixType, Side, Transposed> >
+  : traits<typename MatrixType::PlainObject>
 {
  typedef typename MatrixType::PlainObject ReturnType;
 };
@ -590,7 +601,7 @@ struct permut_matrix_product_retval
      }
      else
      {
-        for(int i = 0; i < n; ++i)
+        for(Index i = 0; i < n; ++i)
        {
          Block<Dest, Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side==OnTheRight ? 1 : Dest::ColsAtCompileTime>
               (dst, ((Side==OnTheLeft) ^ Transposed) ? m_permutation.indices().coeff(i) : i)
@ -617,6 +628,8 @@ struct traits<Transpose<PermutationBase<Derived> > >

 } // end namespace internal

+// TODO: the specificties should be handled by the evaluator,
+// at the very least we should only specialize TransposeImpl
 template<typename Derived>
 class Transpose<PermutationBase<Derived> >
  : public EigenBase<Transpose<PermutationBase<Derived> > >
@ -631,26 +644,26 @@ class Transpose<PermutationBase<Derived> >
    typedef typename Derived::DenseMatrixType DenseMatrixType;
    enum {
      Flags = Traits::Flags,
-      CoeffReadCost = Traits::CoeffReadCost,
      RowsAtCompileTime = Traits::RowsAtCompileTime,
      ColsAtCompileTime = Traits::ColsAtCompileTime,
      MaxRowsAtCompileTime = Traits::MaxRowsAtCompileTime,
      MaxColsAtCompileTime = Traits::MaxColsAtCompileTime
    };
    typedef typename Traits::Scalar Scalar;
+    typedef typename Traits::Index Index;
    #endif

    Transpose(const PermutationType& p) : m_permutation(p) {}

-    inline int rows() const { return m_permutation.rows(); }
-    inline int cols() const { return m_permutation.cols(); }
+    inline Index rows() const { return m_permutation.rows(); }
+    inline Index cols() const { return m_permutation.cols(); }

    #ifndef EIGEN_PARSED_BY_DOXYGEN
    template<typename DenseDerived>
    void evalTo(MatrixBase<DenseDerived>& other) const
    {
      other.setZero();
-      for (int i=0; i<rows();++i)
+      for (Index i=0; i<rows();++i)
        other.coeffRef(i, m_permutation.indices().coeff(i)) = typename DenseDerived::Scalar(1);
    }
    #endif
@ -663,19 +676,19 @@ class Transpose<PermutationBase<Derived> >
    /** \returns the matrix with the inverse permutation applied to the columns.
      */
    template<typename OtherDerived> friend
-    inline const internal::permut_matrix_product_retval<PermutationType, OtherDerived, OnTheRight, true>
+    const Product<OtherDerived, Transpose, DefaultProduct>
    operator*(const MatrixBase<OtherDerived>& matrix, const Transpose& trPerm)
    {
-      return internal::permut_matrix_product_retval<PermutationType, OtherDerived, OnTheRight, true>(trPerm.m_permutation, matrix.derived());
+      return Product<OtherDerived, Transpose, DefaultProduct>(matrix.derived(), trPerm.derived());
    }

    /** \returns the matrix with the inverse permutation applied to the rows.
      */
    template<typename OtherDerived>
-    inline const internal::permut_matrix_product_retval<PermutationType, OtherDerived, OnTheLeft, true>
+    const Product<Transpose, OtherDerived, DefaultProduct>
    operator*(const MatrixBase<OtherDerived>& matrix) const
    {
-      return internal::permut_matrix_product_retval<PermutationType, OtherDerived, OnTheLeft, true>(m_permutation, matrix.derived());
+      return Product<Transpose, OtherDerived, DefaultProduct>(*this, matrix.derived());
    }

    const PermutationType& nestedPermutation() const { return m_permutation; }
@ -690,6 +703,38 @@ const PermutationWrapper<const Derived> MatrixBase<Derived>::asPermutation() con
  return derived();
 }

+namespace internal {
+  
+// TODO currently a permutation matrix expression has the form PermutationMatrix or PermutationWrapper
+//      or their transpose; in the future shape should be defined by the expression traits
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType>
+struct evaluator_traits<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType> >
+{
+  typedef typename storage_kind_to_evaluator_kind<Dense>::Kind Kind;
+  typedef PermutationShape Shape;
+  static const int AssumeAliasing = 0;
+};
+
+template<typename IndicesType>
+struct evaluator_traits<PermutationWrapper<IndicesType> >
+{
+  typedef typename storage_kind_to_evaluator_kind<Dense>::Kind Kind;
+  typedef PermutationShape Shape;
+  static const int AssumeAliasing = 0;
+};
+
+template<typename Derived>
+struct evaluator_traits<Transpose<PermutationBase<Derived> > >
+{
+  typedef typename storage_kind_to_evaluator_kind<Dense>::Kind Kind;
+  typedef PermutationShape Shape;
+  static const int AssumeAliasing = 0;
+};
+
+template<> struct AssignmentKind<DenseShape,PermutationShape> { typedef EigenBase2EigenBase Kind; };
+
+} // end namespace internal
+
 } // end namespace Eigen

 #endif // EIGEN_PERMUTATIONMATRIX_H
--- a/Eigen/src/Core/PlainObjectBase.h
+++ b/Eigen/src/Core/PlainObjectBase.h
@ -128,7 +128,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    DenseStorage<Scalar, Base::MaxSizeAtCompileTime, Base::RowsAtCompileTime, Base::ColsAtCompileTime, Options> m_storage;

  public:
-    enum { NeedsToAlign = SizeAtCompileTime != Dynamic && (internal::traits<Derived>::Flags & AlignedBit) != 0 };
+    enum { NeedsToAlign = SizeAtCompileTime != Dynamic && (internal::traits<Derived>::EvaluatorFlags & AlignedBit) != 0 };
    EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign)

    EIGEN_DEVICE_FUNC
@ -221,11 +221,11 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    }

    /** \returns a const pointer to the data array of this matrix */
-    EIGEN_STRONG_INLINE const Scalar *data() const
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar *data() const
    { return m_storage.data(); }

    /** \returns a pointer to the data array of this matrix */
-    EIGEN_STRONG_INLINE Scalar *data()
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar *data()
    { return m_storage.data(); }

    /** Resizes \c *this to a \a rows x \a cols matrix.
@ -457,7 +457,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    // FIXME is it still needed ?
    /** \internal */
    EIGEN_DEVICE_FUNC
-    PlainObjectBase(internal::constructor_without_unaligned_array_assert)
+    explicit PlainObjectBase(internal::constructor_without_unaligned_array_assert)
      : m_storage(internal::constructor_without_unaligned_array_assert())
    {
 //       _check_template_params(); EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
@ -639,22 +639,16 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      *
      * \internal
      */
+    // aliasing is dealt once in internall::call_assignment
+    // so at this stage we have to assume aliasing... and resising has to be done later.
    template<typename OtherDerived>
    EIGEN_DEVICE_FUNC 
    EIGEN_STRONG_INLINE Derived& _set(const DenseBase<OtherDerived>& other)
    {
-      _set_selector(other.derived(), typename internal::conditional<static_cast<bool>(int(OtherDerived::Flags) & EvalBeforeAssigningBit), internal::true_type, internal::false_type>::type());
+      internal::call_assignment(this->derived(), other.derived());
      return this->derived();
    }

-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC 
-    EIGEN_STRONG_INLINE void _set_selector(const OtherDerived& other, const internal::true_type&) { _set_noalias(other.eval()); }
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC 
-    EIGEN_STRONG_INLINE void _set_selector(const OtherDerived& other, const internal::false_type&) { _set_noalias(other); }
-
    /** \internal Like _set() but additionally makes the assumption that no aliasing effect can happen (which
      * is the case when creating a new matrix) so one can enforce lazy evaluation.
      *
@ -669,7 +663,8 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      //_resize_to_match(other);
      // the 'false' below means to enforce lazy evaluation. We don't use lazyAssign() because
      // it wouldn't allow to copy a row-vector into a column-vector.
-      return internal::assign_selector<Derived,OtherDerived,false>::run(this->derived(), other.derived());
+      internal::call_assignment_no_alias(this->derived(), other.derived(), internal::assign_op<Scalar>());
+      return this->derived();
    }

    template<typename T0, typename T1>
@ -704,9 +699,12 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      m_storage.data()[1] = Scalar(val1);
    }

+    // The argument is convertible to the Index type and we either have a non 1x1 Matrix, or a dynamic-sized Array,
+    // then the argument is meant to be the size of the object.
    template<typename T>
    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE void _init1(Index size, typename internal::enable_if<Base::SizeAtCompileTime!=1 || !internal::is_convertible<T, Scalar>::value,T>::type* = 0)
+    EIGEN_STRONG_INLINE void _init1(Index size, typename internal::enable_if<    (Base::SizeAtCompileTime!=1 || !internal::is_convertible<T, Scalar>::value)
+                                                                              && ((!internal::is_same<typename internal::traits<Derived>::XprKind,ArrayXpr>::value || Base::SizeAtCompileTime==Dynamic)),T>::type* = 0)
    {
      // NOTE MSVC 2008 complains if we directly put bool(NumTraits<T>::IsInteger) as the EIGEN_STATIC_ASSERT argument.
      const bool is_integer = NumTraits<T>::IsInteger;
@ -714,6 +712,8 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
                          FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED)
      resize(size);
    }
+    
+    // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type can be implicitely converted)
    template<typename T>
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void _init1(const Scalar& val0, typename internal::enable_if<Base::SizeAtCompileTime==1 && internal::is_convertible<T, Scalar>::value,T>::type* = 0)
@ -722,6 +722,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      m_storage.data()[0] = val0;
    }
    
+    // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type match the index type)
    template<typename T>
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void _init1(const Index& val0,
@ -734,18 +735,21 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      m_storage.data()[0] = Scalar(val0);
    }

+    // Initialize a fixed size matrix from a pointer to raw data
    template<typename T>
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void _init1(const Scalar* data){
      this->_set_noalias(ConstMapType(data));
    }

+    // Initialize an arbitrary matrix from a dense expression
    template<typename T, typename OtherDerived>
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void _init1(const DenseBase<OtherDerived>& other){
      this->_set_noalias(other);
    }

+    // Initialize an arbitrary matrix from a generic Eigen expression
    template<typename T, typename OtherDerived>
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void _init1(const EigenBase<OtherDerived>& other){
@ -766,23 +770,58 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    {
      this->derived() = r;
    }
-
+    
+    // For fixed -size arrays:
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const Scalar& val0,
+                                    typename internal::enable_if<    Base::SizeAtCompileTime!=Dynamic
+                                                                  && Base::SizeAtCompileTime!=1
+                                                                  && internal::is_convertible<T, Scalar>::value
+                                                                  && internal::is_same<typename internal::traits<Derived>::XprKind,ArrayXpr>::value,T>::type* = 0)
+    {
+      Base::setConstant(val0);
+    }
+    
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const Index& val0,
+                                    typename internal::enable_if<    (!internal::is_same<Index,Scalar>::value)
+                                                                  && (internal::is_same<Index,T>::value)
+                                                                  && Base::SizeAtCompileTime!=Dynamic
+                                                                  && Base::SizeAtCompileTime!=1
+                                                                  && internal::is_convertible<T, Scalar>::value
+                                                                  && internal::is_same<typename internal::traits<Derived>::XprKind,ArrayXpr>::value,T*>::type* = 0)
+    {
+      Base::setConstant(val0);
+    }
+    
    template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers>
    friend struct internal::matrix_swap_impl;

-    /** \internal generic implementation of swap for dense storage since for dynamic-sized matrices of same type it is enough to swap the
-      * data pointers.
+  public:
+    
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** \internal
+      * \brief Override DenseBase::swap() since for dynamic-sized matrices
+      * of same type it is enough to swap the data pointers.
      */
    template<typename OtherDerived>
    EIGEN_DEVICE_FUNC
-    void _swap(DenseBase<OtherDerived> const & other)
+    void swap(DenseBase<OtherDerived> & other)
    {
      enum { SwapPointers = internal::is_same<Derived, OtherDerived>::value && Base::SizeAtCompileTime==Dynamic };
-      internal::matrix_swap_impl<Derived, OtherDerived, bool(SwapPointers)>::run(this->derived(), other.const_cast_derived());
+      internal::matrix_swap_impl<Derived, OtherDerived, bool(SwapPointers)>::run(this->derived(), other.derived());
    }
-
-  public:
-#ifndef EIGEN_PARSED_BY_DOXYGEN
+    
+    /** \internal
+      * \brief const version forwarded to DenseBase::swap
+      */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    void swap(DenseBase<OtherDerived> const & other)
+    { Base::swap(other.derived()); }
+    
    EIGEN_DEVICE_FUNC 
    static EIGEN_STRONG_INLINE void _check_template_params()
    {
@ -797,10 +836,9 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
                        && (Options & (DontAlign|RowMajor)) == Options),
        INVALID_MATRIX_TEMPLATE_PARAMETERS)
    }
-#endif

-private:
-    enum { ThisConstantIsPrivateInPlainObjectBase };
+    enum { IsPlainObjectBase = 1 };
+#endif
 };

 namespace internal {
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@ -12,8 +12,7 @@

 namespace Eigen {

-template<typename Lhs, typename Rhs> class Product;
-template<typename Lhs, typename Rhs, typename StorageKind> class ProductImpl;
+template<typename Lhs, typename Rhs, int Option, typename StorageKind> class ProductImpl;

 /** \class Product
  * \ingroup Core_Module
@ -24,53 +23,108 @@ template<typename Lhs, typename Rhs, typename StorageKind> class ProductImpl;
  * \param Rhs the type of the right-hand side expression
  *
  * This class represents an expression of the product of two arbitrary matrices.
+  * 
+  * The other template parameters are:
+  * \tparam Option     can be DefaultProduct or LazyProduct
  *
  */

-// Use ProductReturnType to get correct traits, in particular vectorization flags
+
 namespace internal {
-template<typename Lhs, typename Rhs>
-struct traits<Product<Lhs, Rhs> >
-  : traits<typename ProductReturnType<Lhs, Rhs>::Type>
-{ 
-  // We want A+B*C to be of type Product<Matrix, Sum> and not Product<Matrix, Matrix>
-  // TODO: This flag should eventually go in a separate evaluator traits class
+
+// Determine the scalar of Product<Lhs, Rhs>. This is normally the same as Lhs::Scalar times
+// Rhs::Scalar, but product with permutation matrices inherit the scalar of the other factor.
+template<typename Lhs, typename Rhs, typename LhsShape = typename evaluator_traits<Lhs>::Shape, 
+         typename RhsShape = typename evaluator_traits<Rhs>::Shape >
+struct product_result_scalar
+{
+  typedef typename scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
+};
+
+template<typename Lhs, typename Rhs, typename RhsShape>
+struct product_result_scalar<Lhs, Rhs, PermutationShape, RhsShape>
+{
+  typedef typename Rhs::Scalar Scalar;
+};
+
+template<typename Lhs, typename Rhs, typename LhsShape>
+  struct product_result_scalar<Lhs, Rhs, LhsShape, PermutationShape>
+{
+  typedef typename Lhs::Scalar Scalar;
+};
+
+template<typename Lhs, typename Rhs, int Option>
+struct traits<Product<Lhs, Rhs, Option> >
+{
+  typedef typename remove_all<Lhs>::type LhsCleaned;
+  typedef typename remove_all<Rhs>::type RhsCleaned;
+  typedef traits<LhsCleaned> LhsTraits;
+  typedef traits<RhsCleaned> RhsTraits;
+  
+  typedef MatrixXpr XprKind;
+  
+  typedef typename product_result_scalar<LhsCleaned,RhsCleaned>::Scalar Scalar;
+  typedef typename product_promote_storage_type<typename LhsTraits::StorageKind,
+                                                typename RhsTraits::StorageKind,
+                                                internal::product_type<Lhs,Rhs>::ret>::ret StorageKind;
+  typedef typename promote_index_type<typename LhsTraits::Index,
+                                      typename RhsTraits::Index>::type Index;
+  
  enum {
-    Flags = traits<typename ProductReturnType<Lhs, Rhs>::Type>::Flags & ~(EvalBeforeNestingBit | DirectAccessBit)
+    RowsAtCompileTime    = LhsTraits::RowsAtCompileTime,
+    ColsAtCompileTime    = RhsTraits::ColsAtCompileTime,
+    MaxRowsAtCompileTime = LhsTraits::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = RhsTraits::MaxColsAtCompileTime,
+    
+    // FIXME: only needed by GeneralMatrixMatrixTriangular
+    InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(LhsTraits::ColsAtCompileTime, RhsTraits::RowsAtCompileTime),
+    
+    // The storage order is somewhat arbitrary here. The correct one will be determined through the evaluator.
+    Flags = (   (MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1)
+             || ((LhsTraits::Flags&NoPreferredStorageOrderBit) && (RhsTraits::Flags&RowMajorBit))
+             || ((RhsTraits::Flags&NoPreferredStorageOrderBit) && (LhsTraits::Flags&RowMajorBit)) )
+          ? RowMajorBit : (MaxColsAtCompileTime==1 ? 0 : NoPreferredStorageOrderBit)
  };
 };
+
 } // end namespace internal


-template<typename Lhs, typename Rhs>
-class Product : public ProductImpl<Lhs,Rhs,typename internal::promote_storage_type<typename internal::traits<Lhs>::StorageKind,
-                                                                            typename internal::traits<Rhs>::StorageKind>::ret>
+template<typename _Lhs, typename _Rhs, int Option>
+class Product : public ProductImpl<_Lhs,_Rhs,Option,
+                                   typename internal::product_promote_storage_type<typename internal::traits<_Lhs>::StorageKind,
+                                                                                   typename internal::traits<_Rhs>::StorageKind,
+                                                                                   internal::product_type<_Lhs,_Rhs>::ret>::ret>
 {
  public:
    
+    typedef _Lhs Lhs;
+    typedef _Rhs Rhs;
+    
    typedef typename ProductImpl<
-        Lhs, Rhs,
-        typename internal::promote_storage_type<typename Lhs::StorageKind,
-                                                typename Rhs::StorageKind>::ret>::Base Base;
+        Lhs, Rhs, Option,
+        typename internal::product_promote_storage_type<typename internal::traits<Lhs>::StorageKind,
+                                                        typename internal::traits<Rhs>::StorageKind,
+                                                        internal::product_type<Lhs,Rhs>::ret>::ret>::Base Base;
    EIGEN_GENERIC_PUBLIC_INTERFACE(Product)

-    typedef typename Lhs::Nested LhsNested;
-    typedef typename Rhs::Nested RhsNested;
+    typedef typename internal::nested<Lhs>::type LhsNested;
+    typedef typename internal::nested<Rhs>::type RhsNested;
    typedef typename internal::remove_all<LhsNested>::type LhsNestedCleaned;
    typedef typename internal::remove_all<RhsNested>::type RhsNestedCleaned;

-    Product(const Lhs& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs)
+    EIGEN_DEVICE_FUNC Product(const Lhs& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs)
    {
      eigen_assert(lhs.cols() == rhs.rows()
        && "invalid matrix product"
        && "if you wanted a coeff-wise or a dot product use the respective explicit functions");
    }

-    inline Index rows() const { return m_lhs.rows(); }
-    inline Index cols() const { return m_rhs.cols(); }
+    EIGEN_DEVICE_FUNC inline Index rows() const { return m_lhs.rows(); }
+    EIGEN_DEVICE_FUNC inline Index cols() const { return m_rhs.cols(); }

-    const LhsNestedCleaned& lhs() const { return m_lhs; }
-    const RhsNestedCleaned& rhs() const { return m_rhs; }
+    EIGEN_DEVICE_FUNC const LhsNestedCleaned& lhs() const { return m_lhs; }
+    EIGEN_DEVICE_FUNC const RhsNestedCleaned& rhs() const { return m_rhs; }

  protected:

@ -78,14 +132,77 @@ class Product : public ProductImpl<Lhs,Rhs,typename internal::promote_storage_ty
    RhsNested m_rhs;
 };

-template<typename Lhs, typename Rhs>
-class ProductImpl<Lhs,Rhs,Dense> : public internal::dense_xpr_base<Product<Lhs,Rhs> >::type
-{
-    typedef Product<Lhs, Rhs> Derived;
-  public:
+namespace internal {
+  
+template<typename Lhs, typename Rhs, int Option, int ProductTag = internal::product_type<Lhs,Rhs>::ret>
+class dense_product_base
+ : public internal::dense_xpr_base<Product<Lhs,Rhs,Option> >::type
+{};

-    typedef typename internal::dense_xpr_base<Product<Lhs, Rhs> >::type Base;
+/** Convertion to scalar for inner-products */
+template<typename Lhs, typename Rhs, int Option>
+class dense_product_base<Lhs, Rhs, Option, InnerProduct>
+ : public internal::dense_xpr_base<Product<Lhs,Rhs,Option> >::type
+{
+  typedef Product<Lhs,Rhs,Option> ProductXpr;
+  typedef typename internal::dense_xpr_base<ProductXpr>::type Base;
+public:
+  using Base::derived;
+  typedef typename Base::Scalar Scalar;
+  typedef typename Base::Index Index;
+  
+  operator const Scalar() const
+  {
+    return typename internal::evaluator<ProductXpr>::type(derived()).coeff(0,0);
+  }
+};
+
+} // namespace internal
+
+// Generic API dispatcher
+template<typename Lhs, typename Rhs, int Option, typename StorageKind>
+class ProductImpl : public internal::generic_xpr_base<Product<Lhs,Rhs,Option>, MatrixXpr, StorageKind>::type
+{
+  public:
+    typedef typename internal::generic_xpr_base<Product<Lhs,Rhs,Option>, MatrixXpr, StorageKind>::type Base;
+};
+
+template<typename Lhs, typename Rhs, int Option>
+class ProductImpl<Lhs,Rhs,Option,Dense>
+  : public internal::dense_product_base<Lhs,Rhs,Option>
+{
+    typedef Product<Lhs, Rhs, Option> Derived;
+    
+  public:
+    
+    typedef typename internal::dense_product_base<Lhs, Rhs, Option> Base;
    EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
+  protected:
+    enum {
+      IsOneByOne = (RowsAtCompileTime == 1 || RowsAtCompileTime == Dynamic) && 
+                   (ColsAtCompileTime == 1 || ColsAtCompileTime == Dynamic),
+      EnableCoeff = IsOneByOne || Option==LazyProduct
+    };
+    
+  public:
+  
+    EIGEN_DEVICE_FUNC Scalar coeff(Index row, Index col) const
+    {
+      EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS);
+      eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) );
+      
+      return typename internal::evaluator<Derived>::type(derived()).coeff(row,col);
+    }
+
+    EIGEN_DEVICE_FUNC Scalar coeff(Index i) const
+    {
+      EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS);
+      eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) );
+      
+      return typename internal::evaluator<Derived>::type(derived()).coeff(i);
+    }
+    
+  
 };

 /***************************************************************************
@ -102,6 +219,15 @@ prod(const Lhs& lhs, const Rhs& rhs)
  return Product<Lhs,Rhs>(lhs,rhs);
 }

+/** \internal used to test the evaluator only
+  */
+template<typename Lhs,typename Rhs>
+const Product<Lhs,Rhs,LazyProduct>
+lazyprod(const Lhs& lhs, const Rhs& rhs)
+{
+  return Product<Lhs,Rhs,LazyProduct>(lhs,rhs);
+}
+
 } // end namespace Eigen

 #endif // EIGEN_PRODUCT_H
--- a/Eigen/src/Core/ProductBase.h
+++ b/Eigen/src/Core/ProductBase.h
@ -12,253 +12,6 @@

 namespace Eigen { 

-/** \class ProductBase
-  * \ingroup Core_Module
-  *
-  */
-
-namespace internal {
-template<typename Derived, typename _Lhs, typename _Rhs>
-struct traits<ProductBase<Derived,_Lhs,_Rhs> >
-{
-  typedef MatrixXpr XprKind;
-  typedef typename remove_all<_Lhs>::type Lhs;
-  typedef typename remove_all<_Rhs>::type Rhs;
-  typedef typename scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
-  typedef typename promote_storage_type<typename traits<Lhs>::StorageKind,
-                                           typename traits<Rhs>::StorageKind>::ret StorageKind;
-  typedef typename promote_index_type<typename traits<Lhs>::Index,
-                                         typename traits<Rhs>::Index>::type Index;
-  enum {
-    RowsAtCompileTime = traits<Lhs>::RowsAtCompileTime,
-    ColsAtCompileTime = traits<Rhs>::ColsAtCompileTime,
-    MaxRowsAtCompileTime = traits<Lhs>::MaxRowsAtCompileTime,
-    MaxColsAtCompileTime = traits<Rhs>::MaxColsAtCompileTime,
-    Flags = (MaxRowsAtCompileTime==1 ? RowMajorBit : 0)
-          | EvalBeforeNestingBit | EvalBeforeAssigningBit | NestByRefBit,
-                  // Note that EvalBeforeNestingBit and NestByRefBit
-                  // are not used in practice because nested is overloaded for products
-    CoeffReadCost = 0 // FIXME why is it needed ?
-  };
-};
-}
-
-#define EIGEN_PRODUCT_PUBLIC_INTERFACE(Derived) \
-  typedef ProductBase<Derived, Lhs, Rhs > Base; \
-  EIGEN_DENSE_PUBLIC_INTERFACE(Derived) \
-  typedef typename Base::LhsNested LhsNested; \
-  typedef typename Base::_LhsNested _LhsNested; \
-  typedef typename Base::LhsBlasTraits LhsBlasTraits; \
-  typedef typename Base::ActualLhsType ActualLhsType; \
-  typedef typename Base::_ActualLhsType _ActualLhsType; \
-  typedef typename Base::RhsNested RhsNested; \
-  typedef typename Base::_RhsNested _RhsNested; \
-  typedef typename Base::RhsBlasTraits RhsBlasTraits; \
-  typedef typename Base::ActualRhsType ActualRhsType; \
-  typedef typename Base::_ActualRhsType _ActualRhsType; \
-  using Base::m_lhs; \
-  using Base::m_rhs;
-
-template<typename Derived, typename Lhs, typename Rhs>
-class ProductBase : public MatrixBase<Derived>
-{
-  public:
-    typedef MatrixBase<Derived> Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(ProductBase)
-    
-    typedef typename Lhs::Nested LhsNested;
-    typedef typename internal::remove_all<LhsNested>::type _LhsNested;
-    typedef internal::blas_traits<_LhsNested> LhsBlasTraits;
-    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
-    typedef typename internal::remove_all<ActualLhsType>::type _ActualLhsType;
-    typedef typename internal::traits<Lhs>::Scalar LhsScalar;
-
-    typedef typename Rhs::Nested RhsNested;
-    typedef typename internal::remove_all<RhsNested>::type _RhsNested;
-    typedef internal::blas_traits<_RhsNested> RhsBlasTraits;
-    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
-    typedef typename internal::remove_all<ActualRhsType>::type _ActualRhsType;
-    typedef typename internal::traits<Rhs>::Scalar RhsScalar;
-
-    // Diagonal of a product: no need to evaluate the arguments because they are going to be evaluated only once
-    typedef CoeffBasedProduct<LhsNested, RhsNested, 0> FullyLazyCoeffBaseProductType;
-
-  public:
-
-    typedef typename Base::PlainObject PlainObject;
-
-    ProductBase(const Lhs& a_lhs, const Rhs& a_rhs)
-      : m_lhs(a_lhs), m_rhs(a_rhs)
-    {
-      eigen_assert(a_lhs.cols() == a_rhs.rows()
-        && "invalid matrix product"
-        && "if you wanted a coeff-wise or a dot product use the respective explicit functions");
-    }
-
-    inline Index rows() const { return m_lhs.rows(); }
-    inline Index cols() const { return m_rhs.cols(); }
-
-    template<typename Dest>
-    inline void evalTo(Dest& dst) const { dst.setZero(); scaleAndAddTo(dst,Scalar(1)); }
-
-    template<typename Dest>
-    inline void addTo(Dest& dst) const { scaleAndAddTo(dst,Scalar(1)); }
-
-    template<typename Dest>
-    inline void subTo(Dest& dst) const { scaleAndAddTo(dst,Scalar(-1)); }
-
-    template<typename Dest>
-    inline void scaleAndAddTo(Dest& dst, const Scalar& alpha) const { derived().scaleAndAddTo(dst,alpha); }
-
-    const _LhsNested& lhs() const { return m_lhs; }
-    const _RhsNested& rhs() const { return m_rhs; }
-
-    // Implicit conversion to the nested type (trigger the evaluation of the product)
-    operator const PlainObject& () const
-    {
-      m_result.resize(m_lhs.rows(), m_rhs.cols());
-      derived().evalTo(m_result);
-      return m_result;
-    }
-
-    const Diagonal<const FullyLazyCoeffBaseProductType,0> diagonal() const
-    { return FullyLazyCoeffBaseProductType(m_lhs, m_rhs); }
-
-    template<int Index>
-    const Diagonal<FullyLazyCoeffBaseProductType,Index> diagonal() const
-    { return FullyLazyCoeffBaseProductType(m_lhs, m_rhs); }
-
-    const Diagonal<FullyLazyCoeffBaseProductType,Dynamic> diagonal(Index index) const
-    { return FullyLazyCoeffBaseProductType(m_lhs, m_rhs).diagonal(index); }
-
-    // restrict coeff accessors to 1x1 expressions. No need to care about mutators here since this isn't an Lvalue expression
-    typename Base::CoeffReturnType coeff(Index row, Index col) const
-    {
-      EIGEN_STATIC_ASSERT_SIZE_1x1(Derived)
-      eigen_assert(this->rows() == 1 && this->cols() == 1);
-      Matrix<Scalar,1,1> result = *this;
-      return result.coeff(row,col);
-    }
-
-    typename Base::CoeffReturnType coeff(Index i) const
-    {
-      EIGEN_STATIC_ASSERT_SIZE_1x1(Derived)
-      eigen_assert(this->rows() == 1 && this->cols() == 1);
-      Matrix<Scalar,1,1> result = *this;
-      return result.coeff(i);
-    }
-
-    const Scalar& coeffRef(Index row, Index col) const
-    {
-      EIGEN_STATIC_ASSERT_SIZE_1x1(Derived)
-      eigen_assert(this->rows() == 1 && this->cols() == 1);
-      return derived().coeffRef(row,col);
-    }
-
-    const Scalar& coeffRef(Index i) const
-    {
-      EIGEN_STATIC_ASSERT_SIZE_1x1(Derived)
-      eigen_assert(this->rows() == 1 && this->cols() == 1);
-      return derived().coeffRef(i);
-    }
-
-  protected:
-
-    LhsNested m_lhs;
-    RhsNested m_rhs;
-
-    mutable PlainObject m_result;
-};
-
-// here we need to overload the nested rule for products
-// such that the nested type is a const reference to a plain matrix
-namespace internal {
-template<typename Lhs, typename Rhs, int Mode, int N, typename PlainObject>
-struct nested<GeneralProduct<Lhs,Rhs,Mode>, N, PlainObject>
-{
-  typedef PlainObject const& type;
-};
-}
-
-template<typename NestedProduct>
-class ScaledProduct;
-
-// Note that these two operator* functions are not defined as member
-// functions of ProductBase, because, otherwise we would have to
-// define all overloads defined in MatrixBase. Furthermore, Using
-// "using Base::operator*" would not work with MSVC.
-//
-// Also note that here we accept any compatible scalar types
-template<typename Derived,typename Lhs,typename Rhs>
-const ScaledProduct<Derived>
-operator*(const ProductBase<Derived,Lhs,Rhs>& prod, const typename Derived::Scalar& x)
-{ return ScaledProduct<Derived>(prod.derived(), x); }
-
-template<typename Derived,typename Lhs,typename Rhs>
-typename internal::enable_if<!internal::is_same<typename Derived::Scalar,typename Derived::RealScalar>::value,
-                      const ScaledProduct<Derived> >::type
-operator*(const ProductBase<Derived,Lhs,Rhs>& prod, const typename Derived::RealScalar& x)
-{ return ScaledProduct<Derived>(prod.derived(), x); }
-
-
-template<typename Derived,typename Lhs,typename Rhs>
-const ScaledProduct<Derived>
-operator*(const typename Derived::Scalar& x,const ProductBase<Derived,Lhs,Rhs>& prod)
-{ return ScaledProduct<Derived>(prod.derived(), x); }
-
-template<typename Derived,typename Lhs,typename Rhs>
-typename internal::enable_if<!internal::is_same<typename Derived::Scalar,typename Derived::RealScalar>::value,
-                      const ScaledProduct<Derived> >::type
-operator*(const typename Derived::RealScalar& x,const ProductBase<Derived,Lhs,Rhs>& prod)
-{ return ScaledProduct<Derived>(prod.derived(), x); }
-
-namespace internal {
-template<typename NestedProduct>
-struct traits<ScaledProduct<NestedProduct> >
- : traits<ProductBase<ScaledProduct<NestedProduct>,
-                         typename NestedProduct::_LhsNested,
-                         typename NestedProduct::_RhsNested> >
-{
-  typedef typename traits<NestedProduct>::StorageKind StorageKind;
-};
-}
-
-template<typename NestedProduct>
-class ScaledProduct
-  : public ProductBase<ScaledProduct<NestedProduct>,
-                       typename NestedProduct::_LhsNested,
-                       typename NestedProduct::_RhsNested>
-{
-  public:
-    typedef ProductBase<ScaledProduct<NestedProduct>,
-                       typename NestedProduct::_LhsNested,
-                       typename NestedProduct::_RhsNested> Base;
-    typedef typename Base::Scalar Scalar;
-    typedef typename Base::PlainObject PlainObject;
-//     EIGEN_PRODUCT_PUBLIC_INTERFACE(ScaledProduct)
-
-    ScaledProduct(const NestedProduct& prod, const Scalar& x)
-    : Base(prod.lhs(),prod.rhs()), m_prod(prod), m_alpha(x) {}
-
-    template<typename Dest>
-    inline void evalTo(Dest& dst) const { dst.setZero(); scaleAndAddTo(dst, Scalar(1)); }
-
-    template<typename Dest>
-    inline void addTo(Dest& dst) const { scaleAndAddTo(dst, Scalar(1)); }
-
-    template<typename Dest>
-    inline void subTo(Dest& dst) const { scaleAndAddTo(dst, Scalar(-1)); }
-
-    template<typename Dest>
-    inline void scaleAndAddTo(Dest& dst, const Scalar& a_alpha) const { m_prod.derived().scaleAndAddTo(dst,a_alpha * m_alpha); }
-
-    const Scalar& alpha() const { return m_alpha; }
-    
-  protected:
-    const NestedProduct& m_prod;
-    Scalar m_alpha;
-};
-
 /** \internal
  * Overloaded to perform an efficient C = (A*B).lazy() */
 template<typename Derived>
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h
@ -65,6 +65,25 @@ public:
              ? CompleteUnrolling
              : NoUnrolling
  };
+  
+#ifdef EIGEN_DEBUG_ASSIGN
+  static void debug()
+  {
+    std::cerr << "Xpr: " << typeid(typename Derived::XprType).name() << std::endl;
+    std::cerr.setf(std::ios::hex, std::ios::basefield);
+    EIGEN_DEBUG_VAR(Derived::Flags)
+    std::cerr.unsetf(std::ios::hex);
+    EIGEN_DEBUG_VAR(InnerMaxSize)
+    EIGEN_DEBUG_VAR(PacketSize)
+    EIGEN_DEBUG_VAR(MightVectorize)
+    EIGEN_DEBUG_VAR(MayLinearVectorize)
+    EIGEN_DEBUG_VAR(MaySliceVectorize)
+    EIGEN_DEBUG_VAR(Traversal)
+    EIGEN_DEBUG_VAR(UnrollingLimit)
+    EIGEN_DEBUG_VAR(Unrolling)
+    std::cerr << std::endl;
+  }
+#endif
 };

 /***************************************************************************
@ -174,7 +193,7 @@ struct redux_impl<Func, Derived, DefaultTraversal, NoUnrolling>
  typedef typename Derived::Scalar Scalar;
  typedef typename Derived::Index Index;
  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Derived& mat, const Func& func)
+  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
  {
    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
    Scalar res;
@ -200,10 +219,10 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
  typedef typename packet_traits<Scalar>::type PacketScalar;
  typedef typename Derived::Index Index;

-  static Scalar run(const Derived& mat, const Func& func)
+  static Scalar run(const Derived &mat, const Func& func)
  {
    const Index size = mat.size();
-    eigen_assert(size && "you are using an empty matrix");
+    
    const Index packetSize = packet_traits<Scalar>::size;
    const Index alignedStart = internal::first_aligned(mat);
    enum {
@ -258,7 +277,7 @@ struct redux_impl<Func, Derived, SliceVectorizedTraversal, NoUnrolling>
  typedef typename packet_traits<Scalar>::type PacketScalar;
  typedef typename Derived::Index Index;

-  static Scalar run(const Derived& mat, const Func& func)
+  EIGEN_DEVICE_FUNC static Scalar run(const Derived &mat, const Func& func)
  {
    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
    const Index innerSize = mat.innerSize();
@ -300,7 +319,7 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, CompleteUnrolling>
    Size = Derived::SizeAtCompileTime,
    VectorizedSize = (Size / PacketSize) * PacketSize
  };
-  static EIGEN_STRONG_INLINE Scalar run(const Derived& mat, const Func& func)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
  {
    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
    if (VectorizedSize > 0) {
@ -315,6 +334,66 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, CompleteUnrolling>
  }
 };

+// evaluator adaptor
+template<typename _XprType>
+class redux_evaluator
+{
+public:
+  typedef _XprType XprType;
+  EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {}
+  
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketScalar PacketScalar;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+  
+  enum {
+    MaxRowsAtCompileTime = XprType::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = XprType::MaxColsAtCompileTime,
+    // TODO we should not remove DirectAccessBit and rather find an elegant way to query the alignment offset at runtime from the evaluator
+    Flags = evaluator<XprType>::Flags & ~DirectAccessBit,
+    IsRowMajor = XprType::IsRowMajor,
+    SizeAtCompileTime = XprType::SizeAtCompileTime,
+    InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime,
+    CoeffReadCost = evaluator<XprType>::CoeffReadCost
+  };
+  
+  EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); }
+  EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); }
+  EIGEN_DEVICE_FUNC Index size() const { return m_xpr.size(); }
+  EIGEN_DEVICE_FUNC Index innerSize() const { return m_xpr.innerSize(); }
+  EIGEN_DEVICE_FUNC Index outerSize() const { return m_xpr.outerSize(); }
+
+  EIGEN_DEVICE_FUNC
+  CoeffReturnType coeff(Index row, Index col) const
+  { return m_evaluator.coeff(row, col); }
+
+  EIGEN_DEVICE_FUNC
+  CoeffReturnType coeff(Index index) const
+  { return m_evaluator.coeff(index); }
+
+  template<int LoadMode>
+  PacketReturnType packet(Index row, Index col) const
+  { return m_evaluator.template packet<LoadMode>(row, col); }
+
+  template<int LoadMode>
+  PacketReturnType packet(Index index) const
+  { return m_evaluator.template packet<LoadMode>(index); }
+  
+  EIGEN_DEVICE_FUNC
+  CoeffReturnType coeffByOuterInner(Index outer, Index inner) const
+  { return m_evaluator.coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
+  
+  template<int LoadMode>
+  PacketReturnType packetByOuterInner(Index outer, Index inner) const
+  { return m_evaluator.template packet<LoadMode>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
+  
+protected:
+  typename internal::evaluator<XprType>::nestedType m_evaluator;
+  const XprType &m_xpr;
+};
+
 } // end namespace internal

 /***************************************************************************
@ -325,7 +404,7 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, CompleteUnrolling>
 /** \returns the result of a full redux operation on the whole matrix or vector using \a func
  *
  * The template parameter \a BinaryOp is the type of the functor \a func which must be
-  * an associative operator. Both current STL and TR1 functor styles are handled.
+  * an associative operator. Both current C++98 and C++11 functor styles are handled.
  *
  * \sa DenseBase::sum(), DenseBase::minCoeff(), DenseBase::maxCoeff(), MatrixBase::colwise(), MatrixBase::rowwise()
  */
@ -334,9 +413,22 @@ template<typename Func>
 EIGEN_STRONG_INLINE typename internal::result_of<Func(typename internal::traits<Derived>::Scalar)>::type
 DenseBase<Derived>::redux(const Func& func) const
 {
-  typedef typename internal::remove_all<typename Derived::Nested>::type ThisNested;
-  return internal::redux_impl<Func, ThisNested>
-            ::run(derived(), func);
+  eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
+  
+  // FIXME, eval_nest should be handled by redux_evaluator, however:
+  //  - it is currently difficult to provide the right Flags since they are still handled by the expressions
+  //  - handling it here might reduce the number of template instantiations
+//   typedef typename internal::nested_eval<Derived,1>::type ThisNested;
+//   typedef typename internal::remove_all<ThisNested>::type ThisNestedCleaned;
+//   typedef typename internal::redux_evaluator<ThisNestedCleaned> ThisEvaluator;
+//   
+//   ThisNested thisNested(derived());
+//   ThisEvaluator thisEval(thisNested);
+  
+  typedef typename internal::redux_evaluator<Derived> ThisEvaluator;
+  ThisEvaluator thisEval(derived());
+  
+  return internal::redux_impl<Func, ThisEvaluator>::run(thisEval, func);
 }

 /** \returns the minimum of all coefficients of \c *this.
@ -346,7 +438,7 @@ template<typename Derived>
 EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::minCoeff() const
 {
-  return this->redux(Eigen::internal::scalar_min_op<Scalar>());
+  return derived().redux(Eigen::internal::scalar_min_op<Scalar>());
 }

 /** \returns the maximum of all coefficients of \c *this.
@ -356,7 +448,7 @@ template<typename Derived>
 EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::maxCoeff() const
 {
-  return this->redux(Eigen::internal::scalar_max_op<Scalar>());
+  return derived().redux(Eigen::internal::scalar_max_op<Scalar>());
 }

 /** \returns the sum of all coefficients of *this
@ -369,7 +461,7 @@ DenseBase<Derived>::sum() const
 {
  if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
    return Scalar(0);
-  return this->redux(Eigen::internal::scalar_sum_op<Scalar>());
+  return derived().redux(Eigen::internal::scalar_sum_op<Scalar>());
 }

 /** \returns the mean of all coefficients of *this
@ -380,7 +472,7 @@ template<typename Derived>
 EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::mean() const
 {
-  return Scalar(this->redux(Eigen::internal::scalar_sum_op<Scalar>())) / Scalar(this->size());
+  return Scalar(derived().redux(Eigen::internal::scalar_sum_op<Scalar>())) / Scalar(this->size());
 }

 /** \returns the product of all coefficients of *this
@ -396,7 +488,7 @@ DenseBase<Derived>::prod() const
 {
  if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
    return Scalar(1);
-  return this->redux(Eigen::internal::scalar_product_op<Scalar>());
+  return derived().redux(Eigen::internal::scalar_product_op<Scalar>());
 }

 /** \returns the trace of \c *this, i.e. the sum of the coefficients on the main diagonal.
--- a/Eigen/src/Core/Ref.h
+++ b/Eigen/src/Core/Ref.h
@ -12,10 +12,6 @@

 namespace Eigen { 

-template<typename Derived> class RefBase;
-template<typename PlainObjectType, int Options = 0,
-         typename StrideType = typename internal::conditional<PlainObjectType::IsVectorAtCompileTime,InnerStride<1>,OuterStride<> >::type > class Ref;
-
 /** \class Ref
  * \ingroup Core_Module
  *
@ -131,12 +127,12 @@ public:
  typedef MapBase<Derived> Base;
  EIGEN_DENSE_PUBLIC_INTERFACE(RefBase)

-  inline Index innerStride() const
+  EIGEN_DEVICE_FUNC inline Index innerStride() const
  {
    return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1;
  }

-  inline Index outerStride() const
+  EIGEN_DEVICE_FUNC inline Index outerStride() const
  {
    return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
         : IsVectorAtCompileTime ? this->size()
@ -144,7 +140,7 @@ public:
         : this->rows();
  }

-  RefBase()
+  EIGEN_DEVICE_FUNC RefBase()
    : Base(0,RowsAtCompileTime==Dynamic?0:RowsAtCompileTime,ColsAtCompileTime==Dynamic?0:ColsAtCompileTime),
      // Stride<> does not allow default ctor for Dynamic strides, so let' initialize it with dummy values:
      m_stride(StrideType::OuterStrideAtCompileTime==Dynamic?0:StrideType::OuterStrideAtCompileTime,
@ -158,7 +154,7 @@ protected:
  typedef Stride<StrideType::OuterStrideAtCompileTime,StrideType::InnerStrideAtCompileTime> StrideBase;

  template<typename Expression>
-  void construct(Expression& expr)
+  EIGEN_DEVICE_FUNC void construct(Expression& expr)
  {
    if(PlainObjectType::RowsAtCompileTime==1)
    {
@ -188,6 +184,8 @@ template<typename PlainObjectType, int Options, typename StrideType> class Ref
  : public RefBase<Ref<PlainObjectType, Options, StrideType> >
 {
    typedef internal::traits<Ref> Traits;
+    template<typename Derived>
+    EIGEN_DEVICE_FUNC inline Ref(const PlainObjectBase<Derived>& expr);
  public:

    typedef RefBase<Ref> Base;
@ -196,20 +194,21 @@ template<typename PlainObjectType, int Options, typename StrideType> class Ref

    #ifndef EIGEN_PARSED_BY_DOXYGEN
    template<typename Derived>
-    inline Ref(PlainObjectBase<Derived>& expr,
-               typename internal::enable_if<bool(Traits::template match<Derived>::MatchAtCompileTime),Derived>::type* = 0)
+    EIGEN_DEVICE_FUNC inline Ref(PlainObjectBase<Derived>& expr)
    {
-      Base::construct(expr);
+      EIGEN_STATIC_ASSERT(bool(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+      Base::construct(expr.derived());
    }
    template<typename Derived>
-    inline Ref(const DenseBase<Derived>& expr,
-               typename internal::enable_if<bool(internal::is_lvalue<Derived>::value&&bool(Traits::template match<Derived>::MatchAtCompileTime)),Derived>::type* = 0,
-               int = Derived::ThisConstantIsPrivateInPlainObjectBase)
+    EIGEN_DEVICE_FUNC inline Ref(const DenseBase<Derived>& expr)
    #else
    template<typename Derived>
    inline Ref(DenseBase<Derived>& expr)
    #endif
    {
+      EIGEN_STATIC_ASSERT(bool(internal::is_lvalue<Derived>::value), THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
+      EIGEN_STATIC_ASSERT(bool(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+      EIGEN_STATIC_ASSERT(!Derived::IsPlainObjectBase,THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
      Base::construct(expr.const_cast_derived());
    }

@ -228,7 +227,7 @@ template<typename TPlainObjectType, int Options, typename StrideType> class Ref<
    EIGEN_DENSE_PUBLIC_INTERFACE(Ref)

    template<typename Derived>
-    inline Ref(const DenseBase<Derived>& expr)
+    EIGEN_DEVICE_FUNC inline Ref(const DenseBase<Derived>& expr)
    {
 //      std::cout << match_helper<Derived>::HasDirectAccess << "," << match_helper<Derived>::OuterStrideMatch << "," << match_helper<Derived>::InnerStrideMatch << "\n";
 //      std::cout << int(StrideType::OuterStrideAtCompileTime) << " - " << int(Derived::OuterStrideAtCompileTime) << "\n";
@ -236,18 +235,27 @@ template<typename TPlainObjectType, int Options, typename StrideType> class Ref<
      construct(expr.derived(), typename Traits::template match<Derived>::type());
    }

+    EIGEN_DEVICE_FUNC inline Ref(const Ref& other) : Base(other) {
+      // copy constructor shall not copy the m_object, to avoid unnecessary malloc and copy
+    }
+
+    template<typename OtherRef>
+    EIGEN_DEVICE_FUNC inline Ref(const RefBase<OtherRef>& other) {
+      construct(other.derived(), typename Traits::template match<OtherRef>::type());
+    }
+
  protected:

    template<typename Expression>
-    void construct(const Expression& expr,internal::true_type)
+    EIGEN_DEVICE_FUNC void construct(const Expression& expr,internal::true_type)
    {
      Base::construct(expr);
    }

    template<typename Expression>
-    void construct(const Expression& expr, internal::false_type)
+    EIGEN_DEVICE_FUNC void construct(const Expression& expr, internal::false_type)
    {
-      m_object.lazyAssign(expr);
+      internal::call_assignment_no_alias(m_object,expr,internal::assign_op<Scalar>());
      Base::construct(m_object);
    }

--- a/Eigen/src/Core/Replicate.h
+++ b/Eigen/src/Core/Replicate.h
@ -53,8 +53,9 @@ struct traits<Replicate<MatrixType,RowFactor,ColFactor> >
    IsRowMajor = MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1 ? 1
               : MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1 ? 0
               : (MatrixType::Flags & RowMajorBit) ? 1 : 0,
-    Flags = (_MatrixTypeNested::Flags & HereditaryBits & ~RowMajorBit) | (IsRowMajor ? RowMajorBit : 0),
-    CoeffReadCost = _MatrixTypeNested::CoeffReadCost
+    
+    // FIXME enable DirectAccess with negative strides?
+    Flags = IsRowMajor ? RowMajorBit : 0
  };
 };
 }
@ -68,6 +69,7 @@ template<typename MatrixType,int RowFactor,int ColFactor> class Replicate

    typedef typename internal::dense_xpr_base<Replicate>::type Base;
    EIGEN_DENSE_PUBLIC_INTERFACE(Replicate)
+    typedef typename internal::remove_all<MatrixType>::type NestedExpression;

    template<typename OriginalMatrixType>
    inline explicit Replicate(const OriginalMatrixType& a_matrix)
--- a/Eigen/src/Core/ReturnByValue.h
+++ b/Eigen/src/Core/ReturnByValue.h
@ -38,9 +38,10 @@ struct traits<ReturnByValue<Derived> >
 * So internal::nested always gives the plain return matrix type.
 *
 * FIXME: I don't understand why we need this specialization: isn't this taken care of by the EvalBeforeNestingBit ??
+ * Answer: EvalBeforeNestingBit should be deprecated since we have the evaluators
 */
 template<typename Derived,int n,typename PlainObject>
-struct nested<ReturnByValue<Derived>, n, PlainObject>
+struct nested_eval<ReturnByValue<Derived>, n, PlainObject>
 {
  typedef typename traits<Derived>::ReturnType type;
 };
@ -48,7 +49,7 @@ struct nested<ReturnByValue<Derived>, n, PlainObject>
 } // end namespace internal

 template<typename Derived> class ReturnByValue
-  : internal::no_assignment_operator, public internal::dense_xpr_base< ReturnByValue<Derived> >::type
+  : public internal::dense_xpr_base< ReturnByValue<Derived> >::type, internal::no_assignment_operator
 {
  public:
    typedef typename internal::traits<Derived>::ReturnType ReturnType;
@ -73,6 +74,7 @@ template<typename Derived> class ReturnByValue
    const Unusable& coeff(Index,Index) const { return *reinterpret_cast<const Unusable*>(this); }
    Unusable& coeffRef(Index) { return *reinterpret_cast<Unusable*>(this); }
    Unusable& coeffRef(Index,Index) { return *reinterpret_cast<Unusable*>(this); }
+#undef Unusable
 #endif
 };

@ -84,6 +86,36 @@ Derived& DenseBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
  return derived();
 }

+namespace internal {
+
+// Expression is evaluated in a temporary; default implementation of Assignment is bypassed so that
+// when a ReturnByValue expression is assigned, the evaluator is not constructed.
+// TODO: Finalize port to new regime; ReturnByValue should not exist in the expression world
+  
+template<typename Derived>
+struct evaluator<ReturnByValue<Derived> >
+  : public evaluator<typename internal::traits<Derived>::ReturnType>::type
+{
+  typedef ReturnByValue<Derived> XprType;
+  typedef typename internal::traits<Derived>::ReturnType PlainObject;
+  typedef typename evaluator<PlainObject>::type Base;
+  
+  typedef evaluator type;
+  typedef evaluator nestedType;
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
+    : m_result(xpr.rows(), xpr.cols())
+  {
+    ::new (static_cast<Base*>(this)) Base(m_result);
+    xpr.evalTo(m_result);
+  }
+
+protected:
+  PlainObject m_result;
+};
+
+} // end namespace internal
+
 } // end namespace Eigen

 #endif // EIGEN_RETURNBYVALUE_H
--- a/Eigen/src/Core/Reverse.h
+++ b/Eigen/src/Core/Reverse.h
@ -44,14 +44,7 @@ struct traits<Reverse<MatrixType, Direction> >
    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-
-    // let's enable LinearAccess only with vectorization because of the product overhead
-    LinearAccess = ( (Direction==BothDirections) && (int(_MatrixTypeNested::Flags)&PacketAccessBit) )
-                 ? LinearAccessBit : 0,
-
-    Flags = int(_MatrixTypeNested::Flags) & (HereditaryBits | LvalueBit | PacketAccessBit | LinearAccess),
-
-    CoeffReadCost = _MatrixTypeNested::CoeffReadCost
+    Flags = _MatrixTypeNested::Flags & (RowMajorBit | LvalueBit)
  };
 };

@ -74,6 +67,7 @@ template<typename MatrixType, int Direction> class Reverse

    typedef typename internal::dense_xpr_base<Reverse>::type Base;
    EIGEN_DENSE_PUBLIC_INTERFACE(Reverse)
+    typedef typename internal::remove_all<MatrixType>::type NestedExpression;
    using Base::IsRowMajor;

    // next line is necessary because otherwise const version of operator()
@ -95,47 +89,47 @@ template<typename MatrixType, int Direction> class Reverse
    typedef internal::reverse_packet_cond<PacketScalar,ReversePacket> reverse_packet;
  public:

-    inline Reverse(const MatrixType& matrix) : m_matrix(matrix) { }
+    EIGEN_DEVICE_FUNC explicit inline Reverse(const MatrixType& matrix) : m_matrix(matrix) { }

    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Reverse)

-    inline Index rows() const { return m_matrix.rows(); }
-    inline Index cols() const { return m_matrix.cols(); }
+    EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.rows(); }
+    EIGEN_DEVICE_FUNC inline Index cols() const { return m_matrix.cols(); }

-    inline Index innerStride() const
+    EIGEN_DEVICE_FUNC inline Index innerStride() const
    {
      return -m_matrix.innerStride();
    }

-    inline Scalar& operator()(Index row, Index col)
+    EIGEN_DEVICE_FUNC inline Scalar& operator()(Index row, Index col)
    {
      eigen_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
      return coeffRef(row, col);
    }

-    inline Scalar& coeffRef(Index row, Index col)
+    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index col)
    {
      return m_matrix.const_cast_derived().coeffRef(ReverseRow ? m_matrix.rows() - row - 1 : row,
                                                    ReverseCol ? m_matrix.cols() - col - 1 : col);
    }

-    inline CoeffReturnType coeff(Index row, Index col) const
+    EIGEN_DEVICE_FUNC inline CoeffReturnType coeff(Index row, Index col) const
    {
      return m_matrix.coeff(ReverseRow ? m_matrix.rows() - row - 1 : row,
                            ReverseCol ? m_matrix.cols() - col - 1 : col);
    }

-    inline CoeffReturnType coeff(Index index) const
+    EIGEN_DEVICE_FUNC inline CoeffReturnType coeff(Index index) const
    {
      return m_matrix.coeff(m_matrix.size() - index - 1);
    }

-    inline Scalar& coeffRef(Index index)
+    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index)
    {
      return m_matrix.const_cast_derived().coeffRef(m_matrix.size() - index - 1);
    }

-    inline Scalar& operator()(Index index)
+    EIGEN_DEVICE_FUNC inline Scalar& operator()(Index index)
    {
      eigen_assert(index >= 0 && index < m_matrix.size());
      return coeffRef(index);
@ -170,7 +164,7 @@ template<typename MatrixType, int Direction> class Reverse
      m_matrix.const_cast_derived().template writePacket<LoadMode>(m_matrix.size() - index - PacketSize, internal::preverse(x));
    }

-    const typename internal::remove_all<typename MatrixType::Nested>::type& 
+    EIGEN_DEVICE_FUNC const typename internal::remove_all<typename MatrixType::Nested>::type&
    nestedExpression() const 
    {
      return m_matrix;
@ -190,7 +184,7 @@ template<typename Derived>
 inline typename DenseBase<Derived>::ReverseReturnType
 DenseBase<Derived>::reverse()
 {
-  return derived();
+  return ReverseReturnType(derived());
 }

 /** This is the const version of reverse(). */
@ -198,7 +192,7 @@ template<typename Derived>
 inline const typename DenseBase<Derived>::ConstReverseReturnType
 DenseBase<Derived>::reverse() const
 {
-  return derived();
+  return ConstReverseReturnType(derived());
 }

 /** This is the "in place" version of reverse: it reverses \c *this.
--- a/Eigen/src/Core/Select.h
+++ b/Eigen/src/Core/Select.h
@ -43,23 +43,21 @@ struct traits<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
    ColsAtCompileTime = ConditionMatrixType::ColsAtCompileTime,
    MaxRowsAtCompileTime = ConditionMatrixType::MaxRowsAtCompileTime,
    MaxColsAtCompileTime = ConditionMatrixType::MaxColsAtCompileTime,
-    Flags = (unsigned int)ThenMatrixType::Flags & ElseMatrixType::Flags & HereditaryBits,
-    CoeffReadCost = traits<typename remove_all<ConditionMatrixNested>::type>::CoeffReadCost
-                  + EIGEN_SIZE_MAX(traits<typename remove_all<ThenMatrixNested>::type>::CoeffReadCost,
-                                   traits<typename remove_all<ElseMatrixNested>::type>::CoeffReadCost)
+    Flags = (unsigned int)ThenMatrixType::Flags & ElseMatrixType::Flags & RowMajorBit
  };
 };
 }

 template<typename ConditionMatrixType, typename ThenMatrixType, typename ElseMatrixType>
-class Select : internal::no_assignment_operator,
-  public internal::dense_xpr_base< Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >::type
+class Select : public internal::dense_xpr_base< Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >::type,
+               internal::no_assignment_operator
 {
  public:

    typedef typename internal::dense_xpr_base<Select>::type Base;
    EIGEN_DENSE_PUBLIC_INTERFACE(Select)

+    inline EIGEN_DEVICE_FUNC
    Select(const ConditionMatrixType& a_conditionMatrix,
           const ThenMatrixType& a_thenMatrix,
           const ElseMatrixType& a_elseMatrix)
@ -69,9 +67,10 @@ class Select : internal::no_assignment_operator,
      eigen_assert(m_condition.cols() == m_then.cols() && m_condition.cols() == m_else.cols());
    }

-    Index rows() const { return m_condition.rows(); }
-    Index cols() const { return m_condition.cols(); }
+    inline EIGEN_DEVICE_FUNC Index rows() const { return m_condition.rows(); }
+    inline EIGEN_DEVICE_FUNC Index cols() const { return m_condition.cols(); }

+    inline EIGEN_DEVICE_FUNC
    const Scalar coeff(Index i, Index j) const
    {
      if (m_condition.coeff(i,j))
@ -80,6 +79,7 @@ class Select : internal::no_assignment_operator,
        return m_else.coeff(i,j);
    }

+    inline EIGEN_DEVICE_FUNC
    const Scalar coeff(Index i) const
    {
      if (m_condition.coeff(i))
@ -88,17 +88,17 @@ class Select : internal::no_assignment_operator,
        return m_else.coeff(i);
    }

-    const ConditionMatrixType& conditionMatrix() const
+    inline EIGEN_DEVICE_FUNC const ConditionMatrixType& conditionMatrix() const
    {
      return m_condition;
    }

-    const ThenMatrixType& thenMatrix() const
+    inline EIGEN_DEVICE_FUNC const ThenMatrixType& thenMatrix() const
    {
      return m_then;
    }

-    const ElseMatrixType& elseMatrix() const
+    inline EIGEN_DEVICE_FUNC const ElseMatrixType& elseMatrix() const
    {
      return m_else;
    }
--- a/Eigen/src/Core/SelfAdjointView.h
+++ b/Eigen/src/Core/SelfAdjointView.h
@ -35,26 +35,23 @@ struct traits<SelfAdjointView<MatrixType, UpLo> > : traits<MatrixType>
  typedef typename nested<MatrixType>::type MatrixTypeNested;
  typedef typename remove_all<MatrixTypeNested>::type MatrixTypeNestedCleaned;
  typedef MatrixType ExpressionType;
-  typedef typename MatrixType::PlainObject DenseMatrixType;
+  typedef typename MatrixType::PlainObject FullMatrixType;
  enum {
    Mode = UpLo | SelfAdjoint,
-    Flags =  MatrixTypeNestedCleaned::Flags & (HereditaryBits)
-           & (~(PacketAccessBit | DirectAccessBit | LinearAccessBit)), // FIXME these flags should be preserved
-    CoeffReadCost = MatrixTypeNestedCleaned::CoeffReadCost
+    FlagsLvalueBit = is_lvalue<MatrixType>::value ? LvalueBit : 0,
+    Flags =  MatrixTypeNestedCleaned::Flags & (HereditaryBits|FlagsLvalueBit)
+           & (~(PacketAccessBit | DirectAccessBit | LinearAccessBit)) // FIXME these flags should be preserved
  };
 };
 }

-template <typename Lhs, int LhsMode, bool LhsIsVector,
-          typename Rhs, int RhsMode, bool RhsIsVector>
-struct SelfadjointProductMatrix;
-
 // FIXME could also be called SelfAdjointWrapper to be consistent with DiagonalWrapper ??
-template<typename MatrixType, unsigned int UpLo> class SelfAdjointView
-  : public TriangularBase<SelfAdjointView<MatrixType, UpLo> >
+template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
+  : public TriangularBase<SelfAdjointView<_MatrixType, UpLo> >
 {
  public:

+    typedef _MatrixType MatrixType;
    typedef TriangularBase<SelfAdjointView> Base;
    typedef typename internal::traits<SelfAdjointView>::MatrixTypeNested MatrixTypeNested;
    typedef typename internal::traits<SelfAdjointView>::MatrixTypeNestedCleaned MatrixTypeNestedCleaned;
@ -65,12 +62,13 @@ template<typename MatrixType, unsigned int UpLo> class SelfAdjointView
    typedef typename MatrixType::Index Index;

    enum {
-      Mode = internal::traits<SelfAdjointView>::Mode
+      Mode = internal::traits<SelfAdjointView>::Mode,
+      Flags = internal::traits<SelfAdjointView>::Flags
    };
    typedef typename MatrixType::PlainObject PlainObject;

    EIGEN_DEVICE_FUNC
-    inline SelfAdjointView(MatrixType& matrix) : m_matrix(matrix)
+    explicit inline SelfAdjointView(MatrixType& matrix) : m_matrix(matrix)
    {}

    EIGEN_DEVICE_FUNC
@ -98,6 +96,7 @@ template<typename MatrixType, unsigned int UpLo> class SelfAdjointView
    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index row, Index col)
    {
+      EIGEN_STATIC_ASSERT_LVALUE(SelfAdjointView);
      Base::check_coordinates_internal(row, col);
      return m_matrix.const_cast_derived().coeffRef(row, col);
    }
@ -111,26 +110,29 @@ template<typename MatrixType, unsigned int UpLo> class SelfAdjointView
    EIGEN_DEVICE_FUNC
    MatrixTypeNestedCleaned& nestedExpression() { return *const_cast<MatrixTypeNestedCleaned*>(&m_matrix); }

-    /** Efficient self-adjoint matrix times vector/matrix product */
+    /** Efficient triangular matrix times vector/matrix product */
    template<typename OtherDerived>
    EIGEN_DEVICE_FUNC
-    SelfadjointProductMatrix<MatrixType,Mode,false,OtherDerived,0,OtherDerived::IsVectorAtCompileTime>
+    const Product<SelfAdjointView,OtherDerived>
    operator*(const MatrixBase<OtherDerived>& rhs) const
    {
-      return SelfadjointProductMatrix
-              <MatrixType,Mode,false,OtherDerived,0,OtherDerived::IsVectorAtCompileTime>
-              (m_matrix, rhs.derived());
+      return Product<SelfAdjointView,OtherDerived>(*this, rhs.derived());
    }

-    /** Efficient vector/matrix times self-adjoint matrix product */
+    /** Efficient vector/matrix times triangular matrix product */
    template<typename OtherDerived> friend
    EIGEN_DEVICE_FUNC
-    SelfadjointProductMatrix<OtherDerived,0,OtherDerived::IsVectorAtCompileTime,MatrixType,Mode,false>
+    const Product<OtherDerived,SelfAdjointView>
    operator*(const MatrixBase<OtherDerived>& lhs, const SelfAdjointView& rhs)
    {
-      return SelfadjointProductMatrix
-              <OtherDerived,0,OtherDerived::IsVectorAtCompileTime,MatrixType,Mode,false>
-              (lhs.derived(),rhs.m_matrix);
+      return Product<OtherDerived,SelfAdjointView>(lhs.derived(),rhs);
+    }
+    
+    friend EIGEN_DEVICE_FUNC
+    const SelfAdjointView<const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,MatrixType>,UpLo>
+    operator*(const Scalar& s, const SelfAdjointView& mat)
+    {
+      return (s*mat.nestedExpression()).template selfadjointView<UpLo>();
    }

    /** Perform a symmetric rank 2 update of the selfadjoint matrix \c *this:
@ -194,96 +196,57 @@ template<typename MatrixType, unsigned int UpLo> class SelfAdjointView

 namespace internal {

-template<typename Derived1, typename Derived2, int UnrollCount, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, (SelfAdjoint|Upper), UnrollCount, ClearOpposite>
+// TODO currently a selfadjoint expression has the form SelfAdjointView<.,.>
+//      in the future selfadjoint-ness should be defined by the expression traits
+//      such that Transpose<SelfAdjointView<.,.> > is valid. (currently TriangularBase::transpose() is overloaded to make it work)
+template<typename MatrixType, unsigned int Mode>
+struct evaluator_traits<SelfAdjointView<MatrixType,Mode> >
 {
-  enum {
-    col = (UnrollCount-1) / Derived1::RowsAtCompileTime,
-    row = (UnrollCount-1) % Derived1::RowsAtCompileTime
-  };
+  typedef typename storage_kind_to_evaluator_kind<typename MatrixType::StorageKind>::Kind Kind;
+  typedef SelfAdjointShape Shape;
+  
+  static const int AssumeAliasing = 0;
+};

-  EIGEN_DEVICE_FUNC
-  static inline void run(Derived1 &dst, const Derived2 &src)
+template<int UpLo, int SetOpposite, typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT, typename Functor, int Version>
+class triangular_dense_assignment_kernel<UpLo,SelfAdjoint,SetOpposite,DstEvaluatorTypeT,SrcEvaluatorTypeT,Functor,Version>
+  : public generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, Version>
+{
+protected:
+  typedef generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, Version> Base;
+  typedef typename Base::DstXprType DstXprType;
+  typedef typename Base::SrcXprType SrcXprType;
+  using Base::m_dst;
+  using Base::m_src;
+  using Base::m_functor;
+public:
+  
+  typedef typename Base::DstEvaluatorType DstEvaluatorType;
+  typedef typename Base::SrcEvaluatorType SrcEvaluatorType;
+  typedef typename Base::Scalar Scalar;
+  typedef typename Base::Index Index;
+  typedef typename Base::AssignmentTraits AssignmentTraits;
+  
+  
+  EIGEN_DEVICE_FUNC triangular_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
+    : Base(dst, src, func, dstExpr)
+  {}
+  
+  EIGEN_DEVICE_FUNC void assignCoeff(Index row, Index col)
  {
-    triangular_assignment_selector<Derived1, Derived2, (SelfAdjoint|Upper), UnrollCount-1, ClearOpposite>::run(dst, src);
-
-    if(row == col)
-      dst.coeffRef(row, col) = numext::real(src.coeff(row, col));
-    else if(row < col)
-      dst.coeffRef(col, row) = numext::conj(dst.coeffRef(row, col) = src.coeff(row, col));
+    eigen_internal_assert(row!=col);
+    Scalar tmp = m_src.coeff(row,col);
+    m_functor.assignCoeff(m_dst.coeffRef(row,col), tmp);
+    m_functor.assignCoeff(m_dst.coeffRef(col,row), numext::conj(tmp));
  }
-};
-
-template<typename Derived1, typename Derived2, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, SelfAdjoint|Upper, 0, ClearOpposite>
-{
-  EIGEN_DEVICE_FUNC
-  static inline void run(Derived1 &, const Derived2 &) {}
-};
-
-template<typename Derived1, typename Derived2, int UnrollCount, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, (SelfAdjoint|Lower), UnrollCount, ClearOpposite>
-{
-  enum {
-    col = (UnrollCount-1) / Derived1::RowsAtCompileTime,
-    row = (UnrollCount-1) % Derived1::RowsAtCompileTime
-  };
-
-  EIGEN_DEVICE_FUNC
-  static inline void run(Derived1 &dst, const Derived2 &src)
+  
+  EIGEN_DEVICE_FUNC void assignDiagonalCoeff(Index id)
  {
-    triangular_assignment_selector<Derived1, Derived2, (SelfAdjoint|Lower), UnrollCount-1, ClearOpposite>::run(dst, src);
-
-    if(row == col)
-      dst.coeffRef(row, col) = numext::real(src.coeff(row, col));
-    else if(row > col)
-      dst.coeffRef(col, row) = numext::conj(dst.coeffRef(row, col) = src.coeff(row, col));
-  }
-};
-
-template<typename Derived1, typename Derived2, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, SelfAdjoint|Lower, 0, ClearOpposite>
-{
-  EIGEN_DEVICE_FUNC
-  static inline void run(Derived1 &, const Derived2 &) {}
-};
-
-template<typename Derived1, typename Derived2, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, SelfAdjoint|Upper, Dynamic, ClearOpposite>
-{
-  typedef typename Derived1::Index Index;
-  EIGEN_DEVICE_FUNC
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    for(Index j = 0; j < dst.cols(); ++j)
-    {
-      for(Index i = 0; i < j; ++i)
-      {
-        dst.copyCoeff(i, j, src);
-        dst.coeffRef(j,i) = numext::conj(dst.coeff(i,j));
-      }
-      dst.copyCoeff(j, j, src);
-    }
-  }
-};
-
-template<typename Derived1, typename Derived2, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, SelfAdjoint|Lower, Dynamic, ClearOpposite>
-{
-  EIGEN_DEVICE_FUNC
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-  typedef typename Derived1::Index Index;
-    for(Index i = 0; i < dst.rows(); ++i)
-    {
-      for(Index j = 0; j < i; ++j)
-      {
-        dst.copyCoeff(i, j, src);
-        dst.coeffRef(j,i) = numext::conj(dst.coeff(i,j));
-      }
-      dst.copyCoeff(i, i, src);
-    }
+    Base::assignCoeff(id,id);
  }
+  
+  EIGEN_DEVICE_FUNC void assignOppositeCoeff(Index, Index)
+  { eigen_internal_assert(false && "should never be called"); }
 };

 } // end namespace internal
@ -297,7 +260,7 @@ template<unsigned int UpLo>
 typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
 MatrixBase<Derived>::selfadjointView() const
 {
-  return derived();
+  return typename ConstSelfAdjointViewReturnType<UpLo>::Type(derived());
 }

 template<typename Derived>
@ -305,7 +268,7 @@ template<unsigned int UpLo>
 typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type
 MatrixBase<Derived>::selfadjointView()
 {
-  return derived();
+  return typename SelfAdjointViewReturnType<UpLo>::Type(derived());
 }

 } // end namespace Eigen
--- a/Eigen/src/Core/SelfCwiseBinaryOp.h
+++ b/Eigen/src/Core/SelfCwiseBinaryOp.h
@ -12,179 +12,11 @@

 namespace Eigen { 

-/** \class SelfCwiseBinaryOp
-  * \ingroup Core_Module
-  *
-  * \internal
-  *
-  * \brief Internal helper class for optimizing operators like +=, -=
-  *
-  * This is a pseudo expression class re-implementing the copyCoeff/copyPacket
-  * method to directly performs a +=/-= operations in an optimal way. In particular,
-  * this allows to make sure that the input/output data are loaded only once using
-  * aligned packet loads.
-  *
-  * \sa class SwapWrapper for a similar trick.
-  */
-
-namespace internal {
-template<typename BinaryOp, typename Lhs, typename Rhs>
-struct traits<SelfCwiseBinaryOp<BinaryOp,Lhs,Rhs> >
-  : traits<CwiseBinaryOp<BinaryOp,Lhs,Rhs> >
-{
-  enum {
-    // Note that it is still a good idea to preserve the DirectAccessBit
-    // so that assign can correctly align the data.
-    Flags = traits<CwiseBinaryOp<BinaryOp,Lhs,Rhs> >::Flags | (Lhs::Flags&AlignedBit) | (Lhs::Flags&DirectAccessBit) | (Lhs::Flags&LvalueBit),
-    OuterStrideAtCompileTime = Lhs::OuterStrideAtCompileTime,
-    InnerStrideAtCompileTime = Lhs::InnerStrideAtCompileTime
-  };
-};
-}
-
-template<typename BinaryOp, typename Lhs, typename Rhs> class SelfCwiseBinaryOp
-  : public internal::dense_xpr_base< SelfCwiseBinaryOp<BinaryOp, Lhs, Rhs> >::type
-{
-  public:
-
-    typedef typename internal::dense_xpr_base<SelfCwiseBinaryOp>::type Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(SelfCwiseBinaryOp)
-
-    typedef typename internal::packet_traits<Scalar>::type Packet;
-
-    EIGEN_DEVICE_FUNC
-    inline SelfCwiseBinaryOp(Lhs& xpr, const BinaryOp& func = BinaryOp()) : m_matrix(xpr), m_functor(func) {}
-
-    EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.rows(); }
-    EIGEN_DEVICE_FUNC inline Index cols() const { return m_matrix.cols(); }
-    EIGEN_DEVICE_FUNC inline Index outerStride() const { return m_matrix.outerStride(); }
-    EIGEN_DEVICE_FUNC inline Index innerStride() const { return m_matrix.innerStride(); }
-    EIGEN_DEVICE_FUNC inline const Scalar* data() const { return m_matrix.data(); }
-
-    // note that this function is needed by assign to correctly align loads/stores
-    // TODO make Assign use .data()
-    EIGEN_DEVICE_FUNC
-    inline Scalar& coeffRef(Index row, Index col)
-    {
-      EIGEN_STATIC_ASSERT_LVALUE(Lhs)
-      return m_matrix.const_cast_derived().coeffRef(row, col);
-    }
-    EIGEN_DEVICE_FUNC
-    inline const Scalar& coeffRef(Index row, Index col) const
-    {
-      return m_matrix.coeffRef(row, col);
-    }
-
-    // note that this function is needed by assign to correctly align loads/stores
-    // TODO make Assign use .data()
-    EIGEN_DEVICE_FUNC
-    inline Scalar& coeffRef(Index index)
-    {
-      EIGEN_STATIC_ASSERT_LVALUE(Lhs)
-      return m_matrix.const_cast_derived().coeffRef(index);
-    }
-    EIGEN_DEVICE_FUNC
-    inline const Scalar& coeffRef(Index index) const
-    {
-      return m_matrix.const_cast_derived().coeffRef(index);
-    }
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    void copyCoeff(Index row, Index col, const DenseBase<OtherDerived>& other)
-    {
-      OtherDerived& _other = other.const_cast_derived();
-      eigen_internal_assert(row >= 0 && row < rows()
-                         && col >= 0 && col < cols());
-      Scalar& tmp = m_matrix.coeffRef(row,col);
-      tmp = m_functor(tmp, _other.coeff(row,col));
-    }
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    void copyCoeff(Index index, const DenseBase<OtherDerived>& other)
-    {
-      OtherDerived& _other = other.const_cast_derived();
-      eigen_internal_assert(index >= 0 && index < m_matrix.size());
-      Scalar& tmp = m_matrix.coeffRef(index);
-      tmp = m_functor(tmp, _other.coeff(index));
-    }
-
-    template<typename OtherDerived, int StoreMode, int LoadMode>
-    void copyPacket(Index row, Index col, const DenseBase<OtherDerived>& other)
-    {
-      OtherDerived& _other = other.const_cast_derived();
-      eigen_internal_assert(row >= 0 && row < rows()
-                        && col >= 0 && col < cols());
-      m_matrix.template writePacket<StoreMode>(row, col,
-        m_functor.packetOp(m_matrix.template packet<StoreMode>(row, col),_other.template packet<LoadMode>(row, col)) );
-    }
-
-    template<typename OtherDerived, int StoreMode, int LoadMode>
-    void copyPacket(Index index, const DenseBase<OtherDerived>& other)
-    {
-      OtherDerived& _other = other.const_cast_derived();
-      eigen_internal_assert(index >= 0 && index < m_matrix.size());
-      m_matrix.template writePacket<StoreMode>(index,
-        m_functor.packetOp(m_matrix.template packet<StoreMode>(index),_other.template packet<LoadMode>(index)) );
-    }
-
-    // reimplement lazyAssign to handle complex *= real
-    // see CwiseBinaryOp ctor for details
-    template<typename RhsDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE SelfCwiseBinaryOp& lazyAssign(const DenseBase<RhsDerived>& rhs)
-    {
-      EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Lhs,RhsDerived)
-      EIGEN_CHECK_BINARY_COMPATIBILIY(BinaryOp,typename Lhs::Scalar,typename RhsDerived::Scalar);
-      
-    #ifdef EIGEN_DEBUG_ASSIGN
-      internal::assign_traits<SelfCwiseBinaryOp, RhsDerived>::debug();
-    #endif
-      eigen_assert(rows() == rhs.rows() && cols() == rhs.cols());
-      internal::assign_impl<SelfCwiseBinaryOp, RhsDerived>::run(*this,rhs.derived());
-    #ifndef EIGEN_NO_DEBUG
-      this->checkTransposeAliasing(rhs.derived());
-    #endif
-      return *this;
-    }
-    
-    // overloaded to honor evaluation of special matrices
-    // maybe another solution would be to not use SelfCwiseBinaryOp
-    // at first...
-    EIGEN_DEVICE_FUNC
-    SelfCwiseBinaryOp& operator=(const Rhs& _rhs)
-    {
-      typename internal::nested<Rhs>::type rhs(_rhs);
-      return Base::operator=(rhs);
-    }
-
-    EIGEN_DEVICE_FUNC
-    Lhs& expression() const 
-    { 
-      return m_matrix;
-    }
-
-    EIGEN_DEVICE_FUNC
-    const BinaryOp& functor() const 
-    { 
-      return m_functor;
-    }
-
-  protected:
-    Lhs& m_matrix;
-    const BinaryOp& m_functor;
-
-  private:
-    SelfCwiseBinaryOp& operator=(const SelfCwiseBinaryOp&);
-};
-
 template<typename Derived>
 inline Derived& DenseBase<Derived>::operator*=(const Scalar& other)
 {
  typedef typename Derived::PlainObject PlainObject;
-  SelfCwiseBinaryOp<internal::scalar_product_op<Scalar>, Derived, typename PlainObject::ConstantReturnType> tmp(derived());
-  tmp = PlainObject::Constant(rows(),cols(),other);
+  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::mul_assign_op<Scalar>());
  return derived();
 }

@ -192,8 +24,7 @@ template<typename Derived>
 inline Derived& ArrayBase<Derived>::operator+=(const Scalar& other)
 {
  typedef typename Derived::PlainObject PlainObject;
-  SelfCwiseBinaryOp<internal::scalar_sum_op<Scalar>, Derived, typename PlainObject::ConstantReturnType> tmp(derived());
-  tmp = PlainObject::Constant(rows(),cols(),other);
+  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::add_assign_op<Scalar>());
  return derived();
 }

@ -201,8 +32,7 @@ template<typename Derived>
 inline Derived& ArrayBase<Derived>::operator-=(const Scalar& other)
 {
  typedef typename Derived::PlainObject PlainObject;
-  SelfCwiseBinaryOp<internal::scalar_difference_op<Scalar>, Derived, typename PlainObject::ConstantReturnType> tmp(derived());
-  tmp = PlainObject::Constant(rows(),cols(),other);
+  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::sub_assign_op<Scalar>());
  return derived();
 }

@ -210,8 +40,7 @@ template<typename Derived>
 inline Derived& DenseBase<Derived>::operator/=(const Scalar& other)
 {
  typedef typename Derived::PlainObject PlainObject;
-  SelfCwiseBinaryOp<internal::scalar_quotient_op<Scalar>, Derived, typename PlainObject::ConstantReturnType> tmp(derived());
-  tmp = PlainObject::Constant(rows(),cols(), other);
+  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::div_assign_op<Scalar>());
  return derived();
 }

--- a/Eigen/src/Core/Solve.h
+++ b/Eigen/src/Core/Solve.h
@ -0,0 +1,152 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SOLVE_H
+#define EIGEN_SOLVE_H
+
+namespace Eigen {
+
+template<typename Decomposition, typename RhsType, typename StorageKind> class SolveImpl;
+  
+/** \class Solve
+  * \ingroup Core_Module
+  *
+  * \brief Pseudo expression representing a solving operation
+  *
+  * \tparam Decomposition the type of the matrix or decomposion object
+  * \tparam Rhstype the type of the right-hand side
+  *
+  * This class represents an expression of A.solve(B)
+  * and most of the time this is the only way it is used.
+  *
+  */
+namespace internal {
+
+// this solve_traits class permits to determine the evaluation type with respect to storage kind (Dense vs Sparse)
+template<typename Decomposition, typename RhsType,typename StorageKind> struct solve_traits;
+
+template<typename Decomposition, typename RhsType>
+struct solve_traits<Decomposition,RhsType,Dense>
+{
+  typedef typename Decomposition::MatrixType MatrixType;
+  typedef Matrix<typename RhsType::Scalar,
+                 MatrixType::ColsAtCompileTime,
+                 RhsType::ColsAtCompileTime,
+                 RhsType::PlainObject::Options,
+                 MatrixType::MaxColsAtCompileTime,
+                 RhsType::MaxColsAtCompileTime> PlainObject;  
+};
+
+template<typename Decomposition, typename RhsType>
+struct traits<Solve<Decomposition, RhsType> >
+  : traits<typename solve_traits<Decomposition,RhsType,typename internal::traits<RhsType>::StorageKind>::PlainObject>
+{
+  typedef typename solve_traits<Decomposition,RhsType,typename internal::traits<RhsType>::StorageKind>::PlainObject PlainObject;
+  typedef traits<PlainObject> BaseTraits;
+  enum {
+    Flags = BaseTraits::Flags & RowMajorBit,
+    CoeffReadCost = Dynamic
+  };
+};
+
+}
+
+
+template<typename Decomposition, typename RhsType>
+class Solve : public SolveImpl<Decomposition,RhsType,typename internal::traits<RhsType>::StorageKind>
+{
+public:
+  typedef typename RhsType::Index Index;
+  typedef typename internal::traits<Solve>::PlainObject PlainObject;
+  
+  Solve(const Decomposition &dec, const RhsType &rhs)
+    : m_dec(dec), m_rhs(rhs)
+  {}
+  
+  EIGEN_DEVICE_FUNC Index rows() const { return m_dec.cols(); }
+  EIGEN_DEVICE_FUNC Index cols() const { return m_rhs.cols(); }
+
+  EIGEN_DEVICE_FUNC const Decomposition& dec() const { return m_dec; }
+  EIGEN_DEVICE_FUNC const RhsType&       rhs() const { return m_rhs; }
+
+protected:
+  const Decomposition &m_dec;
+  const RhsType       &m_rhs;
+};
+
+
+// Specialization of the Solve expression for dense results
+template<typename Decomposition, typename RhsType>
+class SolveImpl<Decomposition,RhsType,Dense>
+  : public MatrixBase<Solve<Decomposition,RhsType> >
+{
+  typedef Solve<Decomposition,RhsType> Derived;
+  
+public:
+  
+  typedef MatrixBase<Solve<Decomposition,RhsType> > Base;
+  EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
+
+private:
+  
+  Scalar coeff(Index row, Index col) const;
+  Scalar coeff(Index i) const;
+};
+
+// Generic API dispatcher
+template<typename Decomposition, typename RhsType, typename StorageKind>
+class SolveImpl : public internal::generic_xpr_base<Solve<Decomposition,RhsType>, MatrixXpr, StorageKind>::type
+{
+  public:
+    typedef typename internal::generic_xpr_base<Solve<Decomposition,RhsType>, MatrixXpr, StorageKind>::type Base;
+};
+
+namespace internal {
+
+// Evaluator of Solve -> eval into a temporary
+template<typename Decomposition, typename RhsType>
+struct evaluator<Solve<Decomposition,RhsType> >
+  : public evaluator<typename Solve<Decomposition,RhsType>::PlainObject>::type
+{
+  typedef Solve<Decomposition,RhsType> SolveType;
+  typedef typename SolveType::PlainObject PlainObject;
+  typedef typename evaluator<PlainObject>::type Base;
+  
+  typedef evaluator type;
+  typedef evaluator nestedType;
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const SolveType& solve)
+    : m_result(solve.rows(), solve.cols())
+  {
+    ::new (static_cast<Base*>(this)) Base(m_result);
+    solve.dec()._solve_impl(solve.rhs(), m_result);
+  }
+  
+protected:  
+  PlainObject m_result;
+};
+
+// Specialization for "dst = dec.solve(rhs)"
+// NOTE we need to specialize it for Dense2Dense to avoid ambiguous specialization error and a Sparse2Sparse specialization must exist somewhere
+template<typename DstXprType, typename DecType, typename RhsType, typename Scalar>
+struct Assignment<DstXprType, Solve<DecType,RhsType>, internal::assign_op<Scalar>, Dense2Dense, Scalar>
+{
+  typedef Solve<DecType,RhsType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
+  {
+    // FIXME shall we resize dst here?
+    src.dec()._solve_impl(src.rhs(), dst);
+  }
+};
+
+} // end namepsace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SOLVE_H
--- a/Eigen/src/Core/SolveTriangular.h
+++ b/Eigen/src/Core/SolveTriangular.h
@ -171,10 +171,10 @@ struct triangular_solver_selector<Lhs,Rhs,OnTheRight,Mode,CompleteUnrolling,1> {
  */
 template<typename MatrixType, unsigned int Mode>
 template<int Side, typename OtherDerived>
-void TriangularView<MatrixType,Mode>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
+void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
 {
  OtherDerived& other = _other.const_cast_derived();
-  eigen_assert( cols() == rows() && ((Side==OnTheLeft && cols() == other.rows()) || (Side==OnTheRight && cols() == other.cols())) );
+  eigen_assert( derived().cols() == derived().rows() && ((Side==OnTheLeft && derived().cols() == other.rows()) || (Side==OnTheRight && derived().cols() == other.cols())) );
  eigen_assert((!(Mode & ZeroDiag)) && bool(Mode & (Upper|Lower)));

  enum { copy = internal::traits<OtherDerived>::Flags & RowMajorBit  && OtherDerived::IsVectorAtCompileTime };
@ -183,7 +183,7 @@ void TriangularView<MatrixType,Mode>::solveInPlace(const MatrixBase<OtherDerived
  OtherCopy otherCopy(other);

  internal::triangular_solver_selector<MatrixType, typename internal::remove_reference<OtherCopy>::type,
-    Side, Mode>::run(nestedExpression(), otherCopy);
+    Side, Mode>::run(derived().nestedExpression(), otherCopy);

  if (copy)
    other = otherCopy;
@ -213,9 +213,9 @@ void TriangularView<MatrixType,Mode>::solveInPlace(const MatrixBase<OtherDerived
 template<typename Derived, unsigned int Mode>
 template<int Side, typename Other>
 const internal::triangular_solve_retval<Side,TriangularView<Derived,Mode>,Other>
-TriangularView<Derived,Mode>::solve(const MatrixBase<Other>& other) const
+TriangularViewImpl<Derived,Mode,Dense>::solve(const MatrixBase<Other>& other) const
 {
-  return internal::triangular_solve_retval<Side,TriangularView,Other>(*this, other.derived());
+  return internal::triangular_solve_retval<Side,TriangularViewType,Other>(derived(), other.derived());
 }

 namespace internal {
--- a/Eigen/src/Core/StableNorm.h
+++ b/Eigen/src/Core/StableNorm.h
@ -17,7 +17,6 @@ namespace internal {
 template<typename ExpressionType, typename Scalar>
 inline void stable_norm_kernel(const ExpressionType& bl, Scalar& ssq, Scalar& scale, Scalar& invScale)
 {
-  using std::max;
  Scalar maxCoeff = bl.cwiseAbs().maxCoeff();
  
  if(maxCoeff>scale)
@ -58,8 +57,6 @@ blueNorm_impl(const EigenBase<Derived>& _vec)
  typedef typename Derived::RealScalar RealScalar;  
  typedef typename Derived::Index Index;
  using std::pow;
-  EIGEN_USING_STD_MATH(min);
-  EIGEN_USING_STD_MATH(max);
  using std::sqrt;
  using std::abs;
  const Derived& vec(_vec.derived());
@ -136,8 +133,8 @@ blueNorm_impl(const EigenBase<Derived>& _vec)
  }
  else
    return sqrt(amed);
-  asml = (min)(abig, amed);
-  abig = (max)(abig, amed);
+  asml = numext::mini(abig, amed);
+  abig = numext::maxi(abig, amed);
  if(asml <= abig*relerr)
    return abig;
  else
@ -160,7 +157,6 @@ template<typename Derived>
 inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
 MatrixBase<Derived>::stableNorm() const
 {
-  EIGEN_USING_STD_MATH(min);
  using std::sqrt;
  const Index blockSize = 4096;
  RealScalar scale(0);
@ -174,7 +170,7 @@ MatrixBase<Derived>::stableNorm() const
  if (bi>0)
    internal::stable_norm_kernel(this->head(bi), ssq, scale, invScale);
  for (; bi<n; bi+=blockSize)
-    internal::stable_norm_kernel(this->segment(bi,(min)(blockSize, n - bi)).template forceAlignedAccessIf<Alignment>(), ssq, scale, invScale);
+    internal::stable_norm_kernel(this->segment(bi,numext::mini(blockSize, n - bi)).template forceAlignedAccessIf<Alignment>(), ssq, scale, invScale);
  return scale * sqrt(ssq);
 }

--- a/Eigen/src/Core/Stride.h
+++ b/Eigen/src/Core/Stride.h
@ -86,26 +86,26 @@ class Stride

 /** \brief Convenience specialization of Stride to specify only an inner stride
  * See class Map for some examples */
-template<int Value = Dynamic>
+template<int Value>
 class InnerStride : public Stride<0, Value>
 {
    typedef Stride<0, Value> Base;
  public:
    typedef DenseIndex Index;
    EIGEN_DEVICE_FUNC InnerStride() : Base() {}
-    EIGEN_DEVICE_FUNC InnerStride(Index v) : Base(0, v) {}
+    EIGEN_DEVICE_FUNC InnerStride(Index v) : Base(0, v) {} // FIXME making this explicit could break valid code
 };

 /** \brief Convenience specialization of Stride to specify only an outer stride
  * See class Map for some examples */
-template<int Value = Dynamic>
+template<int Value>
 class OuterStride : public Stride<Value, 0>
 {
    typedef Stride<Value, 0> Base;
  public:
    typedef DenseIndex Index;
    EIGEN_DEVICE_FUNC OuterStride() : Base() {}
-    EIGEN_DEVICE_FUNC OuterStride(Index v) : Base(v,0) {}
+    EIGEN_DEVICE_FUNC OuterStride(Index v) : Base(v,0) {} // FIXME making this explicit could break valid code
 };

 } // end namespace Eigen
--- a/Eigen/src/Core/Swap.h
+++ b/Eigen/src/Core/Swap.h
@ -12,129 +12,54 @@

 namespace Eigen { 

-/** \class SwapWrapper
-  * \ingroup Core_Module
-  *
-  * \internal
-  *
-  * \brief Internal helper class for swapping two expressions
-  */
 namespace internal {
-template<typename ExpressionType>
-struct traits<SwapWrapper<ExpressionType> > : traits<ExpressionType> {};
-}

-template<typename ExpressionType> class SwapWrapper
-  : public internal::dense_xpr_base<SwapWrapper<ExpressionType> >::type
+// Overload default assignPacket behavior for swapping them
+template<typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT>
+class generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, swap_assign_op<typename DstEvaluatorTypeT::Scalar>, Specialized>
+ : public generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, swap_assign_op<typename DstEvaluatorTypeT::Scalar>, BuiltIn>
 {
-  public:
-
-    typedef typename internal::dense_xpr_base<SwapWrapper>::type Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(SwapWrapper)
-    typedef typename internal::packet_traits<Scalar>::type Packet;
-
-    EIGEN_DEVICE_FUNC
-    inline SwapWrapper(ExpressionType& xpr) : m_expression(xpr) {}
-
-    EIGEN_DEVICE_FUNC
-    inline Index rows() const { return m_expression.rows(); }
-    EIGEN_DEVICE_FUNC
-    inline Index cols() const { return m_expression.cols(); }
-    EIGEN_DEVICE_FUNC
-    inline Index outerStride() const { return m_expression.outerStride(); }
-    EIGEN_DEVICE_FUNC
-    inline Index innerStride() const { return m_expression.innerStride(); }
-    
-    typedef typename internal::conditional<
-                       internal::is_lvalue<ExpressionType>::value,
-                       Scalar,
-                       const Scalar
-                     >::type ScalarWithConstIfNotLvalue;
-                     
-    EIGEN_DEVICE_FUNC
-    inline ScalarWithConstIfNotLvalue* data() { return m_expression.data(); }
-    EIGEN_DEVICE_FUNC
-    inline const Scalar* data() const { return m_expression.data(); }
-
-    EIGEN_DEVICE_FUNC
-    inline Scalar& coeffRef(Index rowId, Index colId)
-    {
-      return m_expression.const_cast_derived().coeffRef(rowId, colId);
-    }
-
-    EIGEN_DEVICE_FUNC
-    inline Scalar& coeffRef(Index index)
-    {
-      return m_expression.const_cast_derived().coeffRef(index);
-    }
-
-    EIGEN_DEVICE_FUNC
-    inline Scalar& coeffRef(Index rowId, Index colId) const
-    {
-      return m_expression.coeffRef(rowId, colId);
-    }
-
-    EIGEN_DEVICE_FUNC
-    inline Scalar& coeffRef(Index index) const
-    {
-      return m_expression.coeffRef(index);
-    }
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    void copyCoeff(Index rowId, Index colId, const DenseBase<OtherDerived>& other)
-    {
-      OtherDerived& _other = other.const_cast_derived();
-      eigen_internal_assert(rowId >= 0 && rowId < rows()
-                         && colId >= 0 && colId < cols());
-      Scalar tmp = m_expression.coeff(rowId, colId);
-      m_expression.coeffRef(rowId, colId) = _other.coeff(rowId, colId);
-      _other.coeffRef(rowId, colId) = tmp;
-    }
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    void copyCoeff(Index index, const DenseBase<OtherDerived>& other)
-    {
-      OtherDerived& _other = other.const_cast_derived();
-      eigen_internal_assert(index >= 0 && index < m_expression.size());
-      Scalar tmp = m_expression.coeff(index);
-      m_expression.coeffRef(index) = _other.coeff(index);
-      _other.coeffRef(index) = tmp;
-    }
-
-    template<typename OtherDerived, int StoreMode, int LoadMode>
-    void copyPacket(Index rowId, Index colId, const DenseBase<OtherDerived>& other)
-    {
-      OtherDerived& _other = other.const_cast_derived();
-      eigen_internal_assert(rowId >= 0 && rowId < rows()
-                        && colId >= 0 && colId < cols());
-      Packet tmp = m_expression.template packet<StoreMode>(rowId, colId);
-      m_expression.template writePacket<StoreMode>(rowId, colId,
-        _other.template packet<LoadMode>(rowId, colId)
-      );
-      _other.template writePacket<LoadMode>(rowId, colId, tmp);
-    }
-
-    template<typename OtherDerived, int StoreMode, int LoadMode>
-    void copyPacket(Index index, const DenseBase<OtherDerived>& other)
-    {
-      OtherDerived& _other = other.const_cast_derived();
-      eigen_internal_assert(index >= 0 && index < m_expression.size());
-      Packet tmp = m_expression.template packet<StoreMode>(index);
-      m_expression.template writePacket<StoreMode>(index,
-        _other.template packet<LoadMode>(index)
-      );
-      _other.template writePacket<LoadMode>(index, tmp);
-    }
-
-    EIGEN_DEVICE_FUNC
-    ExpressionType& expression() const { return m_expression; }
-
-  protected:
-    ExpressionType& m_expression;
+protected:
+  typedef generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, swap_assign_op<typename DstEvaluatorTypeT::Scalar>, BuiltIn> Base;
+  typedef typename DstEvaluatorTypeT::PacketScalar PacketScalar;
+  using Base::m_dst;
+  using Base::m_src;
+  using Base::m_functor;
+  
+public:
+  typedef typename Base::Scalar Scalar;
+  typedef typename Base::Index Index;
+  typedef typename Base::DstXprType DstXprType;
+  typedef swap_assign_op<Scalar> Functor;
+  
+  EIGEN_DEVICE_FUNC generic_dense_assignment_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, const Functor &func, DstXprType& dstExpr)
+    : Base(dst, src, func, dstExpr)
+  {}
+  
+  template<int StoreMode, int LoadMode>
+  void assignPacket(Index row, Index col)
+  {
+    m_functor.template swapPacket<StoreMode,LoadMode,PacketScalar>(&m_dst.coeffRef(row,col), &const_cast<SrcEvaluatorTypeT&>(m_src).coeffRef(row,col));
+  }
+  
+  template<int StoreMode, int LoadMode>
+  void assignPacket(Index index)
+  {
+    m_functor.template swapPacket<StoreMode,LoadMode,PacketScalar>(&m_dst.coeffRef(index), &const_cast<SrcEvaluatorTypeT&>(m_src).coeffRef(index));
+  }
+  
+  // TODO find a simple way not to have to copy/paste this function from generic_dense_assignment_kernel, by simple I mean no CRTP (Gael)
+  template<int StoreMode, int LoadMode>
+  void assignPacketByOuterInner(Index outer, Index inner)
+  {
+    Index row = Base::rowIndexByOuterInner(outer, inner); 
+    Index col = Base::colIndexByOuterInner(outer, inner);
+    assignPacket<StoreMode,LoadMode>(row, col);
+  }
 };

+} // namespace internal
+
 } // end namespace Eigen

 #endif // EIGEN_SWAP_H
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@ -2,7 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@ -29,9 +29,10 @@ namespace Eigen {

 namespace internal {
 template<typename MatrixType>
-struct traits<Transpose<MatrixType> > : traits<MatrixType>
+struct traits<Transpose<MatrixType> >
 {
-  typedef typename MatrixType::Scalar Scalar;
+  typedef typename traits<MatrixType>::Scalar Scalar;
+  typedef typename traits<MatrixType>::Index Index;
  typedef typename nested<MatrixType>::type MatrixTypeNested;
  typedef typename remove_reference<MatrixTypeNested>::type MatrixTypeNestedPlain;
  typedef typename traits<MatrixType>::StorageKind StorageKind;
@ -45,7 +46,6 @@ struct traits<Transpose<MatrixType> > : traits<MatrixType>
    Flags0 = MatrixTypeNestedPlain::Flags & ~(LvalueBit | NestByRefBit),
    Flags1 = Flags0 | FlagsLvalueBit,
    Flags = Flags1 ^ RowMajorBit,
-    CoeffReadCost = MatrixTypeNestedPlain::CoeffReadCost,
    InnerStrideAtCompileTime = inner_stride_at_compile_time<MatrixType>::ret,
    OuterStrideAtCompileTime = outer_stride_at_compile_time<MatrixType>::ret
  };
@ -61,9 +61,10 @@ template<typename MatrixType> class Transpose

    typedef typename TransposeImpl<MatrixType,typename internal::traits<MatrixType>::StorageKind>::Base Base;
    EIGEN_GENERIC_PUBLIC_INTERFACE(Transpose)
+    typedef typename internal::remove_all<MatrixType>::type NestedExpression;

    EIGEN_DEVICE_FUNC
-    inline Transpose(MatrixType& a_matrix) : m_matrix(a_matrix) {}
+    explicit inline Transpose(MatrixType& a_matrix) : m_matrix(a_matrix) {}

    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Transpose)

@ -100,12 +101,22 @@ struct TransposeImpl_base<MatrixType, false>

 } // end namespace internal

+// Generic API dispatcher
+template<typename XprType, typename StorageKind>
+class TransposeImpl
+  : public internal::generic_xpr_base<Transpose<XprType> >::type
+{
+public:
+  typedef typename internal::generic_xpr_base<Transpose<XprType> >::type Base;
+};
+
 template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
  : public internal::TransposeImpl_base<MatrixType>::type
 {
  public:

    typedef typename internal::TransposeImpl_base<MatrixType>::type Base;
+    using Base::coeffRef;
    EIGEN_DENSE_PUBLIC_INTERFACE(Transpose<MatrixType>)
    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TransposeImpl)

@ -118,23 +129,10 @@ template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
                       const Scalar
                     >::type ScalarWithConstIfNotLvalue;

-    inline ScalarWithConstIfNotLvalue* data() { return derived().nestedExpression().data(); }
-    inline const Scalar* data() const { return derived().nestedExpression().data(); }
-
-    EIGEN_DEVICE_FUNC
-    inline ScalarWithConstIfNotLvalue& coeffRef(Index rowId, Index colId)
-    {
-      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
-      return derived().nestedExpression().const_cast_derived().coeffRef(colId, rowId);
-    }
-
-    EIGEN_DEVICE_FUNC
-    inline ScalarWithConstIfNotLvalue& coeffRef(Index index)
-    {
-      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
-      return derived().nestedExpression().const_cast_derived().coeffRef(index);
-    }
+    EIGEN_DEVICE_FUNC inline ScalarWithConstIfNotLvalue* data() { return derived().nestedExpression().data(); }
+    EIGEN_DEVICE_FUNC inline const Scalar* data() const { return derived().nestedExpression().data(); }

+    // FIXME: shall we keep the const version of coeffRef?
    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index rowId, Index colId) const
    {
@ -146,42 +144,6 @@ template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
    {
      return derived().nestedExpression().coeffRef(index);
    }
-
-    EIGEN_DEVICE_FUNC
-    inline CoeffReturnType coeff(Index rowId, Index colId) const
-    {
-      return derived().nestedExpression().coeff(colId, rowId);
-    }
-
-    EIGEN_DEVICE_FUNC
-    inline CoeffReturnType coeff(Index index) const
-    {
-      return derived().nestedExpression().coeff(index);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index rowId, Index colId) const
-    {
-      return derived().nestedExpression().template packet<LoadMode>(colId, rowId);
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index rowId, Index colId, const PacketScalar& x)
-    {
-      derived().nestedExpression().const_cast_derived().template writePacket<LoadMode>(colId, rowId, x);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index index) const
-    {
-      return derived().nestedExpression().template packet<LoadMode>(index);
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index index, const PacketScalar& x)
-    {
-      derived().nestedExpression().const_cast_derived().template writePacket<LoadMode>(index, x);
-    }
 };

 /** \returns an expression of the transpose of *this.
@ -207,7 +169,7 @@ template<typename Derived>
 inline Transpose<Derived>
 DenseBase<Derived>::transpose()
 {
-  return derived();
+  return TransposeReturnType(derived());
 }

 /** This is the const version of transpose().
@ -245,8 +207,7 @@ template<typename Derived>
 inline const typename MatrixBase<Derived>::AdjointReturnType
 MatrixBase<Derived>::adjoint() const
 {
-  return this->transpose(); // in the complex case, the .conjugate() is be implicit here
-                            // due to implicit conversion to return type
+  return AdjointReturnType(this->transpose());
 }

 /***************************************************************************
@ -256,18 +217,39 @@ MatrixBase<Derived>::adjoint() const
 namespace internal {

 template<typename MatrixType,
-  bool IsSquare = (MatrixType::RowsAtCompileTime == MatrixType::ColsAtCompileTime) && MatrixType::RowsAtCompileTime!=Dynamic>
+  bool IsSquare = (MatrixType::RowsAtCompileTime == MatrixType::ColsAtCompileTime) && MatrixType::RowsAtCompileTime!=Dynamic,
+  bool MatchPacketSize =
+        (int(MatrixType::RowsAtCompileTime) == int(internal::packet_traits<typename MatrixType::Scalar>::size))
+    &&  (internal::evaluator<MatrixType>::Flags&PacketAccessBit) >
 struct inplace_transpose_selector;

 template<typename MatrixType>
-struct inplace_transpose_selector<MatrixType,true> { // square matrix
+struct inplace_transpose_selector<MatrixType,true,false> { // square matrix
  static void run(MatrixType& m) {
    m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose());
  }
 };

+// TODO: vectorized path is currently limited to LargestPacketSize x LargestPacketSize cases only.
 template<typename MatrixType>
-struct inplace_transpose_selector<MatrixType,false> { // non square matrix
+struct inplace_transpose_selector<MatrixType,true,true> { // PacketSize x PacketSize
+  static void run(MatrixType& m) {
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename internal::packet_traits<typename MatrixType::Scalar>::type Packet;
+    typedef typename MatrixType::Index Index;
+    const Index PacketSize = internal::packet_traits<Scalar>::size;
+    const Index Alignment = internal::evaluator<MatrixType>::Flags&AlignedBit ? Aligned : Unaligned;
+    PacketBlock<Packet> A;
+    for (Index i=0; i<PacketSize; ++i)
+      A.packet[i] = m.template packetByOuterInner<Alignment>(i,0);
+    internal::ptranspose(A);
+    for (Index i=0; i<PacketSize; ++i)
+      m.template writePacket<Alignment>(m.rowIndexByOuterInner(i,0), m.colIndexByOuterInner(i,0), A.packet[i]);
+  }
+};
+
+template<typename MatrixType,bool MatchPacketSize>
+struct inplace_transpose_selector<MatrixType,false,MatchPacketSize> { // non square matrix
  static void run(MatrixType& m) {
    if (m.rows()==m.cols())
      m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose());
@ -413,15 +395,15 @@ struct checkTransposeAliasing_impl<Derived, OtherDerived, false>
    }
 };

+template<typename Dst, typename Src>
+void check_for_aliasing(const Dst &dst, const Src &src)
+{
+  internal::checkTransposeAliasing_impl<Dst, Src>::run(dst, src);
+}
+
 } // end namespace internal

-template<typename Derived>
-template<typename OtherDerived>
-void DenseBase<Derived>::checkTransposeAliasing(const OtherDerived& other) const
-{
-    internal::checkTransposeAliasing_impl<Derived, OtherDerived>::run(derived(), other);
-}
-#endif
+#endif // EIGEN_NO_DEBUG

 } // end namespace Eigen

--- a/Eigen/src/Core/Transpositions.h
+++ b/Eigen/src/Core/Transpositions.h
@ -240,7 +240,7 @@ class Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndexTyp
    typedef typename IndicesType::Scalar StorageIndexType;
    typedef typename IndicesType::Index  Index;

-    inline Map(const StorageIndexType* indicesPtr)
+    explicit inline Map(const StorageIndexType* indicesPtr)
      : m_indices(indicesPtr)
    {}

@ -299,7 +299,7 @@ class TranspositionsWrapper
    typedef typename IndicesType::Scalar StorageIndexType;
    typedef typename IndicesType::Index  Index;

-    inline TranspositionsWrapper(IndicesType& a_indices)
+    explicit inline TranspositionsWrapper(IndicesType& a_indices)
      : m_indices(a_indices)
    {}

@ -414,7 +414,7 @@ class Transpose<TranspositionsBase<TranspositionsDerived> >
    typedef typename TranspositionType::IndicesType IndicesType;
  public:

-    Transpose(const TranspositionType& t) : m_transpositions(t) {}
+    explicit Transpose(const TranspositionType& t) : m_transpositions(t) {}

    inline int size() const { return m_transpositions.size(); }

--- a/Eigen/src/Core/TriangularMatrix.h
+++ b/Eigen/src/Core/TriangularMatrix.h
--- a/Eigen/src/Core/VectorwiseOp.h
+++ b/Eigen/src/Core/VectorwiseOp.h
@ -48,25 +48,15 @@ struct traits<PartialReduxExpr<MatrixType, MemberOp, Direction> >
    ColsAtCompileTime = Direction==Horizontal ? 1 : MatrixType::ColsAtCompileTime,
    MaxRowsAtCompileTime = Direction==Vertical   ? 1 : MatrixType::MaxRowsAtCompileTime,
    MaxColsAtCompileTime = Direction==Horizontal ? 1 : MatrixType::MaxColsAtCompileTime,
-    Flags0 = (unsigned int)_MatrixTypeNested::Flags & HereditaryBits,
-    Flags = (Flags0 & ~RowMajorBit) | (RowsAtCompileTime == 1 ? RowMajorBit : 0),
+    Flags = RowsAtCompileTime == 1 ? RowMajorBit : 0,
    TraversalSize = Direction==Vertical ? MatrixType::RowsAtCompileTime :  MatrixType::ColsAtCompileTime
  };
-  #if EIGEN_GNUC_AT_LEAST(3,4)
-  typedef typename MemberOp::template Cost<InputScalar,int(TraversalSize)> CostOpType;
-  #else
-  typedef typename MemberOp::template Cost<InputScalar,TraversalSize> CostOpType;
-  #endif
-  enum {
-    CoeffReadCost = TraversalSize==Dynamic ? Dynamic
-                  : TraversalSize * traits<_MatrixTypeNested>::CoeffReadCost + int(CostOpType::value)
-  };
 };
 }

 template< typename MatrixType, typename MemberOp, int Direction>
-class PartialReduxExpr : internal::no_assignment_operator,
-  public internal::dense_xpr_base< PartialReduxExpr<MatrixType, MemberOp, Direction> >::type
+class PartialReduxExpr : public internal::dense_xpr_base< PartialReduxExpr<MatrixType, MemberOp, Direction> >::type,
+                         internal::no_assignment_operator
 {
  public:

@ -75,7 +65,7 @@ class PartialReduxExpr : internal::no_assignment_operator,
    typedef typename internal::traits<PartialReduxExpr>::MatrixTypeNested MatrixTypeNested;
    typedef typename internal::traits<PartialReduxExpr>::_MatrixTypeNested _MatrixTypeNested;

-    PartialReduxExpr(const MatrixType& mat, const MemberOp& func = MemberOp())
+    explicit PartialReduxExpr(const MatrixType& mat, const MemberOp& func = MemberOp())
      : m_matrix(mat), m_functor(func) {}

    Index rows() const { return (Direction==Vertical   ? 1 : m_matrix.rows()); }
@ -138,7 +128,7 @@ struct member_redux {
                   >::type  result_type;
  template<typename _Scalar, int Size> struct Cost
  { enum { value = (Size-1) * functor_traits<BinaryOp>::Cost }; };
-  member_redux(const BinaryOp func) : m_functor(func) {}
+  explicit member_redux(const BinaryOp func) : m_functor(func) {}
  template<typename Derived>
  inline result_type operator()(const DenseBase<Derived>& mat) const
  { return mat.redux(m_functor); }
@ -175,10 +165,10 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
    typedef typename internal::remove_all<ExpressionTypeNested>::type ExpressionTypeNestedCleaned;

    template<template<typename _Scalar> class Functor,
-                      typename Scalar=typename internal::traits<ExpressionType>::Scalar> struct ReturnType
+                      typename Scalar_=Scalar> struct ReturnType
    {
      typedef PartialReduxExpr<ExpressionType,
-                               Functor<Scalar>,
+                               Functor<Scalar_>,
                               Direction
                              > Type;
    };
@ -186,7 +176,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
    template<typename BinaryOp> struct ReduxReturnType
    {
      typedef PartialReduxExpr<ExpressionType,
-                               internal::member_redux<BinaryOp,typename internal::traits<ExpressionType>::Scalar>,
+                               internal::member_redux<BinaryOp,Scalar>,
                               Direction
                              > Type;
    };
@ -259,7 +249,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp

  public:

-    inline VectorwiseOp(ExpressionType& matrix) : m_matrix(matrix) {}
+    explicit inline VectorwiseOp(ExpressionType& matrix) : m_matrix(matrix) {}

    /** \internal */
    inline const ExpressionType& _expression() const { return m_matrix; }
@ -274,7 +264,22 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
    template<typename BinaryOp>
    const typename ReduxReturnType<BinaryOp>::Type
    redux(const BinaryOp& func = BinaryOp()) const
-    { return typename ReduxReturnType<BinaryOp>::Type(_expression(), func); }
+    { return typename ReduxReturnType<BinaryOp>::Type(_expression(), internal::member_redux<BinaryOp,Scalar>(func)); }
+
+    typedef typename ReturnType<internal::member_minCoeff>::Type MinCoeffReturnType;
+    typedef typename ReturnType<internal::member_maxCoeff>::Type MaxCoeffReturnType;
+    typedef typename ReturnType<internal::member_squaredNorm,RealScalar>::Type SquaredNormReturnType;
+    typedef typename ReturnType<internal::member_norm,RealScalar>::Type NormReturnType;
+    typedef typename ReturnType<internal::member_blueNorm,RealScalar>::Type BlueNormReturnType;
+    typedef typename ReturnType<internal::member_stableNorm,RealScalar>::Type StableNormReturnType;
+    typedef typename ReturnType<internal::member_hypotNorm,RealScalar>::Type HypotNormReturnType;
+    typedef typename ReturnType<internal::member_sum>::Type SumReturnType;
+    typedef typename ReturnType<internal::member_mean>::Type MeanReturnType;
+    typedef typename ReturnType<internal::member_all>::Type AllReturnType;
+    typedef typename ReturnType<internal::member_any>::Type AnyReturnType;
+    typedef PartialReduxExpr<ExpressionType, internal::member_count<Index>, Direction> CountReturnType;
+    typedef typename ReturnType<internal::member_prod>::Type ProdReturnType;
+    typedef Reverse<ExpressionType, Direction> ReverseReturnType;

    /** \returns a row (or column) vector expression of the smallest coefficient
      * of each column (or row) of the referenced expression.
@ -285,8 +290,8 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
      * Output: \verbinclude PartialRedux_minCoeff.out
      *
      * \sa DenseBase::minCoeff() */
-    const typename ReturnType<internal::member_minCoeff>::Type minCoeff() const
-    { return _expression(); }
+    const MinCoeffReturnType minCoeff() const
+    { return MinCoeffReturnType(_expression()); }

    /** \returns a row (or column) vector expression of the largest coefficient
      * of each column (or row) of the referenced expression.
@ -297,8 +302,8 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
      * Output: \verbinclude PartialRedux_maxCoeff.out
      *
      * \sa DenseBase::maxCoeff() */
-    const typename ReturnType<internal::member_maxCoeff>::Type maxCoeff() const
-    { return _expression(); }
+    const MaxCoeffReturnType maxCoeff() const
+    { return MaxCoeffReturnType(_expression()); }

    /** \returns a row (or column) vector expression of the squared norm
      * of each column (or row) of the referenced expression.
@ -308,8 +313,8 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
      * Output: \verbinclude PartialRedux_squaredNorm.out
      *
      * \sa DenseBase::squaredNorm() */
-    const typename ReturnType<internal::member_squaredNorm,RealScalar>::Type squaredNorm() const
-    { return _expression(); }
+    const SquaredNormReturnType squaredNorm() const
+    { return SquaredNormReturnType(_expression()); }

    /** \returns a row (or column) vector expression of the norm
      * of each column (or row) of the referenced expression.
@ -319,8 +324,8 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
      * Output: \verbinclude PartialRedux_norm.out
      *
      * \sa DenseBase::norm() */
-    const typename ReturnType<internal::member_norm,RealScalar>::Type norm() const
-    { return _expression(); }
+    const NormReturnType norm() const
+    { return NormReturnType(_expression()); }


    /** \returns a row (or column) vector expression of the norm
@ -329,8 +334,8 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
      * This is a vector with real entries, even if the original matrix has complex entries.
      *
      * \sa DenseBase::blueNorm() */
-    const typename ReturnType<internal::member_blueNorm,RealScalar>::Type blueNorm() const
-    { return _expression(); }
+    const BlueNormReturnType blueNorm() const
+    { return BlueNormReturnType(_expression()); }


    /** \returns a row (or column) vector expression of the norm
@ -339,8 +344,8 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
      * This is a vector with real entries, even if the original matrix has complex entries.
      *
      * \sa DenseBase::stableNorm() */
-    const typename ReturnType<internal::member_stableNorm,RealScalar>::Type stableNorm() const
-    { return _expression(); }
+    const StableNormReturnType stableNorm() const
+    { return StableNormReturnType(_expression()); }


    /** \returns a row (or column) vector expression of the norm
@ -349,8 +354,8 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
      * This is a vector with real entries, even if the original matrix has complex entries.
      *
      * \sa DenseBase::hypotNorm() */
-    const typename ReturnType<internal::member_hypotNorm,RealScalar>::Type hypotNorm() const
-    { return _expression(); }
+    const HypotNormReturnType hypotNorm() const
+    { return HypotNormReturnType(_expression()); }

    /** \returns a row (or column) vector expression of the sum
      * of each column (or row) of the referenced expression.
@ -359,31 +364,31 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
      * Output: \verbinclude PartialRedux_sum.out
      *
      * \sa DenseBase::sum() */
-    const typename ReturnType<internal::member_sum>::Type sum() const
-    { return _expression(); }
+    const SumReturnType sum() const
+    { return SumReturnType(_expression()); }

    /** \returns a row (or column) vector expression of the mean
    * of each column (or row) of the referenced expression.
    *
    * \sa DenseBase::mean() */
-    const typename ReturnType<internal::member_mean>::Type mean() const
-    { return _expression(); }
+    const MeanReturnType mean() const
+    { return MeanReturnType(_expression()); }

    /** \returns a row (or column) vector expression representing
      * whether \b all coefficients of each respective column (or row) are \c true.
      * This expression can be assigned to a vector with entries of type \c bool.
      *
      * \sa DenseBase::all() */
-    const typename ReturnType<internal::member_all>::Type all() const
-    { return _expression(); }
+    const AllReturnType all() const
+    { return AllReturnType(_expression()); }

    /** \returns a row (or column) vector expression representing
      * whether \b at \b least one coefficient of each respective column (or row) is \c true.
      * This expression can be assigned to a vector with entries of type \c bool.
      *
      * \sa DenseBase::any() */
-    const typename ReturnType<internal::member_any>::Type any() const
-    { return _expression(); }
+    const AnyReturnType any() const
+    { return Any(_expression()); }

    /** \returns a row (or column) vector expression representing
      * the number of \c true coefficients of each respective column (or row).
@ -394,8 +399,8 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
      * Output: \verbinclude PartialRedux_count.out
      *
      * \sa DenseBase::count() */
-    const PartialReduxExpr<ExpressionType, internal::member_count<Index>, Direction> count() const
-    { return _expression(); }
+    const CountReturnType count() const
+    { return CountReturnType(_expression()); }

    /** \returns a row (or column) vector expression of the product
      * of each column (or row) of the referenced expression.
@ -404,8 +409,8 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
      * Output: \verbinclude PartialRedux_prod.out
      *
      * \sa DenseBase::prod() */
-    const typename ReturnType<internal::member_prod>::Type prod() const
-    { return _expression(); }
+    const ProdReturnType prod() const
+    { return ProdReturnType(_expression()); }


    /** \returns a matrix expression
@ -415,8 +420,8 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
      * Output: \verbinclude Vectorwise_reverse.out
      *
      * \sa DenseBase::reverse() */
-    const Reverse<ExpressionType, Direction> reverse() const
-    { return Reverse<ExpressionType, Direction>( _expression() ); }
+    const ReverseReturnType reverse() const
+    { return ReverseReturnType( _expression() ); }

    typedef Replicate<ExpressionType,Direction==Vertical?Dynamic:1,Direction==Horizontal?Dynamic:1> ReplicateReturnType;
    const ReplicateReturnType replicate(Index factor) const;
@ -560,7 +565,8 @@ template<typename ExpressionType, int Direction> class VectorwiseOp

 /////////// Geometry module ///////////

-    Homogeneous<ExpressionType,Direction> homogeneous() const;
+    typedef Homogeneous<ExpressionType,Direction> HomogeneousReturnType;
+    HomogeneousReturnType homogeneous() const;

    typedef typename ExpressionType::PlainObject CrossReturnType;
    template<typename OtherDerived>
@ -605,7 +611,7 @@ template<typename Derived>
 inline const typename DenseBase<Derived>::ConstColwiseReturnType
 DenseBase<Derived>::colwise() const
 {
-  return derived();
+  return ConstColwiseReturnType(derived());
 }

 /** \returns a writable VectorwiseOp wrapper of *this providing additional partial reduction operations
@ -616,7 +622,7 @@ template<typename Derived>
 inline typename DenseBase<Derived>::ColwiseReturnType
 DenseBase<Derived>::colwise()
 {
-  return derived();
+  return ColwiseReturnType(derived());
 }

 /** \returns a VectorwiseOp wrapper of *this providing additional partial reduction operations
@ -630,7 +636,7 @@ template<typename Derived>
 inline const typename DenseBase<Derived>::ConstRowwiseReturnType
 DenseBase<Derived>::rowwise() const
 {
-  return derived();
+  return ConstRowwiseReturnType(derived());
 }

 /** \returns a writable VectorwiseOp wrapper of *this providing additional partial reduction operations
@ -641,7 +647,7 @@ template<typename Derived>
 inline typename DenseBase<Derived>::RowwiseReturnType
 DenseBase<Derived>::rowwise()
 {
-  return derived();
+  return RowwiseReturnType(derived());
 }

 } // end namespace Eigen
--- a/Eigen/src/Core/Visitor.h
+++ b/Eigen/src/Core/Visitor.h
@ -53,6 +53,33 @@ struct visitor_impl<Visitor, Derived, Dynamic>
  }
 };

+// evaluator adaptor
+template<typename XprType>
+class visitor_evaluator
+{
+public:
+  explicit visitor_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {}
+  
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  
+  enum {
+    RowsAtCompileTime = XprType::RowsAtCompileTime,
+    CoeffReadCost = internal::evaluator<XprType>::CoeffReadCost
+  };
+  
+  Index rows() const { return m_xpr.rows(); }
+  Index cols() const { return m_xpr.cols(); }
+  Index size() const { return m_xpr.size(); }
+
+  CoeffReturnType coeff(Index row, Index col) const
+  { return m_evaluator.coeff(row, col); }
+  
+protected:
+  typename internal::evaluator<XprType>::nestedType m_evaluator;
+  const XprType &m_xpr;
+};
 } // end namespace internal

 /** Applies the visitor \a visitor to the whole coefficients of the matrix or vector.
@ -76,14 +103,17 @@ template<typename Derived>
 template<typename Visitor>
 void DenseBase<Derived>::visit(Visitor& visitor) const
 {
-  enum { unroll = SizeAtCompileTime != Dynamic
-                   && CoeffReadCost != Dynamic
-                   && (SizeAtCompileTime == 1 || internal::functor_traits<Visitor>::Cost != Dynamic)
-                   && SizeAtCompileTime * CoeffReadCost + (SizeAtCompileTime-1) * internal::functor_traits<Visitor>::Cost
-                      <= EIGEN_UNROLLING_LIMIT };
-  return internal::visitor_impl<Visitor, Derived,
+  typedef typename internal::visitor_evaluator<Derived> ThisEvaluator;
+  ThisEvaluator thisEval(derived());
+  
+  enum { unroll =   SizeAtCompileTime != Dynamic
+                &&  ThisEvaluator::CoeffReadCost != Dynamic
+                &&  (SizeAtCompileTime == 1 || internal::functor_traits<Visitor>::Cost != Dynamic)
+                &&  SizeAtCompileTime * ThisEvaluator::CoeffReadCost + (SizeAtCompileTime-1) * internal::functor_traits<Visitor>::Cost
+                <= EIGEN_UNROLLING_LIMIT };
+  return internal::visitor_impl<Visitor, ThisEvaluator,
      unroll ? int(SizeAtCompileTime) : Dynamic
-    >::run(derived(), visitor);
+    >::run(thisEval, visitor);
 }

 namespace internal {
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@ -22,9 +22,9 @@ namespace internal {
 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
 #endif

-#ifdef EIGEN_VECTORIZE_FMA
-#ifndef EIGEN_HAS_FUSED_MADD
-#define EIGEN_HAS_FUSED_MADD 1
+#ifdef __FMA__
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
 #endif
 #endif

@ -137,7 +137,7 @@ template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& /*a*/, co

 #ifdef EIGEN_VECTORIZE_FMA
 template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
-#if defined(__clang__) || defined(__GNUC__)
+#if EIGEN_COMP_GNUC || EIGEN_COMP_CLANG
  // clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers,
  // and gcc stupidly generates a vfmadd132ps instruction,
  // so let's enforce it to generate a vfmadd231ps instruction since the most common use case is to accumulate
@ -150,7 +150,7 @@ template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f&
 #endif
 }
 template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) {
-#if defined(__clang__) || defined(__GNUC__)
+#if EIGEN_COMP_GNUC || EIGEN_COMP_CLANG
  // see above
  Packet4d res = c;
  __asm__("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
--- a/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/Eigen/src/Core/arch/AltiVec/Complex.h
@ -7,23 +7,21 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

-#ifndef EIGEN_COMPLEX_ALTIVEC_H
-#define EIGEN_COMPLEX_ALTIVEC_H
+#ifndef EIGEN_COMPLEX32_ALTIVEC_H
+#define EIGEN_COMPLEX32_ALTIVEC_H

 namespace Eigen {

 namespace internal {

 static Packet4ui  p4ui_CONJ_XOR = vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_ZERO_);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
-static Packet16uc p16uc_COMPLEX_RE   = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
-static Packet16uc p16uc_COMPLEX_IM   = vec_sld(p16uc_DUPLICATE, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
-static Packet16uc p16uc_COMPLEX_REV  = vec_sld(p16uc_REVERSE, p16uc_REVERSE, 8);//{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
-static Packet16uc p16uc_COMPLEX_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8);//{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
-static Packet16uc p16uc_PSET_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_COMPLEX_RE, (Packet4ui)p16uc_COMPLEX_IM);//{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
-static Packet16uc p16uc_PSET_LO = (Packet16uc) vec_mergel((Packet4ui)p16uc_COMPLEX_RE, (Packet4ui)p16uc_COMPLEX_IM);//{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
-static Packet16uc p16uc_COMPLEX_MASK16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8);//{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
-static Packet16uc p16uc_COMPLEX_TRANSPOSE_0 = vec_add(p16uc_PSET_HI, p16uc_COMPLEX_MASK16);//{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
-static Packet16uc p16uc_COMPLEX_TRANSPOSE_1 = vec_add(p16uc_PSET_LO, p16uc_COMPLEX_MASK16);//{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
+#ifdef _BIG_ENDIAN
+static Packet2ul  p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
+static Packet2ul  p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO,  (Packet4ui) p2d_ZERO_, 8);//{ 0x8000000000000000, 0x0000000000000000 };
+#else
+static Packet2ul  p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO,  (Packet4ui) p2d_ZERO_, 8);//{ 0x8000000000000000, 0x0000000000000000 };
+static Packet2ul  p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
+#endif

 //---------- float ----------
 struct Packet2cf
@ -65,7 +63,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<flo
    res.v = pload<Packet4f>((const float *)&from);
  else
    res.v = ploadu<Packet4f>((const float *)&from);
-  res.v = vec_perm(res.v, res.v, p16uc_PSET_HI);
+  res.v = vec_perm(res.v, res.v, p16uc_PSET64_HI);
  return res;
 }

@ -95,16 +93,16 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, con
  Packet4f v1, v2;

  // Permute and multiply the real parts of a and b
-  v1 = vec_perm(a.v, a.v, p16uc_COMPLEX_RE);
+  v1 = vec_perm(a.v, a.v, p16uc_PSET32_WODD);
  // Get the imaginary parts of a
-  v2 = vec_perm(a.v, a.v, p16uc_COMPLEX_IM);
+  v2 = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN);
  // multiply a_re * b 
  v1 = vec_madd(v1, b.v, p4f_ZERO);
  // multiply a_im * b and get the conjugate result
  v2 = vec_madd(v2, b.v, p4f_ZERO);
  v2 = (Packet4f) vec_xor((Packet4ui)v2, p4ui_CONJ_XOR);
  // permute back to a proper order
-  v2 = vec_perm(v2, v2, p16uc_COMPLEX_REV);
+  v2 = vec_perm(v2, v2, p16uc_COMPLEX32_REV);
  
  return Packet2cf(vec_add(v1, v2));
 }
@ -138,7 +136,7 @@ template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Pack
 template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
 {
  Packet4f rev_a;
-  rev_a = vec_perm(a.v, a.v, p16uc_COMPLEX_REV2);
+  rev_a = vec_perm(a.v, a.v, p16uc_COMPLEX32_REV2);
  return Packet2cf(rev_a);
 }

@ -153,9 +151,13 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packe
 template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs)
 {
  Packet4f b1, b2;
-  
+#ifdef _BIG_ENDIAN  
  b1 = (Packet4f) vec_sld(vecs[0].v, vecs[1].v, 8);
  b2 = (Packet4f) vec_sld(vecs[1].v, vecs[0].v, 8);
+#else
+  b1 = (Packet4f) vec_sld(vecs[1].v, vecs[0].v, 8);
+  b2 = (Packet4f) vec_sld(vecs[0].v, vecs[1].v, 8);
+#endif
  b2 = (Packet4f) vec_sld(b2, b2, 8);
  b2 = padd(b1, b2);

@ -179,7 +181,11 @@ struct palign_impl<Offset,Packet2cf>
  {
    if (Offset==1)
    {
+#ifdef _BIG_ENDIAN
      first.v = vec_sld(first.v, second.v, 8);
+#else
+      first.v = vec_sld(second.v, first.v, 8);
+#endif
    }
  }
 };
@ -222,23 +228,203 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, con
  // TODO optimize it for AltiVec
  Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a,b);
  Packet4f s = vec_madd(b.v, b.v, p4f_ZERO);
-  return Packet2cf(pdiv(res.v, vec_add(s,vec_perm(s, s, p16uc_COMPLEX_REV))));
+  return Packet2cf(pdiv(res.v, vec_add(s,vec_perm(s, s, p16uc_COMPLEX32_REV))));
 }

 template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& x)
 {
-  return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX_REV));
+  return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX32_REV));
 }

 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
 {
-  Packet4f tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_COMPLEX_TRANSPOSE_0);
-  kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_COMPLEX_TRANSPOSE_1);
+  Packet4f tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI);
+  kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO);
  kernel.packet[0].v = tmp;
 }

+//---------- double ----------
+#ifdef __VSX__
+struct Packet1cd
+{
+  EIGEN_STRONG_INLINE Packet1cd() {}
+  EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {}
+  Packet2d v;
+};
+
+template<> struct packet_traits<std::complex<double> >  : default_packet_traits
+{
+  typedef Packet1cd type;
+  typedef Packet1cd half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 0,
+    size = 1,
+    HasHalfPacket = 0,
+
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasSetLinear = 0
+  };
+};
+
+template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1}; typedef Packet1cd half; };
+
+template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>&  from)
+{ /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, DenseIndex stride)
+{
+  std::complex<double> EIGEN_ALIGN16 af[2];
+  af[0] = from[0*stride];
+  af[1] = from[1*stride];
+  return pload<Packet1cd>(af);
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, DenseIndex stride)
+{
+  std::complex<double> EIGEN_ALIGN16 af[2];
+  pstore<std::complex<double> >(af, from);
+  to[0*stride] = af[0];
+  to[1*stride] = af[1];
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_add(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_sub(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd((Packet2d)vec_xor((Packet2d)a.v, (Packet2d)p2ul_CONJ_XOR2)); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+  Packet2d a_re, a_im, v1, v2;
+
+  // Permute and multiply the real parts of a and b
+  a_re = vec_perm(a.v, a.v, p16uc_PSET64_HI);
+  // Get the imaginary parts of a
+  a_im = vec_perm(a.v, a.v, p16uc_PSET64_LO);
+  // multiply a_re * b
+  v1 = vec_madd(a_re, b.v, p2d_ZERO);
+  // multiply a_im * b and get the conjugate result
+  v2 = vec_madd(a_im, b.v, p2d_ZERO);
+  v2 = (Packet2d) vec_sld((Packet4ui)v2, (Packet4ui)v2, 8);
+  v2 = (Packet2d) vec_xor((Packet2d)v2, (Packet2d) p2ul_CONJ_XOR1);
+
+  return Packet1cd(vec_add(v1, v2));
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd pand   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd por    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>*     from)
+{
+  return pset1<Packet1cd>(*from);
+}
+
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { vec_dstt((long *)addr, DST_CTRL(2,2,32), DST_CHAN); }
+
+template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
+{
+  std::complex<double> EIGEN_ALIGN16 res[2];
+  pstore<std::complex<double> >(res, a);
+
+  return res[0];
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a)
+{
+  return pfirst(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs)
+{
+  return vecs[0];
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a)
+{
+  return pfirst(a);
+}
+
+template<int Offset>
+struct palign_impl<Offset,Packet1cd>
+{
+  static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/)
+  {
+    // FIXME is it sure we never have to align a Packet1cd?
+    // Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary...
+  }
+};
+
+template<> struct conj_helper<Packet1cd, Packet1cd, false,true>
+{
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
+  {
+    return internal::pmul(a, pconj(b));
+  }
+};
+
+template<> struct conj_helper<Packet1cd, Packet1cd, true,false>
+{
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
+  {
+    return internal::pmul(pconj(a), b);
+  }
+};
+
+template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
+{
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
+  {
+    return pconj(internal::pmul(a, b));
+  }
+};
+
+template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+  // TODO optimize it for AltiVec
+  Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b);
+  Packet2d s = vec_madd(b.v, b.v, p2d_ZERO_);
+  return Packet1cd(pdiv(res.v, vec_add(s,vec_perm(s, s, p16uc_COMPLEX32_REV))));
+}
+
+EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
+{
+  return Packet1cd(preverse(Packet2d(x.v)));
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd,2>& kernel)
+{
+  Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI);
+  kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO);
+  kernel.packet[0].v = tmp;
+}
+#endif // __VSX__
 } // end namespace internal

 } // end namespace Eigen

-#endif // EIGEN_COMPLEX_ALTIVEC_H
+#endif // EIGEN_COMPLEX32_ALTIVEC_H
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@ -18,17 +18,17 @@ namespace internal {
 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
 #endif

-#ifndef EIGEN_HAS_FUSED_MADD
-#define EIGEN_HAS_FUSED_MADD 1
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
 #endif

-#ifndef EIGEN_HAS_FUSE_CJMADD
-#define EIGEN_HAS_FUSE_CJMADD 1
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
 #endif

 // NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16
 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS  32
 #endif

 typedef __vector float          Packet4f;
@ -50,22 +50,20 @@ typedef __vector unsigned char  Packet16uc;
 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
  Packet4f p4f_##NAME = pset1<Packet4f>(X)

-#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
-  Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int>(X))
-
 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
  Packet4i p4i_##NAME = pset1<Packet4i>(X)

+#define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \
+  Packet2d p2d_##NAME = pset1<Packet2d>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet2l(NAME,X) \
+  Packet2l p2l_##NAME = pset1<Packet2l>(X)
+
 #define DST_CHAN 1
 #define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride))

-// Define global static constants:
-static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 };
-static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 };
-static Packet16uc p16uc_REVERSE = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3};
-static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0); //{ 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}
-static Packet16uc p16uc_DUPLICATE = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7};

+// These constants are endian-agnostic
 static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
 static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
 static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1); //{ 1, 1, 1, 1}
@ -74,6 +72,50 @@ static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1}
 static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0}
 static Packet4f p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000}

+static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 };
+static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 };
+
+static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 };
+static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 };
+
+// Mask alignment
+#ifdef __PPC64__
+#define _EIGEN_MASK_ALIGNMENT	0xfffffffffffffff0
+#else
+#define _EIGEN_MASK_ALIGNMENT	0xfffffff0
+#endif
+
+#define _EIGEN_ALIGNED_PTR(x)	((ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT)
+
+// Handle endianness properly while loading constants
+// Define global static constants:
+#ifdef _BIG_ENDIAN
+static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0); 
+static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
+static Packet16uc p16uc_PSET32_WODD   = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
+static Packet16uc p16uc_PSET32_WEVEN  = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
+static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8);      //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
+#else
+static Packet16uc p16uc_FORWARD = p16uc_REVERSE32; 
+static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
+static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
+static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
+static Packet16uc p16uc_HALF64_0_16 = vec_sld(vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 0), (Packet16uc)p4i_ZERO, 8);      //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
+#endif // _BIG_ENDIAN
+
+static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN);     //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
+static Packet16uc p16uc_PSET64_LO = (Packet16uc) vec_mergel((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN);     //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
+static Packet16uc p16uc_TRANSPOSE64_HI = vec_add(p16uc_PSET64_HI, p16uc_HALF64_0_16);                                         //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
+static Packet16uc p16uc_TRANSPOSE64_LO = vec_add(p16uc_PSET64_LO, p16uc_HALF64_0_16);                                         //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
+
+static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8);                                         //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
+
+#ifdef _BIG_ENDIAN
+static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8);                                            //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
+#else
+static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_PSET64_HI, p16uc_PSET64_LO, 8);                                            //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
+#endif // _BIG_ENDIAN
+
 template<> struct packet_traits<float>  : default_packet_traits
 {
  typedef Packet4f type;
@ -105,9 +147,22 @@ template<> struct packet_traits<int>    : default_packet_traits
  };
 };

+
 template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; typedef Packet4f half; };
 template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; typedef Packet4i half; };
-/*
+
+inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v)
+{
+  union {
+    Packet16uc   v;
+    unsigned char n[16];
+  } vt;
+  vt.v = v;
+  for (int i=0; i< 16; i++)
+    s << (int)vt.n[i] << ", ";
+  return s;
+}
+
 inline std::ostream & operator <<(std::ostream & s, const Packet4f & v)
 {
  union {
@ -140,7 +195,7 @@ inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v)
  s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
  return s;
 }
-
+/*
 inline std::ostream & operator <<(std::ostream & s, const Packetbi & v)
 {
  union {
@ -150,14 +205,21 @@ inline std::ostream & operator <<(std::ostream & s, const Packetbi & v)
  vt.v = v;
  s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
  return s;
-}
-*/
+}*/
+
+
+// Need to define them first or we get specialization after instantiation errors
+template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
+template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
+
+template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); }
+template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); }

 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) {
  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
  float EIGEN_ALIGN16 af[4];
  af[0] = from;
-  Packet4f vc = vec_ld(0, af);
+  Packet4f vc = pload<Packet4f>(af);
  vc = vec_splat(vc, 0);
  return vc;
 }
@ -165,17 +227,15 @@ template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) {
 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)   {
  int EIGEN_ALIGN16 ai[4];
  ai[0] = from;
-  Packet4i vc = vec_ld(0, ai);
+  Packet4i vc = pload<Packet4i>(ai);
  vc = vec_splat(vc, 0);
  return vc;
 }
-
-
 template<> EIGEN_STRONG_INLINE void
 pbroadcast4<Packet4f>(const float *a,
                      Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
 {
-  a3 = vec_ld(0,a);
+  a3 = pload<Packet4f>(a);
  a0 = vec_splat(a3, 0);
  a1 = vec_splat(a3, 1);
  a2 = vec_splat(a3, 2);
@ -185,7 +245,7 @@ template<> EIGEN_STRONG_INLINE void
 pbroadcast4<Packet4i>(const int *a,
                      Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3)
 {
-  a3 = vec_ld(0,a);
+  a3 = pload<Packet4i>(a);
  a0 = vec_splat(a3, 0);
  a1 = vec_splat(a3, 1);
  a2 = vec_splat(a3, 2);
@ -199,7 +259,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const floa
  af[1] = from[1*stride];
  af[2] = from[2*stride];
  af[3] = from[3*stride];
- return vec_ld(0, af);
+ return pload<Packet4f>(af);
 }
 template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, DenseIndex stride)
 {
@ -208,12 +268,12 @@ template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* f
  ai[1] = from[1*stride];
  ai[2] = from[2*stride];
  ai[3] = from[3*stride];
- return vec_ld(0, ai);
+ return pload<Packet4i>(ai);
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, DenseIndex stride)
 {
  float EIGEN_ALIGN16 af[4];
-  vec_st(from, 0, af);
+  pstore<float>(af, from);
  to[0*stride] = af[0];
  to[1*stride] = af[1];
  to[2*stride] = af[2];
@ -222,7 +282,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, co
 template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, DenseIndex stride)
 {
  int EIGEN_ALIGN16 ai[4];
-  vec_st(from, 0, ai);
+  pstore<int>((int *)ai, from);
  to[0*stride] = ai[0];
  to[1*stride] = ai[1];
  to[2*stride] = ai[2];
@ -283,7 +343,8 @@ template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const
 */
 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
 {
-  Packet4f t, y_0, y_1, res;
+#ifndef __VSX__  // VSX actually provides a div instruction
+  Packet4f t, y_0, y_1;

  // Altivec does not offer a divide instruction, we have to do a reciprocal approximation
  y_0 = vec_re(b);
@ -292,8 +353,10 @@ template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const
  t   = vec_nmsub(y_0, b, p4f_ONE);
  y_1 = vec_madd(y_0, t, y_0);

-  res = vec_madd(a, y_1, p4f_ZERO);
-  return res;
+  return vec_madd(a, y_1, p4f_ZERO);
+#else
+  return vec_div(a, b);
+#endif
 }

 template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
@ -311,7 +374,6 @@ template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const
 template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_max(a, b); }
 template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }

-// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
 template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
 template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }

@ -324,13 +386,10 @@ template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const
 template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); }
 template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); }

-template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
-template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
-
+#ifdef _BIG_ENDIAN
 template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
 {
  EIGEN_DEBUG_ALIGNED_LOAD
-  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
  Packet16uc MSQ, LSQ;
  Packet16uc mask;
  MSQ = vec_ld(0, (unsigned char *)from);          // most significant quadword
@ -350,25 +409,36 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
  mask = vec_lvsl(0, from);                        // create the permute mask
  return (Packet4i) vec_perm(MSQ, LSQ, mask);    // align the data
 }
+#else
+// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX
+template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
+{
+  EIGEN_DEBUG_ALIGNED_LOAD
+  return (Packet4i) vec_vsx_ld((long)from & 15, (const int*) _EIGEN_ALIGNED_PTR(from));
+}
+template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
+{
+  EIGEN_DEBUG_ALIGNED_LOAD
+  return (Packet4f) vec_vsx_ld((long)from & 15, (const float*) _EIGEN_ALIGNED_PTR(from));
+}
+#endif

 template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*   from)
 {
  Packet4f p;
  if((ptrdiff_t(from) % 16) == 0)  p = pload<Packet4f>(from);
  else                             p = ploadu<Packet4f>(from);
-  return vec_perm(p, p, p16uc_DUPLICATE);
+  return vec_perm(p, p, p16uc_DUPLICATE32_HI);
 }
 template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
 {
  Packet4i p;
  if((ptrdiff_t(from) % 16) == 0)  p = pload<Packet4i>(from);
  else                             p = ploadu<Packet4i>(from);
-  return vec_perm(p, p, p16uc_DUPLICATE);
+  return vec_perm(p, p, p16uc_DUPLICATE32_HI);
 }

-template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); }
-template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); }
-
+#ifdef _BIG_ENDIAN
 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& from)
 {
  EIGEN_DEBUG_UNALIGNED_STORE
@ -405,15 +475,30 @@ template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& f
  vec_st( LSQ, 15, (unsigned char *)to );                   // Store the LSQ part first
  vec_st( MSQ, 0, (unsigned char *)to );                    // Store the MSQ part
 }
+#else
+// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX
+template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*       to, const Packet4i& from)
+{
+  EIGEN_DEBUG_ALIGNED_STORE
+  vec_vsx_st(from, (long)to & 15, (int*) _EIGEN_ALIGNED_PTR(to));
+}
+template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet4f& from)
+{
+  EIGEN_DEBUG_ALIGNED_STORE
+  vec_vsx_st(from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to));
+}
+#endif

+#ifndef __VSX__
 template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); }
 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*     addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); }
+#endif

 template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; }
 template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int   EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; }

-template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { return (Packet4f)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE); }
-template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { return (Packet4i)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE); }
+template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { return (Packet4f)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE32); }
+template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { return (Packet4i)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE32); }

 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); }
 template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); }
@ -460,7 +545,11 @@ template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
 {
  Packet4i sum;
  sum = vec_sums(a, p4i_ZERO);
+#ifdef _BIG_ENDIAN
  sum = vec_sld(sum, p4i_ZERO, 12);
+#else
+  sum = vec_sld(p4i_ZERO, sum, 4);
+#endif
  return pfirst(sum);
 }

@ -547,8 +636,25 @@ struct palign_impl<Offset,Packet4f>
 {
  static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
  {
-    if (Offset!=0)
-      first = vec_sld(first, second, Offset*4);
+#ifdef _BIG_ENDIAN
+    switch (Offset % 4) {
+    case 1:
+      first = vec_sld(first, second, 4); break;
+    case 2:
+      first = vec_sld(first, second, 8); break;
+    case 3:
+      first = vec_sld(first, second, 12); break;
+    }
+#else
+    switch (Offset % 4) {
+    case 1:
+      first = vec_sld(second, first, 12); break;
+    case 2:
+      first = vec_sld(second, first, 8); break;
+    case 3:
+      first = vec_sld(second, first, 4); break;
+    }
+#endif
  }
 };

@ -557,8 +663,25 @@ struct palign_impl<Offset,Packet4i>
 {
  static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
  {
-    if (Offset!=0)
-      first = vec_sld(first, second, Offset*4);
+#ifdef _BIG_ENDIAN
+    switch (Offset % 4) {
+    case 1:
+      first = vec_sld(first, second, 4); break;
+    case 2:
+      first = vec_sld(first, second, 8); break;
+    case 3:
+      first = vec_sld(first, second, 12); break;
+    }
+#else
+    switch (Offset % 4) {
+    case 1:
+      first = vec_sld(second, first, 12); break;
+    case 2:
+      first = vec_sld(second, first, 8); break;
+    case 3:
+      first = vec_sld(second, first, 4); break;
+    }
+#endif
  }
 };

@ -588,6 +711,222 @@ ptranspose(PacketBlock<Packet4i,4>& kernel) {
  kernel.packet[3] = vec_mergel(t1, t3);
 }

+
+//---------- double ----------
+#ifdef __VSX__
+typedef __vector double              Packet2d;
+typedef __vector unsigned long long  Packet2ul;
+typedef __vector long long           Packet2l;
+
+static Packet2l p2l_ZERO = (Packet2l) p4i_ZERO;
+static Packet2d p2d_ONE = { 1.0, 1.0 }; 
+static Packet2d p2d_ZERO = (Packet2d) p4f_ZERO;
+static Packet2d p2d_ZERO_ = { -0.0, -0.0 };
+
+#ifdef _BIG_ENDIAN
+static Packet2d p2d_COUNTDOWN = (Packet2d) vec_sld((Packet16uc) p2d_ZERO, (Packet16uc) p2d_ONE, 8);
+#else
+static Packet2d p2d_COUNTDOWN = (Packet2d) vec_sld((Packet16uc) p2d_ONE, (Packet16uc) p2d_ZERO, 8);
+#endif
+
+static EIGEN_STRONG_INLINE Packet2d vec_splat_dbl(Packet2d& a, int index)
+{
+  switch (index) {
+  case 0:
+    return (Packet2d) vec_perm(a, a, p16uc_PSET64_HI);
+  case 1:
+    return (Packet2d) vec_perm(a, a, p16uc_PSET64_LO);
+  }
+  return a;
+}
+
+template<> struct packet_traits<double> : default_packet_traits
+{
+  typedef Packet2d type;
+  typedef Packet2d half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=2,
+    HasHalfPacket = 0,
+
+    HasDiv  = 1,
+    HasExp  = 0,
+    HasSqrt = 0
+  };
+};
+
+template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2}; typedef Packet2d half; };
+
+
+inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
+{
+  union {
+    Packet2d   v;
+    double n[2];
+  } vt;
+  vt.v = v;
+  s << vt.n[0] << ", " << vt.n[1];
+  return s;
+}
+
+// Need to define them first or we get specialization after instantiation errors
+template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return (Packet2d) vec_ld(0, (const float *) from); } //FIXME
+
+template<> EIGEN_STRONG_INLINE void pstore<double>(double*   to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st((Packet4f)from, 0, (float *)to); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double&  from) {
+  double EIGEN_ALIGN16 af[2];
+  af[0] = from;
+  Packet2d vc = pload<Packet2d>(af);
+  vc = vec_splat_dbl(vc, 0);
+  return vc;
+}
+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet2d>(const double *a,
+                      Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
+{
+  a1 = pload<Packet2d>(a);
+  a0 = vec_splat_dbl(a1, 0);
+  a1 = vec_splat_dbl(a1, 1);
+  a3 = pload<Packet2d>(a+2);
+  a2 = vec_splat_dbl(a3, 0);
+  a3 = vec_splat_dbl(a3, 1);
+}
+template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, DenseIndex stride)
+{
+  double EIGEN_ALIGN16 af[2];
+  af[0] = from[0*stride];
+  af[1] = from[1*stride];
+ return pload<Packet2d>(af);
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, DenseIndex stride)
+{
+  double EIGEN_ALIGN16 af[2];
+  pstore<double>(af, from);
+  to[0*stride] = af[0];
+  to[1*stride] = af[1];
+}
+template<> EIGEN_STRONG_INLINE Packet2d plset<double>(const double& a) { return vec_add(pset1<Packet2d>(a), p2d_COUNTDOWN); }
+
+template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_add(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_sub(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return psub<Packet2d>(p2d_ZERO, a); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_madd(a,b,p2d_ZERO); }
+template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_div(a,b); }
+
+// for some weird raisons, it has to be overloaded for packet of integers
+template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); }
+
+template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
+{
+  EIGEN_DEBUG_ALIGNED_LOAD
+  return (Packet2d) vec_vsx_ld((long)from & 15, (const float*) _EIGEN_ALIGNED_PTR(from));
+}
+template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*   from)
+{
+  Packet2d p;
+  if((ptrdiff_t(from) % 16) == 0)  p = pload<Packet2d>(from);
+  else                             p = ploadu<Packet2d>(from);
+  return vec_perm(p, p, p16uc_PSET64_HI);
+}
+
+template<> EIGEN_STRONG_INLINE void pstoreu<double>(double*  to, const Packet2d& from)
+{
+  EIGEN_DEBUG_ALIGNED_STORE
+  vec_vsx_st((Packet4f)from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to));
+}
+
+template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { vec_dstt((const float *) addr, DST_CTRL(2,2,32), DST_CHAN); }
+
+template<> EIGEN_STRONG_INLINE double  pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; }
+
+template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return (Packet2d)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE64); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); }
+
+template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
+{
+  Packet2d b, sum;
+  b   = (Packet2d) vec_sld((Packet4ui) a, (Packet4ui)a, 8);
+  sum = vec_add(a, b);
+  return pfirst(sum);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
+{
+  Packet2d v[2], sum;
+  v[0] = vec_add(vecs[0], (Packet2d) vec_sld((Packet4ui) vecs[0], (Packet4ui) vecs[0], 8));
+  v[1] = vec_add(vecs[1], (Packet2d) vec_sld((Packet4ui) vecs[1], (Packet4ui) vecs[1], 8));
+ 
+#ifdef _BIG_ENDIAN
+ sum = (Packet2d) vec_sld((Packet4ui) v[0], (Packet4ui) v[1], 8);
+#else
+  sum = (Packet2d) vec_sld((Packet4ui) v[1], (Packet4ui) v[0], 8);
+#endif
+
+  return sum;
+}
+// Other reduction functions:
+// mul
+template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
+{
+  return pfirst(pmul(a, (Packet2d)vec_sld((Packet4ui) a, (Packet4ui) a, 8)));
+}
+
+// min
+template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
+{
+  return pfirst(vec_min(a, (Packet2d) vec_sld((Packet4ui) a, (Packet4ui) a, 8)));
+}
+
+// max
+template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
+{
+  return pfirst(vec_max(a, (Packet2d) vec_sld((Packet4ui) a, (Packet4ui) a, 8)));
+}
+
+template<int Offset>
+struct palign_impl<Offset,Packet2d>
+{
+  static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second)
+  {
+    if (Offset == 1)
+#ifdef _BIG_ENDIAN
+      first = (Packet2d) vec_sld((Packet4ui) first, (Packet4ui) second, 8);
+#else
+      first = (Packet2d) vec_sld((Packet4ui) second, (Packet4ui) first, 8);
+#endif
+  }
+};
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet2d,2>& kernel) {
+  Packet2d t0, t1;
+  t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI);
+  t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO);
+  kernel.packet[0] = t0;
+  kernel.packet[1] = t1;
+}
+
+#endif // __VSX__
 } // end namespace internal

 } // end namespace Eigen
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h
@ -33,6 +33,7 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
    Vectorizable = 1,
    AlignedOnScalar = 1,
    size = 2,
+    HasHalfPacket = 0,

    HasAdd    = 1,
    HasSub    = 1,
@ -88,7 +89,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, con

 template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
-  return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v))));
+  return Packet2cf(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v))));
 }
 template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
@ -252,7 +253,7 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>

 template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
-  // TODO optimize it for AltiVec
+  // TODO optimize it for NEON
  Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a,b);
  Packet4f s, rev_s;

@ -265,11 +266,198 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, con

 EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet2cf,2>& kernel) {
-  float32x4_t tmp = vcombine_f32(vget_high_f32(kernel.packet[0].v), vget_high_f32(kernel.packet[1].v));
+  Packet4f tmp = vcombine_f32(vget_high_f32(kernel.packet[0].v), vget_high_f32(kernel.packet[1].v));
  kernel.packet[0].v = vcombine_f32(vget_low_f32(kernel.packet[0].v), vget_low_f32(kernel.packet[1].v));
  kernel.packet[1].v = tmp;
 }

+//---------- double ----------
+#if EIGEN_ARCH_ARM64
+
+static uint64x2_t p2ul_CONJ_XOR = EIGEN_INIT_NEON_PACKET2(0x0, 0x8000000000000000);
+
+struct Packet1cd
+{
+  EIGEN_STRONG_INLINE Packet1cd() {}
+  EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {}
+  Packet2d v;
+};
+
+template<> struct packet_traits<std::complex<double> >  : default_packet_traits
+{
+  typedef Packet1cd type;
+  typedef Packet1cd half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 0,
+    size = 1,
+    HasHalfPacket = 0,
+
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasSetLinear = 0
+  };
+};
+
+template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1}; typedef Packet1cd half; };
+
+template<> EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>&  from)
+{ /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(padd<Packet2d>(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(psub<Packet2d>(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate<Packet2d>(a.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v), p2ul_CONJ_XOR))); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+  Packet2d v1, v2;
+
+  // Get the real values of a 
+  v1 = vdupq_lane_f64(vget_low_f64(a.v), 0);
+  // Get the real values of a 
+  v2 = vdupq_lane_f64(vget_high_f64(a.v), 1);
+  // Multiply the real a with b
+  v1 = vmulq_f64(v1, b.v);
+  // Multiply the imag a with b
+  v2 = vmulq_f64(v2, b.v);
+  // Conjugate v2 
+  v2 = vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(v2), p2ul_CONJ_XOR));
+  // Swap real/imag elements in v2.
+  v2 = preverse<Packet2d>(v2);
+  // Add and return the result
+  return Packet1cd(vaddq_f64(v1, v2));
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd pand   <Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+  return Packet1cd(vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v))));
+}
+template<> EIGEN_STRONG_INLINE Packet1cd por    <Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+  return Packet1cd(vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v))));
+}
+template<> EIGEN_STRONG_INLINE Packet1cd pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+  return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v))));
+}
+template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+  return Packet1cd(vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v))));
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) { return pset1<Packet1cd>(*from); }
+
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
+
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { EIGEN_ARM_PREFETCH((double *)addr); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, DenseIndex stride)
+{
+  Packet2d res;
+  res = vsetq_lane_f64(std::real(from[0*stride]), res, 0);
+  res = vsetq_lane_f64(std::imag(from[0*stride]), res, 1);
+  return Packet1cd(res);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, DenseIndex stride)
+{
+  to[stride*0] = std::complex<double>(vgetq_lane_f64(from.v, 0), vgetq_lane_f64(from.v, 1));
+}
+
+
+template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
+{
+  std::complex<double> EIGEN_ALIGN16 res;
+  pstore<std::complex<double> >(&res, a);
+
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs) { return vecs[0]; }
+
+template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
+
+template<int Offset>
+struct palign_impl<Offset,Packet1cd>
+{
+  static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/)
+  {
+    // FIXME is it sure we never have to align a Packet1cd?
+    // Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary...
+  }
+};
+
+template<> struct conj_helper<Packet1cd, Packet1cd, false,true>
+{
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
+  {
+    return internal::pmul(a, pconj(b));
+  }
+};
+
+template<> struct conj_helper<Packet1cd, Packet1cd, true,false>
+{
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
+  {
+    return internal::pmul(pconj(a), b);
+  }
+};
+
+template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
+{
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
+  {
+    return pconj(internal::pmul(a, b));
+  }
+};
+
+template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+  // TODO optimize it for NEON
+  Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b);
+  Packet2d s = pmul<Packet2d>(b.v, b.v);
+  Packet2d rev_s = preverse<Packet2d>(s);
+
+  return Packet1cd(pdiv(res.v, padd<Packet2d>(s,rev_s)));
+}
+
+EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
+{
+  return Packet1cd(preverse(Packet2d(x.v)));
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd,2>& kernel)
+{
+  Packet2d tmp = vcombine_f64(vget_high_f64(kernel.packet[0].v), vget_high_f64(kernel.packet[1].v));
+  kernel.packet[0].v = vcombine_f64(vget_low_f64(kernel.packet[0].v), vget_low_f64(kernel.packet[1].v));
+  kernel.packet[1].v = tmp;
+}
+#endif // EIGEN_ARCH_ARM64

 } // end namespace internal

--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@ -20,14 +20,24 @@ namespace internal {
 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
 #endif

+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#endif
+
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
+#endif
+
 // FIXME NEON has 16 quad registers, but since the current register allocator
 // is so bad, it is much better to reduce it to 8
 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 8
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 
 #endif

+typedef float32x2_t Packet2f;
 typedef float32x4_t Packet4f;
 typedef int32x4_t   Packet4i;
+typedef int32x2_t   Packet2i;
 typedef uint32x4_t  Packet4ui;

 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
@ -39,7 +49,7 @@ typedef uint32x4_t  Packet4ui;
 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
  const Packet4i p4i_##NAME = pset1<Packet4i>(X)

-#if defined(__llvm__) && !defined(__clang__)
+#if EIGEN_COMP_LLVM && !EIGEN_COMP_CLANG
  //Special treatment for Apple's llvm-gcc, its NEON packet types are unions
  #define EIGEN_INIT_NEON_PACKET2(X, Y)       {{X, Y}}
  #define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {{X, Y, Z, W}}
@ -52,11 +62,11 @@ typedef uint32x4_t  Packet4ui;

 // arm64 does have the pld instruction. If available, let's trust the __builtin_prefetch built-in function
 // which available on LLVM and GCC (at least)
-#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || defined(__GNUC__)
+#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
  #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR);
 #elif defined __pld
  #define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR)
-#elif !defined(__aarch64__)
+#elif !EIGEN_ARCH_ARM64
  #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ( "   pld [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
 #else
  // by default no explicit prefetching
@ -66,11 +76,12 @@ typedef uint32x4_t  Packet4ui;
 template<> struct packet_traits<float>  : default_packet_traits
 {
  typedef Packet4f type;
-  typedef Packet4f half;
+  typedef Packet2f half;
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 1,
    size = 4,
+    HasHalfPacket=1,
   
    HasDiv  = 1,
    // FIXME check the Has*
@ -84,16 +95,17 @@ template<> struct packet_traits<float>  : default_packet_traits
 template<> struct packet_traits<int>    : default_packet_traits
 {
  typedef Packet4i type;
-  typedef Packet4i half;
+  typedef Packet2i half;
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 1,
-    size=4
+    size=4,
+    HasHalfPacket=1
    // FIXME check the Has*
  };
 };

-#if EIGEN_GNUC_AT_MOST(4,4) && !defined(__llvm__)
+#if EIGEN_GNUC_AT_MOST(4,4) && !EIGEN_COMP_LLVM
 // workaround gcc 4.2, 4.3 and 4.4 compilatin issue
 EIGEN_STRONG_INLINE float32x4_t vld1q_f32(const float* x) { return ::vld1q_f32((const float32_t*)x); }
 EIGEN_STRONG_INLINE float32x2_t vld1_f32 (const float* x) { return ::vld1_f32 ((const float32_t*)x); }
@ -136,6 +148,9 @@ template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const

 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
 {
+#if EIGEN_ARCH_ARM64
+  return vdivq_f32(a,b);
+#else
  Packet4f inv, restep, div;

  // NEON does not offer a divide instruction, we have to do a reciprocal approximation
@ -154,14 +169,27 @@ template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const
  div = vmulq_f32(a, inv);

  return div;
+#endif
 }
+
 template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
 { eigen_assert(false && "packet integer division are not supported by NEON");
  return pset1<Packet4i>(0);
 }

-// for some weird raisons, it has to be overloaded for packet of integers
+#ifdef __ARM_FEATURE_FMA
+// See bug 936.
+// FMA is available on VFPv4 i.e. when compiling with -mfpu=neon-vfpv4.
+// FMA is a true fused multiply-add i.e. only 1 rounding at the end, no intermediate rounding.
+// MLA is not fused i.e. does 2 roundings.
+// In addition to giving better accuracy, FMA also gives better performance here on a Krait (Nexus 4):
+// MLA: 10 GFlop/s ; FMA: 12 GFlops/s.
+template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vfmaq_f32(c,a,b); }
+#else
 template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vmlaq_f32(c,a,b); }
+#endif
+
+// No FMA instruction for int, so use MLA unconditionally.
 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return vmlaq_s32(c,a,b); }

 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); }
@ -472,6 +500,193 @@ ptranspose(PacketBlock<Packet4i,4>& kernel) {
  kernel.packet[3] = vcombine_s32(vget_high_s32(tmp1.val[1]), vget_high_s32(tmp2.val[1]));
 }

+//---------- double ----------
+#if EIGEN_ARCH_ARM64
+
+#if (EIGEN_COMP_GNUC_STRICT && defined(__ANDROID__)) || defined(__apple_build_version__)
+// Bug 907: workaround missing declarations of the following two functions in the ADK
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_u64_f64 (float64x2_t __a)
+{
+  return (uint64x2_t) __a;
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_f64_u64 (uint64x2_t __a)
+{
+  return (float64x2_t) __a;
+}
+#endif
+
+typedef float64x2_t Packet2d;
+typedef float64x1_t Packet1d;
+
+template<> struct packet_traits<double>  : default_packet_traits
+{
+  typedef Packet2d type;
+  typedef Packet1d half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+    HasHalfPacket=1,
+   
+    HasDiv  = 1,
+    // FIXME check the Has*
+    HasSin  = 0,
+    HasCos  = 0,
+    HasLog  = 0,
+    HasExp  = 0,
+    HasSqrt = 0
+  };
+};
+
+template<> struct unpacket_traits<Packet2d> { typedef double  type; enum {size=2}; typedef Packet2d half; };
+
+template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double&  from) { return vdupq_n_f64(from); }
+
+template<> EIGEN_STRONG_INLINE Packet2d plset<double>(const double& a)
+{
+  Packet2d countdown = EIGEN_INIT_NEON_PACKET2(0, 1);
+  return vaddq_f64(pset1<Packet2d>(a), countdown);
+}
+template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return vaddq_f64(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return vsubq_f64(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return vnegq_f64(a); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vmulq_f64(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); }
+
+#ifdef __ARM_FEATURE_FMA
+// See bug 936. See above comment about FMA for float.
+template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vfmaq_f64(c,a,b); }
+#else
+template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vmlaq_f64(c,a,b); }
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vminq_f64(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vmaxq_f64(a,b); }
+
+// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
+template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b)
+{
+  return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b)
+{
+  return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b)
+{
+  return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b)
+{
+  return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); }
+
+template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from); }
+
+template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*   from)
+{
+  return vld1q_dup_f64(from);
+}
+template<> EIGEN_STRONG_INLINE void pstore<double>(double*   to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to, from); }
+
+template<> EIGEN_STRONG_INLINE void pstoreu<double>(double*  to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to, from); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, DenseIndex stride)
+{
+  Packet2d res;
+  res = vsetq_lane_f64(from[0*stride], res, 0);
+  res = vsetq_lane_f64(from[1*stride], res, 1);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, DenseIndex stride)
+{
+  to[stride*0] = vgetq_lane_f64(from, 0);
+  to[stride*1] = vgetq_lane_f64(from, 1);
+}
+template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_ARM_PREFETCH(addr); }
+
+// FIXME only store the 2 first elements ?
+template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(a, 0); }
+
+template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); }
+
+#if EIGEN_COMP_CLANG && defined(__apple_build_version__)
+// workaround ICE, see bug 907
+template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return (vget_low_f64(a) + vget_high_f64(a))[0]; }
+#else
+template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); }
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
+{
+  float64x2_t trn1, trn2;
+
+  // NEON zip performs interleaving of the supplied vectors.
+  // We perform two interleaves in a row to acquire the transposed vector
+  trn1 = vzip1q_f64(vecs[0], vecs[1]);
+  trn2 = vzip2q_f64(vecs[0], vecs[1]);
+
+  // Do the addition of the resulting vectors
+  return vaddq_f64(trn1, trn2);
+}
+// Other reduction functions:
+// mul
+#if EIGEN_COMP_CLANG && defined(__apple_build_version__)
+template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return (vget_low_f64(a) * vget_high_f64(a))[0]; }
+#else
+template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); }
+#endif
+
+// min
+template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(vpminq_f64(a, a), 0); }
+
+// max
+template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(vpmaxq_f64(a, a), 0); }
+
+// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors,
+// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074
+#define PALIGN_NEON(Offset,Type,Command) \
+template<>\
+struct palign_impl<Offset,Type>\
+{\
+    EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\
+    {\
+        if (Offset!=0)\
+            first = Command(first, second, Offset);\
+    }\
+};\
+
+PALIGN_NEON(0,Packet2d,vextq_f64)
+PALIGN_NEON(1,Packet2d,vextq_f64)
+#undef PALIGN_NEON
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet2d,2>& kernel) {
+  float64x2_t trn1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]);
+  float64x2_t trn2 = vzip2q_f64(kernel.packet[0], kernel.packet[1]);
+
+  kernel.packet[0] = trn1;
+  kernel.packet[1] = trn2;
+}
+#endif // EIGEN_ARCH_ARM64 
+
 } // end namespace internal

 } // end namespace Eigen
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@ -52,7 +52,7 @@ Packet4f plog<Packet4f>(const Packet4f& _x)

  Packet4i emm0;

-  Packet4f invalid_mask = _mm_cmplt_ps(x, _mm_setzero_ps());
+  Packet4f invalid_mask = _mm_cmpnge_ps(x, _mm_setzero_ps()); // not greater equal is true if x is NaN
  Packet4f iszero_mask = _mm_cmpeq_ps(x, _mm_setzero_ps());

  x = pmax(x, p4f_min_norm_pos);  /* cut off denormalized stuff */
@ -167,7 +167,7 @@ Packet4f pexp<Packet4f>(const Packet4f& _x)
  emm0 = _mm_cvttps_epi32(fx);
  emm0 = _mm_add_epi32(emm0, p4i_0x7f);
  emm0 = _mm_slli_epi32(emm0, 23);
-  return pmul(y, Packet4f(_mm_castsi128_ps(emm0)));
+  return pmax(pmul(y, Packet4f(_mm_castsi128_ps(emm0))), _x);
 }
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet2d pexp<Packet2d>(const Packet2d& _x)
@ -241,7 +241,7 @@ Packet2d pexp<Packet2d>(const Packet2d& _x)
  emm0 = _mm_add_epi32(emm0, p4i_1023_0);
  emm0 = _mm_slli_epi32(emm0, 20);
  emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3));
-  return pmul(x, Packet2d(_mm_castsi128_pd(emm0)));
+  return pmax(pmul(x, Packet2d(_mm_castsi128_pd(emm0))), _x);
 }

 /* evaluation of 4 sines at onces, using SSE2 intrinsics.
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@ -22,13 +22,13 @@ namespace internal {
 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
 #endif

-#ifdef EIGEN_VECTORIZE_FMA
-#ifndef EIGEN_HAS_FUSED_MADD
-#define EIGEN_HAS_FUSED_MADD 1
+#ifdef __FMA__
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1
 #endif
 #endif

-#if defined EIGEN_VECTORIZE_AVX && defined __GNUC__ && !(defined __clang__ || defined __INTEL_COMPILER)
+#if defined EIGEN_VECTORIZE_AVX && EIGEN_COMP_GNUC_STRICT
 // With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot
 // have overloads for both types without linking error.
 // One solution is to increase ABI version using -fabi-version=4 (or greater).
@ -147,7 +147,7 @@ template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4}
 template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2}; typedef Packet2d half; };
 template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; typedef Packet4i half; };

-#if defined(_MSC_VER) && (_MSC_VER==1500)
+#if EIGEN_COMP_MSVC==1500
 // Workaround MSVC 9 internal compiler error.
 // TODO: It has been detected with win64 builds (amd64), so let's check whether it also happens in 32bits+SSE mode
 // TODO: let's check whether there does not exist a better fix, like adding a pset0() function. (it crashed on pset1(0)).
@ -165,7 +165,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from) { re
 // Using inline assembly is also not an option because then gcc fails to reorder properly the instructions.
 // Therefore, we introduced the pload1 functions to be used in product kernels for which bug 203 does not apply.
 // Also note that with AVX, we want it to generate a vbroadcastss.
-#if (defined __GNUC__) && (!defined __INTEL_COMPILER) && (!defined __clang__) && (!defined __AVX__)
+#if EIGEN_COMP_GNUC_STRICT && (!defined __AVX__)
 template<> EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float *from) {
  return vec4f_swizzle1(_mm_load_ss(from),0,0,0,0);
 }
@ -282,10 +282,10 @@ template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float*   from) { E
 template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double*  from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
 template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }

-#if defined(_MSC_VER)
+#if EIGEN_COMP_MSVC
  template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float*  from) {
    EIGEN_DEBUG_UNALIGNED_LOAD
-    #if (_MSC_VER==1600)
+    #if (EIGEN_COMP_MSVC==1600)
    // NOTE Some version of MSVC10 generates bad code when using _mm_loadu_ps
    // (i.e., it does not generate an unaligned load!!
    // TODO On most architectures this version should also be faster than a single _mm_loadu_ps
@ -307,11 +307,11 @@ template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { E
 // TODO: do the same for MSVC (ICC is compatible)
 // NOTE: with the code below, MSVC's compiler crashes!

-#if defined(__GNUC__) && (defined(__i386__) || (defined(__x86_64) && EIGEN_GNUC_AT_LEAST(4, 8)))
+#if EIGEN_COMP_GNUC && (EIGEN_ARCH_i386 || (EIGEN_ARCH_x86_64 && EIGEN_GNUC_AT_LEAST(4, 8)))
  // bug 195: gcc/i386 emits weird x87 fldl/fstpl instructions for _mm_load_sd
  #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1
  #define EIGEN_AVOID_CUSTOM_UNALIGNED_STORES 1
-#elif defined(__clang__)
+#elif EIGEN_COMP_CLANG
  // bug 201: Segfaults in __mm_loadh_pd with clang 2.8
  #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1
  #define EIGEN_AVOID_CUSTOM_UNALIGNED_STORES 0
@ -439,13 +439,13 @@ template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_p
 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
 #endif

-#if defined(_MSC_VER) && defined(_WIN64) && !defined(__INTEL_COMPILER)
+#if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64
 // The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
 // Direct of the struct members fixed bug #62.
 template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { return a.m128_f32[0]; }
 template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return a.m128d_f64[0]; }
 template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; }
-#elif defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#elif EIGEN_COMP_MSVC_STRICT
 // The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
 template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float x = _mm_cvtss_f32(a); return x; }
 template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double x = _mm_cvtsd_f64(a); return x; }
@ -680,7 +680,7 @@ template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
 #endif // EIGEN_VECTORIZE_SSE4_1
 }

-#if (defined __GNUC__)
+#if EIGEN_COMP_GNUC
 // template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f&  a, const Packet4f&  b, const Packet4f&  c)
 // {
 //   Packet4f res = b;
--- a/Eigen/src/Core/functors/AssignmentFunctors.h
+++ b/Eigen/src/Core/functors/AssignmentFunctors.h
@ -31,7 +31,7 @@ template<typename Scalar>
 struct functor_traits<assign_op<Scalar> > {
  enum {
    Cost = NumTraits<Scalar>::ReadCost,
-    PacketAccess = packet_traits<Scalar>::IsVectorized
+    PacketAccess = packet_traits<Scalar>::Vectorizable
  };
 };

@ -73,7 +73,7 @@ template<typename Scalar>
 struct functor_traits<sub_assign_op<Scalar> > {
  enum {
    Cost = NumTraits<Scalar>::ReadCost + NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasAdd
+    PacketAccess = packet_traits<Scalar>::HasSub
  };
 };

@ -81,22 +81,24 @@ struct functor_traits<sub_assign_op<Scalar> > {
  * \brief Template functor for scalar/packet assignment with multiplication
  *
  */
-template<typename Scalar> struct mul_assign_op {
+template<typename DstScalar, typename SrcScalar=DstScalar>
+struct mul_assign_op {

  EIGEN_EMPTY_STRUCT_CTOR(mul_assign_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const { a *= b; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a *= b; }
  
  template<int Alignment, typename Packet>
-  EIGEN_STRONG_INLINE void assignPacket(Scalar* a, const Packet& b) const
-  { internal::pstoret<Scalar,Packet,Alignment>(a,internal::pmul(internal::ploadt<Packet,Alignment>(a),b)); }
+  EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const
+  { internal::pstoret<DstScalar,Packet,Alignment>(a,internal::pmul(internal::ploadt<Packet,Alignment>(a),b)); }
 };
-template<typename Scalar>
-struct functor_traits<mul_assign_op<Scalar> > {
+template<typename DstScalar, typename SrcScalar>
+struct functor_traits<mul_assign_op<DstScalar,SrcScalar> > {
  enum {
-    Cost = NumTraits<Scalar>::ReadCost + NumTraits<Scalar>::MulCost,
-    PacketAccess = packet_traits<Scalar>::HasMul
+    Cost = NumTraits<DstScalar>::ReadCost + NumTraits<DstScalar>::MulCost,
+    PacketAccess = is_same<DstScalar,SrcScalar>::value && packet_traits<DstScalar>::HasMul
  };
 };
+template<typename DstScalar,typename SrcScalar> struct functor_is_product_like<mul_assign_op<DstScalar,SrcScalar> > { enum { ret = 1 }; };

 /** \internal
  * \brief Template functor for scalar/packet assignment with diviving
@ -115,13 +117,13 @@ template<typename Scalar>
 struct functor_traits<div_assign_op<Scalar> > {
  enum {
    Cost = NumTraits<Scalar>::ReadCost + NumTraits<Scalar>::MulCost,
-    PacketAccess = packet_traits<Scalar>::HasMul
+    PacketAccess = packet_traits<Scalar>::HasDiv
  };
 };


 /** \internal
-  * \brief Template functor for scalar/packet assignment with swaping
+  * \brief Template functor for scalar/packet assignment with swapping
  *
  * It works as follow. For a non-vectorized evaluation loop, we have:
  *   for(i) func(A.coeffRef(i), B.coeff(i));
@ -140,8 +142,13 @@ template<typename Scalar> struct swap_assign_op {
  EIGEN_EMPTY_STRUCT_CTOR(swap_assign_op)
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const
  {
+#ifdef __CUDACC__
+    // FIXME is there some kind of cuda::swap?
+    Scalar t=b; const_cast<Scalar&>(b)=a; a=t;
+#else
    using std::swap;
    swap(a,const_cast<Scalar&>(b));
+#endif
  }
  
  template<int LhsAlignment, int RhsAlignment, typename Packet>
@ -156,7 +163,7 @@ template<typename Scalar>
 struct functor_traits<swap_assign_op<Scalar> > {
  enum {
    Cost = 3 * NumTraits<Scalar>::ReadCost,
-    PacketAccess = packet_traits<Scalar>::IsVectorized
+    PacketAccess = packet_traits<Scalar>::Vectorizable
  };
 };

--- a/Eigen/src/Core/functors/BinaryFunctors.h
+++ b/Eigen/src/Core/functors/BinaryFunctors.h
@ -115,7 +115,7 @@ struct functor_traits<scalar_conj_product_op<LhsScalar,RhsScalar> > {
  */
 template<typename Scalar> struct scalar_min_op {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_min_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { EIGEN_USING_STD_MATH(min); return (min)(a, b); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return numext::mini(a, b); }
  template<typename Packet>
  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
  { return internal::pmin(a,b); }
@ -138,7 +138,7 @@ struct functor_traits<scalar_min_op<Scalar> > {
  */
 template<typename Scalar> struct scalar_max_op {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_max_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { EIGEN_USING_STD_MATH(max); return (max)(a, b); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return numext::maxi(a, b); }
  template<typename Packet>
  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
  { return internal::pmax(a,b); }
@ -164,8 +164,6 @@ template<typename Scalar> struct scalar_hypot_op {
 //   typedef typename NumTraits<Scalar>::Real result_type;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& _x, const Scalar& _y) const
  {
-    EIGEN_USING_STD_MATH(max);
-    EIGEN_USING_STD_MATH(min);
    using std::sqrt;
    Scalar p, qp;
    if(_x>_y)
--- a/Eigen/src/Core/products/CoeffBasedProduct.h
+++ b/Eigen/src/Core/products/CoeffBasedProduct.h
@ -1,452 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_COEFFBASED_PRODUCT_H
-#define EIGEN_COEFFBASED_PRODUCT_H
-
-namespace Eigen { 
-
-namespace internal {
-
-/*********************************************************************************
-*  Coefficient based product implementation.
-*  It is designed for the following use cases:
-*  - small fixed sizes
-*  - lazy products
-*********************************************************************************/
-
-/* Since the all the dimensions of the product are small, here we can rely
- * on the generic Assign mechanism to evaluate the product per coeff (or packet).
- *
- * Note that here the inner-loops should always be unrolled.
- */
-
-template<int Traversal, int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
-struct product_coeff_impl;
-
-template<int StorageOrder, int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct product_packet_impl;
-
-template<typename LhsNested, typename RhsNested, int NestingFlags>
-struct traits<CoeffBasedProduct<LhsNested,RhsNested,NestingFlags> >
-{
-  typedef MatrixXpr XprKind;
-  typedef typename remove_all<LhsNested>::type _LhsNested;
-  typedef typename remove_all<RhsNested>::type _RhsNested;
-  typedef typename scalar_product_traits<typename _LhsNested::Scalar, typename _RhsNested::Scalar>::ReturnType Scalar;
-  typedef typename promote_storage_type<typename traits<_LhsNested>::StorageKind,
-                                           typename traits<_RhsNested>::StorageKind>::ret StorageKind;
-  typedef typename promote_index_type<typename traits<_LhsNested>::Index,
-                                         typename traits<_RhsNested>::Index>::type Index;
-
-  enum {
-      LhsCoeffReadCost = _LhsNested::CoeffReadCost,
-      RhsCoeffReadCost = _RhsNested::CoeffReadCost,
-      LhsFlags = _LhsNested::Flags,
-      RhsFlags = _RhsNested::Flags,
-
-      RowsAtCompileTime = _LhsNested::RowsAtCompileTime,
-      ColsAtCompileTime = _RhsNested::ColsAtCompileTime,
-      InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(_LhsNested::ColsAtCompileTime, _RhsNested::RowsAtCompileTime),
-
-      MaxRowsAtCompileTime = _LhsNested::MaxRowsAtCompileTime,
-      MaxColsAtCompileTime = _RhsNested::MaxColsAtCompileTime,
-
-      LhsRowMajor = LhsFlags & RowMajorBit,
-      RhsRowMajor = RhsFlags & RowMajorBit,
-
-      SameType = is_same<typename _LhsNested::Scalar,typename _RhsNested::Scalar>::value,
-
-      CanVectorizeRhs = RhsRowMajor && (RhsFlags & PacketAccessBit)
-                      && (ColsAtCompileTime == Dynamic
-                          || ( (ColsAtCompileTime % packet_traits<Scalar>::size) == 0
-                              && (RhsFlags&AlignedBit)
-                             )
-                         ),
-
-      CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit)
-                      && (RowsAtCompileTime == Dynamic
-                          || ( (RowsAtCompileTime % packet_traits<Scalar>::size) == 0
-                              && (LhsFlags&AlignedBit)
-                             )
-                         ),
-
-      EvalToRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
-                     : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
-                     : (RhsRowMajor && !CanVectorizeLhs),
-
-      Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & ~RowMajorBit)
-            | (EvalToRowMajor ? RowMajorBit : 0)
-            | NestingFlags
-            | (CanVectorizeLhs ? (LhsFlags & AlignedBit) : 0)
-            | (CanVectorizeRhs ? (RhsFlags & AlignedBit) : 0)
-            // TODO enable vectorization for mixed types
-            | (SameType && (CanVectorizeLhs || CanVectorizeRhs) ? PacketAccessBit : 0),
-
-      CoeffReadCost = InnerSize == Dynamic ? Dynamic
-                    : InnerSize * (NumTraits<Scalar>::MulCost + LhsCoeffReadCost + RhsCoeffReadCost)
-                      + (InnerSize - 1) * NumTraits<Scalar>::AddCost,
-
-      /* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside
-      * of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner
-      * loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect
-      * the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI.
-      */
-      CanVectorizeInner =    SameType
-                          && LhsRowMajor
-                          && (!RhsRowMajor)
-                          && (LhsFlags & RhsFlags & ActualPacketAccessBit)
-                          && (LhsFlags & RhsFlags & AlignedBit)
-                          && (InnerSize % packet_traits<Scalar>::size == 0)
-    };
-};
-
-} // end namespace internal
-
-template<typename LhsNested, typename RhsNested, int NestingFlags>
-class CoeffBasedProduct
-  : internal::no_assignment_operator,
-    public MatrixBase<CoeffBasedProduct<LhsNested, RhsNested, NestingFlags> >
-{
-  public:
-
-    typedef MatrixBase<CoeffBasedProduct> Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(CoeffBasedProduct)
-    typedef typename Base::PlainObject PlainObject;
-
-  private:
-
-    typedef typename internal::traits<CoeffBasedProduct>::_LhsNested _LhsNested;
-    typedef typename internal::traits<CoeffBasedProduct>::_RhsNested _RhsNested;
-
-    enum {
-      PacketSize = internal::packet_traits<Scalar>::size,
-      InnerSize  = internal::traits<CoeffBasedProduct>::InnerSize,
-      Unroll = CoeffReadCost != Dynamic && CoeffReadCost <= EIGEN_UNROLLING_LIMIT,
-      CanVectorizeInner = internal::traits<CoeffBasedProduct>::CanVectorizeInner
-    };
-
-    typedef internal::product_coeff_impl<CanVectorizeInner ? InnerVectorizedTraversal : DefaultTraversal,
-                                   Unroll ? InnerSize-1 : Dynamic,
-                                   _LhsNested, _RhsNested, Scalar> ScalarCoeffImpl;
-
-    typedef CoeffBasedProduct<LhsNested,RhsNested,NestByRefBit> LazyCoeffBasedProductType;
-
-  public:
-
-    EIGEN_DEVICE_FUNC
-    inline CoeffBasedProduct(const CoeffBasedProduct& other)
-      : Base(), m_lhs(other.m_lhs), m_rhs(other.m_rhs)
-    {}
-
-    template<typename Lhs, typename Rhs>
-    EIGEN_DEVICE_FUNC 
-    inline CoeffBasedProduct(const Lhs& lhs, const Rhs& rhs)
-      : m_lhs(lhs), m_rhs(rhs)
-    {
-      // we don't allow taking products of matrices of different real types, as that wouldn't be vectorizable.
-      // We still allow to mix T and complex<T>.
-      EIGEN_STATIC_ASSERT((internal::scalar_product_traits<typename Lhs::RealScalar, typename Rhs::RealScalar>::Defined),
-        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-      eigen_assert(lhs.cols() == rhs.rows()
-        && "invalid matrix product"
-        && "if you wanted a coeff-wise or a dot product use the respective explicit functions");
-    }
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
-    {
-      Scalar res;
-      ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res);
-      return res;
-    }
-
-    /* Allow index-based non-packet access. It is impossible though to allow index-based packed access,
-     * which is why we don't set the LinearAccessBit.
-     */
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
-    {
-      Scalar res;
-      const Index row = RowsAtCompileTime == 1 ? 0 : index;
-      const Index col = RowsAtCompileTime == 1 ? index : 0;
-      ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res);
-      return res;
-    }
-
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE const PacketScalar packet(Index row, Index col) const
-    {
-      PacketScalar res;
-      internal::product_packet_impl<Flags&RowMajorBit ? RowMajor : ColMajor,
-                              Unroll ? InnerSize-1 : Dynamic,
-                              _LhsNested, _RhsNested, PacketScalar, LoadMode>
-        ::run(row, col, m_lhs, m_rhs, res);
-      return res;
-    }
-
-    // Implicit conversion to the nested type (trigger the evaluation of the product)
-    EIGEN_DEVICE_FUNC 
-    EIGEN_STRONG_INLINE operator const PlainObject& () const
-    {
-      m_result.lazyAssign(*this);
-      return m_result;
-    }
-
-    EIGEN_DEVICE_FUNC const _LhsNested& lhs() const { return m_lhs; }
-    EIGEN_DEVICE_FUNC const _RhsNested& rhs() const { return m_rhs; }
-
-    EIGEN_DEVICE_FUNC
-    const Diagonal<const LazyCoeffBasedProductType,0> diagonal() const
-    { return reinterpret_cast<const LazyCoeffBasedProductType&>(*this); }
-
-    template<int DiagonalIndex>
-    EIGEN_DEVICE_FUNC 
-    const Diagonal<const LazyCoeffBasedProductType,DiagonalIndex> diagonal() const
-    { return reinterpret_cast<const LazyCoeffBasedProductType&>(*this); }
-
-    EIGEN_DEVICE_FUNC
-    const Diagonal<const LazyCoeffBasedProductType,Dynamic> diagonal(Index index) const
-    { return reinterpret_cast<const LazyCoeffBasedProductType&>(*this).diagonal(index); }
-
-  protected:
-    typename internal::add_const_on_value_type<LhsNested>::type m_lhs;
-    typename internal::add_const_on_value_type<RhsNested>::type m_rhs;
-
-    mutable PlainObject m_result;
-};
-
-namespace internal {
-
-// here we need to overload the nested rule for products
-// such that the nested type is a const reference to a plain matrix
-template<typename Lhs, typename Rhs, int N, typename PlainObject>
-struct nested<CoeffBasedProduct<Lhs,Rhs,EvalBeforeNestingBit|EvalBeforeAssigningBit>, N, PlainObject>
-{
-  typedef PlainObject const& type;
-};
-
-/***************************************************************************
-* Normal product .coeff() implementation (with meta-unrolling)
-***************************************************************************/
-
-/**************************************
-*** Scalar path  - no vectorization ***
-**************************************/
-
-template<int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
-struct product_coeff_impl<DefaultTraversal, UnrollingIndex, Lhs, Rhs, RetScalar>
-{
-  typedef typename Lhs::Index Index;
-  EIGEN_DEVICE_FUNC 
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
-  {
-    product_coeff_impl<DefaultTraversal, UnrollingIndex-1, Lhs, Rhs, RetScalar>::run(row, col, lhs, rhs, res);
-    res += lhs.coeff(row, UnrollingIndex) * rhs.coeff(UnrollingIndex, col);
-  }
-};
-
-template<typename Lhs, typename Rhs, typename RetScalar>
-struct product_coeff_impl<DefaultTraversal, 0, Lhs, Rhs, RetScalar>
-{
-  typedef typename Lhs::Index Index;
-  EIGEN_DEVICE_FUNC 
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
-  {
-    res = lhs.coeff(row, 0) * rhs.coeff(0, col);
-  }
-};
-
-template<typename Lhs, typename Rhs, typename RetScalar>
-struct product_coeff_impl<DefaultTraversal, Dynamic, Lhs, Rhs, RetScalar>
-{
-  typedef typename Lhs::Index Index;
-  EIGEN_DEVICE_FUNC 
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar& res)
-  {
-    eigen_assert(lhs.cols()>0 && "you are using a non initialized matrix");
-    res = lhs.coeff(row, 0) * rhs.coeff(0, col);
-      for(Index i = 1; i < lhs.cols(); ++i)
-        res += lhs.coeff(row, i) * rhs.coeff(i, col);
-  }
-};
-
-/*******************************************
-*** Scalar path with inner vectorization ***
-*******************************************/
-
-template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet>
-struct product_coeff_vectorized_unroller
-{
-  typedef typename Lhs::Index Index;
-  enum { PacketSize = packet_traits<typename Lhs::Scalar>::size };
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres)
-  {
-    product_coeff_vectorized_unroller<UnrollingIndex-PacketSize, Lhs, Rhs, Packet>::run(row, col, lhs, rhs, pres);
-    pres = padd(pres, pmul( lhs.template packet<Aligned>(row, UnrollingIndex) , rhs.template packet<Aligned>(UnrollingIndex, col) ));
-  }
-};
-
-template<typename Lhs, typename Rhs, typename Packet>
-struct product_coeff_vectorized_unroller<0, Lhs, Rhs, Packet>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres)
-  {
-    pres = pmul(lhs.template packet<Aligned>(row, 0) , rhs.template packet<Aligned>(0, col));
-  }
-};
-
-template<int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
-struct product_coeff_impl<InnerVectorizedTraversal, UnrollingIndex, Lhs, Rhs, RetScalar>
-{
-  typedef typename Lhs::PacketScalar Packet;
-  typedef typename Lhs::Index Index;
-  enum { PacketSize = packet_traits<typename Lhs::Scalar>::size };
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
-  {
-    Packet pres;
-    product_coeff_vectorized_unroller<UnrollingIndex+1-PacketSize, Lhs, Rhs, Packet>::run(row, col, lhs, rhs, pres);
-    res = predux(pres);
-  }
-};
-
-template<typename Lhs, typename Rhs, int LhsRows = Lhs::RowsAtCompileTime, int RhsCols = Rhs::ColsAtCompileTime>
-struct product_coeff_vectorized_dyn_selector
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
-  {
-    res = lhs.row(row).transpose().cwiseProduct(rhs.col(col)).sum();
-  }
-};
-
-// NOTE the 3 following specializations are because taking .col(0) on a vector is a bit slower
-// NOTE maybe they are now useless since we have a specialization for Block<Matrix>
-template<typename Lhs, typename Rhs, int RhsCols>
-struct product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,RhsCols>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
-  {
-    res = lhs.transpose().cwiseProduct(rhs.col(col)).sum();
-  }
-};
-
-template<typename Lhs, typename Rhs, int LhsRows>
-struct product_coeff_vectorized_dyn_selector<Lhs,Rhs,LhsRows,1>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
-  {
-    res = lhs.row(row).transpose().cwiseProduct(rhs).sum();
-  }
-};
-
-template<typename Lhs, typename Rhs>
-struct product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,1>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
-  {
-    res = lhs.transpose().cwiseProduct(rhs).sum();
-  }
-};
-
-template<typename Lhs, typename Rhs, typename RetScalar>
-struct product_coeff_impl<InnerVectorizedTraversal, Dynamic, Lhs, Rhs, RetScalar>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
-  {
-    product_coeff_vectorized_dyn_selector<Lhs,Rhs>::run(row, col, lhs, rhs, res);
-  }
-};
-
-/*******************
-*** Packet path  ***
-*******************/
-
-template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
-  {
-    product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, res);
-    res =  pmadd(pset1<Packet>(lhs.coeff(row, UnrollingIndex)), rhs.template packet<LoadMode>(UnrollingIndex, col), res);
-  }
-};
-
-template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
-  {
-    product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, res);
-    res =  pmadd(lhs.template packet<LoadMode>(row, UnrollingIndex), pset1<Packet>(rhs.coeff(UnrollingIndex, col)), res);
-  }
-};
-
-template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
-  {
-    res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
-  }
-};
-
-template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
-  {
-    res = pmul(lhs.template packet<LoadMode>(row, 0), pset1<Packet>(rhs.coeff(0, col)));
-  }
-};
-
-template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet& res)
-  {
-    eigen_assert(lhs.cols()>0 && "you are using a non initialized matrix");
-    res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
-      for(Index i = 1; i < lhs.cols(); ++i)
-        res =  pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode>(i, col), res);
-  }
-};
-
-template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet& res)
-  {
-    eigen_assert(lhs.cols()>0 && "you are using a non initialized matrix");
-    res = pmul(lhs.template packet<LoadMode>(row, 0), pset1<Packet>(rhs.coeff(0, col)));
-      for(Index i = 1; i < lhs.cols(); ++i)
-        res =  pmadd(lhs.template packet<LoadMode>(row, i), pset1<Packet>(rhs.coeff(i, col)), res);
-  }
-};
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_COEFFBASED_PRODUCT_H
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@ -158,8 +158,8 @@ inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n, i
  computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n, num_threads);
 }

-#ifdef EIGEN_HAS_FUSE_CJMADD
-  #define MADD(CJ,A,B,C,T)  C = CJ.pmadd(A,B,C);
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
+  #define CJMADD(CJ,A,B,C,T)  C = CJ.pmadd(A,B,C);
 #else

  // FIXME (a bit overkill maybe ?)
@ -184,8 +184,8 @@ inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n, i
    gebp_madd_selector<CJ,A,B,C,T>::run(cj,a,b,c,t);
  }

-  #define MADD(CJ,A,B,C,T)  gebp_madd(CJ,A,B,C,T);
-//   #define MADD(CJ,A,B,C,T)  T = B; T = CJ.pmul(A,T); C = padd(C,T);
+  #define CJMADD(CJ,A,B,C,T)  gebp_madd(CJ,A,B,C,T);
+//   #define CJMADD(CJ,A,B,C,T)  T = B; T = CJ.pmul(A,T); C = padd(C,T);
 #endif

 /* Vectorization logic
@ -220,7 +220,7 @@ public:
    nr = 4,

    // register block size along the M direction (currently, this one cannot be modified)
-#if defined(EIGEN_HAS_FUSED_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC)
+#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
    // we assume 16 registers
    mr = 3*LhsPacketSize,
 #else
@ -286,7 +286,7 @@ public:
    // let gcc allocate the register in which to store the result of the pmul
    // (in the case where there is no FMA) gcc fails to figure out how to avoid
    // spilling register.
-#ifdef EIGEN_HAS_FUSED_MADD
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
    EIGEN_UNUSED_VARIABLE(tmp);
    c = pmadd(a,b,c);
 #else
@ -328,7 +328,7 @@ public:
    
    NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
    nr = 4,
-#if defined(EIGEN_HAS_FUSED_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC)
+#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
    // we assume 16 registers
    mr = 3*LhsPacketSize,
 #else
@ -391,7 +391,7 @@ public:

  EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
  {
-#ifdef EIGEN_HAS_FUSED_MADD
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
    EIGEN_UNUSED_VARIABLE(tmp);
    c.v = pmadd(a.v,b,c.v);
 #else
@ -675,7 +675,7 @@ public:

  EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
  {
-#ifdef EIGEN_HAS_FUSED_MADD
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
    EIGEN_UNUSED_VARIABLE(tmp);
    c.v = pmadd(a,b.v,c.v);
 #else
@ -801,31 +801,36 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
          
          for(Index k=0; k<peeled_kc; k+=pk)
          {
-            EIGEN_ASM_COMMENT("begin gegp micro kernel 3p x 4");
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
            RhsPacket B_0, T0;
            LhsPacket A2;

 #define EIGEN_GEBGP_ONESTEP(K) \
-            internal::prefetch(blA+(3*K+16)*LhsProgress); \
-            traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0);  \
-            traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1);  \
-            traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2);  \
-            traits.loadRhs(&blB[(0+4*K)*RhsProgress], B_0); \
-            traits.madd(A0, B_0, C0, T0); \
-            traits.madd(A1, B_0, C4, T0); \
-            traits.madd(A2, B_0, C8, B_0); \
-            traits.loadRhs(&blB[1+4*K*RhsProgress], B_0); \
-            traits.madd(A0, B_0, C1, T0); \
-            traits.madd(A1, B_0, C5, T0); \
-            traits.madd(A2, B_0, C9, B_0); \
-            traits.loadRhs(&blB[2+4*K*RhsProgress], B_0); \
-            traits.madd(A0, B_0, C2,  T0); \
-            traits.madd(A1, B_0, C6,  T0); \
-            traits.madd(A2, B_0, C10, B_0); \
-            traits.loadRhs(&blB[3+4*K*RhsProgress], B_0); \
-            traits.madd(A0, B_0, C3 , T0); \
-            traits.madd(A1, B_0, C7,  T0); \
-            traits.madd(A2, B_0, C11, B_0)
+            do { \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
+              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+              internal::prefetch(blA+(3*K+16)*LhsProgress); \
+              traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0);  \
+              traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1);  \
+              traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2);  \
+              traits.loadRhs(&blB[(0+4*K)*RhsProgress], B_0); \
+              traits.madd(A0, B_0, C0, T0); \
+              traits.madd(A1, B_0, C4, T0); \
+              traits.madd(A2, B_0, C8, B_0); \
+              traits.loadRhs(&blB[1+4*K*RhsProgress], B_0); \
+              traits.madd(A0, B_0, C1, T0); \
+              traits.madd(A1, B_0, C5, T0); \
+              traits.madd(A2, B_0, C9, B_0); \
+              traits.loadRhs(&blB[2+4*K*RhsProgress], B_0); \
+              traits.madd(A0, B_0, C2,  T0); \
+              traits.madd(A1, B_0, C6,  T0); \
+              traits.madd(A2, B_0, C10, B_0); \
+              traits.loadRhs(&blB[3+4*K*RhsProgress], B_0); \
+              traits.madd(A0, B_0, C3 , T0); \
+              traits.madd(A1, B_0, C7,  T0); \
+              traits.madd(A2, B_0, C11, B_0); \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
+            } while(false)
        
            internal::prefetch(blB+(48+0));
            EIGEN_GEBGP_ONESTEP(0);
@ -840,6 +845,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga

            blB += pk*4*RhsProgress;
            blA += pk*3*Traits::LhsProgress;
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 3pX4");
          }
          // process remaining peeled loop
          for(Index k=peeled_kc; k<depth; k++)
@ -918,16 +925,21 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
          
          for(Index k=0; k<peeled_kc; k+=pk)
          {
-            EIGEN_ASM_COMMENT("begin gegp micro kernel 3p x 1");
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1");
            RhsPacket B_0;
 #define EIGEN_GEBGP_ONESTEP(K) \
-            traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0);  \
-            traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1);  \
-            traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2);  \
-            traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);   \
-            traits.madd(A0, B_0, C0, B_0); \
-            traits.madd(A1, B_0, C4, B_0); \
-            traits.madd(A2, B_0, C8, B_0)
+            do { \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
+              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+              traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0);  \
+              traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1);  \
+              traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2);  \
+              traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);   \
+              traits.madd(A0, B_0, C0, B_0); \
+              traits.madd(A1, B_0, C4, B_0); \
+              traits.madd(A2, B_0, C8, B_0); \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
+            } while(false)
        
            EIGEN_GEBGP_ONESTEP(0);
            EIGEN_GEBGP_ONESTEP(1);
@ -940,6 +952,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga

            blB += pk*RhsProgress;
            blA += pk*3*Traits::LhsProgress;
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 3pX1");
          }

          // process remaining peeled loop
@ -1005,22 +1019,27 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga

          for(Index k=0; k<peeled_kc; k+=pk)
          {
-            EIGEN_ASM_COMMENT("begin gegp micro kernel 2pX4");
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4");
            RhsPacket B_0, B1, B2, B3, T0;

   #define EIGEN_GEBGP_ONESTEP(K) \
-            traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0);                    \
-            traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1);                    \
-            traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3);  \
-            traits.madd(A0, B_0, C0, T0);                                     \
-            traits.madd(A1, B_0, C4, B_0);                                    \
-            traits.madd(A0, B1,  C1, T0);                                     \
-            traits.madd(A1, B1,  C5, B1);                                     \
-            traits.madd(A0, B2,  C2, T0);                                     \
-            traits.madd(A1, B2,  C6, B2);                                     \
-            traits.madd(A0, B3,  C3, T0);                                     \
-            traits.madd(A1, B3,  C7, B3)
-
+            do {                                                                \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4");        \
+              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+              traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0);                    \
+              traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1);                    \
+              traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3);  \
+              traits.madd(A0, B_0, C0, T0);                                     \
+              traits.madd(A1, B_0, C4, B_0);                                    \
+              traits.madd(A0, B1,  C1, T0);                                     \
+              traits.madd(A1, B1,  C5, B1);                                     \
+              traits.madd(A0, B2,  C2, T0);                                     \
+              traits.madd(A1, B2,  C6, B2);                                     \
+              traits.madd(A0, B3,  C3, T0);                                     \
+              traits.madd(A1, B3,  C7, B3);                                     \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4");          \
+            } while(false)
+            
            internal::prefetch(blB+(48+0));
            EIGEN_GEBGP_ONESTEP(0);
            EIGEN_GEBGP_ONESTEP(1);
@ -1034,6 +1053,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga

            blB += pk*4*RhsProgress;
            blA += pk*(2*Traits::LhsProgress);
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 2pX4");
          }
          // process remaining peeled loop
          for(Index k=peeled_kc; k<depth; k++)
@ -1096,15 +1117,20 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga

          for(Index k=0; k<peeled_kc; k+=pk)
          {
-            EIGEN_ASM_COMMENT("begin gegp micro kernel 2p x 1");
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1");
            RhsPacket B_0, B1;
        
 #define EIGEN_GEBGP_ONESTEP(K) \
-            traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0);  \
-            traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1);  \
-            traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);   \
-            traits.madd(A0, B_0, C0, B1);                   \
-            traits.madd(A1, B_0, C4, B_0)
+            do {                                                                  \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1");          \
+              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+              traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0);                      \
+              traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1);                      \
+              traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);                       \
+              traits.madd(A0, B_0, C0, B1);                                       \
+              traits.madd(A1, B_0, C4, B_0);                                      \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1");            \
+            } while(false)
        
            EIGEN_GEBGP_ONESTEP(0);
            EIGEN_GEBGP_ONESTEP(1);
@ -1117,6 +1143,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga

            blB += pk*RhsProgress;
            blA += pk*2*Traits::LhsProgress;
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
          }

          // process remaining peeled loop
@ -1179,16 +1207,21 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga

          for(Index k=0; k<peeled_kc; k+=pk)
          {
-            EIGEN_ASM_COMMENT("begin gegp micro kernel 1pX4");
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX4");
            RhsPacket B_0, B1, B2, B3;
               
 #define EIGEN_GEBGP_ONESTEP(K) \
-            traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0);                    \
-            traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3);  \
-            traits.madd(A0, B_0, C0, B_0);                                    \
-            traits.madd(A0, B1,  C1, B1);                                     \
-            traits.madd(A0, B2,  C2, B2);                                     \
-            traits.madd(A0, B3,  C3, B3);
+            do {                                                                \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX4");        \
+              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+              traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0);                    \
+              traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3);  \
+              traits.madd(A0, B_0, C0, B_0);                                    \
+              traits.madd(A0, B1,  C1, B1);                                     \
+              traits.madd(A0, B2,  C2, B2);                                     \
+              traits.madd(A0, B3,  C3, B3);                                     \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4");          \
+            } while(false)
            
            internal::prefetch(blB+(48+0));
            EIGEN_GEBGP_ONESTEP(0);
@ -1203,6 +1236,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga

            blB += pk*4*RhsProgress;
            blA += pk*1*LhsProgress;
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 1pX4");
          }
          // process remaining peeled loop
          for(Index k=peeled_kc; k<depth; k++)
@ -1251,14 +1286,19 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga

          for(Index k=0; k<peeled_kc; k+=pk)
          {
-            EIGEN_ASM_COMMENT("begin gegp micro kernel 2p x 1");
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1");
            RhsPacket B_0;
        
 #define EIGEN_GEBGP_ONESTEP(K) \
-            traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0);  \
-            traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);   \
-            traits.madd(A0, B_0, C0, B_0); \
-        
+            do {                                                                \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1");        \
+              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+              traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0);                    \
+              traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);                     \
+              traits.madd(A0, B_0, C0, B_0);                                    \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1");          \
+            } while(false);
+
            EIGEN_GEBGP_ONESTEP(0);
            EIGEN_GEBGP_ONESTEP(1);
            EIGEN_GEBGP_ONESTEP(2);
@ -1270,6 +1310,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga

            blB += pk*RhsProgress;
            blA += pk*1*Traits::LhsProgress;
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
          }

          // process remaining peeled loop
@ -1402,14 +1444,14 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga

              B_0 = blB[0];
              B_1 = blB[1];
-              MADD(cj,A0,B_0,C0,  B_0);
-              MADD(cj,A0,B_1,C1,  B_1);
-
+              CJMADD(cj,A0,B_0,C0,  B_0);
+              CJMADD(cj,A0,B_1,C1,  B_1);
+              
              B_0 = blB[2];
              B_1 = blB[3];
-              MADD(cj,A0,B_0,C2,  B_0);
-              MADD(cj,A0,B_1,C3,  B_1);
-
+              CJMADD(cj,A0,B_0,C2,  B_0);
+              CJMADD(cj,A0,B_1,C3,  B_1);
+              
              blB += 4;
            }
            res(i, j2 + 0) += alpha * C0;
@ -1434,7 +1476,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
          {
            LhsScalar A0 = blA[k];
            RhsScalar B_0 = blB[k];
-            MADD(cj, A0, B_0, C0, B_0);
+            CJMADD(cj, A0, B_0, C0, B_0);
          }
          res(i, j2) += alpha * C0;
        }
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@ -228,8 +228,8 @@ struct gemm_functor
      cols = m_rhs.cols();

    Gemm::run(rows, cols, m_lhs.cols(),
-              /*(const Scalar*)*/&m_lhs.coeffRef(row,0), m_lhs.outerStride(),
-              /*(const Scalar*)*/&m_rhs.coeffRef(0,col), m_rhs.outerStride(),
+              &m_lhs.coeffRef(row,0), m_lhs.outerStride(),
+              &m_rhs.coeffRef(0,col), m_rhs.outerStride(),
              (Scalar*)&(m_dest.coeffRef(row,col)), m_dest.outerStride(),
              m_actualAlpha, m_blocking, info);
  }
@ -379,84 +379,92 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M

 } // end namespace internal

+namespace internal {
+  
 template<typename Lhs, typename Rhs>
-class GeneralProduct<Lhs, Rhs, GemmProduct>
-  : public ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs>
+struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
+  : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct> >
 {
-    enum {
-      MaxDepthAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(Lhs::MaxColsAtCompileTime,Rhs::MaxRowsAtCompileTime)
-    };
-  public:
-    EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct)
-    
-    typedef typename  Lhs::Scalar LhsScalar;
-    typedef typename  Rhs::Scalar RhsScalar;
-    typedef           Scalar      ResScalar;
-
-    GeneralProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs)
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  typedef typename Product<Lhs,Rhs>::Index  Index;
+  typedef typename Lhs::Scalar LhsScalar;
+  typedef typename Rhs::Scalar RhsScalar;
+  
+  typedef internal::blas_traits<Lhs> LhsBlasTraits;
+  typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+  typedef typename internal::remove_all<ActualLhsType>::type ActualLhsTypeCleaned;
+  
+  typedef internal::blas_traits<Rhs> RhsBlasTraits;
+  typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+  typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned;
+  
+  enum {
+    MaxDepthAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(Lhs::MaxColsAtCompileTime,Rhs::MaxRowsAtCompileTime)
+  };
+  
+  typedef generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode> lazyproduct;
+  
+  template<typename Dst>
+  static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0)
+      lazyproduct::evalTo(dst, lhs, rhs);
+    else
    {
-      typedef internal::scalar_product_op<LhsScalar,RhsScalar> BinOp;
-      EIGEN_CHECK_BINARY_COMPATIBILIY(BinOp,LhsScalar,RhsScalar);
-    }
-    
-    template<typename Dest>
-    inline void evalTo(Dest& dst) const
-    {
-      if((m_rhs.rows()+dst.rows()+dst.cols())<20 && m_rhs.rows()>0)
-        dst.noalias() = m_lhs .lazyProduct( m_rhs );
-      else
-      {
-        dst.setZero();
-        scaleAndAddTo(dst,Scalar(1));
-      }
+      dst.setZero();
+      scaleAndAddTo(dst, lhs, rhs, Scalar(1));
    }
+  }

-    template<typename Dest>
-    inline void addTo(Dest& dst) const
-    {
-      if((m_rhs.rows()+dst.rows()+dst.cols())<20 && m_rhs.rows()>0)
-        dst.noalias() += m_lhs .lazyProduct( m_rhs );
-      else
-        scaleAndAddTo(dst,Scalar(1));
-    }
+  template<typename Dst>
+  static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0)
+      lazyproduct::addTo(dst, lhs, rhs);
+    else
+      scaleAndAddTo(dst,lhs, rhs, Scalar(1));
+  }

-    template<typename Dest>
-    inline void subTo(Dest& dst) const
-    {
-      if((m_rhs.rows()+dst.rows()+dst.cols())<20 && m_rhs.rows()>0)
-        dst.noalias() -= m_lhs .lazyProduct( m_rhs );
-      else
-        scaleAndAddTo(dst,Scalar(-1));
-    }
-    
-    template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
-    {
-      eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
+  template<typename Dst>
+  static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0)
+      lazyproduct::subTo(dst, lhs, rhs);
+    else
+      scaleAndAddTo(dst, lhs, rhs, Scalar(-1));
+  }
+  
+  template<typename Dest>
+  static void scaleAndAddTo(Dest& dst, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha)
+  {
+    eigen_assert(dst.rows()==a_lhs.rows() && dst.cols()==a_rhs.cols());

-      typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(m_lhs);
-      typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(m_rhs);
+    typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs);
+    typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs);

-      Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
-                                 * RhsBlasTraits::extractScalarFactor(m_rhs);
+    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs)
+                               * RhsBlasTraits::extractScalarFactor(a_rhs);

-      typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,LhsScalar,RhsScalar,
-              Dest::MaxRowsAtCompileTime,Dest::MaxColsAtCompileTime,MaxDepthAtCompileTime> BlockingType;
+    typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,LhsScalar,RhsScalar,
+            Dest::MaxRowsAtCompileTime,Dest::MaxColsAtCompileTime,MaxDepthAtCompileTime> BlockingType;

-      typedef internal::gemm_functor<
-        Scalar, Index,
-        internal::general_matrix_matrix_product<
-          Index,
-          LhsScalar, (_ActualLhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate),
-          RhsScalar, (_ActualRhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate),
-          (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>,
-        _ActualLhsType, _ActualRhsType, Dest, BlockingType> GemmFunctor;
+    typedef internal::gemm_functor<
+      Scalar, Index,
+      internal::general_matrix_matrix_product<
+        Index,
+        LhsScalar, (ActualLhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate),
+        RhsScalar, (ActualRhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate),
+        (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>,
+      ActualLhsTypeCleaned, ActualRhsTypeCleaned, Dest, BlockingType> GemmFunctor;

-      BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true);
-
-      internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), this->rows(), this->cols(), Dest::Flags&RowMajorBit);
-    }
+    BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true);
+    internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>
+                              (GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(), a_rhs.cols(), Dest::Flags&RowMajorBit);
+  }
 };

+} // end namespace internal
+
 } // end namespace Eigen

 #endif // EIGEN_GENERAL_MATRIX_MATRIX_H
--- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
@ -20,7 +20,7 @@ namespace internal {
 /**********************************************************************
 * This file implements a general A * B product while
 * evaluating only one triangular part of the product.
-* This is more general version of self adjoint product (C += A A^T)
+* This is a more general version of self adjoint product (C += A A^T)
 * as the level 3 SYRK Blas routine.
 **********************************************************************/

@ -270,14 +270,14 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,false>
 };

 template<typename MatrixType, unsigned int UpLo>
-template<typename ProductDerived, typename _Lhs, typename _Rhs>
-TriangularView<MatrixType,UpLo>& TriangularView<MatrixType,UpLo>::assignProduct(const ProductBase<ProductDerived, _Lhs,_Rhs>& prod, const Scalar& alpha)
+template<typename ProductType>
+TriangularView<MatrixType,UpLo>& TriangularViewImpl<MatrixType,UpLo,Dense>::_assignProduct(const ProductType& prod, const Scalar& alpha)
 {
-  eigen_assert(m_matrix.rows() == prod.rows() && m_matrix.cols() == prod.cols());
-
-  general_product_to_triangular_selector<MatrixType, ProductDerived, UpLo, (_Lhs::ColsAtCompileTime==1) || (_Rhs::RowsAtCompileTime==1)>::run(m_matrix.const_cast_derived(), prod.derived(), alpha);
+  eigen_assert(derived().nestedExpression().rows() == prod.rows() && derived().cols() == prod.cols());
  
-  return *this;
+  general_product_to_triangular_selector<MatrixType, ProductType, UpLo, internal::traits<ProductType>::InnerSize==1>::run(derived().nestedExpression().const_cast_derived(), prod, alpha);
+  
+  return derived();
 }

 } // end namespace Eigen
--- a/Eigen/src/Core/products/Parallelizer.h
+++ b/Eigen/src/Core/products/Parallelizer.h
@ -129,7 +129,7 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos
  Index blockRows = (rows / threads);
  blockRows = (blockRows/Functor::Traits::mr)*Functor::Traits::mr;
  
-  GemmParallelInfo<Index>* info = new GemmParallelInfo<Index>[threads];
+  ei_declare_aligned_stack_constructed_variable(GemmParallelInfo<Index>,info,threads,0);

  #pragma omp parallel num_threads(threads)
  {
@ -146,8 +146,6 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos
    if(transpose) func(c0, actualBlockCols, 0, rows, info);
    else          func(0, rows, c0, actualBlockCols, info);
  }
-
-  delete[] info;
 #endif
 }

--- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
@ -469,55 +469,54 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
 ***************************************************************************/

 namespace internal {
+  
 template<typename Lhs, int LhsMode, typename Rhs, int RhsMode>
-struct traits<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false> >
-  : traits<ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>, Lhs, Rhs> >
-{};
-}
-
-template<typename Lhs, int LhsMode, typename Rhs, int RhsMode>
-struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>
-  : public ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>, Lhs, Rhs >
+struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,RhsMode,false>
 {
-  EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix)
-
-  SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
-
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  typedef typename Product<Lhs,Rhs>::Index  Index;
+  
+  typedef internal::blas_traits<Lhs> LhsBlasTraits;
+  typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+  typedef internal::blas_traits<Rhs> RhsBlasTraits;
+  typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+  
  enum {
    LhsIsUpper = (LhsMode&(Upper|Lower))==Upper,
    LhsIsSelfAdjoint = (LhsMode&SelfAdjoint)==SelfAdjoint,
    RhsIsUpper = (RhsMode&(Upper|Lower))==Upper,
    RhsIsSelfAdjoint = (RhsMode&SelfAdjoint)==SelfAdjoint
  };
-
-  template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
+  
+  template<typename Dest>
+  static void run(Dest &dst, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha)
  {
-    eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
+    eigen_assert(dst.rows()==a_lhs.rows() && dst.cols()==a_rhs.cols());

-    typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(m_lhs);
-    typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(m_rhs);
+    typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs);
+    typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs);

-    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
-                               * RhsBlasTraits::extractScalarFactor(m_rhs);
+    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs)
+                               * RhsBlasTraits::extractScalarFactor(a_rhs);

    internal::product_selfadjoint_matrix<Scalar, Index,
-      EIGEN_LOGICAL_XOR(LhsIsUpper,
-                        internal::traits<Lhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, LhsIsSelfAdjoint,
+      EIGEN_LOGICAL_XOR(LhsIsUpper,internal::traits<Lhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, LhsIsSelfAdjoint,
      NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsIsUpper,bool(LhsBlasTraits::NeedToConjugate)),
-      EIGEN_LOGICAL_XOR(RhsIsUpper,
-                        internal::traits<Rhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, RhsIsSelfAdjoint,
+      EIGEN_LOGICAL_XOR(RhsIsUpper,internal::traits<Rhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, RhsIsSelfAdjoint,
      NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsIsUpper,bool(RhsBlasTraits::NeedToConjugate)),
      internal::traits<Dest>::Flags&RowMajorBit  ? RowMajor : ColMajor>
      ::run(
-        lhs.rows(), rhs.cols(),                     // sizes
-        &lhs.coeffRef(0,0),    lhs.outerStride(),   // lhs info
-        &rhs.coeffRef(0,0),    rhs.outerStride(),   // rhs info
-        &dst.coeffRef(0,0), dst.outerStride(),      // result info
-        actualAlpha                                 // alpha
+        lhs.rows(), rhs.cols(),                 // sizes
+        &lhs.coeffRef(0,0), lhs.outerStride(),  // lhs info
+        &rhs.coeffRef(0,0), rhs.outerStride(),  // rhs info
+        &dst.coeffRef(0,0), dst.outerStride(),  // result info
+        actualAlpha                             // alpha
      );
  }
 };

+} // end namespace internal
+
 } // end namespace Eigen

 #endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_H
--- a/Eigen/src/Core/products/SelfadjointMatrixVector.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h
@ -169,45 +169,45 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
 ***************************************************************************/

 namespace internal {
-template<typename Lhs, int LhsMode, typename Rhs>
-struct traits<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true> >
-  : traits<ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>, Lhs, Rhs> >
-{};
-}

 template<typename Lhs, int LhsMode, typename Rhs>
-struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>
-  : public ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>, Lhs, Rhs >
+struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,0,true>
 {
-  EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix)
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  typedef typename Product<Lhs,Rhs>::Index Index;
+  
+  typedef internal::blas_traits<Lhs> LhsBlasTraits;
+  typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+  typedef typename internal::remove_all<ActualLhsType>::type ActualLhsTypeCleaned;
+  
+  typedef internal::blas_traits<Rhs> RhsBlasTraits;
+  typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+  typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned;

-  enum {
-    LhsUpLo = LhsMode&(Upper|Lower)
-  };
+  enum { LhsUpLo = LhsMode&(Upper|Lower) };

-  SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
-
-  template<typename Dest> void scaleAndAddTo(Dest& dest, const Scalar& alpha) const
+  template<typename Dest>
+  static void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha)
  {
    typedef typename Dest::Scalar ResScalar;
-    typedef typename Base::RhsScalar RhsScalar;
+    typedef typename Rhs::Scalar RhsScalar;
    typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
    
-    eigen_assert(dest.rows()==m_lhs.rows() && dest.cols()==m_rhs.cols());
+    eigen_assert(dest.rows()==a_lhs.rows() && dest.cols()==a_rhs.cols());

-    typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(m_lhs);
-    typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(m_rhs);
+    typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs);
+    typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs);

-    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
-                               * RhsBlasTraits::extractScalarFactor(m_rhs);
+    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs)
+                               * RhsBlasTraits::extractScalarFactor(a_rhs);

    enum {
      EvalToDest = (Dest::InnerStrideAtCompileTime==1),
-      UseRhs = (_ActualRhsType::InnerStrideAtCompileTime==1)
+      UseRhs = (ActualRhsTypeCleaned::InnerStrideAtCompileTime==1)
    };
    
    internal::gemv_static_vector_if<ResScalar,Dest::SizeAtCompileTime,Dest::MaxSizeAtCompileTime,!EvalToDest> static_dest;
-    internal::gemv_static_vector_if<RhsScalar,_ActualRhsType::SizeAtCompileTime,_ActualRhsType::MaxSizeAtCompileTime,!UseRhs> static_rhs;
+    internal::gemv_static_vector_if<RhsScalar,ActualRhsTypeCleaned::SizeAtCompileTime,ActualRhsTypeCleaned::MaxSizeAtCompileTime,!UseRhs> static_rhs;

    ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
                                                  EvalToDest ? dest.data() : static_dest.data());
@ -230,11 +230,12 @@ struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>
      Index size = rhs.size();
      EIGEN_DENSE_STORAGE_CTOR_PLUGIN
      #endif
-      Map<typename _ActualRhsType::PlainObject>(actualRhsPtr, rhs.size()) = rhs;
+      Map<typename ActualRhsTypeCleaned::PlainObject>(actualRhsPtr, rhs.size()) = rhs;
    }
      
      
-    internal::selfadjoint_matrix_vector_product<Scalar, Index, (internal::traits<_ActualLhsType>::Flags&RowMajorBit) ? RowMajor : ColMajor, int(LhsUpLo), bool(LhsBlasTraits::NeedToConjugate), bool(RhsBlasTraits::NeedToConjugate)>::run
+    internal::selfadjoint_matrix_vector_product<Scalar, Index, (internal::traits<ActualLhsTypeCleaned>::Flags&RowMajorBit) ? RowMajor : ColMajor,
+                                                int(LhsUpLo), bool(LhsBlasTraits::NeedToConjugate), bool(RhsBlasTraits::NeedToConjugate)>::run
      (
        lhs.rows(),                             // size
        &lhs.coeffRef(0,0),  lhs.outerStride(), // lhs info
@ -248,34 +249,24 @@ struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>
  }
 };

-namespace internal {
 template<typename Lhs, typename Rhs, int RhsMode>
-struct traits<SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false> >
-  : traits<ProductBase<SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false>, Lhs, Rhs> >
-{};
-}
-
-template<typename Lhs, typename Rhs, int RhsMode>
-struct SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false>
-  : public ProductBase<SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false>, Lhs, Rhs >
+struct selfadjoint_product_impl<Lhs,0,true,Rhs,RhsMode,false>
 {
-  EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix)
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  enum { RhsUpLo = RhsMode&(Upper|Lower)  };

-  enum {
-    RhsUpLo = RhsMode&(Upper|Lower)
-  };
-
-  SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
-
-  template<typename Dest> void scaleAndAddTo(Dest& dest, const Scalar& alpha) const
+  template<typename Dest>
+  static void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha)
  {
    // let's simply transpose the product
    Transpose<Dest> destT(dest);
-    SelfadjointProductMatrix<Transpose<const Rhs>, int(RhsUpLo)==Upper ? Lower : Upper, false,
-                             Transpose<const Lhs>, 0, true>(m_rhs.transpose(), m_lhs.transpose()).scaleAndAddTo(destT, alpha);
+    selfadjoint_product_impl<Transpose<const Rhs>, int(RhsUpLo)==Upper ? Lower : Upper, false,
+                             Transpose<const Lhs>, 0, true>::run(destT, a_rhs.transpose(), a_lhs.transpose(), alpha);
  }
 };

+} // end namespace internal
+
 } // end namespace Eigen

 #endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_H
--- a/Eigen/src/Core/products/SelfadjointRank2Update.h
+++ b/Eigen/src/Core/products/SelfadjointRank2Update.h
@ -79,11 +79,11 @@ SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
  if (IsRowMajor)
    actualAlpha = numext::conj(actualAlpha);

-  internal::selfadjoint_rank2_update_selector<Scalar, Index,
-    typename internal::remove_all<typename internal::conj_expr_if<IsRowMajor ^ UBlasTraits::NeedToConjugate,_ActualUType>::type>::type,
-    typename internal::remove_all<typename internal::conj_expr_if<IsRowMajor ^ VBlasTraits::NeedToConjugate,_ActualVType>::type>::type,
+  typedef typename internal::remove_all<typename internal::conj_expr_if<IsRowMajor ^ UBlasTraits::NeedToConjugate,_ActualUType>::type>::type UType;
+  typedef typename internal::remove_all<typename internal::conj_expr_if<IsRowMajor ^ VBlasTraits::NeedToConjugate,_ActualVType>::type>::type VType;
+  internal::selfadjoint_rank2_update_selector<Scalar, Index, UType, VType,
    (IsRowMajor ? int(UpLo==Upper ? Lower : Upper) : UpLo)>
-    ::run(_expression().const_cast_derived().data(),_expression().outerStride(),actualU,actualV,actualAlpha);
+    ::run(_expression().const_cast_derived().data(),_expression().outerStride(),UType(actualU),VType(actualV),actualAlpha);

  return *this;
 }
--- a/Eigen/src/Core/products/TriangularMatrixMatrix.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h
@ -380,28 +380,29 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
 * Wrapper to product_triangular_matrix_matrix
 ***************************************************************************/

-template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs>
-struct traits<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false> >
-  : traits<ProductBase<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false>, Lhs, Rhs> >
-{};
-
 } // end namespace internal

+namespace internal {
 template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs>
-struct TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false>
-  : public ProductBase<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false>, Lhs, Rhs >
+struct triangular_product_impl<Mode,LhsIsTriangular,Lhs,false,Rhs,false>
 {
-  EIGEN_PRODUCT_PUBLIC_INTERFACE(TriangularProduct)
-
-  TriangularProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
-
-  template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
+  template<typename Dest> static void run(Dest& dst, const Lhs &a_lhs, const Rhs &a_rhs, const typename Dest::Scalar& alpha)
  {
-    typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(m_lhs);
-    typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(m_rhs);
+    typedef typename Dest::Index      Index;
+    typedef typename Dest::Scalar     Scalar;
+    
+    typedef internal::blas_traits<Lhs> LhsBlasTraits;
+    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+    typedef typename internal::remove_all<ActualLhsType>::type ActualLhsTypeCleaned;
+    typedef internal::blas_traits<Rhs> RhsBlasTraits;
+    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+    typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned;
+    
+    typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs);
+    typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs);

-    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
-                               * RhsBlasTraits::extractScalarFactor(m_rhs);
+    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs)
+                               * RhsBlasTraits::extractScalarFactor(a_rhs);

    typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar,
              Lhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxColsAtCompileTime,4> BlockingType;
@ -416,19 +417,21 @@ struct TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false>

    internal::product_triangular_matrix_matrix<Scalar, Index,
      Mode, LhsIsTriangular,
-      (internal::traits<_ActualLhsType>::Flags&RowMajorBit) ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate,
-      (internal::traits<_ActualRhsType>::Flags&RowMajorBit) ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate,
+      (internal::traits<ActualLhsTypeCleaned>::Flags&RowMajorBit) ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate,
+      (internal::traits<ActualRhsTypeCleaned>::Flags&RowMajorBit) ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate,
      (internal::traits<Dest          >::Flags&RowMajorBit) ? RowMajor : ColMajor>
      ::run(
        stripedRows, stripedCols, stripedDepth,   // sizes
-        &lhs.coeffRef(0,0),    lhs.outerStride(), // lhs info
-        &rhs.coeffRef(0,0),    rhs.outerStride(), // rhs info
+        &lhs.coeffRef(0,0), lhs.outerStride(),    // lhs info
+        &rhs.coeffRef(0,0), rhs.outerStride(),    // rhs info
        &dst.coeffRef(0,0), dst.outerStride(),    // result info
        actualAlpha, blocking
      );
  }
 };

+} // end namespace internal
+
 } // end namespace Eigen

 #endif // EIGEN_TRIANGULAR_MATRIX_MATRIX_H
--- a/Show More
+++ b/Show More