Merged latest updates from trunk

2025-09-26 00:03:14 +08:00 · 2016-10-05 18:48:55 -07:00 · 2016-10-05 18:48:55 -07:00 · 78b569f685
commit 78b569f685
parent 9c2b6c049b 4387433acf
474 changed files with 33986 additions and 6837 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,4 +1,4 @@
-project(Eigen)
+project(Eigen3)
 cmake_minimum_required(VERSION 2.8.5)
@ -8,6 +8,11 @@ if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR})
  message(FATAL_ERROR "In-source builds not allowed. Please make a new directory (called a build directory) and run CMake from there. You may need to remove CMakeCache.txt. ")
 endif()
 # Alias Eigen_*_DIR to Eigen3_*_DIR:
 set(Eigen_SOURCE_DIR ${Eigen3_SOURCE_DIR})
 set(Eigen_BINARY_DIR ${Eigen3_BINARY_DIR})
 # guard against bad build-type strings
 if (NOT CMAKE_BUILD_TYPE)
@ -93,9 +98,11 @@ else()
 endif()
 option(EIGEN_BUILD_BTL "Build benchmark suite" OFF)
-if(NOT WIN32)
+
 # Disable pkgconfig only for native Windows builds
 if(NOT WIN32 OR NOT CMAKE_HOST_SYSTEM_NAME MATCHES Windows)
  option(EIGEN_BUILD_PKGCONFIG "Build pkg-config .pc file for Eigen" ON)
-endif(NOT WIN32)
+endif()
 set(CMAKE_INCLUDE_CURRENT_DIR ON)
@ -120,7 +127,7 @@ endmacro(ei_add_cxx_compiler_flag)
 if(NOT MSVC)
  # We assume that other compilers are partly compatible with GNUCC
-  # clang outputs some warnings for unknwon flags that are not caught by check_cxx_compiler_flag
+  # clang outputs some warnings for unknown flags that are not caught by check_cxx_compiler_flag
  # adding -Werror turns such warnings into errors
  check_cxx_compiler_flag("-Werror" COMPILER_SUPPORT_WERROR)
  if(COMPILER_SUPPORT_WERROR)
@ -142,8 +149,11 @@ if(NOT MSVC)
  ei_add_cxx_compiler_flag("-Wwrite-strings")
  ei_add_cxx_compiler_flag("-Wformat-security")
  ei_add_cxx_compiler_flag("-Wshorten-64-to-32")
  ei_add_cxx_compiler_flag("-Wlogical-op")
  ei_add_cxx_compiler_flag("-Wenum-conversion")
  ei_add_cxx_compiler_flag("-Wc++11-extensions")
  ei_add_cxx_compiler_flag("-Wdouble-promotion")
 #  ei_add_cxx_compiler_flag("-Wconversion")
  # -Wshadow is insanely too strict with gcc, hopefully it will become usable with gcc 6
  # if(NOT CMAKE_COMPILER_IS_GNUCXX OR (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "5.0.0"))
@ -159,7 +169,7 @@ if(NOT MSVC)
  ei_add_cxx_compiler_flag("-fno-common")
  ei_add_cxx_compiler_flag("-fstrict-aliasing")
  ei_add_cxx_compiler_flag("-wd981")                    # disable ICC's "operands are evaluated in unspecified order" remark
-  ei_add_cxx_compiler_flag("-wd2304")                   # disbale ICC's "warning #2304: non-explicit constructor with single argument may cause implicit type conversion" produced by -Wnon-virtual-dtor
+  ei_add_cxx_compiler_flag("-wd2304")                   # disable ICC's "warning #2304: non-explicit constructor with single argument may cause implicit type conversion" produced by -Wnon-virtual-dtor
  # The -ansi flag must be added last, otherwise it is also used as a linker flag by check_cxx_compiler_flag making it fails
@ -402,7 +412,7 @@ if(EIGEN_BUILD_PKGCONFIG)
    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/eigen3.pc
        DESTINATION ${PKGCONFIG_INSTALL_DIR}
        )
-endif(EIGEN_BUILD_PKGCONFIG)
+endif()
 add_subdirectory(Eigen)
--- a/Eigen/CMakeLists.txt
+++ b/Eigen/CMakeLists.txt
@ -16,4 +16,4 @@ install(FILES
  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen COMPONENT Devel
  )
-add_subdirectory(src)
+install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen COMPONENT Devel FILES_MATCHING PATTERN "*.h")
--- a/Eigen/Cholesky
+++ b/Eigen/Cholesky
@ -31,7 +31,8 @@
 #include "src/Cholesky/LLT.h"
 #include "src/Cholesky/LDLT.h"
 #ifdef EIGEN_USE_LAPACKE
-#include "src/Cholesky/LLT_MKL.h"
+#include "src/misc/lapacke.h"
 #include "src/Cholesky/LLT_LAPACKE.h"
 #endif
 #include "src/Core/util/ReenableStupidWarnings.h"
--- a/Eigen/Core
+++ b/Eigen/Core
@ -164,6 +164,7 @@
      #if EIGEN_COMP_ICC >= 1110
        #include <immintrin.h>
      #else
        #include <mmintrin.h>
        #include <emmintrin.h>
        #include <xmmintrin.h>
        #ifdef  EIGEN_VECTORIZE_SSE3
@ -259,6 +260,11 @@
 // for min/max:
 #include <algorithm>
 // for std::is_nothrow_move_assignable
 #ifdef EIGEN_INCLUDE_TYPE_TRAITS
 #include <type_traits>
 #endif
 // for outputting debug info
 #ifdef EIGEN_DEBUG_ASSIGN
 #include <iostream>
@ -332,8 +338,8 @@ using std::ptrdiff_t;
 #include "src/Core/NumTraits.h"
 #include "src/Core/MathFunctions.h"
 #include "src/Core/SpecialFunctions.h"
 #include "src/Core/GenericPacketMath.h"
 #include "src/Core/MathFunctionsImpl.h"
 #if defined EIGEN_VECTORIZE_AVX512
  #include "src/Core/arch/SSE/PacketMath.h"
@ -368,23 +374,29 @@ using std::ptrdiff_t;
  #include "src/Core/arch/ZVector/Complex.h"
 #endif
 // Half float support
 #include "src/Core/arch/CUDA/Half.h"
 #include "src/Core/arch/CUDA/PacketMathHalf.h"
 #include "src/Core/arch/CUDA/TypeCasting.h"
 #if defined EIGEN_VECTORIZE_CUDA
  #include "src/Core/arch/CUDA/PacketMath.h"
  #include "src/Core/arch/CUDA/PacketMathHalf.h"
  #include "src/Core/arch/CUDA/MathFunctions.h"
  #include "src/Core/arch/CUDA/TypeCasting.h"
 #endif
 #include "src/Core/arch/Default/Settings.h"
 #include "src/Core/functors/TernaryFunctors.h"
 #include "src/Core/functors/BinaryFunctors.h"
 #include "src/Core/functors/UnaryFunctors.h"
 #include "src/Core/functors/NullaryFunctors.h"
 #include "src/Core/functors/StlFunctors.h"
 #include "src/Core/functors/AssignmentFunctors.h"
 // Specialized functors to enable the processing of complex numbers
 // on CUDA devices
 #include "src/Core/arch/CUDA/Complex.h"
 #include "src/Core/DenseCoeffsBase.h"
 #include "src/Core/DenseBase.h"
 #include "src/Core/MatrixBase.h"
@ -411,6 +423,7 @@ using std::ptrdiff_t;
 #include "src/Core/PlainObjectBase.h"
 #include "src/Core/Matrix.h"
 #include "src/Core/Array.h"
 #include "src/Core/CwiseTernaryOp.h"
 #include "src/Core/CwiseBinaryOp.h"
 #include "src/Core/CwiseUnaryOp.h"
 #include "src/Core/CwiseNullaryOp.h"
--- a/Eigen/Eigenvalues
+++ b/Eigen/Eigenvalues
@ -32,6 +32,7 @@
  * \endcode
  */
 #include "src/misc/RealSvd2x2.h"
 #include "src/Eigenvalues/Tridiagonalization.h"
 #include "src/Eigenvalues/RealSchur.h"
 #include "src/Eigenvalues/EigenSolver.h"
@ -44,9 +45,10 @@
 #include "src/Eigenvalues/GeneralizedEigenSolver.h"
 #include "src/Eigenvalues/MatrixBaseEigenvalues.h"
 #ifdef EIGEN_USE_LAPACKE
-#include "src/Eigenvalues/RealSchur_MKL.h"
+#include "src/misc/lapacke.h"
-#include "src/Eigenvalues/ComplexSchur_MKL.h"
+#include "src/Eigenvalues/RealSchur_LAPACKE.h"
-#include "src/Eigenvalues/SelfAdjointEigenSolver_MKL.h"
+#include "src/Eigenvalues/ComplexSchur_LAPACKE.h"
 #include "src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h"
 #endif
 #include "src/Core/util/ReenableStupidWarnings.h"
--- a/Eigen/LU
+++ b/Eigen/LU
@ -28,7 +28,8 @@
 #include "src/LU/FullPivLU.h"
 #include "src/LU/PartialPivLU.h"
 #ifdef EIGEN_USE_LAPACKE
-#include "src/LU/PartialPivLU_MKL.h"
+#include "src/misc/lapacke.h"
 #include "src/LU/PartialPivLU_LAPACKE.h"
 #endif
 #include "src/LU/Determinant.h"
 #include "src/LU/InverseImpl.h"
--- a/Eigen/QR
+++ b/Eigen/QR
@ -36,8 +36,9 @@
 #include "src/QR/ColPivHouseholderQR.h"
 #include "src/QR/CompleteOrthogonalDecomposition.h"
 #ifdef EIGEN_USE_LAPACKE
-#include "src/QR/HouseholderQR_MKL.h"
+#include "src/misc/lapacke.h"
-#include "src/QR/ColPivHouseholderQR_MKL.h"
+#include "src/QR/HouseholderQR_LAPACKE.h"
 #include "src/QR/ColPivHouseholderQR_LAPACKE.h"
 #endif
 #include "src/Core/util/ReenableStupidWarnings.h"
--- a/Eigen/SVD
+++ b/Eigen/SVD
@ -31,12 +31,14 @@
  * \endcode
  */
 #include "src/misc/RealSvd2x2.h"
 #include "src/SVD/UpperBidiagonalization.h"
 #include "src/SVD/SVDBase.h"
 #include "src/SVD/JacobiSVD.h"
 #include "src/SVD/BDCSVD.h"
 #if defined(EIGEN_USE_LAPACKE) && !defined(EIGEN_USE_LAPACKE_STRICT)
-#include "src/SVD/JacobiSVD_MKL.h"
+#include "src/misc/lapacke.h"
 #include "src/SVD/JacobiSVD_LAPACKE.h"
 #endif
 #include "src/Core/util/ReenableStupidWarnings.h"
--- a/Eigen/SuperLUSupport
+++ b/Eigen/SuperLUSupport
@ -43,7 +43,7 @@ namespace Eigen { struct SluMatrix; }
  * - class SuperLU: a supernodal sequential LU factorization.
  * - class SuperILU: a supernodal sequential incomplete LU factorization (to be used as a preconditioner for iterative methods).
  *
-  * \warning This wrapper is only for the 4.x versions of SuperLU. The 3.x and 5.x versions are not supported.
+  * \warning This wrapper requires at least versions 4.0 of SuperLU. The 3.x versions are not supported.
  *
  * \warning When including this module, you have to use SUPERLU_EMPTY instead of EMPTY which is no longer defined because it is too polluting.
  *
--- a/Eigen/src/CMakeLists.txt
+++ b/Eigen/src/CMakeLists.txt
@ -1,7 +0,0 @@
 file(GLOB Eigen_src_subdirectories "*")
 escape_string_as_regex(ESCAPED_CMAKE_CURRENT_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
 foreach(f ${Eigen_src_subdirectories})
  if(NOT f MATCHES "\\.txt" AND NOT f MATCHES "${ESCAPED_CMAKE_CURRENT_SOURCE_DIR}/[.].+" )
    add_subdirectory(${f})
  endif()
 endforeach()
--- a/Eigen/src/Cholesky/CMakeLists.txt
+++ b/Eigen/src/Cholesky/CMakeLists.txt
@ -1,6 +0,0 @@
 FILE(GLOB Eigen_Cholesky_SRCS "*.h")
 INSTALL(FILES
  ${Eigen_Cholesky_SRCS}
  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Cholesky COMPONENT Devel
  )
--- a/Eigen/src/Cholesky/LDLT.h
+++ b/Eigen/src/Cholesky/LDLT.h
@ -43,6 +43,8 @@ namespace internal {
  * Remember that Cholesky decompositions are not rank-revealing. Also, do not use a Cholesky
  * decomposition to determine whether a system of equations has a solution.
  *
  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
  * 
  * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt(), class LLT
  */
 template<typename _MatrixType, int _UpLo> class LDLT
@ -52,7 +54,6 @@ template<typename _MatrixType, int _UpLo> class LDLT
    enum {
      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
      Options = MatrixType::Options & ~RowMajorBit, // these are the options for the TmpMatrixType, we need a ColMajor matrix here!
      MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
      UpLo = _UpLo
@ -61,7 +62,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
    typedef typename MatrixType::StorageIndex StorageIndex;
-    typedef Matrix<Scalar, RowsAtCompileTime, 1, Options, MaxRowsAtCompileTime, 1> TmpMatrixType;
+    typedef Matrix<Scalar, RowsAtCompileTime, 1, 0, MaxRowsAtCompileTime, 1> TmpMatrixType;
    typedef Transpositions<RowsAtCompileTime, MaxRowsAtCompileTime> TranspositionType;
    typedef PermutationMatrix<RowsAtCompileTime, MaxRowsAtCompileTime> PermutationType;
@ -97,6 +98,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
    /** \brief Constructor with decomposition
      *
      * This calculates the decomposition for the input \a matrix.
      *
      * \sa LDLT(Index size)
      */
    template<typename InputType>
@ -110,6 +112,23 @@ template<typename _MatrixType, int _UpLo> class LDLT
      compute(matrix.derived());
    }
    /** \brief Constructs a LDLT factorization from a given matrix
      *
      * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when \c MatrixType is a Eigen::Ref.
      *
      * \sa LDLT(const EigenBase&)
      */
    template<typename InputType>
    explicit LDLT(EigenBase<InputType>& matrix)
      : m_matrix(matrix.derived()),
        m_transpositions(matrix.rows()),
        m_temporary(matrix.rows()),
        m_sign(internal::ZeroSign),
        m_isInitialized(false)
    {
      compute(matrix.derived());
    }
    /** Clear any existing decomposition
     * \sa rankUpdate(w,sigma)
     */
@ -234,7 +253,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
    ComputationInfo info() const
    {
      eigen_assert(m_isInitialized && "LDLT is not initialized.");
-      return Success;
+      return m_info;
    }
    #ifndef EIGEN_PARSED_BY_DOXYGEN
@ -262,6 +281,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
    TmpMatrixType m_temporary;
    internal::SignMatrix m_sign;
    bool m_isInitialized;
    ComputationInfo m_info;
 };
 namespace internal {
@ -279,6 +299,8 @@ template<> struct ldlt_inplace<Lower>
    typedef typename TranspositionType::StorageIndex IndexType;
    eigen_assert(mat.rows()==mat.cols());
    const Index size = mat.rows();
    bool found_zero_pivot = false;
    bool ret = true;
    if (size <= 1)
    {
@ -337,9 +359,27 @@ template<> struct ldlt_inplace<Lower>
      // we should only make sure that we do not introduce INF or NaN values.
      // Remark that LAPACK also uses 0 as the cutoff value.
      RealScalar realAkk = numext::real(mat.coeffRef(k,k));
-      if((rs>0) && (abs(realAkk) > RealScalar(0)))
+      bool pivot_is_valid = (abs(realAkk) > RealScalar(0));
      if(k==0 && !pivot_is_valid)
      {
        // The entire diagonal is zero, there is nothing more to do
        // except filling the transpositions, and checking whether the matrix is zero.
        sign = ZeroSign;
        for(Index j = 0; j<size; ++j)
        {
          transpositions.coeffRef(j) = IndexType(j);
          ret = ret && (mat.col(j).tail(size-j-1).array()==Scalar(0)).all();
        }
        return ret;
      }
      if((rs>0) && pivot_is_valid)
        A21 /= realAkk;
      if(found_zero_pivot && pivot_is_valid) ret = false; // factorization failed
      else if(!pivot_is_valid) found_zero_pivot = true;
      if (sign == PositiveSemiDef) {
        if (realAkk < static_cast<RealScalar>(0)) sign = Indefinite;
      } else if (sign == NegativeSemiDef) {
@ -350,7 +390,7 @@ template<> struct ldlt_inplace<Lower>
      }
    }
-    return true;
+    return ret;
  }
  // Reference for the algorithm: Davis and Hager, "Multiple Rank
@ -474,7 +514,7 @@ LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::compute(const EigenBase<InputTyp
  m_temporary.resize(size);
  m_sign = internal::ZeroSign;
-  internal::ldlt_inplace<UpLo>::unblocked(m_matrix, m_transpositions, m_temporary, m_sign);
+  m_info = internal::ldlt_inplace<UpLo>::unblocked(m_matrix, m_transpositions, m_temporary, m_sign) ? Success : NumericalIssue;
  m_isInitialized = true;
  return *this;
@ -602,7 +642,6 @@ MatrixType LDLT<MatrixType,_UpLo>::reconstructedMatrix() const
  return res;
 }
 #ifndef __CUDACC__
 /** \cholesky_module
  * \returns the Cholesky decomposition with full pivoting without square root of \c *this
  * \sa MatrixBase::ldlt()
@ -624,7 +663,6 @@ MatrixBase<Derived>::ldlt() const
 {
  return LDLT<PlainObject>(derived());
 }
 #endif // __CUDACC__
 } // end namespace Eigen
--- a/Eigen/src/Cholesky/LLT.h
+++ b/Eigen/src/Cholesky/LLT.h
@ -41,6 +41,8 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
  * Example: \include LLT_example.cpp
  * Output: \verbinclude LLT_example.out
  *
  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
  *
  * \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT
  */
 /* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH)
@ -54,7 +56,6 @@ template<typename _MatrixType, int _UpLo> class LLT
    enum {
      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
      Options = MatrixType::Options,
      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
    };
    typedef typename MatrixType::Scalar Scalar;
@ -95,6 +96,21 @@ template<typename _MatrixType, int _UpLo> class LLT
      compute(matrix.derived());
    }
    /** \brief Constructs a LDLT factorization from a given matrix
      *
      * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when
      * \c MatrixType is a Eigen::Ref.
      *
      * \sa LLT(const EigenBase&)
      */
    template<typename InputType>
    explicit LLT(EigenBase<InputType>& matrix)
      : m_matrix(matrix.derived()),
        m_isInitialized(false)
    {
      compute(matrix.derived());
    }
    /** \returns a view of the upper triangular matrix U */
    inline typename Traits::MatrixU matrixU() const
    {
@ -491,7 +507,6 @@ MatrixType LLT<MatrixType,_UpLo>::reconstructedMatrix() const
  return matrixL() * matrixL().adjoint().toDenseMatrix();
 }
 #ifndef __CUDACC__
 /** \cholesky_module
  * \returns the LLT decomposition of \c *this
  * \sa SelfAdjointView::llt()
@ -513,7 +528,6 @@ SelfAdjointView<MatrixType, UpLo>::llt() const
 {
  return LLT<PlainObject,UpLo>(m_matrix);
 }
 #endif // __CUDACC__
 } // end namespace Eigen
--- a/Eigen/src/Cholesky/LLT_LAPACKE.h
+++ b/Eigen/src/Cholesky/LLT_LAPACKE.h
@ -25,25 +25,22 @@
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
+ *   Content : Eigen bindings to LAPACKe
 *     LLt decomposition based on LAPACKE_?potrf function.
 ********************************************************************************
 */
-#ifndef EIGEN_LLT_MKL_H
+#ifndef EIGEN_LLT_LAPACKE_H
-#define EIGEN_LLT_MKL_H
+#define EIGEN_LLT_LAPACKE_H
 #include "Eigen/src/Core/util/MKL_support.h"
 #include <iostream>
 namespace Eigen { 
 namespace internal {
-template<typename Scalar> struct mkl_llt;
+template<typename Scalar> struct lapacke_llt;
-#define EIGEN_MKL_LLT(EIGTYPE, MKLTYPE, MKLPREFIX) \
+#define EIGEN_LAPACKE_LLT(EIGTYPE, BLASTYPE, LAPACKE_PREFIX) \
-template<> struct mkl_llt<EIGTYPE> \
+template<> struct lapacke_llt<EIGTYPE> \
 { \
  template<typename MatrixType> \
  static inline Index potrf(MatrixType& m, char uplo) \
@ -53,13 +50,13 @@ template<> struct mkl_llt<EIGTYPE> \
    EIGTYPE* a; \
    eigen_assert(m.rows()==m.cols()); \
    /* Set up parameters for ?potrf */ \
-    size = m.rows(); \
+    size = convert_index<lapack_int>(m.rows()); \
    StorageOrder = MatrixType::Flags&RowMajorBit?RowMajor:ColMajor; \
    matrix_order = StorageOrder==RowMajor ? LAPACK_ROW_MAJOR : LAPACK_COL_MAJOR; \
    a = &(m.coeffRef(0,0)); \
-    lda = m.outerStride(); \
+    lda = convert_index<lapack_int>(m.outerStride()); \
 \
-    info = LAPACKE_##MKLPREFIX##potrf( matrix_order, uplo, size, (MKLTYPE*)a, lda ); \
+    info = LAPACKE_##LAPACKE_PREFIX##potrf( matrix_order, uplo, size, (BLASTYPE*)a, lda ); \
    info = (info==0) ? -1 : info>0 ? info-1 : size; \
    return info; \
  } \
@ -69,7 +66,7 @@ template<> struct llt_inplace<EIGTYPE, Lower> \
  template<typename MatrixType> \
  static Index blocked(MatrixType& m) \
  { \
-    return mkl_llt<EIGTYPE>::potrf(m, 'L'); \
+    return lapacke_llt<EIGTYPE>::potrf(m, 'L'); \
  } \
  template<typename MatrixType, typename VectorType> \
  static Index rankUpdate(MatrixType& mat, const VectorType& vec, const typename MatrixType::RealScalar& sigma) \
@ -80,7 +77,7 @@ template<> struct llt_inplace<EIGTYPE, Upper> \
  template<typename MatrixType> \
  static Index blocked(MatrixType& m) \
  { \
-    return mkl_llt<EIGTYPE>::potrf(m, 'U'); \
+    return lapacke_llt<EIGTYPE>::potrf(m, 'U'); \
  } \
  template<typename MatrixType, typename VectorType> \
  static Index rankUpdate(MatrixType& mat, const VectorType& vec, const typename MatrixType::RealScalar& sigma) \
@ -90,13 +87,13 @@ template<> struct llt_inplace<EIGTYPE, Upper> \
  } \
 };
-EIGEN_MKL_LLT(double, double, d)
+EIGEN_LAPACKE_LLT(double, double, d)
-EIGEN_MKL_LLT(float, float, s)
+EIGEN_LAPACKE_LLT(float, float, s)
-EIGEN_MKL_LLT(dcomplex, MKL_Complex16, z)
+EIGEN_LAPACKE_LLT(dcomplex, lapack_complex_double, z)
-EIGEN_MKL_LLT(scomplex, MKL_Complex8, c)
+EIGEN_LAPACKE_LLT(scomplex, lapack_complex_float, c)
 } // end namespace internal
 } // end namespace Eigen
-#endif // EIGEN_LLT_MKL_H
+#endif // EIGEN_LLT_LAPACKE_H
--- a/Eigen/src/CholmodSupport/CMakeLists.txt
+++ b/Eigen/src/CholmodSupport/CMakeLists.txt
@ -1,6 +0,0 @@
 FILE(GLOB Eigen_CholmodSupport_SRCS "*.h")
 INSTALL(FILES 
  ${Eigen_CholmodSupport_SRCS}
  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/CholmodSupport COMPONENT Devel
  )
--- a/Eigen/src/Core/Array.h
+++ b/Eigen/src/Core/Array.h
@ -37,7 +37,7 @@ struct traits<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > : tra
  * storage layout.
  *
  * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_ARRAY_PLUGIN.
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_ARRAY_PLUGIN.
  *
  * \sa \blank \ref TutorialArrayClass, \ref TopicClassHierarchy
  */
@ -147,9 +147,9 @@ class Array
    }
 #endif
-#ifdef EIGEN_HAVE_RVALUE_REFERENCES
+#if EIGEN_HAS_RVALUE_REFERENCES
    EIGEN_DEVICE_FUNC
-    Array(Array&& other)
+    Array(Array&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible<Scalar>::value)
      : Base(std::move(other))
    {
      Base::_check_template_params();
@ -157,7 +157,7 @@ class Array
        Base::_set_noalias(other);
    }
    EIGEN_DEVICE_FUNC
-    Array& operator=(Array&& other)
+    Array& operator=(Array&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value)
    {
      other.swap(*this);
      return *this;
--- a/Eigen/src/Core/ArrayBase.h
+++ b/Eigen/src/Core/ArrayBase.h
@ -32,7 +32,7 @@ template<typename ExpressionType> class MatrixWrapper;
  * \tparam Derived is the derived type, e.g., an array or an expression type.
  *
  * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_ARRAYBASE_PLUGIN.
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_ARRAYBASE_PLUGIN.
  *
  * \sa class MatrixBase, \ref TopicClassHierarchy
  */
@ -52,8 +52,6 @@ template<typename Derived> class ArrayBase
    typedef typename NumTraits<Scalar>::Real RealScalar;
    typedef DenseBase<Derived> Base;
    using Base::operator*;
    using Base::operator/;
    using Base::RowsAtCompileTime;
    using Base::ColsAtCompileTime;
    using Base::SizeAtCompileTime;
@ -89,6 +87,7 @@ template<typename Derived> class ArrayBase
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::ArrayBase
 #define EIGEN_DOC_UNARY_ADDONS(X,Y)
 #   include "../plugins/CommonCwiseUnaryOps.h"
 #   include "../plugins/MatrixCwiseUnaryOps.h"
 #   include "../plugins/ArrayCwiseUnaryOps.h"
@ -99,6 +98,7 @@ template<typename Derived> class ArrayBase
 #     include EIGEN_ARRAYBASE_PLUGIN
 #   endif
 #undef EIGEN_CURRENT_STORAGE_BASE_CLASS
 #undef EIGEN_DOC_UNARY_ADDONS
    /** Special case of the template operator=, in order to prevent the compiler
      * from generating a default operator= (issue hit with g++ 4.1)
@ -178,7 +178,7 @@ template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator-=(const ArrayBase<OtherDerived> &other)
 {
-  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar>());
+  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
  return derived();
 }
@ -191,7 +191,7 @@ template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator+=(const ArrayBase<OtherDerived>& other)
 {
-  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar>());
+  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
  return derived();
 }
@ -217,7 +217,7 @@ template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator/=(const ArrayBase<OtherDerived>& other)
 {
-  call_assignment(derived(), other.derived(), internal::div_assign_op<Scalar>());
+  call_assignment(derived(), other.derived(), internal::div_assign_op<Scalar,typename OtherDerived::Scalar>());
  return derived();
 }
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@ -75,23 +75,24 @@ private:
    DstIsRowMajor = DstFlags&RowMajorBit,
    SrcIsRowMajor = SrcFlags&RowMajorBit,
    StorageOrdersAgree = (int(DstIsRowMajor) == int(SrcIsRowMajor)),
-    MightVectorize = StorageOrdersAgree
+    MightVectorize = bool(StorageOrdersAgree)
                  && (int(DstFlags) & int(SrcFlags) & ActualPacketAccessBit)
-                  && (functor_traits<AssignFunc>::PacketAccess),
+                  && bool(functor_traits<AssignFunc>::PacketAccess),
    MayInnerVectorize  = MightVectorize
                       && int(InnerSize)!=Dynamic && int(InnerSize)%int(InnerPacketSize)==0
                       && int(OuterStride)!=Dynamic && int(OuterStride)%int(InnerPacketSize)==0
-                       && int(JointAlignment)>=int(InnerRequiredAlignment),
+                       && (EIGEN_UNALIGNED_VECTORIZE  || int(JointAlignment)>=int(InnerRequiredAlignment)),
-    MayLinearize = StorageOrdersAgree && (int(DstFlags) & int(SrcFlags) & LinearAccessBit),
+    MayLinearize = bool(StorageOrdersAgree) && (int(DstFlags) & int(SrcFlags) & LinearAccessBit),
-    MayLinearVectorize = MightVectorize && MayLinearize && DstHasDirectAccess
+    MayLinearVectorize = bool(MightVectorize) && MayLinearize && DstHasDirectAccess
-                       && ((int(DstAlignment)>=int(LinearRequiredAlignment)) || MaxSizeAtCompileTime == Dynamic),
+                       && (EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment)>=int(LinearRequiredAlignment)) || MaxSizeAtCompileTime == Dynamic),
      /* If the destination isn't aligned, we have to do runtime checks and we don't unroll,
         so it's only good for large enough sizes. */
-    MaySliceVectorize  = MightVectorize && DstHasDirectAccess
+    MaySliceVectorize  = bool(MightVectorize) && bool(DstHasDirectAccess)
-                       && (int(InnerMaxSize)==Dynamic || int(InnerMaxSize)>=3*InnerPacketSize)
+                       && (int(InnerMaxSize)==Dynamic || int(InnerMaxSize)>=(EIGEN_UNALIGNED_VECTORIZE?InnerPacketSize:(3*InnerPacketSize)))
      /* slice vectorization can be slow, so we only want it if the slices are big, which is
         indicated by InnerMaxSize rather than InnerSize, think of the case of a dynamic block
-         in a fixed-size matrix */
+         in a fixed-size matrix
         However, with EIGEN_UNALIGNED_VECTORIZE and unrolling, slice vectorization is still worth it */
  };
 public:
@ -116,9 +117,9 @@ private:
                        : 1,
    UnrollingLimit      = EIGEN_UNROLLING_LIMIT * ActualPacketSize,
    MayUnrollCompletely = int(Dst::SizeAtCompileTime) != Dynamic
-                       && int(Dst::SizeAtCompileTime) * int(SrcEvaluator::CoeffReadCost) <= int(UnrollingLimit),
+                       && int(Dst::SizeAtCompileTime) * (int(DstEvaluator::CoeffReadCost)+int(SrcEvaluator::CoeffReadCost)) <= int(UnrollingLimit),
    MayUnrollInner      = int(InnerSize) != Dynamic
-                       && int(InnerSize) * int(SrcEvaluator::CoeffReadCost) <= int(UnrollingLimit)
+                       && int(InnerSize) * (int(DstEvaluator::CoeffReadCost)+int(SrcEvaluator::CoeffReadCost)) <= int(UnrollingLimit)
  };
 public:
@ -130,11 +131,17 @@ public:
                                             : int(NoUnrolling)
                  )
              : int(Traversal) == int(LinearVectorizedTraversal)
-                ? ( bool(MayUnrollCompletely) && (int(DstAlignment)>=int(LinearRequiredAlignment)) ? int(CompleteUnrolling)
+                ? ( bool(MayUnrollCompletely) && ( EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment)>=int(LinearRequiredAlignment)))
                          ? int(CompleteUnrolling)
                          : int(NoUnrolling) )
              : int(Traversal) == int(LinearTraversal)
                ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling) 
                                              : int(NoUnrolling) )
 #if EIGEN_UNALIGNED_VECTORIZE
              : int(Traversal) == int(SliceVectorizedTraversal)
                ? ( bool(MayUnrollInner) ? int(InnerUnrolling)
                                         : int(NoUnrolling) )
 #endif
              : int(NoUnrolling)
  };
@ -156,6 +163,7 @@ public:
    EIGEN_DEBUG_VAR(InnerMaxSize)
    EIGEN_DEBUG_VAR(LinearPacketSize)
    EIGEN_DEBUG_VAR(InnerPacketSize)
    EIGEN_DEBUG_VAR(ActualPacketSize)
    EIGEN_DEBUG_VAR(StorageOrdersAgree)
    EIGEN_DEBUG_VAR(MightVectorize)
    EIGEN_DEBUG_VAR(MayLinearize)
@ -163,6 +171,7 @@ public:
    EIGEN_DEBUG_VAR(MayLinearVectorize)
    EIGEN_DEBUG_VAR(MaySliceVectorize)
    std::cerr << "Traversal" << " = " << Traversal << " (" << demangle_traversal(Traversal) << ")" << std::endl;
    EIGEN_DEBUG_VAR(SrcEvaluator::CoeffReadCost)
    EIGEN_DEBUG_VAR(UnrollingLimit)
    EIGEN_DEBUG_VAR(MayUnrollCompletely)
    EIGEN_DEBUG_VAR(MayUnrollInner)
@ -256,13 +265,13 @@ struct copy_using_evaluator_innervec_CompleteUnrolling
  enum {
    outer = Index / DstXprType::InnerSizeAtCompileTime,
    inner = Index % DstXprType::InnerSizeAtCompileTime,
-    JointAlignment = Kernel::AssignmentTraits::JointAlignment,
+    SrcAlignment = Kernel::AssignmentTraits::SrcAlignment,
-    DefaultAlignment = unpacket_traits<PacketType>::alignment
+    DstAlignment = Kernel::AssignmentTraits::DstAlignment
  };
  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
  {
-    kernel.template assignPacketByOuterInner<DefaultAlignment, JointAlignment, PacketType>(outer, inner);
+    kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, inner);
    enum { NextIndex = Index + unpacket_traits<PacketType>::size };
    copy_using_evaluator_innervec_CompleteUnrolling<Kernel, NextIndex, Stop>::run(kernel);
  }
@ -274,23 +283,20 @@ struct copy_using_evaluator_innervec_CompleteUnrolling<Kernel, Stop, Stop>
  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) { }
 };
-template<typename Kernel, int Index_, int Stop>
+template<typename Kernel, int Index_, int Stop, int SrcAlignment, int DstAlignment>
 struct copy_using_evaluator_innervec_InnerUnrolling
 {
  typedef typename Kernel::PacketType PacketType;
  enum {
    DefaultAlignment = unpacket_traits<PacketType>::alignment
  };
  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel, Index outer)
  {
-    kernel.template assignPacketByOuterInner<DefaultAlignment, DefaultAlignment, PacketType>(outer, Index_);
+    kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, Index_);
    enum { NextIndex = Index_ + unpacket_traits<PacketType>::size };
-    copy_using_evaluator_innervec_InnerUnrolling<Kernel, NextIndex, Stop>::run(kernel, outer);
+    copy_using_evaluator_innervec_InnerUnrolling<Kernel, NextIndex, Stop, SrcAlignment, DstAlignment>::run(kernel, outer);
  }
 };
-template<typename Kernel, int Stop>
+template<typename Kernel, int Stop, int SrcAlignment, int DstAlignment>
-struct copy_using_evaluator_innervec_InnerUnrolling<Kernel, Stop, Stop>
+struct copy_using_evaluator_innervec_InnerUnrolling<Kernel, Stop, Stop, SrcAlignment, DstAlignment>
 {
  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &, Index) { }
 };
@ -419,9 +425,10 @@ struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, CompleteUnrollin
  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
  {
    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
    typedef typename Kernel::PacketType PacketType;
    enum { size = DstXprType::SizeAtCompileTime,
-           packetSize = packet_traits<typename Kernel::Scalar>::size,
+           packetSize =unpacket_traits<PacketType>::size,
           alignedSize = (size/packetSize)*packetSize };
    copy_using_evaluator_innervec_CompleteUnrolling<Kernel, 0, alignedSize>::run(kernel);
@ -438,7 +445,8 @@ struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, NoUnrolling>
 {
  typedef typename Kernel::PacketType PacketType;
  enum {
-    DefaultAlignment = unpacket_traits<PacketType>::alignment
+    SrcAlignment = Kernel::AssignmentTraits::SrcAlignment,
    DstAlignment = Kernel::AssignmentTraits::DstAlignment
  };
  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
  {
@ -447,7 +455,7 @@ struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, NoUnrolling>
    const Index packetSize = unpacket_traits<PacketType>::size;
    for(Index outer = 0; outer < outerSize; ++outer)
      for(Index inner = 0; inner < innerSize; inner+=packetSize)
-        kernel.template assignPacketByOuterInner<DefaultAlignment, DefaultAlignment, PacketType>(outer, inner);
+        kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, inner);
  }
 };
@ -467,9 +475,11 @@ struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, InnerUnrolling>
  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
  {
    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
    typedef typename Kernel::AssignmentTraits Traits;
    const Index outerSize = kernel.outerSize();
    for(Index outer = 0; outer < outerSize; ++outer)
-      copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, DstXprType::InnerSizeAtCompileTime>::run(kernel, outer);
+      copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, DstXprType::InnerSizeAtCompileTime,
                                                   Traits::SrcAlignment, Traits::DstAlignment>::run(kernel, outer);
  }
 };
@ -518,7 +528,7 @@ struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
                               : int(Kernel::AssignmentTraits::DstAlignment)
    };
    const Scalar *dst_ptr = &kernel.dstEvaluator().coeffRef(0,0);
-    if((!bool(dstIsAligned)) && (size_t(dst_ptr) % sizeof(Scalar))>0)
+    if((!bool(dstIsAligned)) && (UIntPtr(dst_ptr) % sizeof(Scalar))>0)
    {
      // the pointer is not aligend-on scalar, so alignment is not possible
      return dense_assignment_loop<Kernel,DefaultTraversal,NoUnrolling>::run(kernel);
@ -549,6 +559,29 @@ struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
  }
 };
 #if EIGEN_UNALIGNED_VECTORIZE
 template<typename Kernel>
 struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, InnerUnrolling>
 {
  EIGEN_DEVICE_FUNC static inline void run(Kernel &kernel)
  {
    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
    typedef typename Kernel::PacketType PacketType;
    enum { size = DstXprType::InnerSizeAtCompileTime,
           packetSize =unpacket_traits<PacketType>::size,
           vectorizableSize = (size/packetSize)*packetSize };
    for(Index outer = 0; outer < kernel.outerSize(); ++outer)
    {
      copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, vectorizableSize, 0, 0>::run(kernel, outer);
      copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, vectorizableSize, size>::run(kernel, outer);
    }
  }
 };
 #endif
 /***************************************************************************
 * Part 4 : Generic dense assignment kernel
 ***************************************************************************/
@ -683,7 +716,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(const DstX
 template<typename DstXprType, typename SrcXprType>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src)
 {
-  call_dense_assignment_loop(dst, src, internal::assign_op<typename DstXprType::Scalar>());
+  call_dense_assignment_loop(dst, src, internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar>());
 }
 /***************************************************************************
@ -705,7 +738,7 @@ template<> struct AssignmentKind<DenseShape,DenseShape> { typedef Dense2Dense Ki
 // This is the main assignment class
 template< typename DstXprType, typename SrcXprType, typename Functor,
          typename Kind = typename AssignmentKind< typename evaluator_traits<DstXprType>::Shape , typename evaluator_traits<SrcXprType>::Shape >::Kind,
-          typename Scalar = typename DstXprType::Scalar>
+          typename EnableIf = void>
 struct Assignment;
@ -718,13 +751,13 @@ template<typename Dst, typename Src>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 void call_assignment(Dst& dst, const Src& src)
 {
-  call_assignment(dst, src, internal::assign_op<typename Dst::Scalar>());
+  call_assignment(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>());
 }
 template<typename Dst, typename Src>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 void call_assignment(const Dst& dst, const Src& src)
 {
-  call_assignment(dst, src, internal::assign_op<typename Dst::Scalar>());
+  call_assignment(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>());
 }
 // Deal with "assume-aliasing"
@ -783,7 +816,7 @@ template<typename Dst, typename Src>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 void call_assignment_no_alias(Dst& dst, const Src& src)
 {
-  call_assignment_no_alias(dst, src, internal::assign_op<typename Dst::Scalar>());
+  call_assignment_no_alias(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>());
 }
 template<typename Dst, typename Src, typename Func>
@ -805,15 +838,17 @@ template<typename Dst, typename Src>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src)
 {
-  call_assignment_no_alias_no_transpose(dst, src, internal::assign_op<typename Dst::Scalar>());
+  call_assignment_no_alias_no_transpose(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>());
 }
 // forward declaration
 template<typename Dst, typename Src> void check_for_aliasing(const Dst &dst, const Src &src);
 // Generic Dense to Dense assignment
-template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
+// Note that the last template argument "Weak" is needed to make it possible to perform
-struct Assignment<DstXprType, SrcXprType, Functor, Dense2Dense, Scalar>
+// both partial specialization+SFINAE without ambiguous specialization
 template< typename DstXprType, typename SrcXprType, typename Functor, typename Weak>
 struct Assignment<DstXprType, SrcXprType, Functor, Dense2Dense, Weak>
 {
  EIGEN_DEVICE_FUNC
  static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
@ -830,11 +865,13 @@ struct Assignment<DstXprType, SrcXprType, Functor, Dense2Dense, Scalar>
 // Generic assignment through evalTo.
 // TODO: not sure we have to keep that one, but it helps porting current code to new evaluator mechanism.
-template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
+// Note that the last template argument "Weak" is needed to make it possible to perform
-struct Assignment<DstXprType, SrcXprType, Functor, EigenBase2EigenBase, Scalar>
+// both partial specialization+SFINAE without ambiguous specialization
 template< typename DstXprType, typename SrcXprType, typename Functor, typename Weak>
 struct Assignment<DstXprType, SrcXprType, Functor, EigenBase2EigenBase, Weak>
 {
  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &/*func*/)
+  static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
  {
    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
    src.evalTo(dst);
--- a/Eigen/src/Core/Assign_MKL.h
+++ b/Eigen/src/Core/Assign_MKL.h
@ -81,10 +81,10 @@ class vml_assign_traits
 #define EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE, VMLMODE)                                           \
  template< typename DstXprType, typename SrcXprNested>                                                                         \
-  struct Assignment<DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested>, assign_op<EIGENTYPE>,             \
+  struct Assignment<DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested>, assign_op<EIGENTYPE,EIGENTYPE>,   \
-                   Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml,EIGENTYPE>::type> {    \
+                   Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml>::type> {              \
    typedef CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested> SrcXprType;                                            \
-    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE> &/*func*/) {                             \
+    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &/*func*/) {                   \
      eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());                                                       \
      if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal) {                                              \
        VMLOP(dst.size(), (const VMLTYPE*)src.nestedExpression().data(),                                                        \
@ -138,22 +138,24 @@ EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(floor, Floor,  _)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil,  Ceil,   _)
 #define EIGEN_MKL_VML_DECLARE_POW_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE, VMLMODE)                                           \
-  template< typename DstXprType, typename SrcXprNested>                                                                       \
+  template< typename DstXprType, typename SrcXprNested, typename Plain>                                                       \
-  struct Assignment<DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested>, assign_op<EIGENTYPE>,           \
+  struct Assignment<DstXprType, CwiseBinaryOp<scalar_##EIGENOP##_op<EIGENTYPE,EIGENTYPE>, SrcXprNested,                       \
-                   Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml,EIGENTYPE>::type> {  \
+                    const CwiseNullaryOp<internal::scalar_constant_op<EIGENTYPE>,Plain> >, assign_op<EIGENTYPE,EIGENTYPE>,    \
-    typedef CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested> SrcXprType;                                          \
+                   Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml>::type> {            \
-    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE> &/*func*/) {                           \
+    typedef CwiseBinaryOp<scalar_##EIGENOP##_op<EIGENTYPE,EIGENTYPE>, SrcXprNested,                                           \
                    const CwiseNullaryOp<internal::scalar_constant_op<EIGENTYPE>,Plain> > SrcXprType;                         \
    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &/*func*/) {                 \
      eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());                                                     \
-      VMLTYPE exponent = reinterpret_cast<const VMLTYPE&>(src.functor().m_exponent);                                          \
+      VMLTYPE exponent = reinterpret_cast<const VMLTYPE&>(src.rhs().functor().m_other);                                       \
      if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal)                                              \
      {                                                                                                                       \
-        VMLOP( dst.size(), (const VMLTYPE*)src.nestedExpression().data(), exponent,                                           \
+        VMLOP( dst.size(), (const VMLTYPE*)src.lhs().data(), exponent,                                                        \
              (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE) );                                         \
      } else {                                                                                                                \
        const Index outerSize = dst.outerSize();                                                                              \
        for(Index outer = 0; outer < outerSize; ++outer) {                                                                    \
-          const EIGENTYPE *src_ptr = src.IsRowMajor ? &(src.nestedExpression().coeffRef(outer,0)) :                           \
+          const EIGENTYPE *src_ptr = src.IsRowMajor ? &(src.lhs().coeffRef(outer,0)) :                                        \
-                                                      &(src.nestedExpression().coeffRef(0, outer));                           \
+                                                      &(src.lhs().coeffRef(0, outer));                                        \
          EIGENTYPE *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer,0)) : &(dst.coeffRef(0, outer));                         \
          VMLOP( dst.innerSize(), (const VMLTYPE*)src_ptr, exponent,                                                          \
                 (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE));                                          \
--- a/Eigen/src/Core/CMakeLists.txt
+++ b/Eigen/src/Core/CMakeLists.txt
@ -1,11 +0,0 @@
 FILE(GLOB Eigen_Core_SRCS "*.h")
 INSTALL(FILES
  ${Eigen_Core_SRCS}
  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core COMPONENT Devel
  )
 ADD_SUBDIRECTORY(products)
 ADD_SUBDIRECTORY(util)
 ADD_SUBDIRECTORY(arch)
 ADD_SUBDIRECTORY(functors)
--- a/Eigen/src/Core/CommaInitializer.h
+++ b/Eigen/src/Core/CommaInitializer.h
@ -80,9 +80,7 @@ struct CommaInitializer
  EIGEN_DEVICE_FUNC
  CommaInitializer& operator,(const DenseBase<OtherDerived>& other)
  {
-    if(other.cols()==0 || other.rows()==0)
+    if (m_col==m_xpr.cols() && (other.cols()!=0 || other.rows()!=m_currentBlockRows))
      return *this;
    if (m_col==m_xpr.cols())
    {
      m_row+=m_currentBlockRows;
      m_col = 0;
@ -90,15 +88,11 @@ struct CommaInitializer
      eigen_assert(m_row+m_currentBlockRows<=m_xpr.rows()
        && "Too many rows passed to comma initializer (operator<<)");
    }
-    eigen_assert(m_col<m_xpr.cols()
+    eigen_assert((m_col + other.cols() <= m_xpr.cols())
      && "Too many coefficients passed to comma initializer (operator<<)");
    eigen_assert(m_currentBlockRows==other.rows());
-    if (OtherDerived::SizeAtCompileTime != Dynamic)
+    m_xpr.template block<OtherDerived::RowsAtCompileTime, OtherDerived::ColsAtCompileTime>
-      m_xpr.template block<OtherDerived::RowsAtCompileTime != Dynamic ? OtherDerived::RowsAtCompileTime : 1,
+                    (m_row, m_col, other.rows(), other.cols()) = other;
                              OtherDerived::ColsAtCompileTime != Dynamic ? OtherDerived::ColsAtCompileTime : 1>
                    (m_row, m_col) = other;
    else
      m_xpr.block(m_row, m_col, other.rows(), other.cols()) = other;
    m_col += other.cols();
    return *this;
  }
@ -109,9 +103,7 @@ struct CommaInitializer
  EIGEN_EXCEPTION_SPEC(Eigen::eigen_assert_exception)
 #endif
  {
-    eigen_assert((m_row+m_currentBlockRows) == m_xpr.rows()
+      finished();
         && m_col == m_xpr.cols()
         && "Too few coefficients passed to comma initializer (operator<<)");
  }
  /** \returns the built matrix once all its coefficients have been set.
@ -122,7 +114,12 @@ struct CommaInitializer
    * \endcode
    */
  EIGEN_DEVICE_FUNC
-  inline XprType& finished() { return m_xpr; }
+  inline XprType& finished() {
      eigen_assert(((m_row+m_currentBlockRows) == m_xpr.rows() || m_xpr.cols() == 0)
           && m_col == m_xpr.cols()
           && "Too few coefficients passed to comma initializer (operator<<)");
      return m_xpr;
  }
  XprType& m_xpr;           // target expression
  Index m_row;              // current row id
--- a/Eigen/src/Core/ConditionEstimator.h
+++ b/Eigen/src/Core/ConditionEstimator.h
@ -32,33 +32,6 @@ struct rcond_compute_sign<Vector, Vector, false> {
  }
 };
 /** \brief Reciprocal condition number estimator.
  *
  * Computing a decomposition of a dense matrix takes O(n^3) operations, while
  * this method estimates the condition number quickly and reliably in O(n^2)
  * operations.
  *
  * \returns an estimate of the reciprocal condition number
  * (1 / (||matrix||_1 * ||inv(matrix)||_1)) of matrix, given ||matrix||_1 and
  * its decomposition. Supports the following decompositions: FullPivLU,
  * PartialPivLU, LDLT, and LLT.
  *
  * \sa FullPivLU, PartialPivLU, LDLT, LLT.
  */
 template <typename Decomposition>
 typename Decomposition::RealScalar
 rcond_estimate_helper(typename Decomposition::RealScalar matrix_norm, const Decomposition& dec)
 {
  typedef typename Decomposition::RealScalar RealScalar;
  eigen_assert(dec.rows() == dec.cols());
  if (dec.rows() == 0)              return RealScalar(1);
  if (matrix_norm == RealScalar(0)) return RealScalar(0);
  if (dec.rows() == 1)              return RealScalar(1);
  const RealScalar inverse_matrix_norm = rcond_invmatrix_L1_norm_estimate(dec);
  return (inverse_matrix_norm == RealScalar(0) ? RealScalar(0)
                                               : (RealScalar(1) / inverse_matrix_norm) / matrix_norm);
 }
 /**
  * \returns an estimate of ||inv(matrix)||_1 given a decomposition of
  * \a matrix that implements .solve() and .adjoint().solve() methods.
@ -94,7 +67,15 @@ typename Decomposition::RealScalar rcond_invmatrix_L1_norm_estimate(const Decomp
  if (n == 0)
    return 0;
  // Disable Index to float conversion warning
 #ifdef __INTEL_COMPILER
  #pragma warning push
  #pragma warning ( disable : 2259 )
 #endif
  Vector v = dec.solve(Vector::Ones(n) / Scalar(n));
 #ifdef __INTEL_COMPILER
  #pragma warning pop
 #endif
  // lower_bound is a lower bound on
  //   ||inv(matrix)||_1  = sup_v ||inv(matrix) v||_1 / ||v||_1
@ -151,7 +132,8 @@ typename Decomposition::RealScalar rcond_invmatrix_L1_norm_estimate(const Decomp
  // Hager's algorithm to vastly underestimate ||matrix||_1.
  Scalar alternating_sign(RealScalar(1));
  for (Index i = 0; i < n; ++i) {
-    v[i] = alternating_sign * (RealScalar(1) + (RealScalar(i) / (RealScalar(n - 1))));
+    // The static_cast is needed when Scalar is a complex and RealScalar implements expression templates
    v[i] = alternating_sign * static_cast<RealScalar>(RealScalar(1) + (RealScalar(i) / (RealScalar(n - 1))));
    alternating_sign = -alternating_sign;
  }
  v = dec.solve(v);
@ -159,6 +141,33 @@ typename Decomposition::RealScalar rcond_invmatrix_L1_norm_estimate(const Decomp
  return numext::maxi(lower_bound, alternate_lower_bound);
 }
 /** \brief Reciprocal condition number estimator.
  *
  * Computing a decomposition of a dense matrix takes O(n^3) operations, while
  * this method estimates the condition number quickly and reliably in O(n^2)
  * operations.
  *
  * \returns an estimate of the reciprocal condition number
  * (1 / (||matrix||_1 * ||inv(matrix)||_1)) of matrix, given ||matrix||_1 and
  * its decomposition. Supports the following decompositions: FullPivLU,
  * PartialPivLU, LDLT, and LLT.
  *
  * \sa FullPivLU, PartialPivLU, LDLT, LLT.
  */
 template <typename Decomposition>
 typename Decomposition::RealScalar
 rcond_estimate_helper(typename Decomposition::RealScalar matrix_norm, const Decomposition& dec)
 {
  typedef typename Decomposition::RealScalar RealScalar;
  eigen_assert(dec.rows() == dec.cols());
  if (dec.rows() == 0)              return RealScalar(1);
  if (matrix_norm == RealScalar(0)) return RealScalar(0);
  if (dec.rows() == 1)              return RealScalar(1);
  const RealScalar inverse_matrix_norm = rcond_invmatrix_L1_norm_estimate(dec);
  return (inverse_matrix_norm == RealScalar(0) ? RealScalar(0)
                                               : (RealScalar(1) / inverse_matrix_norm) / matrix_norm);
 }
 }  // namespace internal
 }  // namespace Eigen
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@ -41,10 +41,19 @@ template<> struct storage_kind_to_shape<TranspositionsStorage>  { typedef Transp
 // We currently distinguish the following kind of evaluators:
 // - unary_evaluator    for expressions taking only one arguments (CwiseUnaryOp, CwiseUnaryView, Transpose, MatrixWrapper, ArrayWrapper, Reverse, Replicate)
 // - binary_evaluator   for expression taking two arguments (CwiseBinaryOp)
 // - ternary_evaluator   for expression taking three arguments (CwiseTernaryOp)
 // - product_evaluator  for linear algebra products (Product); special case of binary_evaluator because it requires additional tags for dispatching.
 // - mapbase_evaluator  for Map, Block, Ref
 // - block_evaluator    for Block (special dispatching to a mapbase_evaluator or unary_evaluator)
 template< typename T,
          typename Arg1Kind   = typename evaluator_traits<typename T::Arg1>::Kind,
          typename Arg2Kind   = typename evaluator_traits<typename T::Arg2>::Kind,
          typename Arg3Kind   = typename evaluator_traits<typename T::Arg3>::Kind,
          typename Arg1Scalar = typename traits<typename T::Arg1>::Scalar,
          typename Arg2Scalar = typename traits<typename T::Arg2>::Scalar,
          typename Arg3Scalar = typename traits<typename T::Arg3>::Scalar> struct ternary_evaluator;
 template< typename T,
          typename LhsKind   = typename evaluator_traits<typename T::Lhs>::Kind,
          typename RhsKind   = typename evaluator_traits<typename T::Rhs>::Kind,
@ -328,6 +337,120 @@ protected:
 // Like Matrix and Array, this is not really a unary expression, so we directly specialize evaluator.
 // Likewise, there is not need to more sophisticated dispatching here.
 template<typename Scalar,typename NullaryOp,
         bool has_nullary = has_nullary_operator<NullaryOp>::value,
         bool has_unary   = has_unary_operator<NullaryOp>::value,
         bool has_binary  = has_binary_operator<NullaryOp>::value>
 struct nullary_wrapper
 {
  template <typename IndexType>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i, IndexType j) const { return op(i,j); }
  template <typename IndexType>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i) const { return op(i); }
  template <typename T, typename IndexType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i, IndexType j) const { return op.template packetOp<T>(i,j); }
  template <typename T, typename IndexType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i) const { return op.template packetOp<T>(i); }
 };
 template<typename Scalar,typename NullaryOp>
 struct nullary_wrapper<Scalar,NullaryOp,true,false,false>
 {
  template <typename IndexType>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType=0, IndexType=0) const { return op(); }
  template <typename T, typename IndexType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType=0, IndexType=0) const { return op.template packetOp<T>(); }
 };
 template<typename Scalar,typename NullaryOp>
 struct nullary_wrapper<Scalar,NullaryOp,false,false,true>
 {
  template <typename IndexType>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i, IndexType j=0) const { return op(i,j); }
  template <typename T, typename IndexType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i, IndexType j=0) const { return op.template packetOp<T>(i,j); }
 };
 // We need the following specialization for vector-only functors assigned to a runtime vector,
 // for instance, using linspace and assigning a RowVectorXd to a MatrixXd or even a row of a MatrixXd.
 // In this case, i==0 and j is used for the actual iteration.
 template<typename Scalar,typename NullaryOp>
 struct nullary_wrapper<Scalar,NullaryOp,false,true,false>
 {
  template <typename IndexType>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i, IndexType j) const {
    eigen_assert(i==0 || j==0);
    return op(i+j);
  }
  template <typename T, typename IndexType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i, IndexType j) const {
    eigen_assert(i==0 || j==0);
    return op.template packetOp<T>(i+j);
  }
  template <typename IndexType>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i) const { return op(i); }
  template <typename T, typename IndexType>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i) const { return op.template packetOp<T>(i); }
 };
 template<typename Scalar,typename NullaryOp>
 struct nullary_wrapper<Scalar,NullaryOp,false,false,false> {};
 #if 0 && EIGEN_COMP_MSVC>0
 // Disable this ugly workaround. This is now handled in traits<Ref>::match,
 // but this piece of code might still become handly if some other weird compilation
 // erros pop up again.
 // MSVC exhibits a weird compilation error when
 // compiling:
 //    Eigen::MatrixXf A = MatrixXf::Random(3,3);
 //    Ref<const MatrixXf> R = 2.f*A;
 // and that has_*ary_operator<scalar_constant_op<float>> have not been instantiated yet.
 // The "problem" is that evaluator<2.f*A> is instantiated by traits<Ref>::match<2.f*A>
 // and at that time has_*ary_operator<T> returns true regardless of T.
 // Then nullary_wrapper is badly instantiated as nullary_wrapper<.,.,true,true,true>.
 // The trick is thus to defer the proper instantiation of nullary_wrapper when coeff(),
 // and packet() are really instantiated as implemented below:
 // This is a simple wrapper around Index to enforce the re-instantiation of
 // has_*ary_operator when needed.
 template<typename T> struct nullary_wrapper_workaround_msvc {
  nullary_wrapper_workaround_msvc(const T&);
  operator T()const;
 };
 template<typename Scalar,typename NullaryOp>
 struct nullary_wrapper<Scalar,NullaryOp,true,true,true>
 {
  template <typename IndexType>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i, IndexType j) const {
    return nullary_wrapper<Scalar,NullaryOp,
    has_nullary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
    has_unary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
    has_binary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value>().operator()(op,i,j);
  }
  template <typename IndexType>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i) const {
    return nullary_wrapper<Scalar,NullaryOp,
    has_nullary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
    has_unary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
    has_binary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value>().operator()(op,i);
  }
  template <typename T, typename IndexType>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i, IndexType j) const {
    return nullary_wrapper<Scalar,NullaryOp,
    has_nullary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
    has_unary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
    has_binary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value>().template packetOp<T>(op,i,j);
  }
  template <typename T, typename IndexType>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i) const {
    return nullary_wrapper<Scalar,NullaryOp,
    has_nullary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
    has_unary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
    has_binary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value>().template packetOp<T>(op,i);
  }
 };
 #endif // MSVC workaround
 template<typename NullaryOp, typename PlainObjectType>
 struct evaluator<CwiseNullaryOp<NullaryOp,PlainObjectType> >
  : evaluator_base<CwiseNullaryOp<NullaryOp,PlainObjectType> >
@ -347,41 +470,44 @@ struct evaluator<CwiseNullaryOp<NullaryOp,PlainObjectType> >
  };
  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& n)
-    : m_functor(n.functor()) 
+    : m_functor(n.functor()), m_wrapper()
  {
    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
  }
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  template <typename IndexType>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  CoeffReturnType coeff(Index row, Index col) const
+  CoeffReturnType coeff(IndexType row, IndexType col) const
  {
-    return m_functor(row, col);
+    return m_wrapper(m_functor, row, col);
  }
  template <typename IndexType>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  CoeffReturnType coeff(Index index) const
+  CoeffReturnType coeff(IndexType index) const
  {
-    return m_functor(index);
+    return m_wrapper(m_functor,index);
  }
-  template<int LoadMode, typename PacketType>
+  template<int LoadMode, typename PacketType, typename IndexType>
  EIGEN_STRONG_INLINE
-  PacketType packet(Index row, Index col) const
+  PacketType packet(IndexType row, IndexType col) const
  {
-    return m_functor.template packetOp<Index,PacketType>(row, col);
+    return m_wrapper.template packetOp<PacketType>(m_functor, row, col);
  }
-  template<int LoadMode, typename PacketType>
+  template<int LoadMode, typename PacketType, typename IndexType>
  EIGEN_STRONG_INLINE
-  PacketType packet(Index index) const
+  PacketType packet(IndexType index) const
  {
-    return m_functor.template packetOp<Index,PacketType>(index);
+    return m_wrapper.template packetOp<PacketType>(m_functor, index);
  }
 protected:
  const NullaryOp m_functor;
  const internal::nullary_wrapper<CoeffReturnType,NullaryOp> m_wrapper;
 };
 // -------------------- CwiseUnaryOp --------------------
@ -442,6 +568,96 @@ protected:
  evaluator<ArgType> m_argImpl;
 };
 // -------------------- CwiseTernaryOp --------------------
 // this is a ternary expression
 template<typename TernaryOp, typename Arg1, typename Arg2, typename Arg3>
 struct evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> >
  : public ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> >
 {
  typedef CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> XprType;
  typedef ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> > Base;
  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {}
 };
 template<typename TernaryOp, typename Arg1, typename Arg2, typename Arg3>
 struct ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>, IndexBased, IndexBased>
  : evaluator_base<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> >
 {
  typedef CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> XprType;
  enum {
    CoeffReadCost = evaluator<Arg1>::CoeffReadCost + evaluator<Arg2>::CoeffReadCost + evaluator<Arg3>::CoeffReadCost + functor_traits<TernaryOp>::Cost,
    Arg1Flags = evaluator<Arg1>::Flags,
    Arg2Flags = evaluator<Arg2>::Flags,
    Arg3Flags = evaluator<Arg3>::Flags,
    SameType = is_same<typename Arg1::Scalar,typename Arg2::Scalar>::value && is_same<typename Arg1::Scalar,typename Arg3::Scalar>::value,
    StorageOrdersAgree = (int(Arg1Flags)&RowMajorBit)==(int(Arg2Flags)&RowMajorBit) && (int(Arg1Flags)&RowMajorBit)==(int(Arg3Flags)&RowMajorBit),
    Flags0 = (int(Arg1Flags) | int(Arg2Flags) | int(Arg3Flags)) & (
        HereditaryBits
        | (int(Arg1Flags) & int(Arg2Flags) & int(Arg3Flags) &
           ( (StorageOrdersAgree ? LinearAccessBit : 0)
           | (functor_traits<TernaryOp>::PacketAccess && StorageOrdersAgree && SameType ? PacketAccessBit : 0)
           )
        )
     ),
    Flags = (Flags0 & ~RowMajorBit) | (Arg1Flags & RowMajorBit),
    Alignment = EIGEN_PLAIN_ENUM_MIN(
        EIGEN_PLAIN_ENUM_MIN(evaluator<Arg1>::Alignment, evaluator<Arg2>::Alignment),
        evaluator<Arg3>::Alignment)
  };
  EIGEN_DEVICE_FUNC explicit ternary_evaluator(const XprType& xpr)
    : m_functor(xpr.functor()),
      m_arg1Impl(xpr.arg1()), 
      m_arg2Impl(xpr.arg2()), 
      m_arg3Impl(xpr.arg3())  
  {
    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<TernaryOp>::Cost);
    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
  }
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  CoeffReturnType coeff(Index row, Index col) const
  {
    return m_functor(m_arg1Impl.coeff(row, col), m_arg2Impl.coeff(row, col), m_arg3Impl.coeff(row, col));
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  CoeffReturnType coeff(Index index) const
  {
    return m_functor(m_arg1Impl.coeff(index), m_arg2Impl.coeff(index), m_arg3Impl.coeff(index));
  }
  template<int LoadMode, typename PacketType>
  EIGEN_STRONG_INLINE
  PacketType packet(Index row, Index col) const
  {
    return m_functor.packetOp(m_arg1Impl.template packet<LoadMode,PacketType>(row, col),
                              m_arg2Impl.template packet<LoadMode,PacketType>(row, col),
                              m_arg3Impl.template packet<LoadMode,PacketType>(row, col));
  }
  template<int LoadMode, typename PacketType>
  EIGEN_STRONG_INLINE
  PacketType packet(Index index) const
  {
    return m_functor.packetOp(m_arg1Impl.template packet<LoadMode,PacketType>(index),
                              m_arg2Impl.template packet<LoadMode,PacketType>(index),
                              m_arg3Impl.template packet<LoadMode,PacketType>(index));
  }
 protected:
  const TernaryOp m_functor;
  evaluator<Arg1> m_arg1Impl;
  evaluator<Arg2> m_arg2Impl;
  evaluator<Arg3> m_arg3Impl;
 };
 // -------------------- CwiseBinaryOp --------------------
 // this is a binary expression
@ -604,7 +820,8 @@ struct mapbase_evaluator : evaluator_base<Derived>
  EIGEN_DEVICE_FUNC explicit mapbase_evaluator(const XprType& map)
    : m_data(const_cast<PointerType>(map.data())),
-      m_xpr(map)
+      m_innerStride(map.innerStride()),
      m_outerStride(map.outerStride())
  {
    EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(evaluator<Derived>::Flags&PacketAccessBit, internal::inner_stride_at_compile_time<Derived>::ret==1),
                        PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1);
@ -614,32 +831,32 @@ struct mapbase_evaluator : evaluator_base<Derived>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  CoeffReturnType coeff(Index row, Index col) const
  {
-    return m_data[col * m_xpr.colStride() + row * m_xpr.rowStride()];
+    return m_data[col * colStride() + row * rowStride()];
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  CoeffReturnType coeff(Index index) const
  {
-    return m_data[index * m_xpr.innerStride()];
+    return m_data[index * m_innerStride.value()];
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  Scalar& coeffRef(Index row, Index col)
  {
-    return m_data[col * m_xpr.colStride() + row * m_xpr.rowStride()];
+    return m_data[col * colStride() + row * rowStride()];
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  Scalar& coeffRef(Index index)
  {
-    return m_data[index * m_xpr.innerStride()];
+    return m_data[index * m_innerStride.value()];
  }
  template<int LoadMode, typename PacketType>
  EIGEN_STRONG_INLINE
  PacketType packet(Index row, Index col) const
  {
-    PointerType ptr = m_data + row * m_xpr.rowStride() + col * m_xpr.colStride();
+    PointerType ptr = m_data + row * rowStride() + col * colStride();
    return internal::ploadt<PacketType, LoadMode>(ptr);
  }
@ -647,14 +864,14 @@ struct mapbase_evaluator : evaluator_base<Derived>
  EIGEN_STRONG_INLINE
  PacketType packet(Index index) const
  {
-    return internal::ploadt<PacketType, LoadMode>(m_data + index * m_xpr.innerStride());
+    return internal::ploadt<PacketType, LoadMode>(m_data + index * m_innerStride.value());
  }
  template<int StoreMode, typename PacketType>
  EIGEN_STRONG_INLINE
  void writePacket(Index row, Index col, const PacketType& x)
  {
-    PointerType ptr = m_data + row * m_xpr.rowStride() + col * m_xpr.colStride();
+    PointerType ptr = m_data + row * rowStride() + col * colStride();
    return internal::pstoret<Scalar, PacketType, StoreMode>(ptr, x);
  }
@ -662,12 +879,17 @@ struct mapbase_evaluator : evaluator_base<Derived>
  EIGEN_STRONG_INLINE
  void writePacket(Index index, const PacketType& x)
  {
-    internal::pstoret<Scalar, PacketType, StoreMode>(m_data + index * m_xpr.innerStride(), x);
+    internal::pstoret<Scalar, PacketType, StoreMode>(m_data + index * m_innerStride.value(), x);
  }
 protected:
  EIGEN_DEVICE_FUNC
  inline Index rowStride() const { return XprType::IsRowMajor ? m_outerStride.value() : m_innerStride.value(); }
  EIGEN_DEVICE_FUNC
  inline Index colStride() const { return XprType::IsRowMajor ? m_innerStride.value() : m_outerStride.value(); }
  PointerType m_data;
-  const XprType& m_xpr;
+  const internal::variable_if_dynamic<Index, XprType::InnerStrideAtCompileTime> m_innerStride;
  const internal::variable_if_dynamic<Index, XprType::OuterStrideAtCompileTime> m_outerStride;
 };
 template<typename PlainObjectType, int MapOptions, typename StrideType> 
@ -755,9 +977,7 @@ struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
    OuterStrideAtCompileTime = HasSameStorageOrderAsArgType
                             ? int(outer_stride_at_compile_time<ArgType>::ret)
                             : int(inner_stride_at_compile_time<ArgType>::ret),
-    MaskPacketAccessBit = (InnerSize == Dynamic || (InnerSize % packet_traits<Scalar>::size) == 0)
+    MaskPacketAccessBit = (InnerStrideAtCompileTime == 1) ? PacketAccessBit : 0,
                       && (InnerStrideAtCompileTime == 1)
                        ? PacketAccessBit : 0,
    FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (evaluator<ArgType>::Flags&LinearAccessBit))) ? LinearAccessBit : 0,    
    FlagsRowMajorBit = XprType::Flags&RowMajorBit,
@ -884,7 +1104,7 @@ struct block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel, /* HasDirectAc
    : mapbase_evaluator<XprType, typename XprType::PlainObject>(block) 
  {
    // TODO: for the 3.3 release, this should be turned to an internal assertion, but let's keep it as is for the beta lifetime
-    eigen_assert(((size_t(block.data()) % EIGEN_PLAIN_ENUM_MAX(1,evaluator<XprType>::Alignment)) == 0) && "data is not aligned");
+    eigen_assert(((internal::UIntPtr(block.data()) % EIGEN_PLAIN_ENUM_MAX(1,evaluator<XprType>::Alignment)) == 0) && "data is not aligned");
  }
 };
@ -1325,7 +1545,7 @@ struct evaluator<Diagonal<ArgType, DiagIndex> >
  enum {
    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
-    Flags = (unsigned int)evaluator<ArgType>::Flags & (HereditaryBits | LinearAccessBit | DirectAccessBit) & ~RowMajorBit,
+    Flags = (unsigned int)(evaluator<ArgType>::Flags & (HereditaryBits | DirectAccessBit) & ~RowMajorBit) | LinearAccessBit,
    Alignment = 0
  };
--- a/Eigen/src/Core/CwiseBinaryOp.h
+++ b/Eigen/src/Core/CwiseBinaryOp.h
@ -160,7 +160,7 @@ template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived &
 MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived> &other)
 {
-  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar>());
+  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
  return derived();
 }
@ -173,7 +173,7 @@ template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived &
 MatrixBase<Derived>::operator+=(const MatrixBase<OtherDerived>& other)
 {
-  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar>());
+  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
  return derived();
 }
--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h
@ -20,7 +20,8 @@ struct traits<CwiseNullaryOp<NullaryOp, PlainObjectType> > : traits<PlainObjectT
    Flags = traits<PlainObjectType>::Flags & RowMajorBit
  };
 };
-}
+
 } // namespace internal
 /** \class CwiseNullaryOp
  * \ingroup Core_Module
@ -37,7 +38,23 @@ struct traits<CwiseNullaryOp<NullaryOp, PlainObjectType> > : traits<PlainObjectT
  * However, if you want to write a function returning such an expression, you
  * will need to use this class.
  *
-  * \sa class CwiseUnaryOp, class CwiseBinaryOp, DenseBase::NullaryExpr()
+  * The functor NullaryOp must expose one of the following method:
    <table class="manual">
    <tr            ><td>\c operator()() </td><td>if the procedural generation does not depend on the coefficient entries (e.g., random numbers)</td></tr>
    <tr class="alt"><td>\c operator()(Index i)</td><td>if the procedural generation makes sense for vectors only and that it depends on the coefficient index \c i (e.g., linspace) </td></tr>
    <tr            ><td>\c operator()(Index i,Index j)</td><td>if the procedural generation depends on the matrix coordinates \c i, \c j (e.g., to generate a checkerboard with 0 and 1)</td></tr>
    </table>
  * It is also possible to expose the last two operators if the generation makes sense for matrices but can be optimized for vectors.
  *
  * See DenseBase::NullaryExpr(Index,const CustomNullaryOp&) for an example binding
  * C++11 random number generators.
  *
  * A nullary expression can also be used to implement custom sophisticated matrix manipulations
  * that cannot be covered by the existing set of natively supported matrix manipulations.
  * See this \ref TopicCustomizing_NullaryExpr "page" for some examples and additional explanations
  * on the behavior of CwiseNullaryOp.
  *
  * \sa class CwiseUnaryOp, class CwiseBinaryOp, DenseBase::NullaryExpr
  */
 template<typename NullaryOp, typename PlainObjectType>
 class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp, PlainObjectType> >::type, internal::no_assignment_operator
@ -62,30 +79,6 @@ class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index cols() const { return m_cols.value(); }
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const Scalar coeff(Index rowId, Index colId) const
    {
      return m_functor(rowId, colId);
    }
    template<int LoadMode>
    EIGEN_STRONG_INLINE PacketScalar packet(Index rowId, Index colId) const
    {
      return m_functor.packetOp(rowId, colId);
    }
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
    {
      return m_functor(index);
    }
    template<int LoadMode>
    EIGEN_STRONG_INLINE PacketScalar packet(Index index) const
    {
      return m_functor.packetOp(index);
    }
    /** \returns the functor representing the nullary operation */
    EIGEN_DEVICE_FUNC
    const NullaryOp& functor() const { return m_functor; }
@ -227,7 +220,7 @@ DenseBase<Derived>::Constant(const Scalar& value)
  *
  * The function generates 'size' equally spaced values in the closed interval [low,high].
  * This particular version of LinSpaced() uses sequential access, i.e. vector access is
-  * assumed to be a(0), a(1), ..., a(size). This assumption allows for better vectorization
+  * assumed to be a(0), a(1), ..., a(size-1). This assumption allows for better vectorization
  * and yields faster code than the random access version.
  *
  * When size is set to 1, a vector of length 1 containing 'high' is returned.
@ -396,7 +389,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, con
 /**
  * \brief Sets a linearly spaced vector.
  *
-  * The function fill *this with equally spaced values in the closed interval [low,high].
+  * The function fills *this with equally spaced values in the closed interval [low,high].
  * When size is set to 1, a vector of length 1 containing 'high' is returned.
  *
  * \only_for_vectors
--- a/Eigen/src/Core/CwiseTernaryOp.h
+++ b/Eigen/src/Core/CwiseTernaryOp.h
@ -0,0 +1,197 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
 // Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #ifndef EIGEN_CWISE_TERNARY_OP_H
 #define EIGEN_CWISE_TERNARY_OP_H
 namespace Eigen {
 namespace internal {
 template <typename TernaryOp, typename Arg1, typename Arg2, typename Arg3>
 struct traits<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> > {
  // we must not inherit from traits<Arg1> since it has
  // the potential to cause problems with MSVC
  typedef typename remove_all<Arg1>::type Ancestor;
  typedef typename traits<Ancestor>::XprKind XprKind;
  enum {
    RowsAtCompileTime = traits<Ancestor>::RowsAtCompileTime,
    ColsAtCompileTime = traits<Ancestor>::ColsAtCompileTime,
    MaxRowsAtCompileTime = traits<Ancestor>::MaxRowsAtCompileTime,
    MaxColsAtCompileTime = traits<Ancestor>::MaxColsAtCompileTime
  };
  // even though we require Arg1, Arg2, and Arg3 to have the same scalar type
  // (see CwiseTernaryOp constructor),
  // we still want to handle the case when the result type is different.
  typedef typename result_of<TernaryOp(
      const typename Arg1::Scalar&, const typename Arg2::Scalar&,
      const typename Arg3::Scalar&)>::type Scalar;
  typedef typename internal::traits<Arg1>::StorageKind StorageKind;
  typedef typename internal::traits<Arg1>::StorageIndex StorageIndex;
  typedef typename Arg1::Nested Arg1Nested;
  typedef typename Arg2::Nested Arg2Nested;
  typedef typename Arg3::Nested Arg3Nested;
  typedef typename remove_reference<Arg1Nested>::type _Arg1Nested;
  typedef typename remove_reference<Arg2Nested>::type _Arg2Nested;
  typedef typename remove_reference<Arg3Nested>::type _Arg3Nested;
  enum { Flags = _Arg1Nested::Flags & RowMajorBit };
 };
 }  // end namespace internal
 template <typename TernaryOp, typename Arg1, typename Arg2, typename Arg3,
          typename StorageKind>
 class CwiseTernaryOpImpl;
 /** \class CwiseTernaryOp
  * \ingroup Core_Module
  *
  * \brief Generic expression where a coefficient-wise ternary operator is
 * applied to two expressions
  *
  * \tparam TernaryOp template functor implementing the operator
  * \tparam Arg1Type the type of the first argument
  * \tparam Arg2Type the type of the second argument
  * \tparam Arg3Type the type of the third argument
  *
  * This class represents an expression where a coefficient-wise ternary
 * operator is applied to three expressions.
  * It is the return type of ternary operators, by which we mean only those
 * ternary operators where
  * all three arguments are Eigen expressions.
  * For example, the return type of betainc(matrix1, matrix2, matrix3) is a
 * CwiseTernaryOp.
  *
  * Most of the time, this is the only way that it is used, so you typically
 * don't have to name
  * CwiseTernaryOp types explicitly.
  *
  * \sa MatrixBase::ternaryExpr(const MatrixBase<Argument2> &, const
 * MatrixBase<Argument3> &, const CustomTernaryOp &) const, class CwiseBinaryOp,
 * class CwiseUnaryOp, class CwiseNullaryOp
  */
 template <typename TernaryOp, typename Arg1Type, typename Arg2Type,
          typename Arg3Type>
 class CwiseTernaryOp : public CwiseTernaryOpImpl<
                           TernaryOp, Arg1Type, Arg2Type, Arg3Type,
                           typename internal::traits<Arg1Type>::StorageKind>,
                       internal::no_assignment_operator
 {
 public:
  typedef typename internal::remove_all<Arg1Type>::type Arg1;
  typedef typename internal::remove_all<Arg2Type>::type Arg2;
  typedef typename internal::remove_all<Arg3Type>::type Arg3;
  typedef typename CwiseTernaryOpImpl<
      TernaryOp, Arg1Type, Arg2Type, Arg3Type,
      typename internal::traits<Arg1Type>::StorageKind>::Base Base;
  EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseTernaryOp)
  typedef typename internal::ref_selector<Arg1Type>::type Arg1Nested;
  typedef typename internal::ref_selector<Arg2Type>::type Arg2Nested;
  typedef typename internal::ref_selector<Arg3Type>::type Arg3Nested;
  typedef typename internal::remove_reference<Arg1Nested>::type _Arg1Nested;
  typedef typename internal::remove_reference<Arg2Nested>::type _Arg2Nested;
  typedef typename internal::remove_reference<Arg3Nested>::type _Arg3Nested;
  EIGEN_DEVICE_FUNC
  EIGEN_STRONG_INLINE CwiseTernaryOp(const Arg1& a1, const Arg2& a2,
                                     const Arg3& a3,
                                     const TernaryOp& func = TernaryOp())
      : m_arg1(a1), m_arg2(a2), m_arg3(a3), m_functor(func) {
    // require the sizes to match
    EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Arg1, Arg2)
    EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Arg1, Arg3)
    // The index types should match
    EIGEN_STATIC_ASSERT((internal::is_same<
                         typename internal::traits<Arg1Type>::StorageKind,
                         typename internal::traits<Arg2Type>::StorageKind>::value),
                        STORAGE_KIND_MUST_MATCH)
    EIGEN_STATIC_ASSERT((internal::is_same<
                         typename internal::traits<Arg1Type>::StorageKind,
                         typename internal::traits<Arg3Type>::StorageKind>::value),
                        STORAGE_KIND_MUST_MATCH)
    eigen_assert(a1.rows() == a2.rows() && a1.cols() == a2.cols() &&
                 a1.rows() == a3.rows() && a1.cols() == a3.cols());
  }
  EIGEN_DEVICE_FUNC
  EIGEN_STRONG_INLINE Index rows() const {
    // return the fixed size type if available to enable compile time
    // optimizations
    if (internal::traits<typename internal::remove_all<Arg1Nested>::type>::
                RowsAtCompileTime == Dynamic &&
        internal::traits<typename internal::remove_all<Arg2Nested>::type>::
                RowsAtCompileTime == Dynamic)
      return m_arg3.rows();
    else if (internal::traits<typename internal::remove_all<Arg1Nested>::type>::
                     RowsAtCompileTime == Dynamic &&
             internal::traits<typename internal::remove_all<Arg3Nested>::type>::
                     RowsAtCompileTime == Dynamic)
      return m_arg2.rows();
    else
      return m_arg1.rows();
  }
  EIGEN_DEVICE_FUNC
  EIGEN_STRONG_INLINE Index cols() const {
    // return the fixed size type if available to enable compile time
    // optimizations
    if (internal::traits<typename internal::remove_all<Arg1Nested>::type>::
                ColsAtCompileTime == Dynamic &&
        internal::traits<typename internal::remove_all<Arg2Nested>::type>::
                ColsAtCompileTime == Dynamic)
      return m_arg3.cols();
    else if (internal::traits<typename internal::remove_all<Arg1Nested>::type>::
                     ColsAtCompileTime == Dynamic &&
             internal::traits<typename internal::remove_all<Arg3Nested>::type>::
                     ColsAtCompileTime == Dynamic)
      return m_arg2.cols();
    else
      return m_arg1.cols();
  }
  /** \returns the first argument nested expression */
  EIGEN_DEVICE_FUNC
  const _Arg1Nested& arg1() const { return m_arg1; }
  /** \returns the first argument nested expression */
  EIGEN_DEVICE_FUNC
  const _Arg2Nested& arg2() const { return m_arg2; }
  /** \returns the third argument nested expression */
  EIGEN_DEVICE_FUNC
  const _Arg3Nested& arg3() const { return m_arg3; }
  /** \returns the functor representing the ternary operation */
  EIGEN_DEVICE_FUNC
  const TernaryOp& functor() const { return m_functor; }
 protected:
  Arg1Nested m_arg1;
  Arg2Nested m_arg2;
  Arg3Nested m_arg3;
  const TernaryOp m_functor;
 };
 // Generic API dispatcher
 template <typename TernaryOp, typename Arg1, typename Arg2, typename Arg3,
          typename StorageKind>
 class CwiseTernaryOpImpl
    : public internal::generic_xpr_base<
          CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> >::type {
 public:
  typedef typename internal::generic_xpr_base<
      CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> >::type Base;
 };
 }  // end namespace Eigen
 #endif  // EIGEN_CWISE_TERNARY_OP_H
--- a/Eigen/src/Core/DenseBase.h
+++ b/Eigen/src/Core/DenseBase.h
@ -34,17 +34,15 @@ static inline void check_DenseIndex_is_signed() {
  * \tparam Derived is the derived type, e.g., a matrix type or an expression.
  *
  * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_DENSEBASE_PLUGIN.
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_DENSEBASE_PLUGIN.
  *
  * \sa \blank \ref TopicClassHierarchy
  */
 template<typename Derived> class DenseBase
 #ifndef EIGEN_PARSED_BY_DOXYGEN
  : public internal::special_scalar_op_base<Derived, typename internal::traits<Derived>::Scalar,
                                            typename NumTraits<typename internal::traits<Derived>::Scalar>::Real,
                                            DenseCoeffsBase<Derived> >
 #else
  : public DenseCoeffsBase<Derived>
 #else
  : public DenseCoeffsBase<Derived,DirectWriteAccessors>
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 {
  public:
@ -73,10 +71,8 @@ template<typename Derived> class DenseBase
    typedef Scalar value_type;
    typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef internal::special_scalar_op_base<Derived,Scalar,RealScalar, DenseCoeffsBase<Derived> > Base;
+    typedef DenseCoeffsBase<Derived> Base;
    using Base::operator*;
    using Base::operator/;
    using Base::derived;
    using Base::const_cast_derived;
    using Base::rows;
@ -562,12 +558,15 @@ template<typename Derived> class DenseBase
    EIGEN_DEVICE_FUNC void reverseInPlace();
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::DenseBase
 #define EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 #define EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(COND)
 #   include "../plugins/BlockMethods.h"
 #   ifdef EIGEN_DENSEBASE_PLUGIN
 #     include EIGEN_DENSEBASE_PLUGIN
 #   endif
 #undef EIGEN_CURRENT_STORAGE_BASE_CLASS
-
+#undef EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 #undef EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF
    // disable the use of evalTo for dense objects with a nice compilation error
    template<typename Dest>
--- a/Eigen/src/Core/DenseStorage.h
+++ b/Eigen/src/Core/DenseStorage.h
@ -67,13 +67,13 @@ struct plain_array
  template<typename PtrType>
  EIGEN_ALWAYS_INLINE PtrType eigen_unaligned_array_assert_workaround_gcc47(PtrType array) { return array; }
  #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \
-    eigen_assert((reinterpret_cast<size_t>(eigen_unaligned_array_assert_workaround_gcc47(array)) & (sizemask)) == 0 \
+    eigen_assert((internal::UIntPtr(eigen_unaligned_array_assert_workaround_gcc47(array)) & (sizemask)) == 0 \
              && "this assertion is explained here: " \
              "http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \
              " **** READ THIS WEB PAGE !!! ****");
 #else
  #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \
-    eigen_assert((reinterpret_cast<size_t>(array) & (sizemask)) == 0 \
+    eigen_assert((internal::UIntPtr(array) & (sizemask)) == 0 \
              && "this assertion is explained here: " \
              "http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \
              " **** READ THIS WEB PAGE !!! ****");
@ -362,9 +362,9 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
      }
      return *this;
    }
-#ifdef EIGEN_HAVE_RVALUE_REFERENCES
+#if EIGEN_HAS_RVALUE_REFERENCES
    EIGEN_DEVICE_FUNC
-    DenseStorage(DenseStorage&& other)
+    DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT
      : m_data(std::move(other.m_data))
      , m_rows(std::move(other.m_rows))
      , m_cols(std::move(other.m_cols))
@ -374,7 +374,7 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
      other.m_cols = 0;
    }
    EIGEN_DEVICE_FUNC
-    DenseStorage& operator=(DenseStorage&& other)
+    DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
    {
      using std::swap;
      swap(m_data, other.m_data);
@ -441,9 +441,9 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
      }
      return *this;
    }    
-#ifdef EIGEN_HAVE_RVALUE_REFERENCES
+#if EIGEN_HAS_RVALUE_REFERENCES
    EIGEN_DEVICE_FUNC
-    DenseStorage(DenseStorage&& other)
+    DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT
      : m_data(std::move(other.m_data))
      , m_cols(std::move(other.m_cols))
    {
@ -451,7 +451,7 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
      other.m_cols = 0;
    }
    EIGEN_DEVICE_FUNC
-    DenseStorage& operator=(DenseStorage&& other)
+    DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
    {
      using std::swap;
      swap(m_data, other.m_data);
@ -514,9 +514,9 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
      }
      return *this;
    }    
-#ifdef EIGEN_HAVE_RVALUE_REFERENCES
+#if EIGEN_HAS_RVALUE_REFERENCES
    EIGEN_DEVICE_FUNC
-    DenseStorage(DenseStorage&& other)
+    DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT
      : m_data(std::move(other.m_data))
      , m_rows(std::move(other.m_rows))
    {
@ -524,7 +524,7 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
      other.m_rows = 0;
    }
    EIGEN_DEVICE_FUNC
-    DenseStorage& operator=(DenseStorage&& other)
+    DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
    {
      using std::swap;
      swap(m_data, other.m_data);
--- a/Eigen/src/Core/DiagonalMatrix.h
+++ b/Eigen/src/Core/DiagonalMatrix.h
@ -71,18 +71,17 @@ class DiagonalBase : public EigenBase<Derived>
      return InverseReturnType(diagonal().cwiseInverse());
    }
    typedef DiagonalWrapper<const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DiagonalVectorType> > ScalarMultipleReturnType;
    EIGEN_DEVICE_FUNC
-    inline const ScalarMultipleReturnType
+    inline const DiagonalWrapper<const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DiagonalVectorType,Scalar,product) >
    operator*(const Scalar& scalar) const
    {
-      return ScalarMultipleReturnType(diagonal() * scalar);
+      return DiagonalWrapper<const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DiagonalVectorType,Scalar,product) >(diagonal() * scalar);
    }
    EIGEN_DEVICE_FUNC
-    friend inline const ScalarMultipleReturnType
+    friend inline const DiagonalWrapper<const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,DiagonalVectorType,product) >
    operator*(const Scalar& scalar, const DiagonalBase& other)
    {
-      return ScalarMultipleReturnType(other.diagonal() * scalar);
+      return DiagonalWrapper<const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,DiagonalVectorType,product) >(scalar * other.diagonal());
    }
 };
@ -317,19 +316,19 @@ struct Diagonal2Dense {};
 template<> struct AssignmentKind<DenseShape,DiagonalShape> { typedef Diagonal2Dense Kind; };
 // Diagonal matrix to Dense assignment
-template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
+template< typename DstXprType, typename SrcXprType, typename Functor>
-struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Dense, Scalar>
+struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Dense>
 {
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &/*func*/)
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
  {
    dst.setZero();
    dst.diagonal() = src.diagonal();
  }
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar> &/*func*/)
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
  { dst.diagonal() += src.diagonal(); }
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar> &/*func*/)
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
  { dst.diagonal() -= src.diagonal(); }
 };
--- a/Eigen/src/Core/Dot.h
+++ b/Eigen/src/Core/Dot.h
@ -28,22 +28,24 @@ template<typename T, typename U,
 >
 struct dot_nocheck
 {
-  typedef typename scalar_product_traits<typename traits<T>::Scalar,typename traits<U>::Scalar>::ReturnType ResScalar;
+  typedef scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> conj_prod;
  typedef typename conj_prod::result_type ResScalar;
  EIGEN_DEVICE_FUNC
  static inline ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
  {
-    return a.template binaryExpr<scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> >(b).sum();
+    return a.template binaryExpr<conj_prod>(b).sum();
  }
 };
 template<typename T, typename U>
 struct dot_nocheck<T, U, true>
 {
-  typedef typename scalar_product_traits<typename traits<T>::Scalar,typename traits<U>::Scalar>::ReturnType ResScalar;
+  typedef scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> conj_prod;
  typedef typename conj_prod::result_type ResScalar;
  EIGEN_DEVICE_FUNC
  static inline ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
  {
-    return a.transpose().template binaryExpr<scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> >(b).sum();
+    return a.transpose().template binaryExpr<conj_prod>(b).sum();
  }
 };
@ -62,7 +64,7 @@ struct dot_nocheck<T, U, true>
 template<typename Derived>
 template<typename OtherDerived>
 EIGEN_DEVICE_FUNC
-typename internal::scalar_product_traits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
+typename ScalarBinaryOpTraits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
 MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@ -227,9 +229,12 @@ struct lpNorm_selector<Derived, 2>
 template<typename Derived>
 struct lpNorm_selector<Derived, Infinity>
 {
  typedef typename NumTraits<typename traits<Derived>::Scalar>::Real RealScalar;
  EIGEN_DEVICE_FUNC
-  static inline typename NumTraits<typename traits<Derived>::Scalar>::Real run(const MatrixBase<Derived>& m)
+  static inline RealScalar run(const MatrixBase<Derived>& m)
  {
    if(Derived::SizeAtCompileTime==0 || (Derived::SizeAtCompileTime==Dynamic && m.size()==0))
      return RealScalar(0);
    return m.cwiseAbs().maxCoeff();
  }
 };
@ -240,6 +245,8 @@ struct lpNorm_selector<Derived, Infinity>
  *          of the coefficients of \c *this. If \a p is the special value \a Eigen::Infinity, this function returns the \f$ \ell^\infty \f$
  *          norm, that is the maximum of the absolute values of the coefficients of \c *this.
  *
  * In all cases, if \c *this is empty, then the value 0 is returned.
  *
  * \note For matrices, this function does not compute the <a href="https://en.wikipedia.org/wiki/Operator_norm">operator-norm</a>. That is, if \c *this is a matrix, then its coefficients are interpreted as a 1D vector. Nonetheless, you can easily compute the 1-norm and \f$\infty\f$-norm matrix operator norms using \link TutorialReductionsVisitorsBroadcastingReductionsNorm partial reductions \endlink.
  *
  * \sa norm()
--- a/Eigen/src/Core/EigenBase.h
+++ b/Eigen/src/Core/EigenBase.h
@ -138,7 +138,7 @@ template<typename Derived>
 template<typename OtherDerived>
 Derived& DenseBase<Derived>::operator+=(const EigenBase<OtherDerived> &other)
 {
-  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar>());
+  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
  return derived();
 }
@ -146,7 +146,7 @@ template<typename Derived>
 template<typename OtherDerived>
 Derived& DenseBase<Derived>::operator-=(const EigenBase<OtherDerived> &other)
 {
-  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar>());
+  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
  return derived();
 }
--- a/Eigen/src/Core/GeneralProduct.h
+++ b/Eigen/src/Core/GeneralProduct.h
@ -159,20 +159,20 @@ struct gemv_static_vector_if<Scalar,Size,Dynamic,true>
 template<typename Scalar,int Size,int MaxSize>
 struct gemv_static_vector_if<Scalar,Size,MaxSize,true>
 {
  #if EIGEN_MAX_STATIC_ALIGN_BYTES!=0
  internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize),0> m_data;
  EIGEN_STRONG_INLINE Scalar* data() { return m_data.array; }
  #else
  // Some architectures cannot align on the stack,
  // => let's manually enforce alignment by allocating more data and return the address of the first aligned element.
  enum {
    ForceAlignment  = internal::packet_traits<Scalar>::Vectorizable,
    PacketSize      = internal::packet_traits<Scalar>::size
  };
-  internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize)+(ForceAlignment?PacketSize:0),0> m_data;
+  #if EIGEN_MAX_STATIC_ALIGN_BYTES!=0
  internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize),0,EIGEN_PLAIN_ENUM_MIN(AlignedMax,PacketSize)> m_data;
  EIGEN_STRONG_INLINE Scalar* data() { return m_data.array; }
  #else
  // Some architectures cannot align on the stack,
  // => let's manually enforce alignment by allocating more data and return the address of the first aligned element.
  internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize)+(ForceAlignment?EIGEN_MAX_ALIGN_BYTES:0),0> m_data;
  EIGEN_STRONG_INLINE Scalar* data() {
    return ForceAlignment
-            ? reinterpret_cast<Scalar*>((reinterpret_cast<size_t>(m_data.array) & ~(size_t(EIGEN_MAX_ALIGN_BYTES-1))) + EIGEN_MAX_ALIGN_BYTES)
+            ? reinterpret_cast<Scalar*>((internal::UIntPtr(m_data.array) & ~(std::size_t(EIGEN_MAX_ALIGN_BYTES-1))) + EIGEN_MAX_ALIGN_BYTES)
            : m_data.array;
  }
  #endif
@ -207,7 +207,7 @@ template<> struct gemv_dense_selector<OnTheRight,ColMajor,true>
    typedef internal::blas_traits<Rhs> RhsBlasTraits;
    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
-    typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
+    typedef Map<Matrix<ResScalar,Dynamic,1>, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits<ResScalar>::size)> MappedDest;
    ActualLhsType actualLhs = LhsBlasTraits::extract(lhs);
    ActualRhsType actualRhs = RhsBlasTraits::extract(rhs);
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@ -62,6 +62,7 @@ struct default_packet_traits
    HasRsqrt  = 0,
    HasExp    = 0,
    HasLog    = 0,
    HasLog1p  = 0,
    HasLog10  = 0,
    HasPow    = 0,
@ -82,6 +83,7 @@ struct default_packet_traits
    HasErfc = 0,
    HasIGamma = 0,
    HasIGammac = 0,
    HasBetaInc = 0,
    HasRound  = 0,
    HasFloor  = 0,
@ -304,7 +306,7 @@ template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* a
  // 32-bit pointer operand constraint for inlined asm
  asm(" prefetch.L1 [ %1 ];" : "=r"(addr) : "r"(addr));
 #endif
-#elif !EIGEN_COMP_MSVC
+#elif (!EIGEN_COMP_MSVC) && (EIGEN_COMP_GNUC || EIGEN_COMP_CLANG || EIGEN_COMP_ICC)
  __builtin_prefetch(addr);
 #endif
 }
@ -346,22 +348,6 @@ template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Pack
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& a)
 { return a; }
 template<size_t offset, typename Packet>
 struct protate_impl
 {
  // Empty so attempts to use this unimplemented path will fail to compile.
  // Only specializations of this template should be used.
 };
 /** \internal \returns a packet with the coefficients rotated to the right in little-endian convention,
  * by the given offset, e.g. for offset == 1:
  *     (packet[3], packet[2], packet[1], packet[0]) becomes (packet[0], packet[3], packet[2], packet[1])
  */
 template<size_t offset, typename Packet> EIGEN_DEVICE_FUNC inline Packet protate(const Packet& a)
 {
  return offset ? protate_impl<offset, Packet>::run(a) : a;
 }
 /** \internal \returns \a a with real and imaginary part flipped (for complex type only) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet& a)
 {
@ -419,6 +405,10 @@ Packet pexp(const Packet& a) { using std::exp; return exp(a); }
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet plog(const Packet& a) { using std::log; return log(a); }
 /** \internal \returns the log1p of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet plog1p(const Packet& a) { return numext::log1p(a); }
 /** \internal \returns the log10 of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet plog10(const Packet& a) { using std::log10; return log10(a); }
@ -445,38 +435,6 @@ Packet pfloor(const Packet& a) { using numext::floor; return floor(a); }
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pceil(const Packet& a) { using numext::ceil; return ceil(a); }
 /** \internal \returns the ln(|gamma(\a a)|) (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet plgamma(const Packet& a) { using numext::lgamma; return lgamma(a); }
 /** \internal \returns the derivative of lgamma, psi(\a a) (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pdigamma(const Packet& a) { using numext::digamma; return digamma(a); }
 /** \internal \returns the zeta function of two arguments (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pzeta(const Packet& x, const Packet& q) { using numext::zeta; return zeta(x, q); }
 /** \internal \returns the polygamma function (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet ppolygamma(const Packet& n, const Packet& x) { using numext::polygamma; return polygamma(n, x); }
 /** \internal \returns the erf(\a a) (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet perf(const Packet& a) { using numext::erf; return erf(a); }
 /** \internal \returns the erfc(\a a) (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet perfc(const Packet& a) { using numext::erfc; return erfc(a); }
 /** \internal \returns the incomplete gamma function igamma(\a a, \a x) */
 template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 Packet pigamma(const Packet& a, const Packet& x) { using numext::igamma; return igamma(a, x); }
 /** \internal \returns the complementary incomplete gamma function igammac(\a a, \a x) */
 template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 Packet pigammac(const Packet& a, const Packet& x) { using numext::igammac; return igammac(a, x); }
 /***************************************************************************
 * The following functions might not have to be overwritten for vectorized types
 ***************************************************************************/
--- a/Eigen/src/Core/GlobalFunctions.h
+++ b/Eigen/src/Core/GlobalFunctions.h
@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2010-2012 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2010-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2010 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@ -11,13 +11,30 @@
 #ifndef EIGEN_GLOBAL_FUNCTIONS_H
 #define EIGEN_GLOBAL_FUNCTIONS_H
-#define EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(NAME,FUNCTOR) \
+#ifdef EIGEN_PARSED_BY_DOXYGEN
 #define EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(NAME,FUNCTOR,DOC_OP,DOC_DETAILS) \
  /** \returns an expression of the coefficient-wise DOC_OP of \a x
    DOC_DETAILS
    \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_##NAME">Math functions</a>, class CwiseUnaryOp
    */ \
  template<typename Derived> \
  inline const Eigen::CwiseUnaryOp<Eigen::internal::FUNCTOR<typename Derived::Scalar>, const Derived> \
  NAME(const Eigen::ArrayBase<Derived>& x);
 #else
 #define EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(NAME,FUNCTOR,DOC_OP,DOC_DETAILS) \
  template<typename Derived> \
  inline const Eigen::CwiseUnaryOp<Eigen::internal::FUNCTOR<typename Derived::Scalar>, const Derived> \
  (NAME)(const Eigen::ArrayBase<Derived>& x) { \
    return Eigen::CwiseUnaryOp<Eigen::internal::FUNCTOR<typename Derived::Scalar>, const Derived>(x.derived()); \
  }
 #endif // EIGEN_PARSED_BY_DOXYGEN
 #define EIGEN_ARRAY_DECLARE_GLOBAL_EIGEN_UNARY(NAME,FUNCTOR) \
  \
  template<typename Derived> \
@ -36,47 +53,68 @@
 namespace Eigen
 {
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(real,scalar_real_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(real,scalar_real_op,real part,\sa ArrayBase::real)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(imag,scalar_imag_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(imag,scalar_imag_op,imaginary part,\sa ArrayBase::imag)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(conj,scalar_conjugate_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(conj,scalar_conjugate_op,complex conjugate,\sa ArrayBase::conjugate)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(inverse,scalar_inverse_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(inverse,scalar_inverse_op,inverse,\sa ArrayBase::inverse)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sin,scalar_sin_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sin,scalar_sin_op,sine,\sa ArrayBase::sin)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cos,scalar_cos_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cos,scalar_cos_op,cosine,\sa ArrayBase::cos)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tan,scalar_tan_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tan,scalar_tan_op,tangent,\sa ArrayBase::tan)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(atan,scalar_atan_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(atan,scalar_atan_op,arc-tangent,\sa ArrayBase::atan)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asin,scalar_asin_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asin,scalar_asin_op,arc-sine,\sa ArrayBase::asin)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acos,scalar_acos_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acos,scalar_acos_op,arc-consine,\sa ArrayBase::acos)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sinh,scalar_sinh_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sinh,scalar_sinh_op,hyperbolic sine,\sa ArrayBase::sinh)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cosh,scalar_cosh_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cosh,scalar_cosh_op,hyperbolic cosine,\sa ArrayBase::cosh)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tanh,scalar_tanh_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tanh,scalar_tanh_op,hyperbolic tangent,\sa ArrayBase::tanh)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(lgamma,scalar_lgamma_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(lgamma,scalar_lgamma_op,natural logarithm of the gamma function,\sa ArrayBase::lgamma)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(digamma,scalar_digamma_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(digamma,scalar_digamma_op,derivative of lgamma,\sa ArrayBase::digamma)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(zeta,scalar_zeta_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erf,scalar_erf_op,error function,\sa ArrayBase::erf)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(polygamma,scalar_polygamma_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erfc,scalar_erfc_op,complement error function,\sa ArrayBase::erfc)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erf,scalar_erf_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp,scalar_exp_op,exponential,\sa ArrayBase::exp)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erfc,scalar_erfc_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log,scalar_log_op,natural logarithm,\sa Eigen::log10 DOXCOMMA ArrayBase::log)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp,scalar_exp_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log1p,scalar_log1p_op,natural logarithm of 1 plus the value,\sa ArrayBase::log1p)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log,scalar_log_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log10,scalar_log10_op,base 10 logarithm,\sa Eigen::log DOXCOMMA ArrayBase::log)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log10,scalar_log10_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs,scalar_abs_op,absolute value,\sa ArrayBase::abs DOXCOMMA MatrixBase::cwiseAbs)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs,scalar_abs_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs2,scalar_abs2_op,squared absolute value,\sa ArrayBase::abs2 DOXCOMMA MatrixBase::cwiseAbs2)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs2,scalar_abs2_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(arg,scalar_arg_op,complex argument,\sa ArrayBase::arg)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(arg,scalar_arg_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sqrt,scalar_sqrt_op,square root,\sa ArrayBase::sqrt DOXCOMMA MatrixBase::cwiseSqrt)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sqrt,scalar_sqrt_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(rsqrt,scalar_rsqrt_op,reciprocal square root,\sa ArrayBase::rsqrt)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(square,scalar_square_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(square,scalar_square_op,square (power 2),\sa Eigen::abs2 DOXCOMMA Eigen::pow DOXCOMMA ArrayBase::square)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cube,scalar_cube_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cube,scalar_cube_op,cube (power 3),\sa Eigen::pow DOXCOMMA ArrayBase::cube)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(round,scalar_round_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(round,scalar_round_op,nearest integer,\sa Eigen::floor DOXCOMMA Eigen::ceil DOXCOMMA ArrayBase::round)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(floor,scalar_floor_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(floor,scalar_floor_op,nearest integer not greater than the giben value,\sa Eigen::ceil DOXCOMMA ArrayBase::floor)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(ceil,scalar_ceil_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(ceil,scalar_ceil_op,nearest integer not less than the giben value,\sa Eigen::floor DOXCOMMA ArrayBase::ceil)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isnan,scalar_isnan_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isnan,scalar_isnan_op,not-a-number test,\sa Eigen::isinf DOXCOMMA Eigen::isfinite DOXCOMMA ArrayBase::isnan)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isinf,scalar_isinf_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isinf,scalar_isinf_op,infinite value test,\sa Eigen::isnan DOXCOMMA Eigen::isfinite DOXCOMMA ArrayBase::isinf)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isfinite,scalar_isfinite_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isfinite,scalar_isfinite_op,finite value test,\sa Eigen::isinf DOXCOMMA Eigen::isnan DOXCOMMA ArrayBase::isfinite)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sign,scalar_sign_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sign,scalar_sign_op,sign (or 0),\sa ArrayBase::sign)
  /** \returns an expression of the coefficient-wise power of \a x to the given constant \a exponent.
    *
    * \tparam ScalarExponent is the scalar type of \a exponent. It must be compatible with the scalar type of the given expression (\c Derived::Scalar).
    *
    * \sa ArrayBase::pow()
    *
    * \relates ArrayBase
    */
 #ifdef EIGEN_PARSED_BY_DOXYGEN
  template<typename Derived,typename ScalarExponent>
  inline const CwiseBinaryOp<internal::scalar_pow_op<Derived::Scalar,ScalarExponent>,Derived,Constant<ScalarExponent> >
  pow(const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent);
 #else
  template<typename Derived,typename ScalarExponent>
  inline typename internal::enable_if<   !(internal::is_same<typename Derived::Scalar,ScalarExponent>::value) && EIGEN_SCALAR_BINARY_SUPPORTED(pow,typename Derived::Scalar,ScalarExponent),
          const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,ScalarExponent,pow) >::type
  pow(const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent) {
    return x.derived().pow(exponent);
  }
  template<typename Derived>
-  inline const Eigen::CwiseUnaryOp<Eigen::internal::scalar_pow_op<typename Derived::Scalar>, const Derived>
+  inline const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename Derived::Scalar,pow)
  pow(const Eigen::ArrayBase<Derived>& x, const typename Derived::Scalar& exponent) {
    return x.derived().pow(exponent);
  }
 #endif
  /** \returns an expression of the coefficient-wise power of \a x to the given array of \a exponents.
    *
@ -86,12 +124,14 @@ namespace Eigen
    * Output: \verbinclude Cwise_array_power_array.out
    * 
    * \sa ArrayBase::pow()
    *
    * \relates ArrayBase
    */
  template<typename Derived,typename ExponentDerived>
-  inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_binary_pow_op<typename Derived::Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>
+  inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_pow_op<typename Derived::Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>
  pow(const Eigen::ArrayBase<Derived>& x, const Eigen::ArrayBase<ExponentDerived>& exponents) 
  {
-    return Eigen::CwiseBinaryOp<Eigen::internal::scalar_binary_pow_op<typename Derived::Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>(
+    return Eigen::CwiseBinaryOp<Eigen::internal::scalar_pow_op<typename Derived::Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>(
      x.derived(),
      exponents.derived()
    );
@ -100,66 +140,39 @@ namespace Eigen
  /** \returns an expression of the coefficient-wise power of the scalar \a x to the given array of \a exponents.
    *
    * This function computes the coefficient-wise power between a scalar and an array of exponents.
-    * Beaware that the scalar type of the input scalar \a x and the exponents \a exponents must be the same.
+    *
    * \tparam Scalar is the scalar type of \a x. It must be compatible with the scalar type of the given array expression (\c Derived::Scalar).
    *
    * Example: \include Cwise_scalar_power_array.cpp
    * Output: \verbinclude Cwise_scalar_power_array.out
    * 
    * \sa ArrayBase::pow()
    *
    * \relates ArrayBase
    */
 #ifdef EIGEN_PARSED_BY_DOXYGEN
  template<typename Scalar,typename Derived>
  inline const CwiseBinaryOp<internal::scalar_pow_op<Scalar,Derived::Scalar>,Constant<Scalar>,Derived>
  pow(const Scalar& x,const Eigen::ArrayBase<Derived>& x);
 #else
  template<typename Scalar, typename Derived>
  inline typename internal::enable_if<   !(internal::is_same<typename Derived::Scalar,Scalar>::value) && EIGEN_SCALAR_BINARY_SUPPORTED(pow,Scalar,typename Derived::Scalar),
          const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,Derived,pow) >::type
  pow(const Scalar& x, const Eigen::ArrayBase<Derived>& exponents)
  {
    return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,Derived,pow)(
            typename internal::plain_constant_type<Derived,Scalar>::type(exponents.rows(), exponents.cols(), x), exponents.derived() );
  }
  template<typename Derived>
-  inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_binary_pow_op<typename Derived::Scalar, typename Derived::Scalar>, const typename Derived::ConstantReturnType, const Derived>
+  inline const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename Derived::Scalar,Derived,pow)
  pow(const typename Derived::Scalar& x, const Eigen::ArrayBase<Derived>& exponents)
  {
-    typename Derived::ConstantReturnType constant_x(exponents.rows(), exponents.cols(), x);
+    return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename Derived::Scalar,Derived,pow)(
-    return Eigen::CwiseBinaryOp<Eigen::internal::scalar_binary_pow_op<typename Derived::Scalar, typename Derived::Scalar>, const typename Derived::ConstantReturnType, const Derived>(
+      typename internal::plain_constant_type<Derived,typename Derived::Scalar>::type(exponents.rows(), exponents.cols(), x), exponents.derived() );
      constant_x,
      exponents.derived()
    );
  }
 #endif
  /**
  * \brief Component-wise division of a scalar by array elements.
  **/
  template <typename Derived>
  inline const Eigen::CwiseUnaryOp<Eigen::internal::scalar_inverse_mult_op<typename Derived::Scalar>, const Derived>
    operator/(const typename Derived::Scalar& s, const Eigen::ArrayBase<Derived>& a)
  {
    return Eigen::CwiseUnaryOp<Eigen::internal::scalar_inverse_mult_op<typename Derived::Scalar>, const Derived>(
      a.derived(),
      Eigen::internal::scalar_inverse_mult_op<typename Derived::Scalar>(s)  
    );
  }
  /** \returns an expression of the coefficient-wise igamma(\a a, \a x) to the given arrays.
    *
    * This function computes the coefficient-wise incomplete gamma function.
    *
    */
  template<typename Derived,typename ExponentDerived>
  inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
  igamma(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x) 
  {
    return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_op<typename Derived::Scalar>, const Derived, const ExponentDerived>(
      a.derived(),
      x.derived()
    );
  }
  /** \returns an expression of the coefficient-wise igammac(\a a, \a x) to the given arrays.
    *
    * This function computes the coefficient-wise complementary incomplete gamma function.
    *
    */
  template<typename Derived,typename ExponentDerived>
  inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igammac_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
  igammac(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x) 
  {
    return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igammac_op<typename Derived::Scalar>, const Derived, const ExponentDerived>(
      a.derived(),
      x.derived()
    );
  }
  namespace internal
  {
--- a/Eigen/src/Core/IO.h
+++ b/Eigen/src/Core/IO.h
@ -125,31 +125,17 @@ DenseBase<Derived>::format(const IOFormat& fmt) const
 namespace internal {
-template<typename Scalar, bool IsInteger>
+// NOTE: This helper is kept for backward compatibility with previous code specializing
-struct significant_decimals_default_impl
+//       this internal::significant_decimals_impl structure. In the future we should directly
-{
+//       call digits10() which has been introduced in July 2016 in 3.3.
  typedef typename NumTraits<Scalar>::Real RealScalar;
  static inline int run()
  {
    using std::ceil;
    using std::log;
    return cast<RealScalar,int>(ceil(-log(NumTraits<RealScalar>::epsilon())/log(RealScalar(10))));
  }
 };
 template<typename Scalar>
 struct significant_decimals_default_impl<Scalar, true>
 {
  static inline int run()
  {
    return 0;
  }
 };
 template<typename Scalar>
 struct significant_decimals_impl
-  : significant_decimals_default_impl<Scalar, NumTraits<Scalar>::IsInteger>
+{
-{};
+  static inline int run()
  {
    return NumTraits<Scalar>::digits10();
  }
 };
 /** \internal
  * print the matrix \a _m to the output stream \a s using the output format \a fmt */
--- a/Eigen/src/Core/Inverse.h
+++ b/Eigen/src/Core/Inverse.h
@ -50,7 +50,7 @@ public:
  typedef typename internal::ref_selector<Inverse>::type Nested;
  typedef typename internal::remove_all<XprType>::type NestedExpression;
-  explicit Inverse(const XprType &xpr)
+  explicit EIGEN_DEVICE_FUNC Inverse(const XprType &xpr)
    : m_xpr(xpr)
  {}
--- a/Eigen/src/Core/MapBase.h
+++ b/Eigen/src/Core/MapBase.h
@ -17,10 +17,20 @@
 namespace Eigen { 
-/** \class MapBase
+/** \ingroup Core_Module
  * \ingroup Core_Module
  *
-  * \brief Base class for Map and Block expression with direct access
+  * \brief Base class for dense Map and Block expression with direct access
  *
  * This base class provides the const low-level accessors (e.g. coeff, coeffRef) of dense
  * Map and Block objects with direct access.
  * Typical users do not have to directly deal with this class.
  *
  * This class can be extended by through the macro plugin \c EIGEN_MAPBASE_PLUGIN.
  * See \link TopicCustomizing_Plugins customizing Eigen \endlink for details.
  *
  * The \c Derived class has to provide the following two methods describing the memory layout:
  *  \code Index innerStride() const; \endcode
  *  \code Index outerStride() const; \endcode
  *
  * \sa class Map, class Block
  */
@ -75,7 +85,9 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
    typedef typename Base::CoeffReturnType CoeffReturnType;
    /** \copydoc DenseBase::rows() */
    EIGEN_DEVICE_FUNC inline Index rows() const { return m_rows.value(); }
    /** \copydoc DenseBase::cols() */
    EIGEN_DEVICE_FUNC inline Index cols() const { return m_cols.value(); }
    /** Returns a pointer to the first coefficient of the matrix or vector.
@ -86,12 +98,14 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
      */
    EIGEN_DEVICE_FUNC inline const Scalar* data() const { return m_data; }
    /** \copydoc PlainObjectBase::coeff(Index,Index) const */
    EIGEN_DEVICE_FUNC
    inline const Scalar& coeff(Index rowId, Index colId) const
    {
      return m_data[colId * colStride() + rowId * rowStride()];
    }
    /** \copydoc PlainObjectBase::coeff(Index) const */
    EIGEN_DEVICE_FUNC
    inline const Scalar& coeff(Index index) const
    {
@ -99,12 +113,14 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
      return m_data[index * innerStride()];
    }
    /** \copydoc PlainObjectBase::coeffRef(Index,Index) const */
    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index rowId, Index colId) const
    {
      return this->m_data[colId * colStride() + rowId * rowStride()];
    }
    /** \copydoc PlainObjectBase::coeffRef(Index) const */
    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index index) const
    {
@ -112,6 +128,7 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
      return this->m_data[index * innerStride()];
    }
    /** \internal */
    template<int LoadMode>
    inline PacketScalar packet(Index rowId, Index colId) const
    {
@ -119,6 +136,7 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
               (m_data + (colId * colStride() + rowId * rowStride()));
    }
    /** \internal */
    template<int LoadMode>
    inline PacketScalar packet(Index index) const
    {
@ -126,6 +144,7 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
      return internal::ploadt<PacketScalar, LoadMode>(m_data + index * innerStride());
    }
    /** \internal Constructor for fixed size matrices or vectors */
    EIGEN_DEVICE_FUNC
    explicit inline MapBase(PointerType dataPtr) : m_data(dataPtr), m_rows(RowsAtCompileTime), m_cols(ColsAtCompileTime)
    {
@ -133,6 +152,7 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
      checkSanity<Derived>();
    }
    /** \internal Constructor for dynamically sized vectors */
    EIGEN_DEVICE_FUNC
    inline MapBase(PointerType dataPtr, Index vecSize)
            : m_data(dataPtr),
@ -145,6 +165,7 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
      checkSanity<Derived>();
    }
    /** \internal Constructor for dynamically sized matrices */
    EIGEN_DEVICE_FUNC
    inline MapBase(PointerType dataPtr, Index rows, Index cols)
            : m_data(dataPtr), m_rows(rows), m_cols(cols)
@ -166,7 +187,7 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
    void checkSanity(typename internal::enable_if<(internal::traits<T>::Alignment>0),void*>::type = 0) const
    {
 #if EIGEN_MAX_ALIGN_BYTES>0
-      eigen_assert((   ((size_t(m_data) % internal::traits<Derived>::Alignment) == 0)
+      eigen_assert((   ((internal::UIntPtr(m_data) % internal::traits<Derived>::Alignment) == 0)
                    || (cols() * rows() * innerStride() * sizeof(Scalar)) < internal::traits<Derived>::Alignment ) && "data is not aligned");
 #endif
    }
@ -181,6 +202,16 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
    const internal::variable_if_dynamic<Index, ColsAtCompileTime> m_cols;
 };
 /** \ingroup Core_Module
  *
  * \brief Base class for non-const dense Map and Block expression with direct access
  *
  * This base class provides the non-const low-level accessors (e.g. coeff and coeffRef) of
  * dense Map and Block objects with direct access.
  * It inherits MapBase<Derived, ReadOnlyAccessors> which defines the const variant for reading specific entries.
  *
  * \sa class Map, class Block
  */
 template<typename Derived> class MapBase<Derived, WriteAccessors>
  : public MapBase<Derived, ReadOnlyAccessors>
 {
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@ -11,7 +11,9 @@
 #define EIGEN_MATHFUNCTIONS_H
 // source: http://www.geom.uiuc.edu/~huberty/math5337/groupe/digits.html
-#define EIGEN_PI 3.141592653589793238462643383279502884197169399375105820974944592307816406
+// TODO this should better be moved to NumTraits
 #define EIGEN_PI 3.141592653589793238462643383279502884197169399375105820974944592307816406L
 namespace Eigen {
@ -95,6 +97,19 @@ struct real_default_impl<Scalar,true>
 template<typename Scalar> struct real_impl : real_default_impl<Scalar> {};
 #ifdef __CUDA_ARCH__
 template<typename T>
 struct real_impl<std::complex<T> >
 {
  typedef T RealScalar;
  EIGEN_DEVICE_FUNC
  static inline T run(const std::complex<T>& x)
  {
    return x.real();
  }
 };
 #endif
 template<typename Scalar>
 struct real_retval
 {
@ -130,6 +145,19 @@ struct imag_default_impl<Scalar,true>
 template<typename Scalar> struct imag_impl : imag_default_impl<Scalar> {};
 #ifdef __CUDA_ARCH__
 template<typename T>
 struct imag_impl<std::complex<T> >
 {
  typedef T RealScalar;
  EIGEN_DEVICE_FUNC
  static inline T run(const std::complex<T>& x)
  {
    return x.imag();
  }
 };
 #endif
 template<typename Scalar>
 struct imag_retval
 {
@ -457,30 +485,33 @@ struct arg_retval
 /****************************************************************************
 * Implementation of log1p                                                   *
 ****************************************************************************/
-template<typename Scalar, bool isComplex = NumTraits<Scalar>::IsComplex >
+
-struct log1p_impl
+namespace std_fallback {
-{
+  // fallback log1p implementation in case there is no log1p(Scalar) function in namespace of Scalar,
-  static inline Scalar run(const Scalar& x)
+  // or that there is no suitable std::log1p function available
-  {
+  template<typename Scalar>
  EIGEN_DEVICE_FUNC inline Scalar log1p(const Scalar& x) {
    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
    typedef typename NumTraits<Scalar>::Real RealScalar;
    EIGEN_USING_STD_MATH(log);
    Scalar x1p = RealScalar(1) + x;
    return ( x1p == Scalar(1) ) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) );
  }
-};
+}
 #if EIGEN_HAS_CXX11_MATH
 template<typename Scalar>
-struct log1p_impl<Scalar, false> {
+struct log1p_impl {
  static inline Scalar run(const Scalar& x)
  {
    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
    #if EIGEN_HAS_CXX11_MATH
    using std::log1p;
    #endif
    using std_fallback::log1p;
    return log1p(x);
  }
 };
-#endif
+
 template<typename Scalar>
 struct log1p_retval
@ -492,24 +523,26 @@ struct log1p_retval
 * Implementation of pow                                                  *
 ****************************************************************************/
-template<typename Scalar, bool IsInteger>
+template<typename ScalarX,typename ScalarY, bool IsInteger = NumTraits<ScalarX>::IsInteger&&NumTraits<ScalarY>::IsInteger>
-struct pow_default_impl
+struct pow_impl
 {
-  typedef Scalar retval;
+  //typedef Scalar retval;
-  static EIGEN_DEVICE_FUNC inline Scalar run(const Scalar& x, const Scalar& y)
+  typedef typename ScalarBinaryOpTraits<ScalarX,ScalarY,internal::scalar_pow_op<ScalarX,ScalarY> >::ReturnType result_type;
  static EIGEN_DEVICE_FUNC inline result_type run(const ScalarX& x, const ScalarY& y)
  {
    EIGEN_USING_STD_MATH(pow);
    return pow(x, y);
  }
 };
-template<typename Scalar>
+template<typename ScalarX,typename ScalarY>
-struct pow_default_impl<Scalar, true>
+struct pow_impl<ScalarX,ScalarY, true>
 {
-  static EIGEN_DEVICE_FUNC inline Scalar run(Scalar x, Scalar y)
+  typedef ScalarX result_type;
  static EIGEN_DEVICE_FUNC inline ScalarX run(ScalarX x, ScalarY y)
  {
-    Scalar res(1);
+    ScalarX res(1);
-    eigen_assert(!NumTraits<Scalar>::IsSigned || y >= 0);
+    eigen_assert(!NumTraits<ScalarY>::IsSigned || y >= 0);
    if(y & 1) res *= x;
    y >>= 1;
    while(y)
@ -522,15 +555,6 @@ struct pow_default_impl<Scalar, true>
  }
 };
 template<typename Scalar>
 struct pow_impl : pow_default_impl<Scalar, NumTraits<Scalar>::IsInteger> {};
 template<typename Scalar>
 struct pow_retval
 {
  typedef Scalar type;
 };
 /****************************************************************************
 * Implementation of random                                               *
 ****************************************************************************/
@ -620,16 +644,18 @@ struct random_default_impl<Scalar, false, true>
    typedef typename conditional<NumTraits<Scalar>::IsSigned,std::ptrdiff_t,std::size_t>::type ScalarX;
    if(y<x)
      return x;
    // the following difference might overflow on a 32 bits system,
    // but since y>=x the result converted to an unsigned long is still correct.
    std::size_t range = ScalarX(y)-ScalarX(x);
    std::size_t offset = 0;
    // rejection sampling
-    std::size_t divisor    = (range+RAND_MAX-1)/(range+1);
+    std::size_t divisor = 1;
-    std::size_t multiplier = (range+RAND_MAX-1)/std::size_t(RAND_MAX);
+    std::size_t multiplier = 1;
-
+    if(range<RAND_MAX) divisor = (std::size_t(RAND_MAX)+1)/(range+1);
    else               multiplier = 1 + range/(std::size_t(RAND_MAX)+1);
    do {
-      offset = ( (std::size_t(std::rand()) * multiplier) / divisor );
+      offset = (std::size_t(std::rand()) * multiplier) / divisor;
    } while (offset > range);
    return Scalar(ScalarX(x) + offset);
  }
@ -790,6 +816,8 @@ template<typename T> EIGEN_DEVICE_FUNC bool isfinite_impl(const std::complex<T>&
 template<typename T> EIGEN_DEVICE_FUNC bool isnan_impl(const std::complex<T>& x);
 template<typename T> EIGEN_DEVICE_FUNC bool isinf_impl(const std::complex<T>& x);
 template<typename T> T generic_fast_tanh_float(const T& a_x);
 } // end namespace internal
 /****************************************************************************
@ -825,7 +853,7 @@ template<>
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE float mini(const float& x, const float& y)
 {
-  return fmin(x, y);
+  return fminf(x, y);
 }
 template<typename T>
 EIGEN_DEVICE_FUNC
@ -837,7 +865,7 @@ template<>
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE float maxi(const float& x, const float& y)
 {
-  return fmax(x, y);
+  return fmaxf(x, y);
 }
 #endif
@ -926,11 +954,19 @@ inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x)
  return EIGEN_MATHFUNC_IMPL(log1p, Scalar)::run(x);
 }
-template<typename Scalar>
+#ifdef __CUDACC__
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float log1p(const float &x) { return ::log1pf(x); }
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 double log1p(const double &x) { return ::log1p(x); }
 #endif
 template<typename ScalarX,typename ScalarY>
 EIGEN_DEVICE_FUNC
-inline EIGEN_MATHFUNC_RETVAL(pow, Scalar) pow(const Scalar& x, const Scalar& y)
+inline typename internal::pow_impl<ScalarX,ScalarY>::result_type pow(const ScalarX& x, const ScalarY& y)
 {
-  return EIGEN_MATHFUNC_IMPL(pow, Scalar)::run(x, y);
+  return internal::pow_impl<ScalarX,ScalarY>::run(x, y);
 }
 template<typename T> EIGEN_DEVICE_FUNC bool (isnan)   (const T &x) { return internal::isnan_impl(x); }
@ -1036,6 +1072,16 @@ float abs(const float &x) { return ::fabsf(x); }
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 double abs(const double &x) { return ::fabs(x); }
 template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float abs(const std::complex<float>& x) {
  return ::hypotf(x.real(), x.imag());
 }
 template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 double abs(const std::complex<double>& x) {
  return ::hypot(x.real(), x.imag());
 }
 #endif
 template<typename T>
@ -1181,6 +1227,11 @@ T tanh(const T &x) {
  return tanh(x);
 }
 #if (!defined(__CUDACC__)) && EIGEN_FAST_MATH
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float tanh(float x) { return internal::generic_fast_tanh_float(x); }
 #endif
 #ifdef __CUDACC__
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float tanh(const float &x) { return ::tanhf(x); }
@ -1192,7 +1243,7 @@ double tanh(const double &x) { return ::tanh(x); }
 template <typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T fmod(const T& a, const T& b) {
-  EIGEN_USING_STD_MATH(floor);
+  EIGEN_USING_STD_MATH(fmod);
  return fmod(a, b);
 }
@ -1287,11 +1338,12 @@ template<typename Scalar>
 struct scalar_fuzzy_default_impl<Scalar, true, false>
 {
  typedef typename NumTraits<Scalar>::Real RealScalar;
-  template<typename OtherScalar>
+  template<typename OtherScalar> EIGEN_DEVICE_FUNC
  static inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y, const RealScalar& prec)
  {
    return numext::abs2(x) <= numext::abs2(y) * prec * prec;
  }
  EIGEN_DEVICE_FUNC
  static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar& prec)
  {
    return numext::abs2(x - y) <= numext::mini(numext::abs2(x), numext::abs2(y)) * prec * prec;
--- a/Eigen/src/Core/MathFunctionsImpl.h
+++ b/Eigen/src/Core/MathFunctionsImpl.h
@ -0,0 +1,78 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
 // Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #ifndef EIGEN_MATHFUNCTIONSIMPL_H
 #define EIGEN_MATHFUNCTIONSIMPL_H
 namespace Eigen {
 namespace internal {
 /** \internal \returns the hyperbolic tan of \a a (coeff-wise)
    Doesn't do anything fancy, just a 13/6-degree rational interpolant which
    is accurate up to a couple of ulp in the range [-9, 9], outside of which
    the tanh(x) = +/-1.
    This implementation works on both scalars and packets.
 */
 template<typename T>
 T generic_fast_tanh_float(const T& a_x)
 {
  // Clamp the inputs to the range [-9, 9] since anything outside
  // this range is +/-1.0f in single-precision.
  const T plus_9 = pset1<T>(9.f);
  const T minus_9 = pset1<T>(-9.f);
  // NOTE GCC prior to 6.3 might improperly optimize this max/min
  //      step such that if a_x is nan, x will be either 9 or -9,
  //      and tanh will return 1 or -1 instead of nan.
  //      This is supposed to be fixed in gcc6.3,
  //      see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
  const T x = pmax(minus_9,pmin(plus_9,a_x));
  // The monomial coefficients of the numerator polynomial (odd).
  const T alpha_1 = pset1<T>(4.89352455891786e-03f);
  const T alpha_3 = pset1<T>(6.37261928875436e-04f);
  const T alpha_5 = pset1<T>(1.48572235717979e-05f);
  const T alpha_7 = pset1<T>(5.12229709037114e-08f);
  const T alpha_9 = pset1<T>(-8.60467152213735e-11f);
  const T alpha_11 = pset1<T>(2.00018790482477e-13f);
  const T alpha_13 = pset1<T>(-2.76076847742355e-16f);
  // The monomial coefficients of the denominator polynomial (even).
  const T beta_0 = pset1<T>(4.89352518554385e-03f);
  const T beta_2 = pset1<T>(2.26843463243900e-03f);
  const T beta_4 = pset1<T>(1.18534705686654e-04f);
  const T beta_6 = pset1<T>(1.19825839466702e-06f);
  // Since the polynomials are odd/even, we need x^2.
  const T x2 = pmul(x, x);
  // Evaluate the numerator polynomial p.
  T p = pmadd(x2, alpha_13, alpha_11);
  p = pmadd(x2, p, alpha_9);
  p = pmadd(x2, p, alpha_7);
  p = pmadd(x2, p, alpha_5);
  p = pmadd(x2, p, alpha_3);
  p = pmadd(x2, p, alpha_1);
  p = pmul(x, p);
  // Evaluate the denominator polynomial p.
  T q = pmadd(x2, beta_6, beta_4);
  q = pmadd(x2, q, beta_2);
  q = pmadd(x2, q, beta_0);
  // Divide the numerator by the denominator.
  return pdiv(p, q);
 }
 } // end namespace internal
 } // end namespace Eigen
 #endif // EIGEN_MATHFUNCTIONSIMPL_H
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h
@ -27,7 +27,7 @@ private:
      default_alignment = compute_default_alignment<_Scalar,max_size>::value,
      actual_alignment = ((_Options&DontAlign)==0) ? default_alignment : 0,
      required_alignment = unpacket_traits<PacketScalar>::alignment,
-      packet_access_bit = packet_traits<_Scalar>::Vectorizable && (actual_alignment>=required_alignment) ? PacketAccessBit : 0
+      packet_access_bit = (packet_traits<_Scalar>::Vectorizable && (EIGEN_UNALIGNED_VECTORIZE || (actual_alignment>=required_alignment))) ? PacketAccessBit : 0
    };
 public:
@ -106,7 +106,7 @@ public:
  * \endcode
  *
  * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_MATRIX_PLUGIN.
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_MATRIX_PLUGIN.
  *
  * <i><b>Some notes:</b></i>
  *
@ -268,9 +268,9 @@ class Matrix
      : Base(internal::constructor_without_unaligned_array_assert())
    { Base::_check_template_params(); EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED }
-#ifdef EIGEN_HAVE_RVALUE_REFERENCES
+#if EIGEN_HAS_RVALUE_REFERENCES
    EIGEN_DEVICE_FUNC
-    Matrix(Matrix&& other)
+    Matrix(Matrix&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible<Scalar>::value)
      : Base(std::move(other))
    {
      Base::_check_template_params();
@ -278,7 +278,7 @@ class Matrix
        Base::_set_noalias(other);
    }
    EIGEN_DEVICE_FUNC
-    Matrix& operator=(Matrix&& other)
+    Matrix& operator=(Matrix&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value)
    {
      other.swap(*this);
      return *this;
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@ -41,7 +41,7 @@ namespace Eigen {
  * \endcode
  *
  * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_MATRIXBASE_PLUGIN.
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_MATRIXBASE_PLUGIN.
  *
  * \sa \blank \ref TopicClassHierarchy
  */
@ -80,8 +80,6 @@ template<typename Derived> class MatrixBase
    using Base::operator-=;
    using Base::operator*=;
    using Base::operator/=;
    using Base::operator*;
    using Base::operator/;
    typedef typename Base::CoeffReturnType CoeffReturnType;
    typedef typename Base::ConstTransposeReturnType ConstTransposeReturnType;
@ -100,7 +98,7 @@ template<typename Derived> class MatrixBase
    /** \returns the size of the main diagonal, which is min(rows(),cols()).
      * \sa rows(), cols(), SizeAtCompileTime. */
    EIGEN_DEVICE_FUNC
-    inline Index diagonalSize() const { return (std::min)(rows(),cols()); }
+    inline Index diagonalSize() const { return (numext::mini)(rows(),cols()); }
    typedef typename Base::PlainObject PlainObject;
@ -123,6 +121,7 @@ template<typename Derived> class MatrixBase
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::MatrixBase
 #define EIGEN_DOC_UNARY_ADDONS(X,Y)
 #   include "../plugins/CommonCwiseUnaryOps.h"
 #   include "../plugins/CommonCwiseBinaryOps.h"
 #   include "../plugins/MatrixCwiseUnaryOps.h"
@ -131,6 +130,7 @@ template<typename Derived> class MatrixBase
 #     include EIGEN_MATRIXBASE_PLUGIN
 #   endif
 #undef EIGEN_CURRENT_STORAGE_BASE_CLASS
 #undef EIGEN_DOC_UNARY_ADDONS
    /** Special case of the template operator=, in order to prevent the compiler
      * from generating a default operator= (issue hit with g++ 4.1)
@ -195,7 +195,7 @@ template<typename Derived> class MatrixBase
    template<typename OtherDerived>
    EIGEN_DEVICE_FUNC
-    typename internal::scalar_product_traits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
+    typename ScalarBinaryOpTraits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
    dot(const MatrixBase<OtherDerived>& other) const;
    EIGEN_DEVICE_FUNC RealScalar squaredNorm() const;
@ -330,15 +330,11 @@ template<typename Derived> class MatrixBase
 /////////// LU module ///////////
    EIGEN_DEVICE_FUNC
    inline const FullPivLU<PlainObject> fullPivLu() const;
    EIGEN_DEVICE_FUNC
    inline const PartialPivLU<PlainObject> partialPivLu() const;
    EIGEN_DEVICE_FUNC
    inline const PartialPivLU<PlainObject> lu() const;
    EIGEN_DEVICE_FUNC
    inline const Inverse<Derived> inverse() const;
    template<typename ResultType>
@ -383,7 +379,7 @@ template<typename Derived> class MatrixBase
    #ifndef EIGEN_PARSED_BY_DOXYGEN
    /// \internal helper struct to form the return type of the cross product
    template<typename OtherDerived> struct cross_product_return_type {
-      typedef typename internal::scalar_product_traits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType Scalar;
+      typedef typename ScalarBinaryOpTraits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType Scalar;
      typedef Matrix<Scalar,MatrixBase::RowsAtCompileTime,MatrixBase::ColsAtCompileTime> type;
    };
    #endif // EIGEN_PARSED_BY_DOXYGEN
@ -405,7 +401,6 @@ template<typename Derived> class MatrixBase
    inline Matrix<Scalar,3,1> eulerAngles(Index a0, Index a1, Index a2) const;
    inline ScalarMultipleReturnType operator*(const UniformScaling<Scalar>& s) const;
    // put this as separate enum value to work around possible GCC 4.3 bug (?)
    enum { HomogeneousReturnTypeDirection = ColsAtCompileTime==1&&RowsAtCompileTime==1 ? ((internal::traits<Derived>::Flags&RowMajorBit)==RowMajorBit ? Horizontal : Vertical)
                                          : ColsAtCompileTime==1 ? Vertical : Horizontal };
@ -418,8 +413,7 @@ template<typename Derived> class MatrixBase
    typedef Block<const Derived,
                  internal::traits<Derived>::ColsAtCompileTime==1 ? SizeMinusOne : 1,
                  internal::traits<Derived>::ColsAtCompileTime==1 ? 1 : SizeMinusOne> ConstStartMinusOne;
-    typedef CwiseUnaryOp<internal::scalar_quotient1_op<typename internal::traits<Derived>::Scalar>,
+    typedef EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(ConstStartMinusOne,Scalar,quotient) HNormalizedReturnType;
                const ConstStartMinusOne > HNormalizedReturnType;
    inline const HNormalizedReturnType hnormalized() const;
--- a/Eigen/src/Core/NoAlias.h
+++ b/Eigen/src/Core/NoAlias.h
@ -39,7 +39,7 @@ class NoAlias
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE ExpressionType& operator=(const StorageBase<OtherDerived>& other)
    {
-      call_assignment_no_alias(m_expression, other.derived(), internal::assign_op<Scalar>());
+      call_assignment_no_alias(m_expression, other.derived(), internal::assign_op<Scalar,typename OtherDerived::Scalar>());
      return m_expression;
    }
@ -47,7 +47,7 @@ class NoAlias
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE ExpressionType& operator+=(const StorageBase<OtherDerived>& other)
    {
-      call_assignment_no_alias(m_expression, other.derived(), internal::add_assign_op<Scalar>());
+      call_assignment_no_alias(m_expression, other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
      return m_expression;
    }
@ -55,7 +55,7 @@ class NoAlias
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE ExpressionType& operator-=(const StorageBase<OtherDerived>& other)
    {
-      call_assignment_no_alias(m_expression, other.derived(), internal::sub_assign_op<Scalar>());
+      call_assignment_no_alias(m_expression, other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
      return m_expression;
    }
--- a/Eigen/src/Core/NumTraits.h
+++ b/Eigen/src/Core/NumTraits.h
@ -12,6 +12,37 @@
 namespace Eigen {
 namespace internal {
 // default implementation of digits10(), based on numeric_limits if specialized,
 // 0 for integer types, and log10(epsilon()) otherwise.
 template< typename T,
          bool use_numeric_limits = std::numeric_limits<T>::is_specialized,
          bool is_integer = NumTraits<T>::IsInteger>
 struct default_digits10_impl
 {
  static int run() { return std::numeric_limits<T>::digits10; }
 };
 template<typename T>
 struct default_digits10_impl<T,false,false> // Floating point
 {
  static int run() {
    using std::log10;
    using std::ceil;
    typedef typename NumTraits<T>::Real Real;
    return int(ceil(-log10(NumTraits<Real>::epsilon())));
  }
 };
 template<typename T>
 struct default_digits10_impl<T,false,true> // Integer
 {
  static int run() { return 0; }
 };
 } // end namespace internal
 /** \class NumTraits
  * \ingroup Core_Module
  *
@ -22,14 +53,16 @@ namespace Eigen {
  * This class stores enums, typedefs and static methods giving information about a numeric type.
  *
  * The provided data consists of:
-  * \li A typedef \a Real, giving the "real part" type of \a T. If \a T is already real,
+  * \li A typedef \c Real, giving the "real part" type of \a T. If \a T is already real,
-  *     then \a Real is just a typedef to \a T. If \a T is \c std::complex<U> then \a Real
+  *     then \c Real is just a typedef to \a T. If \a T is \c std::complex<U> then \c Real
  *     is a typedef to \a U.
-  * \li A typedef \a NonInteger, giving the type that should be used for operations producing non-integral values,
+  * \li A typedef \c NonInteger, giving the type that should be used for operations producing non-integral values,
  *     such as quotients, square roots, etc. If \a T is a floating-point type, then this typedef just gives
  *     \a T again. Note however that many Eigen functions such as internal::sqrt simply refuse to
  *     take integers. Outside of a few cases, Eigen doesn't do automatic type promotion. Thus, this typedef is
  *     only intended as a helper for code that needs to explicitly promote types.
  * \li A typedef \c Literal giving the type to use for numeric literals such as "2" or "0.5". For instance, for \c std::complex<U>, Literal is defined as \c U.
  *     Of course, this type must be fully compatible with \a T. In doubt, just use \a T here.
  * \li A typedef \a Nested giving the type to use to nest a value inside of the expression tree. If you don't know what
  *     this means, just use \a T here.
  * \li An enum value \a IsComplex. It is equal to 1 if \a T is a \c std::complex
@ -42,10 +75,14 @@ namespace Eigen {
  * \li An enum value \a IsSigned. It is equal to \c 1 if \a T is a signed type and to 0 if \a T is unsigned.
  * \li An enum value \a RequireInitialization. It is equal to \c 1 if the constructor of the numeric type \a T must
  *     be called, and to 0 if it is safe not to call it. Default is 0 if \a T is an arithmetic type, and 1 otherwise.
-  * \li An epsilon() function which, unlike std::numeric_limits::epsilon(), returns a \a Real instead of a \a T.
+  * \li An epsilon() function which, unlike <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/epsilon">std::numeric_limits::epsilon()</a>,
  *     it returns a \a Real instead of a \a T.
  * \li A dummy_precision() function returning a weak epsilon value. It is mainly used as a default
  *     value by the fuzzy comparison operators.
  * \li highest() and lowest() functions returning the highest and lowest possible values respectively.
  * \li digits10() function returning the number of decimal digits that can be represented without change. This is
  *     the analogue of <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/digits10">std::numeric_limits<T>::digits10</a>
  *     which is used as the default implementation if specialized.
  */
 template<typename T> struct GenericNumTraits
@ -60,23 +97,6 @@ template<typename T> struct GenericNumTraits
    MulCost = 1
  };
  // Division is messy but important, because it is expensive and throughput
  // varies significantly. The following numbers are based on min division
  // throughput on Haswell.
  template<bool Vectorized>
  struct Div {
    enum {
 #ifdef EIGEN_VECTORIZE_AVX
      AVX = true,
 #else
      AVX = false,
 #endif
      Cost = IsInteger ? (sizeof(T) == 8 ? (IsSigned ? 24 : 21) : (IsSigned ? 8 : 9)):
          Vectorized ? (sizeof(T) == 8 ? (AVX ? 16 : 8) : (AVX ? 14 : 7)) : 8
    };
  };
  typedef T Real;
  typedef typename internal::conditional<
                     IsInteger,
@ -84,12 +104,20 @@ template<typename T> struct GenericNumTraits
                     T
                   >::type NonInteger;
  typedef T Nested;
  typedef T Literal;
  EIGEN_DEVICE_FUNC
  static inline Real epsilon()
  {
    return numext::numeric_limits<T>::epsilon();
  }
  EIGEN_DEVICE_FUNC
  static inline int digits10()
  {
    return internal::default_digits10_impl<T>::run();
  }
  EIGEN_DEVICE_FUNC
  static inline Real dummy_precision()
  {
@ -145,6 +173,7 @@ template<typename _Real> struct NumTraits<std::complex<_Real> >
  : GenericNumTraits<std::complex<_Real> >
 {
  typedef _Real Real;
  typedef typename NumTraits<_Real>::Literal Literal;
  enum {
    IsComplex = 1,
    RequireInitialization = NumTraits<_Real>::RequireInitialization,
@ -157,6 +186,8 @@ template<typename _Real> struct NumTraits<std::complex<_Real> >
  static inline Real epsilon() { return NumTraits<Real>::epsilon(); }
  EIGEN_DEVICE_FUNC
  static inline Real dummy_precision() { return NumTraits<Real>::dummy_precision(); }
  EIGEN_DEVICE_FUNC
  static inline int digits10() { return NumTraits<Real>::digits10(); }
 };
 template<typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
@ -168,6 +199,7 @@ struct NumTraits<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
  typedef typename NumTraits<Scalar>::NonInteger NonIntegerScalar;
  typedef Array<NonIntegerScalar, Rows, Cols, Options, MaxRows, MaxCols> NonInteger;
  typedef ArrayType & Nested;
  typedef typename NumTraits<Scalar>::Literal Literal;
  enum {
    IsComplex = NumTraits<Scalar>::IsComplex,
@ -185,6 +217,30 @@ struct NumTraits<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
  static inline RealScalar dummy_precision() { return NumTraits<RealScalar>::dummy_precision(); }
 };
 template<> struct NumTraits<std::string>
  : GenericNumTraits<std::string>
 {
  enum {
    RequireInitialization = 1,
    ReadCost = HugeCost,
    AddCost  = HugeCost,
    MulCost  = HugeCost
  };
  static inline int digits10() { return 0; }
 private:
  static inline std::string epsilon();
  static inline std::string dummy_precision();
  static inline std::string lowest();
  static inline std::string highest();
  static inline std::string infinity();
  static inline std::string quiet_NaN();
 };
 // Empty specialization for void to allow template specialization based on NumTraits<T>::Real with T==void and SFINAE.
 template<> struct NumTraits<void> {};
 } // end namespace Eigen
 #endif // EIGEN_NUMTRAITS_H
--- a/Eigen/src/Core/PlainObjectBase.h
+++ b/Eigen/src/Core/PlainObjectBase.h
@ -59,33 +59,34 @@ template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers> struct m
 } // end namespace internal
 /** \class PlainObjectBase
  * \ingroup Core_Module
  * \brief %Dense storage base class for matrices and arrays.
  *
  * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_PLAINOBJECTBASE_PLUGIN.
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_PLAINOBJECTBASE_PLUGIN.
  *
  * \sa \ref TopicClassHierarchy
  */
 #ifdef EIGEN_PARSED_BY_DOXYGEN
-namespace internal {
+namespace doxygen {
 // this is a workaround to doxygen not being able to understand the inheritance logic
 // when it is hidden by the dense_xpr_base helper struct.
 /** This class is just a workaround for Doxygen and it does not not actually exist. */
-template<typename Derived> struct dense_xpr_base_dispatcher_for_doxygen;// : public MatrixBase<Derived> {};
+template<typename Derived> struct dense_xpr_base_dispatcher;
 /** This class is just a workaround for Doxygen and it does not not actually exist. */
 template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
-struct dense_xpr_base_dispatcher_for_doxygen<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
+struct dense_xpr_base_dispatcher<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
    : public MatrixBase<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > {};
 /** This class is just a workaround for Doxygen and it does not not actually exist. */
 template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
-struct dense_xpr_base_dispatcher_for_doxygen<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
+struct dense_xpr_base_dispatcher<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
    : public ArrayBase<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > {};
-} // namespace internal
+} // namespace doxygen
 template<typename Derived>
-class PlainObjectBase : public internal::dense_xpr_base_dispatcher_for_doxygen<Derived>
+class PlainObjectBase : public doxygen::dense_xpr_base_dispatcher<Derived>
 #else
 template<typename Derived>
 class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
@ -145,6 +146,10 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Index cols() const { return m_storage.cols(); }
    /** This is an overloaded version of DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index,Index) const
      * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
      *
      * See DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index) const for details. */
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const Scalar& coeff(Index rowId, Index colId) const
    {
@ -154,12 +159,20 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
        return m_storage.data()[rowId + colId * m_storage.rows()];
    }
    /** This is an overloaded version of DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index) const
      * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
      *
      * See DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index) const for details. */
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const
    {
      return m_storage.data()[index];
    }
    /** This is an overloaded version of DenseCoeffsBase<Derived,WriteAccessors>::coeffRef(Index,Index) const
      * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
      *
      * See DenseCoeffsBase<Derived,WriteAccessors>::coeffRef(Index,Index) const for details. */
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar& coeffRef(Index rowId, Index colId)
    {
@ -169,12 +182,18 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
        return m_storage.data()[rowId + colId * m_storage.rows()];
    }
    /** This is an overloaded version of DenseCoeffsBase<Derived,WriteAccessors>::coeffRef(Index) const
      * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
      *
      * See DenseCoeffsBase<Derived,WriteAccessors>::coeffRef(Index) const for details. */
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
    {
      return m_storage.data()[index];
    }
    /** This is the const version of coeffRef(Index,Index) which is thus synonym of coeff(Index,Index).
      * It is provided for convenience. */
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const Scalar& coeffRef(Index rowId, Index colId) const
    {
@ -184,6 +203,8 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
        return m_storage.data()[rowId + colId * m_storage.rows()];
    }
    /** This is the const version of coeffRef(Index) which is thus synonym of coeff(Index).
      * It is provided for convenience. */
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const Scalar& coeffRef(Index index) const
    {
@ -471,15 +492,15 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    }
 #endif
-#ifdef EIGEN_HAVE_RVALUE_REFERENCES
+#if EIGEN_HAS_RVALUE_REFERENCES
    EIGEN_DEVICE_FUNC
-    PlainObjectBase(PlainObjectBase&& other)
+    PlainObjectBase(PlainObjectBase&& other) EIGEN_NOEXCEPT
      : m_storage( std::move(other.m_storage) )
    {
    }
    EIGEN_DEVICE_FUNC
-    PlainObjectBase& operator=(PlainObjectBase&& other)
+    PlainObjectBase& operator=(PlainObjectBase&& other) EIGEN_NOEXCEPT
    {
      using std::swap;
      swap(m_storage, other.m_storage);
@ -697,7 +718,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      //_resize_to_match(other);
      // the 'false' below means to enforce lazy evaluation. We don't use lazyAssign() because
      // it wouldn't allow to copy a row-vector into a column-vector.
-      internal::call_assignment_no_alias(this->derived(), other.derived(), internal::assign_op<Scalar>());
+      internal::call_assignment_no_alias(this->derived(), other.derived(), internal::assign_op<Scalar,typename OtherDerived::Scalar>());
      return this->derived();
    }
@ -713,11 +734,11 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    template<typename T0, typename T1>
    EIGEN_DEVICE_FUNC 
-    EIGEN_STRONG_INLINE void _init2(const Scalar& val0, const Scalar& val1, typename internal::enable_if<Base::SizeAtCompileTime==2,T0>::type* = 0)
+    EIGEN_STRONG_INLINE void _init2(const T0& val0, const T1& val1, typename internal::enable_if<Base::SizeAtCompileTime==2,T0>::type* = 0)
    {
      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 2)
-      m_storage.data()[0] = val0;
+      m_storage.data()[0] = Scalar(val0);
-      m_storage.data()[1] = val1;
+      m_storage.data()[1] = Scalar(val1);
    }
    template<typename T0, typename T1>
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@ -16,39 +16,6 @@ template<typename Lhs, typename Rhs, int Option, typename StorageKind> class Pro
 namespace internal {
 // Determine the scalar of Product<Lhs, Rhs>. This is normally the same as Lhs::Scalar times
 // Rhs::Scalar, but product with permutation matrices inherit the scalar of the other factor.
 template<typename Lhs, typename Rhs, typename LhsShape = typename evaluator_traits<Lhs>::Shape, 
         typename RhsShape = typename evaluator_traits<Rhs>::Shape >
 struct product_result_scalar
 {
  typedef typename scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
 };
 template<typename Lhs, typename Rhs, typename RhsShape>
 struct product_result_scalar<Lhs, Rhs, PermutationShape, RhsShape>
 {
  typedef typename Rhs::Scalar Scalar;
 };
 template<typename Lhs, typename Rhs, typename LhsShape>
  struct product_result_scalar<Lhs, Rhs, LhsShape, PermutationShape>
 {
  typedef typename Lhs::Scalar Scalar;
 };
 template<typename Lhs, typename Rhs, typename RhsShape>
 struct product_result_scalar<Lhs, Rhs, TranspositionsShape, RhsShape>
 {
  typedef typename Rhs::Scalar Scalar;
 };
 template<typename Lhs, typename Rhs, typename LhsShape>
  struct product_result_scalar<Lhs, Rhs, LhsShape, TranspositionsShape>
 {
  typedef typename Lhs::Scalar Scalar;
 };
 template<typename Lhs, typename Rhs, int Option>
 struct traits<Product<Lhs, Rhs, Option> >
 {
@ -59,7 +26,7 @@ struct traits<Product<Lhs, Rhs, Option> >
  typedef MatrixXpr XprKind;
-  typedef typename product_result_scalar<LhsCleaned,RhsCleaned>::Scalar Scalar;
+  typedef typename ScalarBinaryOpTraits<typename traits<LhsCleaned>::Scalar, typename traits<RhsCleaned>::Scalar>::ReturnType Scalar;
  typedef typename product_promote_storage_type<typename LhsTraits::StorageKind,
                                                typename RhsTraits::StorageKind,
                                                internal::product_type<Lhs,Rhs>::ret>::ret StorageKind;
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@ -35,22 +35,28 @@ struct evaluator<Product<Lhs, Rhs, Options> >
  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {}
 };
-// Catch scalar * ( A * B ) and transform it to (A*scalar) * B
+// Catch "scalar * ( A * B )" and transform it to "(A*scalar) * B"
 // TODO we should apply that rule only if that's really helpful
-template<typename Lhs, typename Rhs, typename Scalar>
+template<typename Lhs, typename Rhs, typename Scalar1, typename Scalar2, typename Plain1>
-struct evaluator_assume_aliasing<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,  const Product<Lhs, Rhs, DefaultProduct>  > >
+struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
                                               const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>,
                                               const Product<Lhs, Rhs, DefaultProduct> > >
 {
  static const bool value = true;
 };
-template<typename Lhs, typename Rhs, typename Scalar>
+template<typename Lhs, typename Rhs, typename Scalar1, typename Scalar2, typename Plain1>
-struct evaluator<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,  const Product<Lhs, Rhs, DefaultProduct>  > > 
+struct evaluator<CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
- : public evaluator<Product<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,const Lhs>, Rhs, DefaultProduct> >
+                               const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>,
                               const Product<Lhs, Rhs, DefaultProduct> > >
 : public evaluator<Product<EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar1,Lhs,product), Rhs, DefaultProduct> >
 {
-  typedef CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const Product<Lhs, Rhs, DefaultProduct> > XprType;
+  typedef CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
-  typedef evaluator<Product<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,const Lhs>, Rhs, DefaultProduct> > Base;
+                               const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>,
                               const Product<Lhs, Rhs, DefaultProduct> > XprType;
  typedef evaluator<Product<EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar1,Lhs,product), Rhs, DefaultProduct> > Base;
  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
-    : Base(xpr.functor().m_other * xpr.nestedExpression().lhs() * xpr.nestedExpression().rhs())
+    : Base(xpr.lhs().functor().m_other * xpr.rhs().lhs() * xpr.rhs().rhs())
  {}
 };
@ -122,13 +128,17 @@ protected:
  PlainObject m_result;
 };
 // The following three shortcuts are enabled only if the scalar types match excatly.
 // TODO: we could enable them for different scalar types when the product is not vectorized.
 // Dense = Product
 template< typename DstXprType, typename Lhs, typename Rhs, int Options, typename Scalar>
-struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::assign_op<Scalar>, Dense2Dense,
+struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::assign_op<Scalar,Scalar>, Dense2Dense,
-  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct),Scalar>::type>
+  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
 {
  typedef Product<Lhs,Rhs,Options> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
+  static EIGEN_STRONG_INLINE
  void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
  {
    // FIXME shall we handle nested_eval here?
    generic_product_impl<Lhs, Rhs>::evalTo(dst, src.lhs(), src.rhs());
@ -137,11 +147,12 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::assign_op<Scal
 // Dense += Product
 template< typename DstXprType, typename Lhs, typename Rhs, int Options, typename Scalar>
-struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::add_assign_op<Scalar>, Dense2Dense,
+struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::add_assign_op<Scalar,Scalar>, Dense2Dense,
-  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct),Scalar>::type>
+  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
 {
  typedef Product<Lhs,Rhs,Options> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar> &)
+  static EIGEN_STRONG_INLINE
  void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar,Scalar> &)
  {
    // FIXME shall we handle nested_eval here?
    generic_product_impl<Lhs, Rhs>::addTo(dst, src.lhs(), src.rhs());
@ -150,11 +161,12 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::add_assign_op<
 // Dense -= Product
 template< typename DstXprType, typename Lhs, typename Rhs, int Options, typename Scalar>
-struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::sub_assign_op<Scalar>, Dense2Dense,
+struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::sub_assign_op<Scalar,Scalar>, Dense2Dense,
-  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct),Scalar>::type>
+  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
 {
  typedef Product<Lhs,Rhs,Options> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar> &)
+  static EIGEN_STRONG_INLINE
  void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar,Scalar> &)
  {
    // FIXME shall we handle nested_eval here?
    generic_product_impl<Lhs, Rhs>::subTo(dst, src.lhs(), src.rhs());
@ -165,55 +177,57 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::sub_assign_op<
 // Dense ?= scalar * Product
 // TODO we should apply that rule if that's really helpful
 // for instance, this is not good for inner products
-template< typename DstXprType, typename Lhs, typename Rhs, typename AssignFunc, typename Scalar, typename ScalarBis>
+template< typename DstXprType, typename Lhs, typename Rhs, typename AssignFunc, typename Scalar, typename ScalarBis, typename Plain>
-struct Assignment<DstXprType, CwiseUnaryOp<internal::scalar_multiple_op<ScalarBis>,
+struct Assignment<DstXprType, CwiseBinaryOp<internal::scalar_product_op<ScalarBis,Scalar>, const CwiseNullaryOp<internal::scalar_constant_op<ScalarBis>,Plain>,
-                                           const Product<Lhs,Rhs,DefaultProduct> >, AssignFunc, Dense2Dense, Scalar>
+                                           const Product<Lhs,Rhs,DefaultProduct> >, AssignFunc, Dense2Dense>
 {
-  typedef CwiseUnaryOp<internal::scalar_multiple_op<ScalarBis>,
+  typedef CwiseBinaryOp<internal::scalar_product_op<ScalarBis,Scalar>,
                        const CwiseNullaryOp<internal::scalar_constant_op<ScalarBis>,Plain>,
                        const Product<Lhs,Rhs,DefaultProduct> > SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const AssignFunc& func)
+  static EIGEN_STRONG_INLINE
  void run(DstXprType &dst, const SrcXprType &src, const AssignFunc& func)
  {
-    call_assignment_no_alias(dst, (src.functor().m_other * src.nestedExpression().lhs())*src.nestedExpression().rhs(), func);
+    call_assignment_no_alias(dst, (src.lhs().functor().m_other * src.rhs().lhs())*src.rhs().rhs(), func);
  }
 };
 //----------------------------------------
 // Catch "Dense ?= xpr + Product<>" expression to save one temporary
 // FIXME we could probably enable these rules for any product, i.e., not only Dense and DefaultProduct
 // TODO enable it for "Dense ?= xpr - Product<>" as well.
 template<typename OtherXpr, typename Lhs, typename Rhs>
-struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_sum_op<typename OtherXpr::Scalar>, const OtherXpr,
+struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_sum_op<typename OtherXpr::Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, const OtherXpr,
                                               const Product<Lhs,Rhs,DefaultProduct> >, DenseShape > {
  static const bool value = true;
 };
-template<typename DstXprType, typename OtherXpr, typename ProductType, typename Scalar, typename Func1, typename Func2>
+template<typename DstXprType, typename OtherXpr, typename ProductType, typename Func1, typename Func2>
-struct assignment_from_xpr_plus_product
+struct assignment_from_xpr_op_product
 {
-  typedef CwiseBinaryOp<internal::scalar_sum_op<Scalar>, const OtherXpr, const ProductType> SrcXprType;
+  template<typename SrcXprType, typename InitialFunc>
-  static void run(DstXprType &dst, const SrcXprType &src, const Func1& func)
+  static EIGEN_STRONG_INLINE
  void run(DstXprType &dst, const SrcXprType &src, const InitialFunc& /*func*/)
  {
-    call_assignment_no_alias(dst, src.lhs(), func);
+    call_assignment_no_alias(dst, src.lhs(), Func1());
    call_assignment_no_alias(dst, src.rhs(), Func2());
  }
 };
-template< typename DstXprType, typename OtherXpr, typename Lhs, typename Rhs, typename Scalar>
+#define EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(ASSIGN_OP,BINOP,ASSIGN_OP2) \
-struct Assignment<DstXprType, CwiseBinaryOp<internal::scalar_sum_op<Scalar>, const OtherXpr,
+  template< typename DstXprType, typename OtherXpr, typename Lhs, typename Rhs, typename DstScalar, typename SrcScalar, typename OtherScalar,typename ProdScalar> \
-                                           const Product<Lhs,Rhs,DefaultProduct> >, internal::assign_op<Scalar>, Dense2Dense>
+  struct Assignment<DstXprType, CwiseBinaryOp<internal::BINOP<OtherScalar,ProdScalar>, const OtherXpr, \
-  : assignment_from_xpr_plus_product<DstXprType, OtherXpr, Product<Lhs,Rhs,DefaultProduct>, Scalar, internal::assign_op<Scalar>, internal::add_assign_op<Scalar> >
+                                            const Product<Lhs,Rhs,DefaultProduct> >, internal::ASSIGN_OP<DstScalar,SrcScalar>, Dense2Dense> \
-{};
+    : assignment_from_xpr_op_product<DstXprType, OtherXpr, Product<Lhs,Rhs,DefaultProduct>, internal::ASSIGN_OP<DstScalar,OtherScalar>, internal::ASSIGN_OP2<DstScalar,ProdScalar> > \
-template< typename DstXprType, typename OtherXpr, typename Lhs, typename Rhs, typename Scalar>
+  {}
-struct Assignment<DstXprType, CwiseBinaryOp<internal::scalar_sum_op<Scalar>, const OtherXpr,
+
-                                           const Product<Lhs,Rhs,DefaultProduct> >, internal::add_assign_op<Scalar>, Dense2Dense>
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(assign_op,    scalar_sum_op,add_assign_op);
-  : assignment_from_xpr_plus_product<DstXprType, OtherXpr, Product<Lhs,Rhs,DefaultProduct>, Scalar, internal::add_assign_op<Scalar>, internal::add_assign_op<Scalar> >
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(add_assign_op,scalar_sum_op,add_assign_op);
-{};
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(sub_assign_op,scalar_sum_op,sub_assign_op);
-template< typename DstXprType, typename OtherXpr, typename Lhs, typename Rhs, typename Scalar>
+
-struct Assignment<DstXprType, CwiseBinaryOp<internal::scalar_sum_op<Scalar>, const OtherXpr,
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(assign_op,    scalar_difference_op,sub_assign_op);
-                                           const Product<Lhs,Rhs,DefaultProduct> >, internal::sub_assign_op<Scalar>, Dense2Dense>
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(add_assign_op,scalar_difference_op,sub_assign_op);
-  : assignment_from_xpr_plus_product<DstXprType, OtherXpr, Product<Lhs,Rhs,DefaultProduct>, Scalar, internal::sub_assign_op<Scalar>, internal::sub_assign_op<Scalar> >
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(sub_assign_op,scalar_difference_op,add_assign_op);
-{};
+
 //----------------------------------------
 template<typename Lhs, typename Rhs>
@ -243,7 +257,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct>
 // Column major result
 template<typename Dst, typename Lhs, typename Rhs, typename Func>
-EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&)
+void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&)
 {
  evaluator<Rhs> rhsEval(rhs);
  typename nested_eval<Lhs,Rhs::SizeAtCompileTime>::type actual_lhs(lhs);
@ -251,12 +265,12 @@ EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, cons
  // FIXME not very good if rhs is real and lhs complex while alpha is real too
  const Index cols = dst.cols();
  for (Index j=0; j<cols; ++j)
-    func(dst.col(j), rhsEval.coeff(0,j) * actual_lhs);
+    func(dst.col(j), rhsEval.coeff(Index(0),j) * actual_lhs);
 }
 // Row major result
 template<typename Dst, typename Lhs, typename Rhs, typename Func>
-EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&)
+void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&)
 {
  evaluator<Lhs> lhsEval(lhs);
  typename nested_eval<Rhs,Lhs::SizeAtCompileTime>::type actual_rhs(rhs);
@ -264,7 +278,7 @@ EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, cons
  // FIXME not very good if lhs is real and rhs complex while alpha is real too
  const Index rows = dst.rows();
  for (Index i=0; i<rows; ++i)
-    func(dst.row(i), lhsEval.coeff(i,0) * actual_rhs);
+    func(dst.row(i), lhsEval.coeff(i,Index(0)) * actual_rhs);
 }
 template<typename Lhs, typename Rhs>
@ -319,19 +333,19 @@ struct generic_product_impl_base
  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
  template<typename Dst>
-  static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  { dst.setZero(); scaleAndAddTo(dst, lhs, rhs, Scalar(1)); }
  template<typename Dst>
-  static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  { scaleAndAddTo(dst,lhs, rhs, Scalar(1)); }
  template<typename Dst>
-  static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  { scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); }
  template<typename Dst>
-  static void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  static EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
  { Derived::scaleAndAddTo(dst,lhs,rhs,alpha); }
 };
@ -345,7 +359,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemvProduct>
  typedef typename internal::conditional<int(Side)==OnTheRight,Lhs,Rhs>::type MatrixType;
  template<typename Dest>
-  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  static EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
  {
    internal::gemv_dense_selector<Side,
                            (int(MatrixType::Flags)&RowMajorBit) ? RowMajor : ColMajor,
@ -360,25 +374,25 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
  template<typename Dst>
-  static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  {
    // Same as: dst.noalias() = lhs.lazyProduct(rhs);
    // but easier on the compiler side
-    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::assign_op<Scalar>());
+    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::assign_op<typename Dst::Scalar,Scalar>());
  }
  template<typename Dst>
-  static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  {
    // dst.noalias() += lhs.lazyProduct(rhs);
-    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::add_assign_op<Scalar>());
+    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::add_assign_op<typename Dst::Scalar,Scalar>());
  }
  template<typename Dst>
-  static inline void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  {
    // dst.noalias() -= lhs.lazyProduct(rhs);
-    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op<Scalar>());
+    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op<typename Dst::Scalar,Scalar>());
  }
 //   template<typename Dst>
@ -423,6 +437,18 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
    EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::MulCost);
    EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::AddCost);
    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
 #if 0
    std::cerr << "LhsOuterStrideBytes=  " << LhsOuterStrideBytes << "\n";
    std::cerr << "RhsOuterStrideBytes=  " << RhsOuterStrideBytes << "\n";
    std::cerr << "LhsAlignment=         " << LhsAlignment << "\n";
    std::cerr << "RhsAlignment=         " << RhsAlignment << "\n";
    std::cerr << "CanVectorizeLhs=      " << CanVectorizeLhs << "\n";
    std::cerr << "CanVectorizeRhs=      " << CanVectorizeRhs << "\n";
    std::cerr << "CanVectorizeInner=    " << CanVectorizeInner << "\n";
    std::cerr << "EvalToRowMajor=       " << EvalToRowMajor << "\n";
    std::cerr << "Alignment=            " << Alignment << "\n";
    std::cerr << "Flags=                " << Flags << "\n";
 #endif
  }
  // Everything below here is taken from CoeffBasedProduct.h
@ -473,15 +499,12 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
    SameType = is_same<typename LhsNestedCleaned::Scalar,typename RhsNestedCleaned::Scalar>::value,
-    CanVectorizeRhs = RhsRowMajor && (RhsFlags & PacketAccessBit)
+    CanVectorizeRhs = bool(RhsRowMajor) && (RhsFlags & PacketAccessBit) && (ColsAtCompileTime!=1),
-                    && (ColsAtCompileTime == Dynamic || ((ColsAtCompileTime % RhsVecPacketSize) == 0) ),
+    CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit) && (RowsAtCompileTime!=1),
    CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit)
                    && (RowsAtCompileTime == Dynamic || ((RowsAtCompileTime % LhsVecPacketSize) == 0) ),
    EvalToRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
                    : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
-                    : (RhsRowMajor && !CanVectorizeLhs),
+                    : (bool(RhsRowMajor) && !CanVectorizeLhs),
    Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & ~RowMajorBit)
          | (EvalToRowMajor ? RowMajorBit : 0)
@ -492,8 +515,8 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
    LhsOuterStrideBytes = int(LhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename LhsNestedCleaned::Scalar)),
    RhsOuterStrideBytes = int(RhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename RhsNestedCleaned::Scalar)),
-    Alignment = CanVectorizeLhs ? (LhsOuterStrideBytes<0 || (int(LhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,LhsAlignment))!=0 ? 0 : LhsAlignment)
+    Alignment = bool(CanVectorizeLhs) ? (LhsOuterStrideBytes<=0 || (int(LhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,LhsAlignment))!=0 ? 0 : LhsAlignment)
-              : CanVectorizeRhs ? (RhsOuterStrideBytes<0 || (int(RhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,RhsAlignment))!=0 ? 0 : RhsAlignment)
+              : bool(CanVectorizeRhs) ? (RhsOuterStrideBytes<=0 || (int(RhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,RhsAlignment))!=0 ? 0 : RhsAlignment)
              : 0,
    /* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside
@ -519,8 +542,8 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
   */
  EIGEN_DEVICE_FUNC const CoeffReturnType coeff(Index index) const
  {
-    const Index row = RowsAtCompileTime == 1 ? 0 : index;
+    const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? 0 : index;
-    const Index col = RowsAtCompileTime == 1 ? index : 0;
+    const Index col = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? index : 0;
    return (m_lhs.row(row).transpose().cwiseProduct( m_rhs.col(col) )).sum();
  }
@ -538,8 +561,8 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
  template<int LoadMode, typename PacketType>
  const PacketType packet(Index index) const
  {
-    const Index row = RowsAtCompileTime == 1 ? 0 : index;
+    const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? 0 : index;
-    const Index col = RowsAtCompileTime == 1 ? index : 0;
+    const Index col = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? index : 0;
    return packet<LoadMode,PacketType>(row,col);
  }
@ -579,7 +602,7 @@ struct etor_product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, Load
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
  {
    etor_product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
-    res =  pmadd(pset1<Packet>(lhs.coeff(row, UnrollingIndex-1)), rhs.template packet<LoadMode,Packet>(UnrollingIndex-1, col), res);
+    res =  pmadd(pset1<Packet>(lhs.coeff(row, Index(UnrollingIndex-1))), rhs.template packet<LoadMode,Packet>(Index(UnrollingIndex-1), col), res);
  }
 };
@ -589,7 +612,7 @@ struct etor_product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, Load
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
  {
    etor_product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
-    res =  pmadd(lhs.template packet<LoadMode,Packet>(row, UnrollingIndex-1), pset1<Packet>(rhs.coeff(UnrollingIndex-1, col)), res);
+    res =  pmadd(lhs.template packet<LoadMode,Packet>(row, Index(UnrollingIndex-1)), pset1<Packet>(rhs.coeff(Index(UnrollingIndex-1), col)), res);
  }
 };
@ -598,7 +621,7 @@ struct etor_product_packet_impl<RowMajor, 1, Lhs, Rhs, Packet, LoadMode>
 {
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
  {
-    res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode,Packet>(0, col));
+    res = pmul(pset1<Packet>(lhs.coeff(row, Index(0))),rhs.template packet<LoadMode,Packet>(Index(0), col));
  }
 };
@ -607,7 +630,7 @@ struct etor_product_packet_impl<ColMajor, 1, Lhs, Rhs, Packet, LoadMode>
 {
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
  {
-    res = pmul(lhs.template packet<LoadMode,Packet>(row, 0), pset1<Packet>(rhs.coeff(0, col)));
+    res = pmul(lhs.template packet<LoadMode,Packet>(row, Index(0)), pset1<Packet>(rhs.coeff(Index(0), col)));
  }
 };
@ -616,7 +639,7 @@ struct etor_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
 {
  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
  {
-    res = pset1<Packet>(0);
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
  }
 };
@ -625,7 +648,7 @@ struct etor_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
 {
  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
  {
-    res = pset1<Packet>(0);
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
  }
 };
@ -634,7 +657,7 @@ struct etor_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
 {
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
  {
-    res = pset1<Packet>(0);
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
    for(Index i = 0; i < innerDim; ++i)
      res =  pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode,Packet>(i, col), res);
  }
@ -645,7 +668,7 @@ struct etor_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
 {
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
  {
-    res = pset1<Packet>(0);
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
    for(Index i = 0; i < innerDim; ++i)
      res =  pmadd(lhs.template packet<LoadMode,Packet>(row, i), pset1<Packet>(rhs.coeff(i, col)), res);
  }
@ -730,7 +753,7 @@ template<typename MatrixType, typename DiagonalType, typename Derived, int Produ
 struct diagonal_product_evaluator_base
  : evaluator_base<Derived>
 {
-   typedef typename scalar_product_traits<typename MatrixType::Scalar, typename DiagonalType::Scalar>::ReturnType Scalar;
+   typedef typename ScalarBinaryOpTraits<typename MatrixType::Scalar, typename DiagonalType::Scalar>::ReturnType Scalar;
 public:
  enum {
    CoeffReadCost = NumTraits<Scalar>::MulCost + evaluator<MatrixType>::CoeffReadCost + evaluator<DiagonalType>::CoeffReadCost,
--- a/Eigen/src/Core/Random.h
+++ b/Eigen/src/Core/Random.h
@ -16,8 +16,7 @@ namespace internal {
 template<typename Scalar> struct scalar_random_op {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_random_op)
-  template<typename Index>
+  inline const Scalar operator() () const { return random<Scalar>(); }
  inline const Scalar operator() (Index, Index = 0) const { return random<Scalar>(); }
 };
 template<typename Scalar>
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h
@ -38,8 +38,8 @@ public:
  enum {
    MightVectorize = (int(Derived::Flags)&ActualPacketAccessBit)
                  && (functor_traits<Func>::PacketAccess),
-    MayLinearVectorize = MightVectorize && (int(Derived::Flags)&LinearAccessBit),
+    MayLinearVectorize = bool(MightVectorize) && (int(Derived::Flags)&LinearAccessBit),
-    MaySliceVectorize  = MightVectorize && int(InnerMaxSize)>=3*PacketSize
+    MaySliceVectorize  = bool(MightVectorize) && int(InnerMaxSize)>=3*PacketSize
  };
 public:
@ -425,7 +425,7 @@ template<typename Derived>
 EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::minCoeff() const
 {
-  return derived().redux(Eigen::internal::scalar_min_op<Scalar>());
+  return derived().redux(Eigen::internal::scalar_min_op<Scalar,Scalar>());
 }
 /** \returns the maximum of all coefficients of \c *this.
@ -435,10 +435,12 @@ template<typename Derived>
 EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::maxCoeff() const
 {
-  return derived().redux(Eigen::internal::scalar_max_op<Scalar>());
+  return derived().redux(Eigen::internal::scalar_max_op<Scalar,Scalar>());
 }
-/** \returns the sum of all coefficients of *this
+/** \returns the sum of all coefficients of \c *this
  *
  * If \c *this is empty, then the value 0 is returned.
  *
  * \sa trace(), prod(), mean()
  */
@ -448,7 +450,7 @@ DenseBase<Derived>::sum() const
 {
  if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
    return Scalar(0);
-  return derived().redux(Eigen::internal::scalar_sum_op<Scalar>());
+  return derived().redux(Eigen::internal::scalar_sum_op<Scalar,Scalar>());
 }
 /** \returns the mean of all coefficients of *this
@ -459,7 +461,14 @@ template<typename Derived>
 EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::mean() const
 {
-  return Scalar(derived().redux(Eigen::internal::scalar_sum_op<Scalar>())) / Scalar(this->size());
+#ifdef __INTEL_COMPILER
  #pragma warning push
  #pragma warning ( disable : 2259 )
 #endif
  return Scalar(derived().redux(Eigen::internal::scalar_sum_op<Scalar,Scalar>())) / Scalar(this->size());
 #ifdef __INTEL_COMPILER
  #pragma warning pop
 #endif
 }
 /** \returns the product of all coefficients of *this
--- a/Eigen/src/Core/Ref.h
+++ b/Eigen/src/Core/Ref.h
@ -35,7 +35,13 @@ struct traits<Ref<_PlainObjectType, _Options, _StrideType> >
                      || (int(StrideType::InnerStrideAtCompileTime)==0 && int(Derived::InnerStrideAtCompileTime)==1),
      OuterStrideMatch = Derived::IsVectorAtCompileTime
                      || int(StrideType::OuterStrideAtCompileTime)==int(Dynamic) || int(StrideType::OuterStrideAtCompileTime)==int(Derived::OuterStrideAtCompileTime),
-      AlignmentMatch = (int(traits<PlainObjectType>::Alignment)==int(Unaligned)) || (int(evaluator<Derived>::Alignment) >= int(Alignment)), // FIXME the first condition is not very clear, it should be replaced by the required alignment
+      // NOTE, this indirection of evaluator<Derived>::Alignment is needed
      // to workaround a very strange bug in MSVC related to the instantiation
      // of has_*ary_operator in evaluator<CwiseNullaryOp>.
      // This line is surprisingly very sensitive. For instance, simply adding parenthesis
      // as "DerivedAlignment = (int(evaluator<Derived>::Alignment))," will make MSVC fail...
      DerivedAlignment = int(evaluator<Derived>::Alignment),
      AlignmentMatch = (int(traits<PlainObjectType>::Alignment)==int(Unaligned)) || (DerivedAlignment >= int(Alignment)), // FIXME the first condition is not very clear, it should be replaced by the required alignment
      ScalarTypeMatch = internal::is_same<typename PlainObjectType::Scalar, typename Derived::Scalar>::value,
      MatchAtCompileTime = HasDirectAccess && StorageOrderMatch && InnerStrideMatch && OuterStrideMatch && AlignmentMatch && ScalarTypeMatch
    };
@ -262,7 +268,7 @@ template<typename TPlainObjectType, int Options, typename StrideType> class Ref<
    template<typename Expression>
    EIGEN_DEVICE_FUNC void construct(const Expression& expr, internal::false_type)
    {
-      internal::call_assignment_no_alias(m_object,expr,internal::assign_op<Scalar>());
+      internal::call_assignment_no_alias(m_object,expr,internal::assign_op<Scalar,Scalar>());
      Base::construct(m_object);
    }
--- a/Eigen/src/Core/SelfAdjointView.h
+++ b/Eigen/src/Core/SelfAdjointView.h
@ -55,6 +55,7 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
    typedef TriangularBase<SelfAdjointView> Base;
    typedef typename internal::traits<SelfAdjointView>::MatrixTypeNested MatrixTypeNested;
    typedef typename internal::traits<SelfAdjointView>::MatrixTypeNestedCleaned MatrixTypeNestedCleaned;
    typedef MatrixTypeNestedCleaned NestedExpression;
    /** \brief The type of coefficients in this matrix */
    typedef typename internal::traits<SelfAdjointView>::Scalar Scalar; 
@ -128,7 +129,7 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
    }
    friend EIGEN_DEVICE_FUNC
-    const SelfAdjointView<const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,MatrixType>,UpLo>
+    const SelfAdjointView<const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,MatrixType,product),UpLo>
    operator*(const Scalar& s, const SelfAdjointView& mat)
    {
      return (s*mat.nestedExpression()).template selfadjointView<UpLo>();
@ -162,6 +163,41 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
    EIGEN_DEVICE_FUNC
    SelfAdjointView& rankUpdate(const MatrixBase<DerivedU>& u, const Scalar& alpha = Scalar(1));
    /** \returns an expression of a triangular view extracted from the current selfadjoint view of a given triangular part
      *
      * The parameter \a TriMode can have the following values: \c #Upper, \c #StrictlyUpper, \c #UnitUpper,
      * \c #Lower, \c #StrictlyLower, \c #UnitLower.
      *
      * If \c TriMode references the same triangular part than \c *this, then this method simply return a \c TriangularView of the nested expression,
      * otherwise, the nested expression is first transposed, thus returning a \c TriangularView<Transpose<MatrixType>> object.
      *
      * \sa MatrixBase::triangularView(), class TriangularView
      */
    template<unsigned int TriMode>
    EIGEN_DEVICE_FUNC
    typename internal::conditional<(TriMode&(Upper|Lower))==(UpLo&(Upper|Lower)),
                                   TriangularView<MatrixType,TriMode>,
                                   TriangularView<typename MatrixType::AdjointReturnType,TriMode> >::type
    triangularView() const
    {
      typename internal::conditional<(TriMode&(Upper|Lower))==(UpLo&(Upper|Lower)), MatrixType&, typename MatrixType::ConstTransposeReturnType>::type tmp1(m_matrix);
      typename internal::conditional<(TriMode&(Upper|Lower))==(UpLo&(Upper|Lower)), MatrixType&, typename MatrixType::AdjointReturnType>::type tmp2(tmp1);
      return typename internal::conditional<(TriMode&(Upper|Lower))==(UpLo&(Upper|Lower)),
                                   TriangularView<MatrixType,TriMode>,
                                   TriangularView<typename MatrixType::AdjointReturnType,TriMode> >::type(tmp2);
    }
    /** \returns a const expression of the main diagonal of the matrix \c *this
      *
      * This method simply returns the diagonal of the nested expression, thus by-passing the SelfAdjointView decorator.
      *
      * \sa MatrixBase::diagonal(), class Diagonal */
    EIGEN_DEVICE_FUNC
    typename MatrixType::ConstDiagonalReturnType diagonal() const
    {
      return typename MatrixType::ConstDiagonalReturnType(m_matrix);
    }
 /////////// Cholesky module ///////////
    const LLT<PlainObject, UpLo> llt() const;
--- a/Eigen/src/Core/SelfCwiseBinaryOp.h
+++ b/Eigen/src/Core/SelfCwiseBinaryOp.h
@ -12,11 +12,13 @@
 namespace Eigen { 
 // TODO generalize the scalar type of 'other'
 template<typename Derived>
 EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const Scalar& other)
 {
  typedef typename Derived::PlainObject PlainObject;
-  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::mul_assign_op<Scalar>());
+  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::mul_assign_op<Scalar,Scalar>());
  return derived();
 }
@ -24,7 +26,7 @@ template<typename Derived>
 EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator+=(const Scalar& other)
 {
  typedef typename Derived::PlainObject PlainObject;
-  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::add_assign_op<Scalar>());
+  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::add_assign_op<Scalar,Scalar>());
  return derived();
 }
@ -32,7 +34,7 @@ template<typename Derived>
 EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator-=(const Scalar& other)
 {
  typedef typename Derived::PlainObject PlainObject;
-  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::sub_assign_op<Scalar>());
+  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::sub_assign_op<Scalar,Scalar>());
  return derived();
 }
@ -40,7 +42,7 @@ template<typename Derived>
 EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator/=(const Scalar& other)
 {
  typedef typename Derived::PlainObject PlainObject;
-  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::div_assign_op<Scalar>());
+  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::div_assign_op<Scalar,Scalar>());
  return derived();
 }
--- a/Eigen/src/Core/Solve.h
+++ b/Eigen/src/Core/Solve.h
@ -134,10 +134,10 @@ protected:
 // Specialization for "dst = dec.solve(rhs)"
 // NOTE we need to specialize it for Dense2Dense to avoid ambiguous specialization error and a Sparse2Sparse specialization must exist somewhere
 template<typename DstXprType, typename DecType, typename RhsType, typename Scalar>
-struct Assignment<DstXprType, Solve<DecType,RhsType>, internal::assign_op<Scalar>, Dense2Dense, Scalar>
+struct Assignment<DstXprType, Solve<DecType,RhsType>, internal::assign_op<Scalar,Scalar>, Dense2Dense>
 {
  typedef Solve<DecType,RhsType> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
  {
    // FIXME shall we resize dst here?
    src.dec()._solve_impl(src.rhs(), dst);
@ -146,10 +146,10 @@ struct Assignment<DstXprType, Solve<DecType,RhsType>, internal::assign_op<Scalar
 // Specialization for "dst = dec.transpose().solve(rhs)"
 template<typename DstXprType, typename DecType, typename RhsType, typename Scalar>
-struct Assignment<DstXprType, Solve<Transpose<const DecType>,RhsType>, internal::assign_op<Scalar>, Dense2Dense, Scalar>
+struct Assignment<DstXprType, Solve<Transpose<const DecType>,RhsType>, internal::assign_op<Scalar,Scalar>, Dense2Dense>
 {
  typedef Solve<Transpose<const DecType>,RhsType> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
  {
    src.dec().nestedExpression().template _solve_impl_transposed<false>(src.rhs(), dst);
  }
@ -157,10 +157,11 @@ struct Assignment<DstXprType, Solve<Transpose<const DecType>,RhsType>, internal:
 // Specialization for "dst = dec.adjoint().solve(rhs)"
 template<typename DstXprType, typename DecType, typename RhsType, typename Scalar>
-struct Assignment<DstXprType, Solve<CwiseUnaryOp<internal::scalar_conjugate_op<typename DecType::Scalar>, const Transpose<const DecType> >,RhsType>, internal::assign_op<Scalar>, Dense2Dense, Scalar>
+struct Assignment<DstXprType, Solve<CwiseUnaryOp<internal::scalar_conjugate_op<typename DecType::Scalar>, const Transpose<const DecType> >,RhsType>,
                  internal::assign_op<Scalar,Scalar>, Dense2Dense>
 {
  typedef Solve<CwiseUnaryOp<internal::scalar_conjugate_op<typename DecType::Scalar>, const Transpose<const DecType> >,RhsType> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
  {
    src.dec().nestedExpression().nestedExpression().template _solve_impl_transposed<true>(src.rhs(), dst);
  }
--- a/Eigen/src/Core/SolveTriangular.h
+++ b/Eigen/src/Core/SolveTriangular.h
@ -169,7 +169,7 @@ void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<Ot
  eigen_assert( derived().cols() == derived().rows() && ((Side==OnTheLeft && derived().cols() == other.rows()) || (Side==OnTheRight && derived().cols() == other.cols())) );
  eigen_assert((!(Mode & ZeroDiag)) && bool(Mode & (Upper|Lower)));
-  enum { copy = internal::traits<OtherDerived>::Flags & RowMajorBit  && OtherDerived::IsVectorAtCompileTime };
+  enum { copy = (internal::traits<OtherDerived>::Flags & RowMajorBit)  && OtherDerived::IsVectorAtCompileTime && OtherDerived::SizeAtCompileTime!=1};
  typedef typename internal::conditional<copy,
    typename internal::plain_matrix_type_column_major<OtherDerived>::type, OtherDerived&>::type OtherCopy;
  OtherCopy otherCopy(other);
--- a/Eigen/src/Core/TriangularMatrix.h
+++ b/Eigen/src/Core/TriangularMatrix.h
@ -367,14 +367,14 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
    template<typename Other>
    EIGEN_DEVICE_FUNC
    TriangularViewType&  operator+=(const DenseBase<Other>& other) {
-      internal::call_assignment_no_alias(derived(), other.derived(), internal::add_assign_op<Scalar>());
+      internal::call_assignment_no_alias(derived(), other.derived(), internal::add_assign_op<Scalar,typename Other::Scalar>());
      return derived();
    }
    /** \sa MatrixBase::operator-=() */
    template<typename Other>
    EIGEN_DEVICE_FUNC
    TriangularViewType&  operator-=(const DenseBase<Other>& other) {
-      internal::call_assignment_no_alias(derived(), other.derived(), internal::sub_assign_op<Scalar>());
+      internal::call_assignment_no_alias(derived(), other.derived(), internal::sub_assign_op<Scalar,typename Other::Scalar>());
      return derived();
    }
@ -552,7 +552,7 @@ template<typename OtherDerived>
 inline TriangularView<MatrixType, Mode>&
 TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const MatrixBase<OtherDerived>& other)
 {
-  internal::call_assignment_no_alias(derived(), other.derived(), internal::assign_op<Scalar>());
+  internal::call_assignment_no_alias(derived(), other.derived(), internal::assign_op<Scalar,typename OtherDerived::Scalar>());
  return derived();
 }
@ -794,7 +794,7 @@ void call_triangular_assignment_loop(const DstXprType& dst, const SrcXprType& sr
  enum {
      unroll = DstXprType::SizeAtCompileTime != Dynamic
            && SrcEvaluatorType::CoeffReadCost < HugeCost
-            && DstXprType::SizeAtCompileTime * SrcEvaluatorType::CoeffReadCost / 2 <= EIGEN_UNROLLING_LIMIT
+            && DstXprType::SizeAtCompileTime * (DstEvaluatorType::CoeffReadCost+SrcEvaluatorType::CoeffReadCost) / 2 <= EIGEN_UNROLLING_LIMIT
    };
  triangular_assignment_loop<Kernel, Mode, unroll ? int(DstXprType::SizeAtCompileTime) : Dynamic, SetOpposite>::run(kernel);
@ -804,7 +804,7 @@ template<int Mode, bool SetOpposite, typename DstXprType, typename SrcXprType>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 void call_triangular_assignment_loop(const DstXprType& dst, const SrcXprType& src)
 {
-  call_triangular_assignment_loop<Mode,SetOpposite>(dst, src, internal::assign_op<typename DstXprType::Scalar>());
+  call_triangular_assignment_loop<Mode,SetOpposite>(dst, src, internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar>());
 }
 template<> struct AssignmentKind<TriangularShape,TriangularShape> { typedef Triangular2Triangular Kind; };
@ -812,8 +812,8 @@ template<> struct AssignmentKind<DenseShape,TriangularShape>      { typedef Tria
 template<> struct AssignmentKind<TriangularShape,DenseShape>      { typedef Dense2Triangular      Kind; };
-template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
+template< typename DstXprType, typename SrcXprType, typename Functor>
-struct Assignment<DstXprType, SrcXprType, Functor, Triangular2Triangular, Scalar>
+struct Assignment<DstXprType, SrcXprType, Functor, Triangular2Triangular>
 {
  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
  {
@ -823,8 +823,8 @@ struct Assignment<DstXprType, SrcXprType, Functor, Triangular2Triangular, Scalar
  }
 };
-template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
+template< typename DstXprType, typename SrcXprType, typename Functor>
-struct Assignment<DstXprType, SrcXprType, Functor, Triangular2Dense, Scalar>
+struct Assignment<DstXprType, SrcXprType, Functor, Triangular2Dense>
 {
  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
  {
@ -832,8 +832,8 @@ struct Assignment<DstXprType, SrcXprType, Functor, Triangular2Dense, Scalar>
  }
 };
-template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
+template< typename DstXprType, typename SrcXprType, typename Functor>
-struct Assignment<DstXprType, SrcXprType, Functor, Dense2Triangular, Scalar>
+struct Assignment<DstXprType, SrcXprType, Functor, Dense2Triangular>
 {
  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
  {
@ -933,10 +933,10 @@ namespace internal {
 // Triangular = Product
 template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar>
-struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::assign_op<Scalar>, Dense2Triangular, Scalar>
+struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::assign_op<Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, Dense2Triangular>
 {
  typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,typename SrcXprType::Scalar> &)
  {
    dst.setZero();
    dst._assignProduct(src, 1);
@ -945,10 +945,10 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::assign_
 // Triangular += Product
 template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar>
-struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::add_assign_op<Scalar>, Dense2Triangular, Scalar>
+struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::add_assign_op<Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, Dense2Triangular>
 {
  typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar> &)
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar,typename SrcXprType::Scalar> &)
  {
    dst._assignProduct(src, 1);
  }
@ -956,10 +956,10 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::add_ass
 // Triangular -= Product
 template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar>
-struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::sub_assign_op<Scalar>, Dense2Triangular, Scalar>
+struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::sub_assign_op<Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, Dense2Triangular>
 {
  typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar> &)
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar,typename SrcXprType::Scalar> &)
  {
    dst._assignProduct(src, -1);
  }
--- a/Eigen/src/Core/VectorwiseOp.h
+++ b/Eigen/src/Core/VectorwiseOp.h
@ -284,6 +284,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
    typedef typename ReturnType<internal::member_any>::Type AnyReturnType;
    typedef PartialReduxExpr<ExpressionType, internal::member_count<Index>, Direction> CountReturnType;
    typedef typename ReturnType<internal::member_prod>::Type ProdReturnType;
    typedef Reverse<const ExpressionType, Direction> ConstReverseReturnType;
    typedef Reverse<ExpressionType, Direction> ReverseReturnType;
    template<int p> struct LpNormReturnType {
@ -456,7 +457,15 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
      *
      * \sa DenseBase::reverse() */
    EIGEN_DEVICE_FUNC
-    const ReverseReturnType reverse() const
+    const ConstReverseReturnType reverse() const
    { return ConstReverseReturnType( _expression() ); }
    /** \returns a writable matrix expression
      * where each column (or row) are reversed.
      *
      * \sa reverse() const */
    EIGEN_DEVICE_FUNC
    ReverseReturnType reverse()
    { return ReverseReturnType( _expression() ); }
    typedef Replicate<ExpressionType,(isVertical?Dynamic:1),(isHorizontal?Dynamic:1)> ReplicateReturnType;
@ -540,7 +549,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
    /** Returns the expression of the sum of the vector \a other to each subvector of \c *this */
    template<typename OtherDerived> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
-    CwiseBinaryOp<internal::scalar_sum_op<Scalar>,
+    CwiseBinaryOp<internal::scalar_sum_op<Scalar,typename OtherDerived::Scalar>,
                  const ExpressionTypeNestedCleaned,
                  const typename ExtendedType<OtherDerived>::Type>
    operator+(const DenseBase<OtherDerived>& other) const
@ -553,7 +562,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
    /** Returns the expression of the difference between each subvector of \c *this and the vector \a other */
    template<typename OtherDerived>
    EIGEN_DEVICE_FUNC
-    CwiseBinaryOp<internal::scalar_difference_op<Scalar>,
+    CwiseBinaryOp<internal::scalar_difference_op<Scalar,typename OtherDerived::Scalar>,
                  const ExpressionTypeNestedCleaned,
                  const typename ExtendedType<OtherDerived>::Type>
    operator-(const DenseBase<OtherDerived>& other) const
--- a/Eigen/src/Core/arch/AVX/CMakeLists.txt
+++ b/Eigen/src/Core/arch/AVX/CMakeLists.txt
@ -1,6 +0,0 @@
 FILE(GLOB Eigen_Core_arch_AVX_SRCS "*.h")
 INSTALL(FILES
  ${Eigen_Core_arch_AVX_SRCS}
  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/AVX COMPONENT Devel
 )
--- a/Eigen/src/Core/arch/AVX/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX/MathFunctions.h
@ -266,52 +266,10 @@ pexp<Packet8f>(const Packet8f& _x) {
 }
 // Hyperbolic Tangent function.
 // Doesn't do anything fancy, just a 13/6-degree rational interpolant which
 // is accurate up to a couple of ulp in the range [-9, 9], outside of which the
 // fl(tanh(x)) = +/-1.
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
-ptanh<Packet8f>(const Packet8f& _x) {
+ptanh<Packet8f>(const Packet8f& x) {
-  // Clamp the inputs to the range [-9, 9] since anything outside
+  return internal::generic_fast_tanh_float(x);
  // this range is +/-1.0f in single-precision.
  _EIGEN_DECLARE_CONST_Packet8f(plus_9, 9.0f);
  _EIGEN_DECLARE_CONST_Packet8f(minus_9, -9.0f);
  const Packet8f x = pmax(p8f_minus_9, pmin(p8f_plus_9, _x));
  // The monomial coefficients of the numerator polynomial (odd).
  _EIGEN_DECLARE_CONST_Packet8f(alpha_1, 4.89352455891786e-03f);
  _EIGEN_DECLARE_CONST_Packet8f(alpha_3, 6.37261928875436e-04f);
  _EIGEN_DECLARE_CONST_Packet8f(alpha_5, 1.48572235717979e-05f);
  _EIGEN_DECLARE_CONST_Packet8f(alpha_7, 5.12229709037114e-08f);
  _EIGEN_DECLARE_CONST_Packet8f(alpha_9, -8.60467152213735e-11f);
  _EIGEN_DECLARE_CONST_Packet8f(alpha_11, 2.00018790482477e-13f);
  _EIGEN_DECLARE_CONST_Packet8f(alpha_13, -2.76076847742355e-16f);
  // The monomial coefficients of the denominator polynomial (even).
  _EIGEN_DECLARE_CONST_Packet8f(beta_0, 4.89352518554385e-03f);
  _EIGEN_DECLARE_CONST_Packet8f(beta_2, 2.26843463243900e-03f);
  _EIGEN_DECLARE_CONST_Packet8f(beta_4, 1.18534705686654e-04f);
  _EIGEN_DECLARE_CONST_Packet8f(beta_6, 1.19825839466702e-06f);
  // Since the polynomials are odd/even, we need x^2.
  const Packet8f x2 = pmul(x, x);
  // Evaluate the numerator polynomial p.
  Packet8f p = pmadd(x2, p8f_alpha_13, p8f_alpha_11);
  p = pmadd(x2, p, p8f_alpha_9);
  p = pmadd(x2, p, p8f_alpha_7);
  p = pmadd(x2, p, p8f_alpha_5);
  p = pmadd(x2, p, p8f_alpha_3);
  p = pmadd(x2, p, p8f_alpha_1);
  p = pmul(x, p);
  // Evaluate the denominator polynomial p.
  Packet8f q = pmadd(x2, p8f_beta_6, p8f_beta_4);
  q = pmadd(x2, q, p8f_beta_2);
  q = pmadd(x2, q, p8f_beta_0);
  // Divide the numerator by the denominator.
  return pdiv(p, q);
 }
 template <>
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@ -97,6 +97,9 @@ template<> struct packet_traits<double> : default_packet_traits
 };
 #endif
 template<> struct scalar_div_cost<float,true> { enum { value = 14 }; };
 template<> struct scalar_div_cost<double,true> { enum { value = 16 }; };
 /* Proper support for integers is only provided by AVX2. In the meantime, we'll
   use SSE instructions and packets to deal with integers.
 template<> struct packet_traits<int>    : default_packet_traits
@ -156,7 +159,7 @@ template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& /*a*/, co
 #ifdef __FMA__
 template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
-#if EIGEN_COMP_GNUC || EIGEN_COMP_CLANG
+#if ( EIGEN_COMP_GNUC_STRICT || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) )
  // clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers,
  // and gcc stupidly generates a vfmadd132ps instruction,
  // so let's enforce it to generate a vfmadd231ps instruction since the most common use case is to accumulate
@ -169,7 +172,7 @@ template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f&
 #endif
 }
 template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) {
-#if EIGEN_COMP_GNUC || EIGEN_COMP_CLANG
+#if ( EIGEN_COMP_GNUC_STRICT || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) )
  // see above
  Packet4d res = c;
  __asm__("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
--- a/Eigen/src/Core/arch/AltiVec/CMakeLists.txt
+++ b/Eigen/src/Core/arch/AltiVec/CMakeLists.txt
@ -1,6 +0,0 @@
 FILE(GLOB Eigen_Core_arch_AltiVec_SRCS "*.h")
 INSTALL(FILES
  ${Eigen_Core_arch_AltiVec_SRCS}
  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/AltiVec COMPONENT Devel
 )
--- a/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/Eigen/src/Core/arch/AltiVec/Complex.h
@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2010-2016 Konstantinos Margaritis <markos@freevec.org>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@ -15,18 +16,20 @@ namespace Eigen {
 namespace internal {
 static Packet4ui  p4ui_CONJ_XOR = vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_ZERO_);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
-#ifdef _BIG_ENDIAN
+#ifdef __VSX__
 #if defined(_BIG_ENDIAN)
 static Packet2ul  p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
 static Packet2ul  p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO,  (Packet4ui) p2d_ZERO_, 8);//{ 0x8000000000000000, 0x0000000000000000 };
 #else
 static Packet2ul  p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO,  (Packet4ui) p2d_ZERO_, 8);//{ 0x8000000000000000, 0x0000000000000000 };
 static Packet2ul  p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
 #endif
 #endif
 //---------- float ----------
 struct Packet2cf
 {
-  EIGEN_STRONG_INLINE Packet2cf() {}
+  EIGEN_STRONG_INLINE explicit Packet2cf() : v(p4f_ZERO) {}
  EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
  Packet4f  v;
 };
@ -39,6 +42,7 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
    Vectorizable = 1,
    AlignedOnScalar = 1,
    size = 2,
    HasHalfPacket = 0,
    HasAdd    = 1,
    HasSub    = 1,
@ -49,6 +53,9 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
    HasAbs2   = 0,
    HasMin    = 0,
    HasMax    = 0,
 #ifdef __VSX__
    HasBlend  = 1,
 #endif
    HasSetLinear = 0
  };
 };
@ -58,7 +65,6 @@ template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type;
 template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
 {
  Packet2cf res;
  /* On AltiVec we cannot load 64-bit registers, so wa have to take care of alignment */
  if((ptrdiff_t(&from) % 16) == 0)
    res.v = pload<Packet4f>((const float *)&from);
  else
@ -67,26 +73,32 @@ template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<flo
  return res;
 }
 template<> EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>*        from) { return Packet2cf(pload<Packet4f>((const float *) from)); }
 template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>*       from) { return Packet2cf(ploadu<Packet4f>((const float*) from)); }
 template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>*     from) { return pset1<Packet2cf>(*from); }
 template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { pstore((float*)to, from.v); }
 template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { pstoreu((float*)to, from.v); }
 template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
 {
  std::complex<float> EIGEN_ALIGN16 af[2];
  af[0] = from[0*stride];
  af[1] = from[1*stride];
-  return Packet2cf(vec_ld(0, (const float*)af));
+  return pload<Packet2cf>(af);
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
 {
  std::complex<float> EIGEN_ALIGN16 af[2];
-  vec_st(from.v, 0, (float*)af);
+  pstore<std::complex<float> >((std::complex<float> *) af, from);
  to[0*stride] = af[0];
  to[1*stride] = af[1];
 }
-
+template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v + b.v); }
-template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_add(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v - b.v); }
 template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_sub(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf((Packet4f)vec_xor((Packet4ui)a.v, p4ui_CONJ_XOR)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor<Packet4f>(a.v, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR))); }
 template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
@ -100,30 +112,19 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, con
  v1 = vec_madd(v1, b.v, p4f_ZERO);
  // multiply a_im * b and get the conjugate result
  v2 = vec_madd(v2, b.v, p4f_ZERO);
-  v2 = (Packet4f) vec_xor((Packet4ui)v2, p4ui_CONJ_XOR);
+  v2 = reinterpret_cast<Packet4f>(pxor(v2, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR)));
  // permute back to a proper order
  v2 = vec_perm(v2, v2, p16uc_COMPLEX32_REV);
-  return Packet2cf(vec_add(v1, v2));
+  return Packet2cf(padd<Packet4f>(v1, v2));
 }
-template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_and(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand<Packet4f>(a.v, b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_or(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por<Packet4f>(a.v, b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_xor(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pxor<Packet4f>(a.v, b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_and(a.v, vec_nor(b.v,b.v))); }
+template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pandnot<Packet4f>(a.v, b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pload <Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from)); }
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr)    { EIGEN_PPC_PREFETCH(addr); }
 template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from)); }
 template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>*     from)
 {
  return pset1<Packet2cf>(*from);
 }
 template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
 template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }
 template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { vec_dstt((float *)addr, DST_CTRL(2,2,32), DST_CHAN); }
 template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
 {
@ -143,23 +144,23 @@ template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
 template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
 {
  Packet4f b;
-  b = (Packet4f) vec_sld(a.v, a.v, 8);
+  b = vec_sld(a.v, a.v, 8);
-  b = padd(a.v, b);
+  b = padd<Packet4f>(a.v, b);
-  return pfirst(Packet2cf(b));
+  return pfirst<Packet2cf>(Packet2cf(b));
 }
 template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs)
 {
  Packet4f b1, b2;
 #ifdef _BIG_ENDIAN  
-  b1 = (Packet4f) vec_sld(vecs[0].v, vecs[1].v, 8);
+  b1 = vec_sld(vecs[0].v, vecs[1].v, 8);
-  b2 = (Packet4f) vec_sld(vecs[1].v, vecs[0].v, 8);
+  b2 = vec_sld(vecs[1].v, vecs[0].v, 8);
 #else
-  b1 = (Packet4f) vec_sld(vecs[1].v, vecs[0].v, 8);
+  b1 = vec_sld(vecs[1].v, vecs[0].v, 8);
-  b2 = (Packet4f) vec_sld(vecs[0].v, vecs[1].v, 8);
+  b2 = vec_sld(vecs[0].v, vecs[1].v, 8);
 #endif
-  b2 = (Packet4f) vec_sld(b2, b2, 8);
+  b2 = vec_sld(b2, b2, 8);
-  b2 = padd(b1, b2);
+  b2 = padd<Packet4f>(b1, b2);
  return Packet2cf(b2);
 }
@ -168,10 +169,10 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const P
 {
  Packet4f b;
  Packet2cf prod;
-  b = (Packet4f) vec_sld(a.v, a.v, 8);
+  b = vec_sld(a.v, a.v, 8);
-  prod = pmul(a, Packet2cf(b));
+  prod = pmul<Packet2cf>(a, Packet2cf(b));
-  return pfirst(prod);
+  return pfirst<Packet2cf>(prod);
 }
 template<int Offset>
@ -223,12 +224,30 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
  }
 };
 template<> struct conj_helper<Packet4f, Packet2cf, false,false>
 {
  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const
  { return padd(c, pmul(x,y)); }
  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const
  { return Packet2cf(internal::pmul<Packet4f>(x, y.v)); }
 };
 template<> struct conj_helper<Packet2cf, Packet4f, false,false>
 {
  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const
  { return padd(c, pmul(x,y)); }
  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const
  { return Packet2cf(internal::pmul<Packet4f>(x.v, y)); }
 };
 template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
  // TODO optimize it for AltiVec
  Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a, b);
-  Packet4f s = vec_madd(b.v, b.v, p4f_ZERO);
+  Packet4f s = pmul<Packet4f>(b.v, b.v);
-  return Packet2cf(pdiv(res.v, vec_add(s,vec_perm(s, s, p16uc_COMPLEX32_REV))));
+  return Packet2cf(pdiv(res.v, padd<Packet4f>(s, vec_perm(s, s, p16uc_COMPLEX32_REV))));
 }
 template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& x)
@ -243,6 +262,14 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
  kernel.packet[0].v = tmp;
 }
 #ifdef __VSX__
 template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
  Packet2cf result;
  result.v = reinterpret_cast<Packet4f>(pblend<Packet2d>(ifPacket, reinterpret_cast<Packet2d>(thenPacket.v), reinterpret_cast<Packet2d>(elsePacket.v)));
  return result;
 }
 #endif
 //---------- double ----------
 #ifdef __VSX__
 struct Packet1cd
@ -277,10 +304,10 @@ template<> struct packet_traits<std::complex<double> >  : default_packet_traits
 template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; };
-template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { return Packet1cd(pload<Packet2d>((const double*)from)); }
-template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { return Packet1cd(ploadu<Packet2d>((const double*)from)); }
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { pstore((double*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { pstoreu((double*)to, from.v); }
 template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>&  from)
 { /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
@ -300,10 +327,10 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1c
  to[1*stride] = af[1];
 }
-template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_add(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); }
-template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_sub(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v - b.v); }
 template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd((Packet2d)vec_xor((Packet2d)a.v, (Packet2d)p2ul_CONJ_XOR2)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd(pxor(a.v, reinterpret_cast<Packet2d>(p2ul_CONJ_XOR2))); }
 template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
@ -317,23 +344,20 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, con
  v1 = vec_madd(a_re, b.v, p2d_ZERO);
  // multiply a_im * b and get the conjugate result
  v2 = vec_madd(a_im, b.v, p2d_ZERO);
-  v2 = (Packet2d) vec_sld((Packet4ui)v2, (Packet4ui)v2, 8);
+  v2 = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(v2), reinterpret_cast<Packet4ui>(v2), 8));
-  v2 = (Packet2d) vec_xor((Packet2d)v2, (Packet2d) p2ul_CONJ_XOR1);
+  v2 = pxor(v2, reinterpret_cast<Packet2d>(p2ul_CONJ_XOR1));
-  return Packet1cd(vec_add(v1, v2));
+  return Packet1cd(padd<Packet2d>(v1, v2));
 }
-template<> EIGEN_STRONG_INLINE Packet1cd pand   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pand   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pand(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd por    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd por    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(por(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pxor(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); }
+template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pandnot(a.v, b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>*     from)
+template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>*     from)  { return pset1<Packet1cd>(*from); }
 {
  return pset1<Packet1cd>(*from);
 }
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { vec_dstt((long *)addr, DST_CTRL(2,2,32), DST_CHAN); }
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr)    { EIGEN_PPC_PREFETCH(addr); }
 template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
 {
@ -345,20 +369,10 @@ template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Pac
 template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; }
-template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a)
+template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
-{
+template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs)        { return vecs[0]; }
  return pfirst(a);
 }
-template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs)
+template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
 {
  return vecs[0];
 }
 template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a)
 {
  return pfirst(a);
 }
 template<int Offset>
 struct palign_impl<Offset,Packet1cd>
@ -402,13 +416,30 @@ template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
    return pconj(internal::pmul(a, b));
  }
 };
 template<> struct conj_helper<Packet2d, Packet1cd, false,false>
 {
  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const
  { return padd(c, pmul(x,y)); }
  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const
  { return Packet1cd(internal::pmul<Packet2d>(x, y.v)); }
 };
 template<> struct conj_helper<Packet1cd, Packet2d, false,false>
 {
  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const
  { return padd(c, pmul(x,y)); }
  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const
  { return Packet1cd(internal::pmul<Packet2d>(x.v, y)); }
 };
 template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
  // TODO optimize it for AltiVec
  Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b);
-  Packet2d s = vec_madd(b.v, b.v, p2d_ZERO_);
+  Packet2d s = pmul<Packet2d>(b.v, b.v);
-  return Packet1cd(pdiv(res.v, vec_add(s,vec_perm(s, s, p16uc_REVERSE64))));
+  return Packet1cd(pdiv(res.v, padd<Packet2d>(s, vec_perm(s, s, p16uc_REVERSE64))));
 }
 EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
--- a/Eigen/src/Core/arch/AltiVec/MathFunctions.h
+++ b/Eigen/src/Core/arch/AltiVec/MathFunctions.h
@ -3,6 +3,7 @@
 //
 // Copyright (C) 2007 Julien Pommier
 // Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2016 Konstantinos Margaritis <markos@freevec.org>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@ -19,38 +20,79 @@ namespace Eigen {
 namespace internal {
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+static _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
-Packet4f plog<Packet4f>(const Packet4f& _x)
+static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
-{
+static _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
-  Packet4f x = _x;
+static _EIGEN_DECLARE_CONST_Packet4i(23, 23);
  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
  _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
  _EIGEN_DECLARE_CONST_Packet4i(23, 23);
-  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000);
+static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000);
 /* the smallest non denormalized float number */
-  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos,  0x00800000);
+static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos,  0x00800000);
-  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf,     0xff800000); // -1.f/0.f
+static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf,     0xff800000); // -1.f/0.f
-  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_nan,     0xffffffff);
+static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_nan,     0xffffffff);
 /* natural logarithm computed for 4 simultaneous float
  return NaN for x <= 0
 */
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
 static _EIGEN_DECLARE_CONST_Packet4f(exp_hi,  88.3762626647950f);
 static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
 static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
 static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
 static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
 static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
 static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
 static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
 static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
 static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
 static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
 #ifdef __VSX__
 static _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
 static _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
 static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
 static _EIGEN_DECLARE_CONST_Packet2d(exp_hi,  709.437);
 static _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303);
 static _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
 static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
 static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
 static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
 static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
 static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
 static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
 static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
 static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
 static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
 static Packet2l p2l_1023 = { 1023, 1023 };
 static Packet2ul p2ul_52 = { 52, 52 };
 #endif
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f plog<Packet4f>(const Packet4f& _x)
 {
  Packet4f x = _x;
  Packet4i emm0;
@ -112,36 +154,17 @@ template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f pexp<Packet4f>(const Packet4f& _x)
 {
  Packet4f x = _x;
  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
  _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
  _EIGEN_DECLARE_CONST_Packet4i(23, 23);
  _EIGEN_DECLARE_CONST_Packet4f(exp_hi,  88.3762626647950f);
  _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
  _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
  Packet4f tmp, fx;
  Packet4i emm0;
  // clamp x
-  x = vec_max(vec_min(x, p4f_exp_hi), p4f_exp_lo);
+  x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo);
-  /* express exp(x) as exp(g + n*log(2)) */
+  // express exp(x) as exp(g + n*log(2))
  fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half);
-  fx = vec_floor(fx);
+  fx = pfloor(fx);
  tmp = pmul(fx, p4f_cephes_exp_C1);
  Packet4f z = pmul(fx, p4f_cephes_exp_C2);
@ -171,14 +194,44 @@ Packet4f pexp<Packet4f>(const Packet4f& _x)
                 isnumber_mask);
 }
 #ifndef EIGEN_COMP_CLANG
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f prsqrt<Packet4f>(const Packet4f& x)
 {
  return  vec_rsqrt(x);
 }
 #endif
 #ifdef __VSX__
 #ifndef EIGEN_COMP_CLANG
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet2d prsqrt<Packet2d>(const Packet2d& x)
 {
  return  vec_rsqrt(x);
 }
 #endif
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f psqrt<Packet4f>(const Packet4f& x)
 {
  return  vec_sqrt(x);
 }
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet2d psqrt<Packet2d>(const Packet2d& x)
 {
  return  vec_sqrt(x);
 }
 // VSX support varies between different compilers and even different
 // versions of the same compiler.  For gcc version >= 4.9.3, we can use
 // vec_cts to efficiently convert Packet2d to Packet2l.  Otherwise, use
 // a slow version that works with older compilers. 
 // Update: apparently vec_cts/vec_ctf intrinsics for 64-bit doubles
 // are buggy, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70963
 static inline Packet2l ConvertToPacket2l(const Packet2d& x) {
-#if EIGEN_GNUC_AT_LEAST(5, 0) || \
+#if EIGEN_GNUC_AT_LEAST(5, 4) || \
-    (EIGEN_GNUC_AT(4, 9) && __GNUC_PATCHLEVEL__ >= 3)
+    (EIGEN_GNUC_AT(6, 1) && __GNUC_PATCHLEVEL__ >= 1)
  return vec_cts(x, 0);    // TODO: check clang version.
 #else
  double tmp[2];
@ -194,36 +247,16 @@ Packet2d pexp<Packet2d>(const Packet2d& _x)
 {
  Packet2d x = _x;
  _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
  _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
  _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
  _EIGEN_DECLARE_CONST_Packet2d(exp_hi,  709.437);
  _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303);
  _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
  Packet2d tmp, fx;
  Packet2l emm0;
  // clamp x
  x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo);
  /* express exp(x) as exp(g + n*log(2)) */
  fx = pmadd(p2d_cephes_LOG2EF, x, p2d_half);
-  fx = vec_floor(fx);
+  /* express exp(x) as exp(g + n*log(2)) */
  fx = pmadd(x, p2d_cephes_LOG2EF, p2d_half);
  fx = pfloor(fx);
  tmp = pmul(fx, p2d_cephes_exp_C1);
  Packet2d z = pmul(fx, p2d_cephes_exp_C2);
@ -249,9 +282,6 @@ Packet2d pexp<Packet2d>(const Packet2d& _x)
  emm0 = ConvertToPacket2l(fx);
 #ifdef __POWER8_VECTOR__ 
  static const Packet2l p2l_1023 = { 1023, 1023 };
  static const Packet2ul p2ul_52 = { 52, 52 };
  emm0 = vec_add(emm0, p2l_1023);
  emm0 = vec_sl(emm0, p2ul_52);
 #else
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2014 Konstantinos Margaritis <markos@freevec.org>
+// Copyright (C) 2008-2016 Konstantinos Margaritis <markos@freevec.org>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@ -42,7 +42,7 @@ typedef __vector unsigned char  Packet16uc;
 // and it doesn't really work to declare them global, so we define macros instead
 #define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
-  Packet4f p4f_##NAME = (Packet4f) vec_splat_s32(X)
+  Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(vec_splat_s32(X))
 #define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \
  Packet4i p4i_##NAME = vec_splat_s32(X)
@ -69,13 +69,13 @@ typedef __vector unsigned char  Packet16uc;
 // These constants are endian-agnostic
 static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
 static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
 #ifndef __VSX__
 static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1); //{ 1, 1, 1, 1}
 static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0}
 #endif
 static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); //{ -16, -16, -16, -16}
 static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1}
 static Packet4f p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000}
 #ifndef __VSX__
 static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0}
 #endif
 static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 };
 static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 };
@ -96,7 +96,9 @@ static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 };
 // Define global static constants:
 #ifdef _BIG_ENDIAN
 static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0);
 #ifdef __VSX__
 static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
 #endif
 static Packet16uc p16uc_PSET32_WODD   = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
 static Packet16uc p16uc_PSET32_WEVEN  = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
 static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8);      //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
@ -110,8 +112,8 @@ static Packet16uc p16uc_HALF64_0_16 = vec_sld(vec_splat((Packet16uc) vec_abs(p4i
 static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN);     //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
 static Packet16uc p16uc_PSET64_LO = (Packet16uc) vec_mergel((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN);     //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
-static Packet16uc p16uc_TRANSPOSE64_HI = vec_add(p16uc_PSET64_HI, p16uc_HALF64_0_16);                                         //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
+static Packet16uc p16uc_TRANSPOSE64_HI = p16uc_PSET64_HI + p16uc_HALF64_0_16;                                         //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
-static Packet16uc p16uc_TRANSPOSE64_LO = vec_add(p16uc_PSET64_LO, p16uc_HALF64_0_16);                                         //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
+static Packet16uc p16uc_TRANSPOSE64_LO = p16uc_PSET64_LO + p16uc_HALF64_0_16;                                         //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
 static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8);                                         //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
@ -121,6 +123,12 @@ static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8
 static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_PSET64_HI, p16uc_PSET64_LO, 8);                                            //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
 #endif // _BIG_ENDIAN
 #if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
  #define EIGEN_PPC_PREFETCH(ADDR) __builtin_prefetch(ADDR);
 #else
  #define EIGEN_PPC_PREFETCH(ADDR) asm( "   dcbt [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
 #endif
 template<> struct packet_traits<float>  : default_packet_traits
 {
  typedef Packet4f type;
@ -129,15 +137,35 @@ template<> struct packet_traits<float>  : default_packet_traits
    Vectorizable = 1,
    AlignedOnScalar = 1,
    size=4,
-    HasHalfPacket=0,
+    HasHalfPacket = 1,
-    // FIXME check the Has*
+    HasAdd  = 1,
    HasSub  = 1,
    HasMul  = 1,
    HasDiv  = 1,
    HasMin  = 1,
    HasMax  = 1,
    HasAbs  = 1,
    HasSin  = 0,
    HasCos  = 0,
-    HasLog  = 1,
+    HasLog  = 0,
    HasExp  = 1,
-    HasSqrt = 0
+#ifdef __VSX__
    HasSqrt = 1,
 #if !EIGEN_COMP_CLANG
    HasRsqrt = 1,
 #else
    HasRsqrt = 0,
 #endif
 #else
    HasSqrt = 0,
    HasRsqrt = 0,
 #endif
    HasRound = 1,
    HasFloor = 1,
    HasCeil = 1,
    HasNegate = 1,
    HasBlend = 1
  };
 };
 template<> struct packet_traits<int>    : default_packet_traits
@ -145,10 +173,16 @@ template<> struct packet_traits<int>    : default_packet_traits
  typedef Packet4i type;
  typedef Packet4i half;
  enum {
    // FIXME check the Has*
    Vectorizable = 1,
    AlignedOnScalar = 1,
-    size=4
+    size = 4,
    HasHalfPacket = 0,
    HasAdd  = 1,
    HasSub  = 1,
    HasMul  = 1,
    HasDiv  = 0,
    HasBlend = 1
  };
 };
@ -200,41 +234,56 @@ inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v)
  s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
  return s;
 }
 /*
 inline std::ostream & operator <<(std::ostream & s, const Packetbi & v)
 {
  union {
    Packet4bi v;
    unsigned int n[4];
  } vt;
  vt.v = v;
  s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
  return s;
 }*/
 // Need to define them first or we get specialization after instantiation errors
-template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
+template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
-template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
+{
  EIGEN_DEBUG_ALIGNED_LOAD
 #ifdef __VSX__
  return vec_vsx_ld(0, from);
 #else
  return vec_ld(0, from);
 #endif
 }
-template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); }
+template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from)
-template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); }
+{
  EIGEN_DEBUG_ALIGNED_LOAD
 #ifdef __VSX__
  return vec_vsx_ld(0, from);
 #else
  return vec_ld(0, from);
 #endif
 }
 template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from)
 {
  EIGEN_DEBUG_ALIGNED_STORE
 #ifdef __VSX__
  vec_vsx_st(from, 0, to);
 #else
  vec_st(from, 0, to);
 #endif
 }
 template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from)
 {
  EIGEN_DEBUG_ALIGNED_STORE
 #ifdef __VSX__
  vec_vsx_st(from, 0, to);
 #else
  vec_st(from, 0, to);
 #endif
 }
 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) {
-  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
+  Packet4f v = {from, from, from, from};
-  float EIGEN_ALIGN16 af[4];
+  return v;
  af[0] = from;
  Packet4f vc = pload<Packet4f>(af);
  vc = vec_splat(vc, 0);
  return vc;
 }
 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)   {
-  int EIGEN_ALIGN16 ai[4];
+  Packet4i v = {from, from, from, from};
-  ai[0] = from;
+  return v;
  Packet4i vc = pload<Packet4i>(ai);
  vc = vec_splat(vc, 0);
  return vc;
 }
 template<> EIGEN_STRONG_INLINE void
 pbroadcast4<Packet4f>(const float *a,
@ -294,58 +343,24 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const
  to[3*stride] = ai[3];
 }
-template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return vec_add(pset1<Packet4f>(a), p4f_COUNTDOWN); }
+template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return pset1<Packet4f>(a) + p4f_COUNTDOWN; }
-template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a)   { return vec_add(pset1<Packet4i>(a), p4i_COUNTDOWN); }
+template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a)   { return pset1<Packet4i>(a) + p4i_COUNTDOWN; }
-template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_add(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return a + b; }
-template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_add(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return a + b; }
-template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_sub(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return a - b; }
-template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_sub(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return a - b; }
-template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return psub<Packet4f>(p4f_ZERO, a); }
+template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return p4f_ZERO - a; }
-template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return psub<Packet4i>(p4i_ZERO, a); }
+template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return p4i_ZERO - a; }
 template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
 template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
 template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b, p4f_ZERO); }
-/* Commented out: it's actually slower than processing it scalar
+template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return a * b; }
 *
 template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b)
 {
  // Detailed in: http://freevec.org/content/32bit_signed_integer_multiplication_altivec
  //Set up constants, variables
  Packet4i a1, b1, bswap, low_prod, high_prod, prod, prod_, v1sel;
  // Get the absolute values
  a1  = vec_abs(a);
  b1  = vec_abs(b);
  // Get the signs using xor
  Packet4bi sgn = (Packet4bi) vec_cmplt(vec_xor(a, b), p4i_ZERO);
  // Do the multiplication for the asbolute values.
  bswap = (Packet4i) vec_rl((Packet4ui) b1, (Packet4ui) p4i_MINUS16 );
  low_prod = vec_mulo((Packet8i) a1, (Packet8i)b1);
  high_prod = vec_msum((Packet8i) a1, (Packet8i) bswap, p4i_ZERO);
  high_prod = (Packet4i) vec_sl((Packet4ui) high_prod, (Packet4ui) p4i_MINUS16);
  prod = vec_add( low_prod, high_prod );
  // NOR the product and select only the negative elements according to the sign mask
  prod_ = vec_nor(prod, prod);
  prod_ = vec_sel(p4i_ZERO, prod_, sgn);
  // Add 1 to the result to get the negative numbers
  v1sel = vec_sel(p4i_ZERO, p4i_ONE, sgn);
  prod_ = vec_add(prod_, v1sel);
  // Merge the results back to the final vector.
  prod = vec_sel(prod, prod_, sgn);
  return prod;
 }
 */
 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
 {
 #ifndef __VSX__  // VSX actually provides a div instruction
@ -371,7 +386,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co
 // for some weird raisons, it has to be overloaded for packet of integers
 template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a,b,c); }
-template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
+template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return a*b + c; }
 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_min(a, b); }
 template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
@ -391,6 +406,10 @@ template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const
 template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); }
 template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); }
 template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return vec_round(a); }
 template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const  Packet4f& a) { return vec_ceil(a); }
 template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return vec_floor(a); }
 #ifdef _BIG_ENDIAN
 template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
 {
@ -418,12 +437,12 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
 // We also need ot redefine little endian loading of Packet4i/Packet4f using VSX
 template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
 {
-  EIGEN_DEBUG_ALIGNED_LOAD
+  EIGEN_DEBUG_UNALIGNED_LOAD
  return (Packet4i) vec_vsx_ld((long)from & 15, (const int*) _EIGEN_ALIGNED_PTR(from));
 }
 template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
 {
-  EIGEN_DEBUG_ALIGNED_LOAD
+  EIGEN_DEBUG_UNALIGNED_LOAD
  return (Packet4f) vec_vsx_ld((long)from & 15, (const float*) _EIGEN_ALIGNED_PTR(from));
 }
 #endif
@ -494,16 +513,19 @@ template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet4f&
 }
 #endif
-#ifndef __VSX__
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr)    { EIGEN_PPC_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); }
+template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*     addr)    { EIGEN_PPC_PREFETCH(addr); }
 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*     addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); }
 #endif
-template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; }
+template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; }
-template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int   EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; }
+template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int   EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; }
-template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { return (Packet4f)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE32); }
+template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
-template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { return (Packet4i)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE32); }
+{
  return reinterpret_cast<Packet4f>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
 }
 template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
 {
  return reinterpret_cast<Packet4i>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32)); }
 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); }
 template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); }
@ -511,10 +533,10 @@ template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs
 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
 {
  Packet4f b, sum;
-  b   = (Packet4f) vec_sld(a, a, 8);
+  b   = vec_sld(a, a, 8);
-  sum = vec_add(a, b);
+  sum = a + b;
-  b   = (Packet4f) vec_sld(sum, sum, 4);
+  b   = vec_sld(sum, sum, 4);
-  sum = vec_add(sum, b);
+  sum += b;
  return pfirst(sum);
 }
@ -537,11 +559,11 @@ template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
  // Now do the summation:
  // Lines 0+1
-  sum[0] = vec_add(sum[0], sum[1]);
+  sum[0] = sum[0] + sum[1];
  // Lines 2+3
-  sum[1] = vec_add(sum[2], sum[3]);
+  sum[1] = sum[2] + sum[3];
  // Add the results
-  sum[0] = vec_add(sum[0], sum[1]);
+  sum[0] = sum[0] + sum[1];
  return sum[0];
 }
@ -577,11 +599,11 @@ template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
  // Now do the summation:
  // Lines 0+1
-  sum[0] = vec_add(sum[0], sum[1]);
+  sum[0] = sum[0] + sum[1];
  // Lines 2+3
-  sum[1] = vec_add(sum[2], sum[3]);
+  sum[1] = sum[2] + sum[3];
  // Add the results
-  sum[0] = vec_add(sum[0], sum[1]);
+  sum[0] = sum[0] + sum[1];
  return sum[0];
 }
@ -591,8 +613,8 @@ template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
 template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
 {
  Packet4f prod;
-  prod = pmul(a, (Packet4f)vec_sld(a, a, 8));
+  prod = pmul(a, vec_sld(a, a, 8));
-  return pfirst(pmul(prod, (Packet4f)vec_sld(prod, prod, 4)));
+  return pfirst(pmul(prod, vec_sld(prod, prod, 4)));
 }
 template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
@ -716,33 +738,52 @@ ptranspose(PacketBlock<Packet4i,4>& kernel) {
  kernel.packet[3] = vec_mergel(t1, t3);
 }
 template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
  Packet4ui mask = reinterpret_cast<Packet4ui>(vec_cmpeq(reinterpret_cast<Packet4ui>(select), reinterpret_cast<Packet4ui>(p4i_ONE)));
  return vec_sel(elsePacket, thenPacket, mask);
 }
 template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
  Packet4ui mask = reinterpret_cast<Packet4ui>(vec_cmpeq(reinterpret_cast<Packet4ui>(select), reinterpret_cast<Packet4ui>(p4i_ONE)));
  return vec_sel(elsePacket, thenPacket, mask);
 }
 //---------- double ----------
 #ifdef __VSX__
 typedef __vector double              Packet2d;
 typedef __vector unsigned long long  Packet2ul;
 typedef __vector long long           Packet2l;
 #if EIGEN_COMP_CLANG
 typedef Packet2ul                    Packet2bl;
 #else
 typedef __vector __bool long         Packet2bl;
 #endif
-static Packet2l p2l_ZERO = (Packet2l) p4i_ZERO;
+static Packet2l  p2l_ONE  = { 1, 1 };
 static Packet2l  p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO);
 static Packet2d  p2d_ONE  = { 1.0, 1.0 }; 
-static Packet2d p2d_ZERO = (Packet2d) p4f_ZERO;
+static Packet2d  p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO);
 static Packet2d  p2d_ZERO_ = { -0.0, -0.0 };
 #ifdef _BIG_ENDIAN
-static Packet2d p2d_COUNTDOWN = (Packet2d) vec_sld((Packet16uc) p2d_ZERO, (Packet16uc) p2d_ONE, 8);
+static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ZERO), reinterpret_cast<Packet4f>(p2d_ONE), 8));
 #else
-static Packet2d p2d_COUNTDOWN = (Packet2d) vec_sld((Packet16uc) p2d_ONE, (Packet16uc) p2d_ZERO, 8);
+static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ONE), reinterpret_cast<Packet4f>(p2d_ZERO), 8));
 #endif
-static EIGEN_STRONG_INLINE Packet2d vec_splat_dbl(Packet2d& a, int index)
+template<int index> Packet2d vec_splat_dbl(Packet2d& a);
 template<> EIGEN_STRONG_INLINE Packet2d vec_splat_dbl<0>(Packet2d& a)
 {
-  switch (index) {
+  return reinterpret_cast<Packet2d>(vec_perm(a, a, p16uc_PSET64_HI));
  case 0:
    return (Packet2d) vec_perm(a, a, p16uc_PSET64_HI);
  case 1:
    return (Packet2d) vec_perm(a, a, p16uc_PSET64_LO);
 }
-  return a;
+
 template<> EIGEN_STRONG_INLINE Packet2d vec_splat_dbl<1>(Packet2d& a)
 {
  return reinterpret_cast<Packet2d>(vec_perm(a, a, p16uc_PSET64_LO));
 }
 template<> struct packet_traits<double> : default_packet_traits
@ -753,16 +794,41 @@ template<> struct packet_traits<double> : default_packet_traits
    Vectorizable = 1,
    AlignedOnScalar = 1,
    size=2,
-    HasHalfPacket = 0,
+    HasHalfPacket = 1,
    HasAdd  = 1,
    HasSub  = 1,
    HasMul  = 1,
    HasDiv  = 1,
    HasMin  = 1,
    HasMax  = 1,
    HasAbs  = 1,
    HasSin  = 0,
    HasCos  = 0,
    HasLog  = 0,
    HasExp  = 1,
-    HasSqrt = 0
+    HasSqrt = 1,
    HasRsqrt = 1,
    HasRound = 1,
    HasFloor = 1,
    HasCeil = 1,
    HasNegate = 1,
    HasBlend = 1
  };
 };
 template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
 inline std::ostream & operator <<(std::ostream & s, const Packet2l & v)
 {
  union {
    Packet2l   v;
    int64_t n[2];
  } vt;
  vt.v = v;
  s << vt.n[0] << ", " << vt.n[1];
  return s;
 }
 inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
 {
@ -776,28 +842,43 @@ inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
 }
 // Need to define them first or we get specialization after instantiation errors
-template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return (Packet2d) vec_ld(0, (const float *) from); } //FIXME
+template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
 {
  EIGEN_DEBUG_ALIGNED_LOAD
 #ifdef __VSX__
  return vec_vsx_ld(0, from);
 #else
  return vec_ld(0, from);
 #endif
 }
-template<> EIGEN_STRONG_INLINE void pstore<double>(double*   to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st((Packet4f)from, 0, (float *)to); }
+template<> EIGEN_STRONG_INLINE void pstore<double>(double*   to, const Packet2d& from)
 {
  EIGEN_DEBUG_ALIGNED_STORE
 #ifdef __VSX__
  vec_vsx_st(from, 0, to);
 #else
  vec_st(from, 0, to);
 #endif
 }
 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double&  from) {
-  double EIGEN_ALIGN16 af[2];
+  Packet2d v = {from, from};
-  af[0] = from;
+  return v;
  Packet2d vc = pload<Packet2d>(af);
  vc = vec_splat_dbl(vc, 0);
  return vc;
 }
 template<> EIGEN_STRONG_INLINE void
 pbroadcast4<Packet2d>(const double *a,
                      Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
 {
  a1 = pload<Packet2d>(a);
-  a0 = vec_splat_dbl(a1, 0);
+  a0 = vec_splat_dbl<0>(a1);
-  a1 = vec_splat_dbl(a1, 1);
+  a1 = vec_splat_dbl<1>(a1);
  a3 = pload<Packet2d>(a+2);
-  a2 = vec_splat_dbl(a3, 0);
+  a2 = vec_splat_dbl<0>(a3);
-  a3 = vec_splat_dbl(a3, 1);
+  a3 = vec_splat_dbl<1>(a3);
 }
 template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
 {
  double EIGEN_ALIGN16 af[2];
@ -812,13 +893,14 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to,
  to[0*stride] = af[0];
  to[1*stride] = af[1];
 }
 template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return vec_add(pset1<Packet2d>(a), p2d_COUNTDOWN); }
-template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_add(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return pset1<Packet2d>(a) + p2d_COUNTDOWN; }
-template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_sub(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return a + b; }
-template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return psub<Packet2d>(p2d_ZERO, a); }
+template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return a - b; }
 template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return p2d_ZERO - a; }
 template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
@ -840,17 +922,22 @@ template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const
 template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); }
 template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return vec_round(a); }
 template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const  Packet2d& a) { return vec_ceil(a); }
 template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); }
 template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
 {
  EIGEN_DEBUG_ALIGNED_LOAD
-  return (Packet2d) vec_vsx_ld((long)from & 15, (const float*) _EIGEN_ALIGNED_PTR(from));
+  return (Packet2d) vec_vsx_ld((long)from & 15, (const double*) _EIGEN_ALIGNED_PTR(from));
 }
 template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*   from)
 {
  Packet2d p;
  if((ptrdiff_t(from) % 16) == 0)  p = pload<Packet2d>(from);
  else                             p = ploadu<Packet2d>(from);
-  return vec_perm(p, p, p16uc_PSET64_HI);
+  return vec_splat_dbl<0>(p);
 }
 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double*  to, const Packet2d& from)
@ -859,32 +946,34 @@ template<> EIGEN_STRONG_INLINE void pstoreu<double>(double*  to, const Packet2d&
  vec_vsx_st((Packet4f)from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to));
 }
-template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { vec_dstt((const float *) addr, DST_CTRL(2,2,32), DST_CHAN); }
+template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_PPC_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE double  pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE double  pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore<double>(x, a); return x[0]; }
 template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return (Packet2d)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE64); }
 template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
 {
  return reinterpret_cast<Packet2d>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE64));
 }
 template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); }
 template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
 {
  Packet2d b, sum;
-  b   = (Packet2d) vec_sld((Packet4ui) a, (Packet4ui)a, 8);
+  b   = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(a), reinterpret_cast<Packet4f>(a), 8));
-  sum = vec_add(a, b);
+  sum = a + b;
-  return pfirst(sum);
+  return pfirst<Packet2d>(sum);
 }
 template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
 {
  Packet2d v[2], sum;
-  v[0] = vec_add(vecs[0], (Packet2d) vec_sld((Packet4ui) vecs[0], (Packet4ui) vecs[0], 8));
+  v[0] = vecs[0] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[0]), reinterpret_cast<Packet4f>(vecs[0]), 8));
-  v[1] = vec_add(vecs[1], (Packet2d) vec_sld((Packet4ui) vecs[1], (Packet4ui) vecs[1], 8));
+  v[1] = vecs[1] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[1]), reinterpret_cast<Packet4f>(vecs[1]), 8));
 #ifdef _BIG_ENDIAN
- sum = (Packet2d) vec_sld((Packet4ui) v[0], (Packet4ui) v[1], 8);
+  sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(v[0]), reinterpret_cast<Packet4f>(v[1]), 8));
 #else
-  sum = (Packet2d) vec_sld((Packet4ui) v[1], (Packet4ui) v[0], 8);
+  sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(v[1]), reinterpret_cast<Packet4f>(v[0]), 8));
 #endif
  return sum;
@ -893,19 +982,19 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
 // mul
 template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
 {
-  return pfirst(pmul(a, (Packet2d)vec_sld((Packet4ui) a, (Packet4ui) a, 8)));
+  return pfirst(pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
 }
 // min
 template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
 {
-  return pfirst(vec_min(a, (Packet2d) vec_sld((Packet4ui) a, (Packet4ui) a, 8)));
+  return pfirst(pmin(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
 }
 // max
 template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
 {
-  return pfirst(vec_max(a, (Packet2d) vec_sld((Packet4ui) a, (Packet4ui) a, 8)));
+  return pfirst(pmax(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
 }
 template<int Offset>
@ -915,9 +1004,9 @@ struct palign_impl<Offset,Packet2d>
  {
    if (Offset == 1)
 #ifdef _BIG_ENDIAN
-      first = (Packet2d) vec_sld((Packet4ui) first, (Packet4ui) second, 8);
+      first = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(first), reinterpret_cast<Packet4ui>(second), 8));
 #else
-      first = (Packet2d) vec_sld((Packet4ui) second, (Packet4ui) first, 8);
+      first = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(second), reinterpret_cast<Packet4ui>(first), 8));
 #endif
  }
 };
@ -931,6 +1020,11 @@ ptranspose(PacketBlock<Packet2d,2>& kernel) {
  kernel.packet[1] = t1;
 }
 template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
  Packet2l select = { ifPacket.select[0], ifPacket.select[1] };
  Packet2bl mask = vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE));
  return vec_sel(elsePacket, thenPacket, mask);
 }
 #endif // __VSX__
 } // end namespace internal
--- a/Eigen/src/Core/arch/CMakeLists.txt
+++ b/Eigen/src/Core/arch/CMakeLists.txt
@ -1,10 +0,0 @@
 ADD_SUBDIRECTORY(AltiVec)
 ADD_SUBDIRECTORY(AVX)
 ADD_SUBDIRECTORY(AVX512)
 ADD_SUBDIRECTORY(CUDA)
 ADD_SUBDIRECTORY(Default)
 ADD_SUBDIRECTORY(NEON)
 ADD_SUBDIRECTORY(SSE)
--- a/Eigen/src/Core/arch/CUDA/CMakeLists.txt
+++ b/Eigen/src/Core/arch/CUDA/CMakeLists.txt
@ -1,6 +0,0 @@
 FILE(GLOB Eigen_Core_arch_CUDA_SRCS "*.h")
 INSTALL(FILES
  ${Eigen_Core_arch_CUDA_SRCS}
  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/CUDA COMPONENT Devel
 )
--- a/Eigen/src/Core/arch/CUDA/Complex.h
+++ b/Eigen/src/Core/arch/CUDA/Complex.h
@ -0,0 +1,103 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #ifndef EIGEN_COMPLEX_CUDA_H
 #define EIGEN_COMPLEX_CUDA_H
 // clang-format off
 namespace Eigen {
 namespace internal {
 #if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
 // Many std::complex methods such as operator+, operator-, operator* and
 // operator/ are not constexpr. Due to this, clang does not treat them as device
 // functions and thus Eigen functors making use of these operators fail to
 // compile. Here, we manually specialize these functors for complex types when
 // building for CUDA to avoid non-constexpr methods.
 // Sum
 template<typename T> struct scalar_sum_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
  typedef typename std::complex<T> result_type;
  EIGEN_EMPTY_STRUCT_CTOR(scalar_sum_op)
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
    return std::complex<T>(numext::real(a) + numext::real(b),
                           numext::imag(a) + numext::imag(b));
  }
 };
 template<typename T> struct scalar_sum_op<std::complex<T>, std::complex<T> > : scalar_sum_op<const std::complex<T>, const std::complex<T> > {};
 // Difference
 template<typename T> struct scalar_difference_op<const std::complex<T>, const std::complex<T> >  : binary_op_base<const std::complex<T>, const std::complex<T> > {
  typedef typename std::complex<T> result_type;
  EIGEN_EMPTY_STRUCT_CTOR(scalar_difference_op)
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
    return std::complex<T>(numext::real(a) - numext::real(b),
                           numext::imag(a) - numext::imag(b));
  }
 };
 template<typename T> struct scalar_difference_op<std::complex<T>, std::complex<T> > : scalar_difference_op<const std::complex<T>, const std::complex<T> > {};
 // Product
 template<typename T> struct scalar_product_op<const std::complex<T>, const std::complex<T> >  : binary_op_base<const std::complex<T>, const std::complex<T> > {
  enum {
    Vectorizable = packet_traits<std::complex<T>>::HasMul
  };
  typedef typename std::complex<T> result_type;
  EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op)
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
    const T a_real = numext::real(a);
    const T a_imag = numext::imag(a);
    const T b_real = numext::real(b);
    const T b_imag = numext::imag(b);
    return std::complex<T>(a_real * b_real - a_imag * b_imag,
                           a_real * b_imag + a_imag * b_real);
  }
 };
 template<typename T> struct scalar_product_op<std::complex<T>, std::complex<T> > : scalar_product_op<const std::complex<T>, const std::complex<T> > {};
 // Quotient
 template<typename T> struct scalar_quotient_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
  enum {
    Vectorizable = packet_traits<std::complex<T>>::HasDiv
  };
  typedef typename std::complex<T> result_type;
  EIGEN_EMPTY_STRUCT_CTOR(scalar_quotient_op)
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
    const T a_real = numext::real(a);
    const T a_imag = numext::imag(a);
    const T b_real = numext::real(b);
    const T b_imag = numext::imag(b);
    const T norm = T(1) / (b_real * b_real + b_imag * b_imag);
    return std::complex<T>((a_real * b_real + a_imag * b_imag) * norm,
                           (a_imag * b_real - a_real * b_imag) * norm);
  }
 };
 template<typename T> struct scalar_quotient_op<std::complex<T>, std::complex<T> > : scalar_quotient_op<const std::complex<T>, const std::complex<T> > {};
 #endif
 } // end namespace internal
 } // end namespace Eigen
 #endif // EIGEN_COMPLEX_CUDA_H
--- a/Eigen/src/Core/arch/CUDA/Half.h
+++ b/Eigen/src/Core/arch/CUDA/Half.h
@ -1,11 +1,3 @@
 // Standard 16-bit float type, mostly useful for GPUs. Defines a new
 // class Eigen::half (inheriting from CUDA's __half struct) with
 // operator overloads such that it behaves basically as an arithmetic
 // type. It will be quite slow on CPUs (so it is recommended to stay
 // in fp32 for CPUs, except for simple parameter conversions, I/O
 // to disk and the likes), but fast on GPUs.
 //
 //
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
@ -32,6 +24,15 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 // Standard 16-bit float type, mostly useful for GPUs. Defines a new
 // type Eigen::half (inheriting from CUDA's __half struct) with
 // operator overloads such that it behaves basically as an arithmetic
 // type. It will be quite slow on CPUs (so it is recommended to stay
 // in fp32 for CPUs, except for simple parameter conversions, I/O
 // to disk and the likes), but fast on GPUs.
 #ifndef EIGEN_HALF_CUDA_H
 #define EIGEN_HALF_CUDA_H
@ -42,92 +43,93 @@
 #endif
 namespace Eigen {
 struct half;
 namespace half_impl {
 #if !defined(EIGEN_HAS_CUDA_FP16)
 // Make our own __half definition that is similar to CUDA's.
 struct __half {
-  __half() {}
+  EIGEN_DEVICE_FUNC __half() {}
-  explicit __half(unsigned short raw) : x(raw) {}
+  explicit EIGEN_DEVICE_FUNC __half(unsigned short raw) : x(raw) {}
  unsigned short x;
 };
 #endif
-namespace Eigen {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x);
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff);
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h);
-namespace internal {
+struct half_base : public __half {
  EIGEN_DEVICE_FUNC half_base() {}
  EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half(h) {}
  EIGEN_DEVICE_FUNC half_base(const __half& h) : __half(h) {}
 };
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x);
+} // namespace half_impl
 static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff);
 static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h);
 } // end namespace internal
 // Class definition.
-struct half : public __half {
+struct half : public half_impl::half_base {
  #if !defined(EIGEN_HAS_CUDA_FP16)
    typedef half_impl::__half __half;
  #endif
  EIGEN_DEVICE_FUNC half() {}
-  EIGEN_DEVICE_FUNC half(const __half& h) : __half(h) {}
+  EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
-  EIGEN_DEVICE_FUNC half(const half& h) : __half(h) {}
+  EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {}
  explicit EIGEN_DEVICE_FUNC half(bool b)
-      : __half(internal::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
+      : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
-  explicit EIGEN_DEVICE_FUNC half(unsigned int ui)
+  template<class T>
-      : __half(internal::float_to_half_rtne(static_cast<float>(ui))) {}
+  explicit EIGEN_DEVICE_FUNC half(const T& val)
-  explicit EIGEN_DEVICE_FUNC half(int i)
+      : half_impl::half_base(half_impl::float_to_half_rtne(static_cast<float>(val))) {}
      : __half(internal::float_to_half_rtne(static_cast<float>(i))) {}
  explicit EIGEN_DEVICE_FUNC half(unsigned long ul)
      : __half(internal::float_to_half_rtne(static_cast<float>(ul))) {}
  explicit EIGEN_DEVICE_FUNC half(long l)
      : __half(internal::float_to_half_rtne(static_cast<float>(l))) {}
  explicit EIGEN_DEVICE_FUNC half(long long ll)
      : __half(internal::float_to_half_rtne(static_cast<float>(ll))) {}
  explicit EIGEN_DEVICE_FUNC half(unsigned long long ull)
      : __half(internal::float_to_half_rtne(static_cast<float>(ull))) {}
  explicit EIGEN_DEVICE_FUNC half(float f)
-      : __half(internal::float_to_half_rtne(f)) {}
+      : half_impl::half_base(half_impl::float_to_half_rtne(f)) {}
  explicit EIGEN_DEVICE_FUNC half(double d)
      : __half(internal::float_to_half_rtne(static_cast<float>(d))) {}
  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(bool) const {
    // +0.0 and -0.0 become false, everything else becomes true.
    return (x & 0x7fff) != 0;
  }
  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(signed char) const {
-    return static_cast<signed char>(internal::half_to_float(*this));
+    return static_cast<signed char>(half_impl::half_to_float(*this));
  }
  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned char) const {
-    return static_cast<unsigned char>(internal::half_to_float(*this));
+    return static_cast<unsigned char>(half_impl::half_to_float(*this));
  }
  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(short) const {
-    return static_cast<short>(internal::half_to_float(*this));
+    return static_cast<short>(half_impl::half_to_float(*this));
  }
  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned short) const {
-    return static_cast<unsigned short>(internal::half_to_float(*this));
+    return static_cast<unsigned short>(half_impl::half_to_float(*this));
  }
  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(int) const {
-    return static_cast<int>(internal::half_to_float(*this));
+    return static_cast<int>(half_impl::half_to_float(*this));
  }
  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned int) const {
-    return static_cast<unsigned int>(internal::half_to_float(*this));
+    return static_cast<unsigned int>(half_impl::half_to_float(*this));
  }
  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long) const {
-    return static_cast<long>(internal::half_to_float(*this));
+    return static_cast<long>(half_impl::half_to_float(*this));
  }
  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long) const {
-    return static_cast<unsigned long>(internal::half_to_float(*this));
+    return static_cast<unsigned long>(half_impl::half_to_float(*this));
  }
  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long long) const {
-    return static_cast<long long>(internal::half_to_float(*this));
+    return static_cast<long long>(half_impl::half_to_float(*this));
  }
  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long long) const {
-    return static_cast<unsigned long long>(internal::half_to_float(*this));
+    return static_cast<unsigned long long>(half_to_float(*this));
  }
  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(float) const {
-    return internal::half_to_float(*this);
+    return half_impl::half_to_float(*this);
  }
  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(double) const {
-    return static_cast<double>(internal::half_to_float(*this));
+    return static_cast<double>(half_impl::half_to_float(*this));
  }
  EIGEN_DEVICE_FUNC half& operator=(const half& other) {
@ -136,6 +138,8 @@ struct half : public __half {
  }
 };
 namespace half_impl {
 #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
 // Intrinsics for native fp16 support. Note that on current hardware,
@ -200,55 +204,55 @@ __device__ bool operator >= (const half& a, const half& b) {
 // Definitions for CPUs and older CUDA, mostly working through conversion
 // to/from fp32.
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) {
  return half(float(a) + float(b));
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) {
  return half(float(a) * float(b));
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) {
  return half(float(a) - float(b));
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) {
  return half(float(a) / float(b));
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a) {
  half result;
  result.x = a.x ^ 0x8000;
  return result;
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) {
  a = half(float(a) + float(b));
  return a;
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) {
  a = half(float(a) * float(b));
  return a;
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) {
  a = half(float(a) - float(b));
  return a;
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) {
  a = half(float(a) / float(b));
  return a;
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) {
  return float(a) == float(b);
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) {
  return float(a) != float(b);
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) {
  return float(a) < float(b);
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const half& a, const half& b) {
  return float(a) <= float(b);
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) {
  return float(a) > float(b);
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const half& b) {
  return float(a) >= float(b);
 }
@ -256,8 +260,8 @@ static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, co
 // Division by an index. Do it in full float precision to avoid accuracy
 // issues in converting the denominator to half.
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) {
-  return Eigen::half(static_cast<float>(a) / static_cast<float>(b));
+  return half(static_cast<float>(a) / static_cast<float>(b));
 }
 // Conversion routines, including fallbacks for the host or older CUDA.
@ -265,9 +269,7 @@ static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Ind
 // these in hardware. If we need more performance on older/other CPUs, they are
 // also possible to vectorize directly.
-namespace internal {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x) {
 static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x) {
  __half h;
  h.x = x;
  return h;
@ -278,7 +280,7 @@ union FP32 {
  float f;
 };
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
 #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
  return __float2half(ff);
@ -333,7 +335,7 @@ static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff)
 #endif
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) {
 #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
  return __half2float(h);
@ -362,92 +364,69 @@ static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) {
 #endif
 }
-} // end namespace internal
+// --- standard functions ---
-// Traits.
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) {
 namespace internal {
 template<> struct is_arithmetic<half> { enum { value = true }; };
 } // end namespace internal
 template<> struct NumTraits<Eigen::half>
    : GenericNumTraits<Eigen::half>
 {
  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() {
    return internal::raw_uint16_to_half(0x0800);
  }
  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half dummy_precision() { return half(1e-3f); }
  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half highest() {
    return internal::raw_uint16_to_half(0x7bff);
  }
  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half lowest() {
    return internal::raw_uint16_to_half(0xfbff);
  }
  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half infinity() {
    return internal::raw_uint16_to_half(0x7c00);
  }
  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() {
    return internal::raw_uint16_to_half(0x7c01);
  }
 };
 // Infinity/NaN checks.
 namespace numext {
 static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const Eigen::half& a) {
  return (a.x & 0x7fff) == 0x7c00;
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const Eigen::half& a) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) {
 #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hisnan(a);
 #else
  return (a.x & 0x7fff) > 0x7c00;
 #endif
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const Eigen::half& a) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const half& a) {
-  return !(Eigen::numext::isinf)(a) && !(Eigen::numext::isnan)(a);
+  return !(isinf EIGEN_NOT_A_MACRO (a)) && !(isnan EIGEN_NOT_A_MACRO (a));
 }
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half abs(const Eigen::half& a) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
-  Eigen::half result;
+  half result;
  result.x = a.x & 0x7FFF;
  return result;
 }
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exp(const Eigen::half& a) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
-  return Eigen::half(::expf(float(a)));
+  return half(::expf(float(a)));
 }
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half log(const Eigen::half& a) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
-  return Eigen::half(::logf(float(a)));
+#if defined(EIGEN_HAS_CUDA_FP16) && defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return Eigen::half(::hlog(a));
 #else
  return half(::logf(float(a)));
 #endif
 }
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sqrt(const Eigen::half& a) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log1p(const half& a) {
-  return Eigen::half(::sqrtf(float(a)));
+  return half(numext::log1p(float(a)));
 }
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half pow(const Eigen::half& a, const Eigen::half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) {
-  return Eigen::half(::powf(float(a), float(b)));
+  return half(::log10f(float(a)));
 }
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sin(const Eigen::half& a) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
-  return Eigen::half(::sinf(float(a)));
+  return half(::sqrtf(float(a)));
 }
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half cos(const Eigen::half& a) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half pow(const half& a, const half& b) {
-  return Eigen::half(::cosf(float(a)));
+  return half(::powf(float(a), float(b)));
 }
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half tan(const Eigen::half& a) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sin(const half& a) {
-  return Eigen::half(::tanf(float(a)));
+  return half(::sinf(float(a)));
 }
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half tanh(const Eigen::half& a) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half cos(const half& a) {
-  return Eigen::half(::tanhf(float(a)));
+  return half(::cosf(float(a)));
 }
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floor(const Eigen::half& a) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tan(const half& a) {
-  return Eigen::half(::floorf(float(a)));
+  return half(::tanf(float(a)));
 }
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ceil(const Eigen::half& a) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) {
-  return Eigen::half(::ceilf(float(a)));
+  return half(::tanhf(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
  return half(::floorf(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
  return half(::ceilf(float(a)));
 }
-template <> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half mini(const Eigen::half& a, const Eigen::half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
 #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hlt(b, a) ? b : a;
 #else
@ -456,7 +435,7 @@ template <> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half mini(const Eigen::
  return f2 < f1 ? b : a;
 #endif
 }
-template <> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half maxi(const Eigen::half& a, const Eigen::half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) {
 #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hlt(a, b) ? b : a;
 #else
@ -466,78 +445,89 @@ template <> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half maxi(const Eigen::
 #endif
 }
-#ifdef EIGEN_HAS_C99_MATH
+EIGEN_ALWAYS_INLINE std::ostream& operator << (std::ostream& os, const half& v) {
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half lgamma(const Eigen::half& a) {
+  os << static_cast<float>(v);
-  return Eigen::half(Eigen::numext::lgamma(static_cast<float>(a)));
+  return os;
 }
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half digamma(const Eigen::half& a) {
+
-  return Eigen::half(Eigen::numext::digamma(static_cast<float>(a)));
+} // end namespace half_impl
 // import Eigen::half_impl::half into Eigen namespace
 // using half_impl::half;
 namespace internal {
 template<>
 struct random_default_impl<half, false, false>
 {
  static inline half run(const half& x, const half& y)
  {
    return x + (y-x) * half(float(std::rand()) / float(RAND_MAX));
  }
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half zeta(const Eigen::half& x, const Eigen::half& q) {
+  static inline half run()
-  return Eigen::half(Eigen::numext::zeta(static_cast<float>(x), static_cast<float>(q)));
+  {
    return run(half(-1.f), half(1.f));
  }
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half polygamma(const Eigen::half& n, const Eigen::half& x) {
+};
-  return Eigen::half(Eigen::numext::polygamma(static_cast<float>(n), static_cast<float>(x)));
+
 template<> struct is_arithmetic<half> { enum { value = true }; };
 } // end namespace internal
 template<> struct NumTraits<Eigen::half>
    : GenericNumTraits<Eigen::half>
 {
  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() {
    return half_impl::raw_uint16_to_half(0x0800);
  }
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erf(const Eigen::half& a) {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half dummy_precision() { return Eigen::half(1e-2f); }
-  return Eigen::half(Eigen::numext::erf(static_cast<float>(a)));
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half highest() {
    return half_impl::raw_uint16_to_half(0x7bff);
  }
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erfc(const Eigen::half& a) {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half lowest() {
-  return Eigen::half(Eigen::numext::erfc(static_cast<float>(a)));
+    return half_impl::raw_uint16_to_half(0xfbff);
  }
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igamma(const Eigen::half& a, const Eigen::half& x) {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half infinity() {
-  return Eigen::half(Eigen::numext::igamma(static_cast<float>(a), static_cast<float>(x)));
+    return half_impl::raw_uint16_to_half(0x7c00);
  }
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igammac(const Eigen::half& a, const Eigen::half& x) {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() {
-  return Eigen::half(Eigen::numext::igammac(static_cast<float>(a), static_cast<float>(x)));
+    return half_impl::raw_uint16_to_half(0x7c01);
  }
-#endif
+};
 } // end namespace numext
 } // end namespace Eigen
-// Standard mathematical functions and trancendentals.
+// C-like standard mathematical functions and trancendentals.
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half fabsh(const Eigen::half& a) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half fabsh(const Eigen::half& a) {
  Eigen::half result;
  result.x = a.x & 0x7FFF;
  return result;
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) {
  return Eigen::half(::expf(float(a)));
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) {
 #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return Eigen::half(::hlog(a));
 #else
  return Eigen::half(::logf(float(a)));
 #endif
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sqrth(const Eigen::half& a) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sqrth(const Eigen::half& a) {
  return Eigen::half(::sqrtf(float(a)));
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half powh(const Eigen::half& a, const Eigen::half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half powh(const Eigen::half& a, const Eigen::half& b) {
  return Eigen::half(::powf(float(a), float(b)));
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floorh(const Eigen::half& a) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floorh(const Eigen::half& a) {
  return Eigen::half(::floorf(float(a)));
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ceilh(const Eigen::half& a) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ceilh(const Eigen::half& a) {
  return Eigen::half(::ceilf(float(a)));
 }
 static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC int (isnan)(const Eigen::half& a) {
  return (Eigen::numext::isnan)(a);
 }
 static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC int (isinf)(const Eigen::half& a) {
  return (Eigen::numext::isinf)(a);
 }
 static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC int (isfinite)(const Eigen::half& a) {
  return !(Eigen::numext::isinf)(a) && !(Eigen::numext::isnan)(a);
 }
 namespace std {
 EIGEN_ALWAYS_INLINE ostream& operator << (ostream& os, const Eigen::half& v) {
  os << static_cast<float>(v);
  return os;
 }
 #if __cplusplus > 199711L
 template <>
 struct hash<Eigen::half> {
@ -551,19 +541,45 @@ struct hash<Eigen::half> {
 // Add the missing shfl_xor intrinsic
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
  return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
 }
 #endif
 // ldg() has an overload for __half, but we also need one for Eigen::half.
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 320
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) {
-  return Eigen::internal::raw_uint16_to_half(
+  return Eigen::half_impl::raw_uint16_to_half(
      __ldg(reinterpret_cast<const unsigned short*>(ptr)));
 }
 #endif
 #if defined(__CUDA_ARCH__)
 namespace Eigen {
 namespace numext {
 template<>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 bool (isnan)(const Eigen::half& h) {
  return (half_impl::isnan)(h);
 }
 template<>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 bool (isinf)(const Eigen::half& h) {
  return (half_impl::isinf)(h);
 }
 template<>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 bool (isfinite)(const Eigen::half& h) {
  return (half_impl::isfinite)(h);
 }
 } // namespace Eigen
 }  // namespace numext
 #endif
 #endif // EIGEN_HALF_CUDA_H
--- a/Eigen/src/Core/arch/CUDA/MathFunctions.h
+++ b/Eigen/src/Core/arch/CUDA/MathFunctions.h
@ -27,9 +27,22 @@ float4 plog<float4>(const float4& a)
 template<>  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 double2 plog<double2>(const double2& a)
 {
  using ::log;
  return make_double2(log(a.x), log(a.y));
 }
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 float4 plog1p<float4>(const float4& a)
 {
  return make_float4(log1pf(a.x), log1pf(a.y), log1pf(a.z), log1pf(a.w));
 }
 template<>  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 double2 plog1p<double2>(const double2& a)
 {
  return make_double2(log1p(a.x), log1p(a.y));
 }
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 float4 pexp<float4>(const float4& a)
 {
@ -39,6 +52,7 @@ float4 pexp<float4>(const float4& a)
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 double2 pexp<double2>(const double2& a)
 {
  using ::exp;
  return make_double2(exp(a.x), exp(a.y));
 }
@ -51,6 +65,7 @@ float4 psqrt<float4>(const float4& a)
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 double2 psqrt<double2>(const double2& a)
 {
  using ::sqrt;
  return make_double2(sqrt(a.x), sqrt(a.y));
 }
@ -66,120 +81,6 @@ double2 prsqrt<double2>(const double2& a)
  return make_double2(rsqrt(a.x), rsqrt(a.y));
 }
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 float4 plgamma<float4>(const float4& a)
 {
  return make_float4(lgammaf(a.x), lgammaf(a.y), lgammaf(a.z), lgammaf(a.w));
 }
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 double2 plgamma<double2>(const double2& a)
 {
  return make_double2(lgamma(a.x), lgamma(a.y));
 }
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 float4 pdigamma<float4>(const float4& a)
 {
  using numext::digamma;
  return make_float4(digamma(a.x), digamma(a.y), digamma(a.z), digamma(a.w));
 }
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 double2 pdigamma<double2>(const double2& a)
 {
  using numext::digamma;
  return make_double2(digamma(a.x), digamma(a.y));
 }
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 float4 pzeta<float4>(const float4& x, const float4& q)
 {
    using numext::zeta;
    return make_float4(zeta(x.x, q.x), zeta(x.y, q.y), zeta(x.z, q.z), zeta(x.w, q.w));
 }
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 double2 pzeta<double2>(const double2& x, const double2& q)
 {
    using numext::zeta;
    return make_double2(zeta(x.x, q.x), zeta(x.y, q.y));
 }
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 float4 ppolygamma<float4>(const float4& n, const float4& x)
 {
    using numext::polygamma;
    return make_float4(polygamma(n.x, x.x), polygamma(n.y, x.y), polygamma(n.z, x.z), polygamma(n.w, x.w));
 }
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 double2 ppolygamma<double2>(const double2& n, const double2& x)
 {
    using numext::polygamma;
    return make_double2(polygamma(n.x, x.x), polygamma(n.y, x.y));
 }
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 float4 perf<float4>(const float4& a)
 {
  return make_float4(erf(a.x), erf(a.y), erf(a.z), erf(a.w));
 }
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 double2 perf<double2>(const double2& a)
 {
  return make_double2(erf(a.x), erf(a.y));
 }
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 float4 perfc<float4>(const float4& a)
 {
  return make_float4(erfc(a.x), erfc(a.y), erfc(a.z), erfc(a.w));
 }
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 double2 perfc<double2>(const double2& a)
 {
  return make_double2(erfc(a.x), erfc(a.y));
 }
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 float4 pigamma<float4>(const float4& a, const float4& x)
 {
  using numext::igamma;
  return make_float4(
      igamma(a.x, x.x),
      igamma(a.y, x.y),
      igamma(a.z, x.z),
      igamma(a.w, x.w));
 }
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 double2 pigamma<double2>(const double2& a, const double2& x)
 {
  using numext::igamma;
  return make_double2(igamma(a.x, x.x), igamma(a.y, x.y));
 }
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 float4 pigammac<float4>(const float4& a, const float4& x)
 {
  using numext::igammac;
  return make_float4(
      igammac(a.x, x.x),
      igammac(a.y, x.y),
      igammac(a.z, x.z),
      igammac(a.w, x.w));
 }
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 double2 pigammac<double2>(const double2& a, const double2& x)
 {
  using numext::igammac;
  return make_double2(igammac(a.x, x.x), igammac(a.y, x.y));
 }
 #endif
--- a/Eigen/src/Core/arch/CUDA/PacketMath.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMath.h
@ -44,8 +44,9 @@ template<> struct packet_traits<float> : default_packet_traits
    HasPolygamma = 1,
    HasErf = 1,
    HasErfc = 1,
-    HasIgamma = 1,
+    HasIGamma = 1,
    HasIGammac = 1,
    HasBetaInc = 1,
    HasBlend = 0,
  };
@ -68,10 +69,13 @@ template<> struct packet_traits<double> : default_packet_traits
    HasRsqrt = 1,
    HasLGamma = 1,
    HasDiGamma = 1,
    HasZeta = 1,
    HasPolygamma = 1,
    HasErf = 1,
    HasErfc = 1,
    HasIGamma = 1,
    HasIGammac = 1,
    HasBetaInc = 1,
    HasBlend = 0,
  };
@ -278,35 +282,6 @@ template<> EIGEN_DEVICE_FUNC inline double predux_mul<double2>(const double2& a)
  return a.x * a.y;
 }
 template<size_t offset>
 struct protate_impl<offset, float4>
 {
  static float4 run(const float4& a) {
    if (offset == 0) {
      return make_float4(a.x, a.y, a.z, a.w);
    }
    if (offset == 1) {
      return make_float4(a.w, a.x, a.y, a.z);
    }
    if (offset == 2) {
      return make_float4(a.z, a.w, a.x, a.y);
    }
    return make_float4(a.y, a.z, a.w, a.x);
  }
 };
 template<size_t offset>
 struct protate_impl<offset, double2>
 {
  static double2 run(const double2& a) {
    if (offset == 0) {
      return make_double2(a.x, a.y);
    }
    return make_double2(a.y, a.x);
  }
 };
 template<> EIGEN_DEVICE_FUNC inline float4  pabs<float4>(const float4& a) {
  return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
 }
--- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
@ -10,22 +10,16 @@
 #ifndef EIGEN_PACKET_MATH_HALF_CUDA_H
 #define EIGEN_PACKET_MATH_HALF_CUDA_H
 #if defined(EIGEN_HAS_CUDA_FP16)
 // Make sure this is only available when targeting a GPU: we don't want to
 // introduce conflicts between these packet_traits definitions and the ones
 // we'll use on the host side (SSE, AVX, ...)
 #if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
 // Most of the following operations require arch >= 5.3
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
 namespace Eigen {
 namespace internal {
 // Most of the following operations require arch >= 3.0
 #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
 template<> struct is_arithmetic<half2> { enum { value = true }; };
-template<> struct packet_traits<half> : default_packet_traits
+template<> struct packet_traits<Eigen::half> : default_packet_traits
 {
  typedef half2 type;
  typedef half2 half;
@ -34,105 +28,172 @@ template<> struct packet_traits<half> : default_packet_traits
    AlignedOnScalar = 1,
    size=2,
    HasHalfPacket = 0,
-    HasDiv  = 1
+    HasAdd    = 1,
    HasMul    = 1,
    HasDiv    = 1,
    HasSqrt   = 1,
    HasRsqrt  = 1,
    HasExp    = 1,
    HasLog    = 1,
    HasLog1p  = 1
  };
 };
 template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; };
-template<> struct unpacket_traits<half2> { typedef half type; enum {size=2, alignment=Aligned16}; typedef half2 half; };
+template<> __device__ EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const half& from) {
  return __half2half2(from);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload<half2>(const half* from) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
  return *reinterpret_cast<const half2*>(from);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu<half2>(const half* from) {
+template<> __device__ EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) {
  return __halves2half2(from[0], from[1]);
 }
-template<> EIGEN_STRONG_INLINE half2 ploaddup<half2>(const half*  from) {
+template<> EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half*  from) {
  return __halves2half2(from[0], from[0]);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<half>(half* to, const half2& from) {
+template<> __device__ EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) {
  *reinterpret_cast<half2*>(to) = from;
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<half>(half* to, const half2& from) {
+template<> __device__ EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
  to[0] = __low2half(from);
  to[1] = __high2half(from);
 }
 template<>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const half* from) {
+ __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
 #if __CUDA_ARCH__ >= 350
   return __ldg((const half2*)from);
 #else
  return __halves2half2(*(from+0), *(from+1));
 #endif
 }
 template<>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const half* from) {
+__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
 #if __CUDA_ARCH__ >= 350
   return __halves2half2(__ldg(from+0), __ldg(from+1));
 #else
  return __halves2half2(*(from+0), *(from+1));
 #endif
 }
-template<> EIGEN_DEVICE_FUNC inline half2 pgather<half, half2>(const half* from, Index stride) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
  return __halves2half2(from[0*stride], from[1*stride]);
 }
-template<> EIGEN_DEVICE_FUNC inline void pscatter<half, half2>(half* to, const half2& from, Index stride) {
+template<> __device__ EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) {
  to[stride*0] = __low2half(from);
  to[stride*1] = __high2half(from);
 }
-template<> EIGEN_DEVICE_FUNC inline half pfirst<half2>(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) {
  return __low2half(a);
 }
-template<> EIGEN_DEVICE_FUNC inline half2 pabs<half2>(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
  half2 result;
  result.x = a.x & 0x7FFF7FFF;
  return result;
 }
-EIGEN_DEVICE_FUNC inline void
+__device__ EIGEN_STRONG_INLINE void
 ptranspose(PacketBlock<half2,2>& kernel) {
-  half a1 = __low2half(kernel.packet[0]);
+  __half a1 = __low2half(kernel.packet[0]);
-  half a2 = __high2half(kernel.packet[0]);
+  __half a2 = __high2half(kernel.packet[0]);
-  half b1 = __low2half(kernel.packet[1]);
+  __half b1 = __low2half(kernel.packet[1]);
-  half b2 = __high2half(kernel.packet[1]);
+  __half b2 = __high2half(kernel.packet[1]);
  kernel.packet[0] = __halves2half2(a1, b1);
  kernel.packet[1] = __halves2half2(a2, b2);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const half& a) {
+template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
 #if __CUDA_ARCH__ >= 530
  return __halves2half2(a, __hadd(a, __float2half(1.0f)));
 #else
  float f = __half2float(a) + 1.0f;
  return __halves2half2(a, __float2half(f));
 #endif
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
+template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
 #if __CUDA_ARCH__ >= 530
  return __hadd2(a, b);
 #else
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float b1 = __low2float(b);
  float b2 = __high2float(b);
  float r1 = a1 + b1;
  float r2 = a2 + b2;
  return __floats2half2_rn(r1, r2);
 #endif
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
+template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
 #if __CUDA_ARCH__ >= 530
  return __hsub2(a, b);
 #else
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float b1 = __low2float(b);
  float b2 = __high2float(b);
  float r1 = a1 - b1;
  float r2 = a2 - b2;
  return __floats2half2_rn(r1, r2);
 #endif
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
 #if __CUDA_ARCH__ >= 530
  return __hneg2(a);
 #else
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  return __floats2half2_rn(-a1, -a2);
 #endif
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
+template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
 #if __CUDA_ARCH__ >= 530
  return __hmul2(a, b);
 #else
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float b1 = __low2float(b);
  float b2 = __high2float(b);
  float r1 = a1 * b1;
  float r2 = a2 * b2;
  return __floats2half2_rn(r1, r2);
 #endif
 }
- template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
 #if __CUDA_ARCH__ >= 530
   return __hfma2(a, b, c);
 #else
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float b1 = __low2float(b);
  float b2 = __high2float(b);
  float c1 = __low2float(c);
  float c2 = __high2float(c);
  float r1 = a1 * b1 + c1;
  float r2 = a2 * b2 + c2;
  return __floats2half2_rn(r1, r2);
 #endif
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float b1 = __low2float(b);
@ -142,51 +203,529 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2&
  return __floats2half2_rn(r1, r2);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float b1 = __low2float(b);
  float b2 = __high2float(b);
-  half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
+  __half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
-  half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
+  __half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
  return __halves2half2(r1, r2);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float b1 = __low2float(b);
  float b2 = __high2float(b);
-  half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
+  __half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
-  half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
+  __half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
  return __halves2half2(r1, r2);
 }
-template<> EIGEN_DEVICE_FUNC inline half predux<half2>(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
 #if __CUDA_ARCH__ >= 530
  return __hadd(__low2half(a), __high2half(a));
 #else
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 + a2)));
 #endif
 }
-template<> EIGEN_DEVICE_FUNC inline half predux_max<half2>(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
-  half first = __low2half(a);
+#if __CUDA_ARCH__ >= 530
-  half second = __high2half(a);
+  __half first = __low2half(a);
  __half second = __high2half(a);
  return __hgt(first, second) ? first : second;
 #else
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  return a1 > a2 ? __low2half(a) : __high2half(a);
 #endif
 }
-template<> EIGEN_DEVICE_FUNC inline half predux_min<half2>(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
-  half first = __low2half(a);
+#if __CUDA_ARCH__ >= 530
-  half second = __high2half(a);
+  __half first = __low2half(a);
  __half second = __high2half(a);
  return __hlt(first, second) ? first : second;
 #else
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  return a1 < a2 ? __low2half(a) : __high2half(a);
 #endif
 }
-template<> EIGEN_DEVICE_FUNC inline half predux_mul<half2>(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
 #if __CUDA_ARCH__ >= 530
  return __hmul(__low2half(a), __high2half(a));
 #else
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 * a2)));
 #endif
 }
-} // end namespace internal
+template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float r1 = log1pf(a1);
  float r2 = log1pf(a2);
  return __floats2half2_rn(r1, r2);
 }
-} // end namespace Eigen
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530
 template<>  __device__ EIGEN_STRONG_INLINE
 half2 plog<half2>(const half2& a) {
  return h2log(a);
 }
 template<> __device__ EIGEN_STRONG_INLINE
 half2 pexp<half2>(const half2& a) {
  return h2exp(a);
 }
 template<> __device__ EIGEN_STRONG_INLINE
 half2 psqrt<half2>(const half2& a) {
  return h2sqrt(a);
 }
 template<> __device__ EIGEN_STRONG_INLINE
 half2 prsqrt<half2>(const half2& a) {
  return h2rsqrt(a);
 }
 #else
 template<> __device__ EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float r1 = logf(a1);
  float r2 = logf(a2);
  return __floats2half2_rn(r1, r2);
 }
 template<> __device__ EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float r1 = expf(a1);
  float r2 = expf(a2);
  return __floats2half2_rn(r1, r2);
 }
 template<> __device__ EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float r1 = sqrtf(a1);
  float r2 = sqrtf(a2);
  return __floats2half2_rn(r1, r2);
 }
 template<> __device__ EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float r1 = rsqrtf(a1);
  float r2 = rsqrtf(a2);
  return __floats2half2_rn(r1, r2);
 }
 #endif
 #elif defined EIGEN_VECTORIZE_AVX
 typedef struct {
  __m128i x;
 } Packet8h;
 template<> struct is_arithmetic<Packet8h> { enum { value = true }; };
 template <>
 struct packet_traits<Eigen::half> : default_packet_traits {
  typedef Packet8h type;
  // There is no half-size packet for Packet8h.
  typedef Packet8h half;
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 1,
    size = 8,
    HasHalfPacket = 0,
    HasAdd    = 0,
    HasSub    = 0,
    HasMul    = 0,
    HasNegate = 0,
    HasAbs    = 0,
    HasAbs2   = 0,
    HasMin    = 0,
    HasMax    = 0,
    HasConj   = 0,
    HasSetLinear = 0,
    HasDiv = 0,
    HasSqrt = 0,
    HasRsqrt = 0,
    HasExp = 0,
    HasLog = 0,
    HasBlend = 0
  };
 };
 template<> struct unpacket_traits<Packet8h> { typedef Eigen::half type; enum {size=8, alignment=Aligned16}; typedef Packet8h half; };
 template<> EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) {
  Packet8h result;
  result.x = _mm_set1_epi16(from.x);
  return result;
 }
 template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8h>(const Packet8h& from) {
  return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_extract_epi16(from.x, 0)));
 }
 template<> EIGEN_STRONG_INLINE Packet8h pload<Packet8h>(const Eigen::half* from) {
  Packet8h result;
  result.x = _mm_load_si128(reinterpret_cast<const __m128i*>(from));
  return result;
 }
 template<> EIGEN_STRONG_INLINE Packet8h ploadu<Packet8h>(const Eigen::half* from) {
  Packet8h result;
  result.x = _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
  return result;
 }
 template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8h& from) {
  _mm_store_si128(reinterpret_cast<__m128i*>(to), from.x);
 }
 template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet8h& from) {
  _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from.x);
 }
 template<> EIGEN_STRONG_INLINE Packet8h
 ploadquad<Packet8h>(const Eigen::half* from) {
  Packet8h result;
  unsigned short a = from[0].x;
  unsigned short b = from[1].x;
  result.x = _mm_set_epi16(b, b, b, b, a, a, a, a);
  return result;
 }
 EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) {
 #ifdef EIGEN_HAS_FP16_C
  return _mm256_cvtph_ps(a.x);
 #else
  EIGEN_ALIGN32 Eigen::half aux[8];
  pstore(aux, a);
  float f0(aux[0]);
  float f1(aux[1]);
  float f2(aux[2]);
  float f3(aux[3]);
  float f4(aux[4]);
  float f5(aux[5]);
  float f6(aux[6]);
  float f7(aux[7]);
  return _mm256_set_ps(f7, f6, f5, f4, f3, f2, f1, f0);
 #endif
 }
 EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) {
 #ifdef EIGEN_HAS_FP16_C
  Packet8h result;
  result.x = _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
  return result;
 #else
  EIGEN_ALIGN32 float aux[8];
  pstore(aux, a);
  Eigen::half h0(aux[0]);
  Eigen::half h1(aux[1]);
  Eigen::half h2(aux[2]);
  Eigen::half h3(aux[3]);
  Eigen::half h4(aux[4]);
  Eigen::half h5(aux[5]);
  Eigen::half h6(aux[6]);
  Eigen::half h7(aux[7]);
  Packet8h result;
  result.x = _mm_set_epi16(h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x);
  return result;
 #endif
 }
 template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; }
 template<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
  Packet8f af = half2float(a);
  Packet8f bf = half2float(b);
  Packet8f rf = padd(af, bf);
  return float2half(rf);
 }
 template<> EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
  Packet8f af = half2float(a);
  Packet8f bf = half2float(b);
  Packet8f rf = pmul(af, bf);
  return float2half(rf);
 }
 template<> EIGEN_STRONG_INLINE Packet8h pgather<Eigen::half, Packet8h>(const Eigen::half* from, Index stride)
 {
  Packet8h result;
  result.x = _mm_set_epi16(from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x, from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
  return result;
 }
 template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8h>(Eigen::half* to, const Packet8h& from, Index stride)
 {
  EIGEN_ALIGN32 Eigen::half aux[8];
  pstore(aux, from);
  to[stride*0].x = aux[0].x;
  to[stride*1].x = aux[1].x;
  to[stride*2].x = aux[2].x;
  to[stride*3].x = aux[3].x;
  to[stride*4].x = aux[4].x;
  to[stride*5].x = aux[5].x;
  to[stride*6].x = aux[6].x;
  to[stride*7].x = aux[7].x;
 }
 EIGEN_STRONG_INLINE void
 ptranspose(PacketBlock<Packet8h,8>& kernel) {
  __m128i a = kernel.packet[0].x;
  __m128i b = kernel.packet[1].x;
  __m128i c = kernel.packet[2].x;
  __m128i d = kernel.packet[3].x;
  __m128i e = kernel.packet[4].x;
  __m128i f = kernel.packet[5].x;
  __m128i g = kernel.packet[6].x;
  __m128i h = kernel.packet[7].x;
  __m128i a03b03 = _mm_unpacklo_epi16(a, b);
  __m128i c03d03 = _mm_unpacklo_epi16(c, d);
  __m128i e03f03 = _mm_unpacklo_epi16(e, f);
  __m128i g03h03 = _mm_unpacklo_epi16(g, h);
  __m128i a47b47 = _mm_unpackhi_epi16(a, b);
  __m128i c47d47 = _mm_unpackhi_epi16(c, d);
  __m128i e47f47 = _mm_unpackhi_epi16(e, f);
  __m128i g47h47 = _mm_unpackhi_epi16(g, h);
  __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03);
  __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03);
  __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03);
  __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03);
  __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47);
  __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47);
  __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47);
  __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47);
  __m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01);
  __m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01);
  __m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23);
  __m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23);
  __m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45);
  __m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45);
  __m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67);
  __m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67);
  kernel.packet[0].x = a0b0c0d0e0f0g0h0;
  kernel.packet[1].x = a1b1c1d1e1f1g1h1;
  kernel.packet[2].x = a2b2c2d2e2f2g2h2;
  kernel.packet[3].x = a3b3c3d3e3f3g3h3;
  kernel.packet[4].x = a4b4c4d4e4f4g4h4;
  kernel.packet[5].x = a5b5c5d5e5f5g5h5;
  kernel.packet[6].x = a6b6c6d6e6f6g6h6;
  kernel.packet[7].x = a7b7c7d7e7f7g7h7;
 }
 EIGEN_STRONG_INLINE void
 ptranspose(PacketBlock<Packet8h,4>& kernel) {
  EIGEN_ALIGN32 Eigen::half in[4][8];
  pstore<Eigen::half>(in[0], kernel.packet[0]);
  pstore<Eigen::half>(in[1], kernel.packet[1]);
  pstore<Eigen::half>(in[2], kernel.packet[2]);
  pstore<Eigen::half>(in[3], kernel.packet[3]);
  EIGEN_ALIGN32 Eigen::half out[4][8];
  for (int i = 0; i < 4; ++i) {
    for (int j = 0; j < 4; ++j) {
      out[i][j] = in[j][2*i];
    }
    for (int j = 0; j < 4; ++j) {
      out[i][j+4] = in[j][2*i+1];
    }
  }
  kernel.packet[0] = pload<Packet8h>(out[0]);
  kernel.packet[1] = pload<Packet8h>(out[1]);
  kernel.packet[2] = pload<Packet8h>(out[2]);
  kernel.packet[3] = pload<Packet8h>(out[3]);
 }
 // Disable the following code since it's broken on too many platforms / compilers.
 //#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
 #elif 0
 typedef struct {
  __m64 x;
 } Packet4h;
 template<> struct is_arithmetic<Packet4h> { enum { value = true }; };
 template <>
 struct packet_traits<Eigen::half> : default_packet_traits {
  typedef Packet4h type;
  // There is no half-size packet for Packet4h.
  typedef Packet4h half;
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 1,
    size = 4,
    HasHalfPacket = 0,
    HasAdd    = 0,
    HasSub    = 0,
    HasMul    = 0,
    HasNegate = 0,
    HasAbs    = 0,
    HasAbs2   = 0,
    HasMin    = 0,
    HasMax    = 0,
    HasConj   = 0,
    HasSetLinear = 0,
    HasDiv = 0,
    HasSqrt = 0,
    HasRsqrt = 0,
    HasExp = 0,
    HasLog = 0,
    HasBlend = 0
  };
 };
 template<> struct unpacket_traits<Packet4h> { typedef Eigen::half type; enum {size=4, alignment=Aligned16}; typedef Packet4h half; };
 template<> EIGEN_STRONG_INLINE Packet4h pset1<Packet4h>(const Eigen::half& from) {
  Packet4h result;
  result.x = _mm_set1_pi16(from.x);
  return result;
 }
 template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h>(const Packet4h& from) {
  return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_cvtsi64_si32(from.x)));
 }
 template<> EIGEN_STRONG_INLINE Packet4h pconj(const Packet4h& a) { return a; }
 template<> EIGEN_STRONG_INLINE Packet4h padd<Packet4h>(const Packet4h& a, const Packet4h& b) {
  __int64_t a64 = _mm_cvtm64_si64(a.x);
  __int64_t b64 = _mm_cvtm64_si64(b.x);
  Eigen::half h[4];
  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
  h[0] = ha + hb;
  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
  h[1] = ha + hb;
  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
  h[2] = ha + hb;
  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
  h[3] = ha + hb;
  Packet4h result;
  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
  return result;
 }
 template<> EIGEN_STRONG_INLINE Packet4h pmul<Packet4h>(const Packet4h& a, const Packet4h& b) {
  __int64_t a64 = _mm_cvtm64_si64(a.x);
  __int64_t b64 = _mm_cvtm64_si64(b.x);
  Eigen::half h[4];
  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
  h[0] = ha * hb;
  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
  h[1] = ha * hb;
  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
  h[2] = ha * hb;
  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
  h[3] = ha * hb;
  Packet4h result;
  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
  return result;
 }
 template<> EIGEN_STRONG_INLINE Packet4h pload<Packet4h>(const Eigen::half* from) {
  Packet4h result;
  result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
  return result;
 }
 template<> EIGEN_STRONG_INLINE Packet4h ploadu<Packet4h>(const Eigen::half* from) {
  Packet4h result;
  result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
  return result;
 }
 template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4h& from) {
  __int64_t r = _mm_cvtm64_si64(from.x);
  *(reinterpret_cast<__int64_t*>(to)) = r;
 }
 template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4h& from) {
  __int64_t r = _mm_cvtm64_si64(from.x);
  *(reinterpret_cast<__int64_t*>(to)) = r;
 }
 template<> EIGEN_STRONG_INLINE Packet4h
 ploadquad<Packet4h>(const Eigen::half* from) {
  return pset1<Packet4h>(*from);
 }
 template<> EIGEN_STRONG_INLINE Packet4h pgather<Eigen::half, Packet4h>(const Eigen::half* from, Index stride)
 {
  Packet4h result;
  result.x = _mm_set_pi16(from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
  return result;
 }
 template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h>(Eigen::half* to, const Packet4h& from, Index stride)
 {
  __int64_t a = _mm_cvtm64_si64(from.x);
  to[stride*0].x = static_cast<unsigned short>(a);
  to[stride*1].x = static_cast<unsigned short>(a >> 16);
  to[stride*2].x = static_cast<unsigned short>(a >> 32);
  to[stride*3].x = static_cast<unsigned short>(a >> 48);
 }
 EIGEN_STRONG_INLINE void
 ptranspose(PacketBlock<Packet4h,4>& kernel) {
  __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x);
  __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x);
  __m64 T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x);
  __m64 T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x);
  kernel.packet[0].x = _mm_unpacklo_pi32(T0, T1);
  kernel.packet[1].x = _mm_unpackhi_pi32(T0, T1);
  kernel.packet[2].x = _mm_unpacklo_pi32(T2, T3);
  kernel.packet[3].x = _mm_unpackhi_pi32(T2, T3);
 }
 #endif
 }
 }
 #endif // EIGEN_PACKET_MATH_HALF_CUDA_H
--- a/Eigen/src/Core/arch/CUDA/TypeCasting.h
+++ b/Eigen/src/Core/arch/CUDA/TypeCasting.h
@ -14,50 +14,48 @@ namespace Eigen {
 namespace internal {
 #if defined(EIGEN_HAS_CUDA_FP16)
 template<>
-struct scalar_cast_op<float, half> {
+struct scalar_cast_op<float, Eigen::half> {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
-  typedef half result_type;
+  typedef Eigen::half result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half operator() (const float& a) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const {
-    #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+    #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
      return __float2half(a);
    #else
-      return half(a);
+      return Eigen::half(a);
    #endif
  }
 };
 template<>
-struct functor_traits<scalar_cast_op<float, half> >
+struct functor_traits<scalar_cast_op<float, Eigen::half> >
 { enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
 template<>
-struct scalar_cast_op<int, half> {
+struct scalar_cast_op<int, Eigen::half> {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
-  typedef half result_type;
+  typedef Eigen::half result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half operator() (const int& a) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const {
-    #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+    #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
      return __float2half(static_cast<float>(a));
    #else
-      return half(static_cast<float>(a));
+      return Eigen::half(static_cast<float>(a));
    #endif
  }
 };
 template<>
-struct functor_traits<scalar_cast_op<int, half> >
+struct functor_traits<scalar_cast_op<int, Eigen::half> >
 { enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
 template<>
-struct scalar_cast_op<half, float> {
+struct scalar_cast_op<Eigen::half, float> {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
  typedef float result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const half& a) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const {
-    #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+    #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
      return __half2float(a);
    #else
      return static_cast<float>(a);
@ -66,15 +64,15 @@ struct scalar_cast_op<half, float> {
 };
 template<>
-struct functor_traits<scalar_cast_op<half, float> >
+struct functor_traits<scalar_cast_op<Eigen::half, float> >
 { enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
 template <>
-struct type_casting_traits<half, float> {
+struct type_casting_traits<Eigen::half, float> {
  enum {
    VectorizedCast = 1,
    SrcCoeffRatio = 2,
@ -89,7 +87,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast<half2, float4>(con
 }
 template <>
-struct type_casting_traits<float, half> {
+struct type_casting_traits<float, Eigen::half> {
  enum {
    VectorizedCast = 1,
    SrcCoeffRatio = 1,
@ -97,12 +95,87 @@ struct type_casting_traits<float, half> {
  };
 };
-template<> EIGEN_STRONG_INLINE half2 pcast<float4, half2>(const float4& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcast<float4, half2>(const float4& a) {
  // Simply discard the second half of the input
-  return __float22half2_rn(make_float2(a.x, a.y));
+  return __floats2half2_rn(a.x, a.y);
 }
 #elif defined EIGEN_VECTORIZE_AVX
 template <>
 struct type_casting_traits<Eigen::half, float> {
  enum {
    VectorizedCast = 1,
    SrcCoeffRatio = 1,
    TgtCoeffRatio = 1
  };
 };
 template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8h, Packet8f>(const Packet8h& a) {
  return half2float(a);
 }
 template <>
 struct type_casting_traits<float, Eigen::half> {
  enum {
    VectorizedCast = 1,
    SrcCoeffRatio = 1,
    TgtCoeffRatio = 1
  };
 };
 template<> EIGEN_STRONG_INLINE Packet8h pcast<Packet8f, Packet8h>(const Packet8f& a) {
  return float2half(a);
 }
 // Disable the following code since it's broken on too many platforms / compilers.
 //#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
 #elif 0
 template <>
 struct type_casting_traits<Eigen::half, float> {
  enum {
    VectorizedCast = 1,
    SrcCoeffRatio = 1,
    TgtCoeffRatio = 1
  };
 };
 template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4h, Packet4f>(const Packet4h& a) {
  __int64_t a64 = _mm_cvtm64_si64(a.x);
  Eigen::half h = raw_uint16_to_half(static_cast<unsigned short>(a64));
  float f1 = static_cast<float>(h);
  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
  float f2 = static_cast<float>(h);
  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
  float f3 = static_cast<float>(h);
  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
  float f4 = static_cast<float>(h);
  return _mm_set_ps(f4, f3, f2, f1);
 }
 template <>
 struct type_casting_traits<float, Eigen::half> {
  enum {
    VectorizedCast = 1,
    SrcCoeffRatio = 1,
    TgtCoeffRatio = 1
  };
 };
 template<> EIGEN_STRONG_INLINE Packet4h pcast<Packet4f, Packet4h>(const Packet4f& a) {
  EIGEN_ALIGN16 float aux[4];
  pstore(aux, a);
  Eigen::half h0(aux[0]);
  Eigen::half h1(aux[1]);
  Eigen::half h2(aux[2]);
  Eigen::half h3(aux[3]);
  Packet4h result;
  result.x = _mm_set_pi16(h3.x, h2.x, h1.x, h0.x);
  return result;
 }
 #endif
 #endif
 } // end namespace internal
--- a/Eigen/src/Core/arch/Default/CMakeLists.txt
+++ b/Eigen/src/Core/arch/Default/CMakeLists.txt
@ -1,6 +0,0 @@
 FILE(GLOB Eigen_Core_arch_Default_SRCS "*.h")
 INSTALL(FILES
  ${Eigen_Core_arch_Default_SRCS}
  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/Default COMPONENT Devel
 )
--- a/Eigen/src/Core/arch/NEON/CMakeLists.txt
+++ b/Eigen/src/Core/arch/NEON/CMakeLists.txt
@ -1,6 +0,0 @@
 FILE(GLOB Eigen_Core_arch_NEON_SRCS "*.h")
 INSTALL(FILES
  ${Eigen_Core_arch_NEON_SRCS}
  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/NEON COMPONENT Devel
 )
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h
@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2010 Konstantinos Margaritis <markos@freevec.org>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@ -14,8 +15,15 @@ namespace Eigen {
 namespace internal {
-static uint32x4_t p4ui_CONJ_XOR = EIGEN_INIT_NEON_PACKET4(0x00000000, 0x80000000, 0x00000000, 0x80000000);
+inline uint32x4_t p4ui_CONJ_XOR() {
-static uint32x2_t p2ui_CONJ_XOR = EIGEN_INIT_NEON_PACKET2(0x00000000, 0x80000000);
+  static const uint32_t conj_XOR_DATA[] = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
  return vld1q_u32( conj_XOR_DATA );
 }
 inline uint32x2_t p2ui_CONJ_XOR() {
  static const uint32_t conj_XOR_DATA[] = { 0x00000000, 0x80000000 };
  return vld1_u32( conj_XOR_DATA );
 }
 //---------- float ----------
 struct Packet2cf
@ -64,7 +72,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Pa
 template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
 {
  Packet4ui b = vreinterpretq_u32_f32(a.v);
-  return Packet2cf(vreinterpretq_f32_u32(veorq_u32(b, p4ui_CONJ_XOR)));
+  return Packet2cf(vreinterpretq_f32_u32(veorq_u32(b, p4ui_CONJ_XOR())));
 }
 template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
@ -80,7 +88,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, con
  // Multiply the imag a with b
  v2 = vmulq_f32(v2, b.v);
  // Conjugate v2 
-  v2 = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v2), p4ui_CONJ_XOR));
+  v2 = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v2), p4ui_CONJ_XOR()));
  // Swap real/imag elements in v2.
  v2 = vrev64q_f32(v2);
  // Add and return the result
@ -195,7 +203,7 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const P
  // Multiply the imag a with b
  v2 = vmul_f32(v2, a2);
  // Conjugate v2 
-  v2 = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2), p2ui_CONJ_XOR));
+  v2 = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2), p2ui_CONJ_XOR()));
  // Swap real/imag elements in v2.
  v2 = vrev64_f32(v2);
  // Add v1, v2
@ -274,7 +282,8 @@ ptranspose(PacketBlock<Packet2cf,2>& kernel) {
 //---------- double ----------
 #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
-static uint64x2_t p2ul_CONJ_XOR = EIGEN_INIT_NEON_PACKET2(0x0, 0x8000000000000000);
+const uint64_t  p2ul_conj_XOR_DATA[] = { 0x0, 0x8000000000000000 };
 static uint64x2_t p2ul_CONJ_XOR = vld1q_u64( p2ul_conj_XOR_DATA );
 struct Packet1cd
 {
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@ -2,7 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
-// Copyright (C) 2010 Konstantinos Margaritis <markos@codex.gr>
+// Copyright (C) 2010 Konstantinos Margaritis <markos@freevec.org>
 // Heavily based on Gael's SSE version.
 //
 // This Source Code Form is subject to the terms of the Mozilla
@ -49,17 +49,6 @@ typedef uint32x4_t  Packet4ui;
 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
  const Packet4i p4i_##NAME = pset1<Packet4i>(X)
 #if EIGEN_COMP_LLVM && !EIGEN_COMP_CLANG
  //Special treatment for Apple's llvm-gcc, its NEON packet types are unions
  #define EIGEN_INIT_NEON_PACKET2(X, Y)       {{X, Y}}
  #define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {{X, Y, Z, W}}
 #else
  //Default initializer for packets
  #define EIGEN_INIT_NEON_PACKET2(X, Y)       {X, Y}
  #define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {X, Y, Z, W}
 #endif
 // arm64 does have the pld instruction. If available, let's trust the __builtin_prefetch built-in function
 // which available on LLVM and GCC (at least)
 #if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
@ -122,12 +111,14 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)   {
 template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a)
 {
-  Packet4f countdown = EIGEN_INIT_NEON_PACKET4(0, 1, 2, 3);
+  const float32_t f[] = {0, 1, 2, 3};
  Packet4f countdown = vld1q_f32(f);
  return vaddq_f32(pset1<Packet4f>(a), countdown);
 }
 template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a)
 {
-  Packet4i countdown = EIGEN_INIT_NEON_PACKET4(0, 1, 2, 3);
+  const int32_t i[] = {0, 1, 2, 3};
  Packet4i countdown = vld1q_s32(i);
  return vaddq_s32(pset1<Packet4i>(a), countdown);
 }
@ -334,22 +325,6 @@ template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
  return vcombine_s32(a_hi, a_lo);
 }
 template<size_t offset>
 struct protate_impl<offset, Packet4f>
 {
  static Packet4f run(const Packet4f& a) {
    return vextq_f32(a, a, offset);
  }
 };
 template<size_t offset>
 struct protate_impl<offset, Packet4i>
 {
  static Packet4i run(const Packet4i& a) {
    return vextq_s32(a, a, offset);
  }
 };
 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); }
 template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); }
@ -601,7 +576,8 @@ template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double&  from) { r
 template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a)
 {
-  Packet2d countdown = EIGEN_INIT_NEON_PACKET2(0, 1);
+  const double countdown_raw[] = {0.0,1.0};
  const Packet2d countdown = vld1q_f64(countdown_raw);
  return vaddq_f64(pset1<Packet2d>(a), countdown);
 }
 template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return vaddq_f64(a,b); }
@ -679,14 +655,6 @@ template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { retu
 template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); }
 template<size_t offset>
 struct protate_impl<offset, Packet2d>
 {
  static Packet2d run(const Packet2d& a) {
    return vextq_f64(a, a, offset);
  }
 };
 template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); }
 #if EIGEN_COMP_CLANG && defined(__apple_build_version__)
--- a/Eigen/src/Core/arch/SSE/CMakeLists.txt
+++ b/Eigen/src/Core/arch/SSE/CMakeLists.txt
@ -1,6 +0,0 @@
 FILE(GLOB Eigen_Core_arch_SSE_SRCS "*.h")
 INSTALL(FILES
  ${Eigen_Core_arch_SSE_SRCS}
  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/SSE COMPONENT Devel
 )
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@ -517,52 +517,10 @@ Packet2d prsqrt<Packet2d>(const Packet2d& x) {
 }
 // Hyperbolic Tangent function.
 // Doesn't do anything fancy, just a 13/6-degree rational interpolant which
 // is accurate up to a couple of ulp in the range [-9, 9], outside of which the
 // fl(tanh(x)) = +/-1.
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
-ptanh<Packet4f>(const Packet4f& _x) {
+ptanh<Packet4f>(const Packet4f& x) {
-  // Clamp the inputs to the range [-9, 9] since anything outside
+  return internal::generic_fast_tanh_float(x);
  // this range is +/-1.0f in single-precision.
  _EIGEN_DECLARE_CONST_Packet4f(plus_9, 9.0f);
  _EIGEN_DECLARE_CONST_Packet4f(minus_9, -9.0f);
  const Packet4f x = pmax(p4f_minus_9, pmin(p4f_plus_9, _x));
  // The monomial coefficients of the numerator polynomial (odd).
  _EIGEN_DECLARE_CONST_Packet4f(alpha_1, 4.89352455891786e-03f);
  _EIGEN_DECLARE_CONST_Packet4f(alpha_3, 6.37261928875436e-04f);
  _EIGEN_DECLARE_CONST_Packet4f(alpha_5, 1.48572235717979e-05f);
  _EIGEN_DECLARE_CONST_Packet4f(alpha_7, 5.12229709037114e-08f);
  _EIGEN_DECLARE_CONST_Packet4f(alpha_9, -8.60467152213735e-11f);
  _EIGEN_DECLARE_CONST_Packet4f(alpha_11, 2.00018790482477e-13f);
  _EIGEN_DECLARE_CONST_Packet4f(alpha_13, -2.76076847742355e-16f);
  // The monomial coefficients of the denominator polynomial (even).
  _EIGEN_DECLARE_CONST_Packet4f(beta_0, 4.89352518554385e-03f);
  _EIGEN_DECLARE_CONST_Packet4f(beta_2, 2.26843463243900e-03f);
  _EIGEN_DECLARE_CONST_Packet4f(beta_4, 1.18534705686654e-04f);
  _EIGEN_DECLARE_CONST_Packet4f(beta_6, 1.19825839466702e-06f);
  // Since the polynomials are odd/even, we need x^2.
  const Packet4f x2 = pmul(x, x);
  // Evaluate the numerator polynomial p.
  Packet4f p = pmadd(x2, p4f_alpha_13, p4f_alpha_11);
  p = pmadd(x2, p, p4f_alpha_9);
  p = pmadd(x2, p, p4f_alpha_7);
  p = pmadd(x2, p, p4f_alpha_5);
  p = pmadd(x2, p, p4f_alpha_3);
  p = pmadd(x2, p, p4f_alpha_1);
  p = pmul(x, p);
  // Evaluate the denominator polynomial p.
  Packet4f q = pmadd(x2, p4f_beta_6, p4f_beta_4);
  q = pmadd(x2, q, p4f_beta_2);
  q = pmadd(x2, q, p4f_beta_0);
  // Divide the numerator by the denominator.
  return pdiv(p, q);
 }
 } // end namespace internal
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@ -162,6 +162,11 @@ template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4,
 template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
 template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
 #ifndef EIGEN_VECTORIZE_AVX
 template<> struct scalar_div_cost<float,true> { enum { value = 7 }; };
 template<> struct scalar_div_cost<double,true> { enum { value = 8 }; };
 #endif
 #if EIGEN_COMP_MSVC==1500
 // Workaround MSVC 9 internal compiler error.
 // TODO: It has been detected with win64 builds (amd64), so let's check whether it also happens in 32bits+SSE mode
@ -434,30 +439,6 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
 template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
 { return _mm_shuffle_epi32(a,0x1B); }
 template<size_t offset>
 struct protate_impl<offset, Packet4f>
 {
  static Packet4f run(const Packet4f& a) {
    return vec4f_swizzle1(a, offset, (offset + 1) % 4, (offset + 2) % 4, (offset + 3) % 4);
  }
 };
 template<size_t offset>
 struct protate_impl<offset, Packet4i>
 {
  static Packet4i run(const Packet4i& a) {
    return vec4i_swizzle1(a, offset, (offset + 1) % 4, (offset + 2) % 4, (offset + 3) % 4);
  }
 };
 template<size_t offset>
 struct protate_impl<offset, Packet2d>
 {
  static Packet2d run(const Packet2d& a) {
    return vec2d_swizzle1(a, offset, (offset + 1) % 2);
  }
 };
 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a)
 {
  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
@ -837,6 +818,16 @@ template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, cons
 #endif
 }
 // Scalar path for pmadd with FMA to ensure consistency with vectorized path.
 #ifdef __FMA__
 template<> EIGEN_STRONG_INLINE float pmadd(const float& a, const float& b, const float& c) {
  return ::fmaf(a,b,c);
 }
 template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, const double& c) {
  return ::fma(a,b,c);
 }
 #endif
 } // end namespace internal
 } // end namespace Eigen
--- a/Eigen/src/Core/arch/ZVector/CMakeLists.txt
+++ b/Eigen/src/Core/arch/ZVector/CMakeLists.txt
@ -1,6 +0,0 @@
 FILE(GLOB Eigen_Core_arch_ZVector_SRCS "*.h")
 INSTALL(FILES
  ${Eigen_Core_arch_ZVector_SRCS}
  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/ZVector COMPONENT Devel
 )
--- a/Eigen/src/Core/arch/ZVector/Complex.h
+++ b/Eigen/src/Core/arch/ZVector/Complex.h
@ -57,21 +57,6 @@ template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<
 template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>&  from)
 { /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
 template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride)
 {
  std::complex<double> EIGEN_ALIGN16 af[2];
  af[0] = from[0*stride];
  af[1] = from[1*stride];
  return pload<Packet1cd>(af);
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index stride)
 {
  std::complex<double> EIGEN_ALIGN16 af[2];
  pstore<std::complex<double> >(af, from);
  to[0*stride] = af[0];
  to[1*stride] = af[1];
 }
 template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); }
 template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v - b.v); }
 template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
--- a/Eigen/src/Core/functors/AssignmentFunctors.h
+++ b/Eigen/src/Core/functors/AssignmentFunctors.h
@ -18,20 +18,24 @@ namespace internal {
  * \brief Template functor for scalar/packet assignment
  *
  */
-template<typename Scalar> struct assign_op {
+template<typename DstScalar,typename SrcScalar> struct assign_op {
  EIGEN_EMPTY_STRUCT_CTOR(assign_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const { a = b; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a = b; }
  template<int Alignment, typename Packet>
-  EIGEN_STRONG_INLINE void assignPacket(Scalar* a, const Packet& b) const
+  EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const
-  { internal::pstoret<Scalar,Packet,Alignment>(a,b); }
+  { internal::pstoret<DstScalar,Packet,Alignment>(a,b); }
 };
-template<typename Scalar>
+
-struct functor_traits<assign_op<Scalar> > {
+// Empty overload for void type (used by PermutationMatrix
 template<typename DstScalar> struct assign_op<DstScalar,void> {};
 template<typename DstScalar,typename SrcScalar>
 struct functor_traits<assign_op<DstScalar,SrcScalar> > {
  enum {
-    Cost = NumTraits<Scalar>::ReadCost,
+    Cost = NumTraits<DstScalar>::ReadCost,
-    PacketAccess = packet_traits<Scalar>::Vectorizable
+    PacketAccess = is_same<DstScalar,SrcScalar>::value && packet_traits<DstScalar>::Vectorizable && packet_traits<SrcScalar>::Vectorizable
  };
 };
@ -39,20 +43,20 @@ struct functor_traits<assign_op<Scalar> > {
  * \brief Template functor for scalar/packet assignment with addition
  *
  */
-template<typename Scalar> struct add_assign_op {
+template<typename DstScalar,typename SrcScalar> struct add_assign_op {
  EIGEN_EMPTY_STRUCT_CTOR(add_assign_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const { a += b; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a += b; }
  template<int Alignment, typename Packet>
-  EIGEN_STRONG_INLINE void assignPacket(Scalar* a, const Packet& b) const
+  EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const
-  { internal::pstoret<Scalar,Packet,Alignment>(a,internal::padd(internal::ploadt<Packet,Alignment>(a),b)); }
+  { internal::pstoret<DstScalar,Packet,Alignment>(a,internal::padd(internal::ploadt<Packet,Alignment>(a),b)); }
 };
-template<typename Scalar>
+template<typename DstScalar,typename SrcScalar>
-struct functor_traits<add_assign_op<Scalar> > {
+struct functor_traits<add_assign_op<DstScalar,SrcScalar> > {
  enum {
-    Cost = NumTraits<Scalar>::ReadCost + NumTraits<Scalar>::AddCost,
+    Cost = NumTraits<DstScalar>::ReadCost + NumTraits<DstScalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasAdd
+    PacketAccess = is_same<DstScalar,SrcScalar>::value && packet_traits<DstScalar>::HasAdd
  };
 };
@ -60,20 +64,20 @@ struct functor_traits<add_assign_op<Scalar> > {
  * \brief Template functor for scalar/packet assignment with subtraction
  *
  */
-template<typename Scalar> struct sub_assign_op {
+template<typename DstScalar,typename SrcScalar> struct sub_assign_op {
  EIGEN_EMPTY_STRUCT_CTOR(sub_assign_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const { a -= b; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a -= b; }
  template<int Alignment, typename Packet>
-  EIGEN_STRONG_INLINE void assignPacket(Scalar* a, const Packet& b) const
+  EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const
-  { internal::pstoret<Scalar,Packet,Alignment>(a,internal::psub(internal::ploadt<Packet,Alignment>(a),b)); }
+  { internal::pstoret<DstScalar,Packet,Alignment>(a,internal::psub(internal::ploadt<Packet,Alignment>(a),b)); }
 };
-template<typename Scalar>
+template<typename DstScalar,typename SrcScalar>
-struct functor_traits<sub_assign_op<Scalar> > {
+struct functor_traits<sub_assign_op<DstScalar,SrcScalar> > {
  enum {
-    Cost = NumTraits<Scalar>::ReadCost + NumTraits<Scalar>::AddCost,
+    Cost = NumTraits<DstScalar>::ReadCost + NumTraits<DstScalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasSub
+    PacketAccess = is_same<DstScalar,SrcScalar>::value && packet_traits<DstScalar>::HasSub
  };
 };
@ -98,30 +102,28 @@ struct functor_traits<mul_assign_op<DstScalar,SrcScalar> > {
    PacketAccess = is_same<DstScalar,SrcScalar>::value && packet_traits<DstScalar>::HasMul
  };
 };
 template<typename DstScalar,typename SrcScalar> struct functor_is_product_like<mul_assign_op<DstScalar,SrcScalar> > { enum { ret = 1 }; };
 /** \internal
  * \brief Template functor for scalar/packet assignment with diviving
  *
  */
-template<typename Scalar> struct div_assign_op {
+template<typename DstScalar, typename SrcScalar=DstScalar> struct div_assign_op {
  EIGEN_EMPTY_STRUCT_CTOR(div_assign_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const { a /= b; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a /= b; }
  template<int Alignment, typename Packet>
-  EIGEN_STRONG_INLINE void assignPacket(Scalar* a, const Packet& b) const
+  EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const
-  { internal::pstoret<Scalar,Packet,Alignment>(a,internal::pdiv(internal::ploadt<Packet,Alignment>(a),b)); }
+  { internal::pstoret<DstScalar,Packet,Alignment>(a,internal::pdiv(internal::ploadt<Packet,Alignment>(a),b)); }
 };
-template<typename Scalar>
+template<typename DstScalar, typename SrcScalar>
-struct functor_traits<div_assign_op<Scalar> > {
+struct functor_traits<div_assign_op<DstScalar,SrcScalar> > {
  enum {
-    Cost = NumTraits<Scalar>::ReadCost + NumTraits<Scalar>::MulCost,
+    Cost = NumTraits<DstScalar>::ReadCost + NumTraits<DstScalar>::MulCost,
-    PacketAccess = packet_traits<Scalar>::HasDiv
+    PacketAccess = is_same<DstScalar,SrcScalar>::value && packet_traits<DstScalar>::HasDiv
  };
 };
 /** \internal
  * \brief Template functor for scalar/packet assignment with swapping
  *
--- a/Eigen/src/Core/functors/BinaryFunctors.h
+++ b/Eigen/src/Core/functors/BinaryFunctors.h
@ -16,27 +16,43 @@ namespace internal {
 //---------- associative binary functors ----------
 template<typename Arg1, typename Arg2>
 struct binary_op_base
 {
  typedef Arg1 first_argument_type;
  typedef Arg2 second_argument_type;
 };
 /** \internal
  * \brief Template functor to compute the sum of two scalars
  *
  * \sa class CwiseBinaryOp, MatrixBase::operator+, class VectorwiseOp, DenseBase::sum()
  */
-template<typename Scalar> struct scalar_sum_op {
+template<typename LhsScalar,typename RhsScalar>
-//   typedef Scalar result_type;
+struct scalar_sum_op : binary_op_base<LhsScalar,RhsScalar>
 {
  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_sum_op>::ReturnType result_type;
 #ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
  EIGEN_EMPTY_STRUCT_CTOR(scalar_sum_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return a + b; }
+#else
  scalar_sum_op() {
    EIGEN_SCALAR_BINARY_OP_PLUGIN
  }
 #endif
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a + b; }
  template<typename Packet>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
  { return internal::padd(a,b); }
  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
  { return internal::predux(a); }
 };
-template<typename Scalar>
+template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_sum_op<Scalar> > {
+struct functor_traits<scalar_sum_op<LhsScalar,RhsScalar> > {
  enum {
-    Cost = NumTraits<Scalar>::AddCost,
+    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2, // rough estimate!
-    PacketAccess = packet_traits<Scalar>::HasAdd
+    PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasAdd && packet_traits<RhsScalar>::HasAdd
    // TODO vectorize mixed sum
  };
 };
@ -45,7 +61,7 @@ struct functor_traits<scalar_sum_op<Scalar> > {
  * This is required to solve Bug 426.
  * \sa DenseBase::count(), DenseBase::any(), ArrayBase::cast(), MatrixBase::cast()
  */
-template<> struct scalar_sum_op<bool> : scalar_sum_op<int> {
+template<> struct scalar_sum_op<bool,bool> : scalar_sum_op<int,int> {
  EIGEN_DEPRECATED
  scalar_sum_op() {}
 };
@ -56,13 +72,17 @@ template<> struct scalar_sum_op<bool> : scalar_sum_op<int> {
  *
  * \sa class CwiseBinaryOp, Cwise::operator*(), class VectorwiseOp, MatrixBase::redux()
  */
-template<typename LhsScalar,typename RhsScalar> struct scalar_product_op {
+template<typename LhsScalar,typename RhsScalar>
-  enum {
+struct scalar_product_op  : binary_op_base<LhsScalar,RhsScalar>
-    // TODO vectorize mixed product
+{
-    Vectorizable = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasMul && packet_traits<RhsScalar>::HasMul
+  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_product_op>::ReturnType result_type;
-  };
+#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
  typedef typename scalar_product_traits<LhsScalar,RhsScalar>::ReturnType result_type;
  EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op)
 #else
  scalar_product_op() {
    EIGEN_SCALAR_BINARY_OP_PLUGIN
  }
 #endif
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a * b; }
  template<typename Packet>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
@ -75,7 +95,8 @@ template<typename LhsScalar,typename RhsScalar>
 struct functor_traits<scalar_product_op<LhsScalar,RhsScalar> > {
  enum {
    Cost = (NumTraits<LhsScalar>::MulCost + NumTraits<RhsScalar>::MulCost)/2, // rough estimate!
-    PacketAccess = scalar_product_op<LhsScalar,RhsScalar>::Vectorizable
+    PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasMul && packet_traits<RhsScalar>::HasMul
    // TODO vectorize mixed product
  };
 };
@ -84,13 +105,15 @@ struct functor_traits<scalar_product_op<LhsScalar,RhsScalar> > {
  *
  * This is a short cut for conj(x) * y which is needed for optimization purpose; in Eigen2 support mode, this becomes x * conj(y)
  */
-template<typename LhsScalar,typename RhsScalar> struct scalar_conj_product_op {
+template<typename LhsScalar,typename RhsScalar>
 struct scalar_conj_product_op  : binary_op_base<LhsScalar,RhsScalar>
 {
  enum {
    Conj = NumTraits<LhsScalar>::IsComplex
  };
-  typedef typename scalar_product_traits<LhsScalar,RhsScalar>::ReturnType result_type;
+  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_conj_product_op>::ReturnType result_type;
  EIGEN_EMPTY_STRUCT_CTOR(scalar_conj_product_op)
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const
@ -113,21 +136,24 @@ struct functor_traits<scalar_conj_product_op<LhsScalar,RhsScalar> > {
  *
  * \sa class CwiseBinaryOp, MatrixBase::cwiseMin, class VectorwiseOp, MatrixBase::minCoeff()
  */
-template<typename Scalar> struct scalar_min_op {
+template<typename LhsScalar,typename RhsScalar>
 struct scalar_min_op : binary_op_base<LhsScalar,RhsScalar>
 {
  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_min_op>::ReturnType result_type;
  EIGEN_EMPTY_STRUCT_CTOR(scalar_min_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return numext::mini(a, b); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return numext::mini(a, b); }
  template<typename Packet>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
  { return internal::pmin(a,b); }
  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
  { return internal::predux_min(a); }
 };
-template<typename Scalar>
+template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_min_op<Scalar> > {
+struct functor_traits<scalar_min_op<LhsScalar,RhsScalar> > {
  enum {
-    Cost = NumTraits<Scalar>::AddCost,
+    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
-    PacketAccess = packet_traits<Scalar>::HasMin
+    PacketAccess = internal::is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasMin
  };
 };
@ -136,21 +162,24 @@ struct functor_traits<scalar_min_op<Scalar> > {
  *
  * \sa class CwiseBinaryOp, MatrixBase::cwiseMax, class VectorwiseOp, MatrixBase::maxCoeff()
  */
-template<typename Scalar> struct scalar_max_op {
+template<typename LhsScalar,typename RhsScalar>
 struct scalar_max_op  : binary_op_base<LhsScalar,RhsScalar>
 {
  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_max_op>::ReturnType result_type;
  EIGEN_EMPTY_STRUCT_CTOR(scalar_max_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return numext::maxi(a, b); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return numext::maxi(a, b); }
  template<typename Packet>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
  { return internal::pmax(a,b); }
  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
  { return internal::predux_max(a); }
 };
-template<typename Scalar>
+template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_max_op<Scalar> > {
+struct functor_traits<scalar_max_op<LhsScalar,RhsScalar> > {
  enum {
-    Cost = NumTraits<Scalar>::AddCost,
+    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
-    PacketAccess = packet_traits<Scalar>::HasMax
+    PacketAccess = internal::is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasMax
  };
 };
@ -158,56 +187,70 @@ struct functor_traits<scalar_max_op<Scalar> > {
  * \brief Template functors for comparison of two scalars
  * \todo Implement packet-comparisons
  */
-template<typename Scalar, ComparisonName cmp> struct scalar_cmp_op;
+template<typename LhsScalar, typename RhsScalar, ComparisonName cmp> struct scalar_cmp_op;
-template<typename Scalar, ComparisonName cmp>
+template<typename LhsScalar, typename RhsScalar, ComparisonName cmp>
-struct functor_traits<scalar_cmp_op<Scalar, cmp> > {
+struct functor_traits<scalar_cmp_op<LhsScalar,RhsScalar, cmp> > {
  enum {
-    Cost = NumTraits<Scalar>::AddCost,
+    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
    PacketAccess = false
  };
 };
-template<ComparisonName Cmp, typename Scalar>
+template<ComparisonName Cmp, typename LhsScalar, typename RhsScalar>
-struct result_of<scalar_cmp_op<Scalar, Cmp>(Scalar,Scalar)> {
+struct result_of<scalar_cmp_op<LhsScalar, RhsScalar, Cmp>(LhsScalar,RhsScalar)> {
  typedef bool type;
 };
-template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_EQ> {
+template<typename LhsScalar, typename RhsScalar>
 struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_EQ> : binary_op_base<LhsScalar,RhsScalar>
 {
  typedef bool result_type;
  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a==b;}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a==b;}
 };
-template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_LT> {
+template<typename LhsScalar, typename RhsScalar>
 struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_LT> : binary_op_base<LhsScalar,RhsScalar>
 {
  typedef bool result_type;
  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a<b;}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a<b;}
 };
-template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_LE> {
+template<typename LhsScalar, typename RhsScalar>
 struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_LE> : binary_op_base<LhsScalar,RhsScalar>
 {
  typedef bool result_type;
  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a<=b;}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a<=b;}
 };
-template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_GT> {
+template<typename LhsScalar, typename RhsScalar>
 struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_GT> : binary_op_base<LhsScalar,RhsScalar>
 {
  typedef bool result_type;
  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a>b;}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a>b;}
 };
-template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_GE> {
+template<typename LhsScalar, typename RhsScalar>
 struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_GE> : binary_op_base<LhsScalar,RhsScalar>
 {
  typedef bool result_type;
  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a>=b;}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a>=b;}
 };
-template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_UNORD> {
+template<typename LhsScalar, typename RhsScalar>
 struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_UNORD> : binary_op_base<LhsScalar,RhsScalar>
 {
  typedef bool result_type;
  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return !(a<=b || b<=a);}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return !(a<=b || b<=a);}
 };
-template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_NEQ> {
+template<typename LhsScalar, typename RhsScalar>
 struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_NEQ> : binary_op_base<LhsScalar,RhsScalar>
 {
  typedef bool result_type;
  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a!=b;}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a!=b;}
 };
@ -216,7 +259,9 @@ template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_NEQ> {
  *
  * \sa MatrixBase::stableNorm(), class Redux
  */
-template<typename Scalar> struct scalar_hypot_op {
+template<typename Scalar>
 struct scalar_hypot_op<Scalar,Scalar> : binary_op_base<Scalar,Scalar>
 {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_hypot_op)
 //   typedef typename NumTraits<Scalar>::Real result_type;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& _x, const Scalar& _y) const
@ -237,12 +282,12 @@ template<typename Scalar> struct scalar_hypot_op {
  }
 };
 template<typename Scalar>
-struct functor_traits<scalar_hypot_op<Scalar> > {
+struct functor_traits<scalar_hypot_op<Scalar,Scalar> > {
  enum
  {
    Cost = 3 * NumTraits<Scalar>::AddCost +
           2 * NumTraits<Scalar>::MulCost +
-           2 * NumTraits<Scalar>::template Div<false>::Cost,
+           2 * scalar_div_cost<Scalar,false>::value,
    PacketAccess = false
  };
 };
@ -250,13 +295,24 @@ struct functor_traits<scalar_hypot_op<Scalar> > {
 /** \internal
  * \brief Template functor to compute the pow of two scalars
  */
-template<typename Scalar, typename OtherScalar> struct scalar_binary_pow_op {
+template<typename Scalar, typename Exponent>
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_binary_pow_op)
+struct scalar_pow_op  : binary_op_base<Scalar,Exponent>
 {
  typedef typename ScalarBinaryOpTraits<Scalar,Exponent,scalar_pow_op>::ReturnType result_type;
 #ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
  EIGEN_EMPTY_STRUCT_CTOR(scalar_pow_op)
 #else
  scalar_pow_op() {
    typedef Scalar LhsScalar;
    typedef Exponent RhsScalar;
    EIGEN_SCALAR_BINARY_OP_PLUGIN
  }
 #endif
  EIGEN_DEVICE_FUNC
-  inline Scalar operator() (const Scalar& a, const OtherScalar& b) const { return numext::pow(a, b); }
+  inline result_type operator() (const Scalar& a, const Exponent& b) const { return numext::pow(a, b); }
 };
-template<typename Scalar, typename OtherScalar>
+template<typename Scalar, typename Exponent>
-struct functor_traits<scalar_binary_pow_op<Scalar,OtherScalar> > {
+struct functor_traits<scalar_pow_op<Scalar,Exponent> > {
  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
 };
@ -269,18 +325,27 @@ struct functor_traits<scalar_binary_pow_op<Scalar,OtherScalar> > {
  *
  * \sa class CwiseBinaryOp, MatrixBase::operator-
  */
-template<typename Scalar> struct scalar_difference_op {
+template<typename LhsScalar,typename RhsScalar>
 struct scalar_difference_op : binary_op_base<LhsScalar,RhsScalar>
 {
  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_difference_op>::ReturnType result_type;
 #ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
  EIGEN_EMPTY_STRUCT_CTOR(scalar_difference_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return a - b; }
+#else
  scalar_difference_op() {
    EIGEN_SCALAR_BINARY_OP_PLUGIN
  }
 #endif
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a - b; }
  template<typename Packet>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
  { return internal::psub(a,b); }
 };
-template<typename Scalar>
+template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_difference_op<Scalar> > {
+struct functor_traits<scalar_difference_op<LhsScalar,RhsScalar> > {
  enum {
-    Cost = NumTraits<Scalar>::AddCost,
+    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
-    PacketAccess = packet_traits<Scalar>::HasSub
+    PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasSub && packet_traits<RhsScalar>::HasSub
  };
 };
@ -289,13 +354,17 @@ struct functor_traits<scalar_difference_op<Scalar> > {
  *
  * \sa class CwiseBinaryOp, Cwise::operator/()
  */
-template<typename LhsScalar,typename RhsScalar> struct scalar_quotient_op {
+template<typename LhsScalar,typename RhsScalar>
-  enum {
+struct scalar_quotient_op  : binary_op_base<LhsScalar,RhsScalar>
-    // TODO vectorize mixed product
+{
-    Vectorizable = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasDiv && packet_traits<RhsScalar>::HasDiv
+  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_quotient_op>::ReturnType result_type;
-  };
+#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
  typedef typename scalar_product_traits<LhsScalar,RhsScalar>::ReturnType result_type;
  EIGEN_EMPTY_STRUCT_CTOR(scalar_quotient_op)
 #else
  scalar_quotient_op() {
    EIGEN_SCALAR_BINARY_OP_PLUGIN
  }
 #endif
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a / b; }
  template<typename Packet>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
@ -305,8 +374,8 @@ template<typename LhsScalar,typename RhsScalar>
 struct functor_traits<scalar_quotient_op<LhsScalar,RhsScalar> > {
  typedef typename scalar_quotient_op<LhsScalar,RhsScalar>::result_type result_type;
  enum {
-    PacketAccess = scalar_quotient_op<LhsScalar,RhsScalar>::Vectorizable,
+    PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasDiv && packet_traits<RhsScalar>::HasDiv,
-    Cost = NumTraits<result_type>::template Div<PacketAccess>::Cost
+    Cost = scalar_div_cost<result_type,PacketAccess>::value
  };
 };
@ -360,236 +429,50 @@ template<> struct functor_traits<scalar_boolean_xor_op> {
  };
 };
 /** \internal
  * \brief Template functor to compute the incomplete gamma function igamma(a, x)
  *
  * \sa class CwiseBinaryOp, Cwise::igamma
  */
 template<typename Scalar> struct scalar_igamma_op {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_igamma_op)
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& x) const {
    using numext::igamma; return igamma(a, x);
  }
  template<typename Packet>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const {
    return internal::pigammac(a, x);
  }
 };
 template<typename Scalar>
 struct functor_traits<scalar_igamma_op<Scalar> > {
  enum {
    // Guesstimate
    Cost = 20 * NumTraits<Scalar>::MulCost + 10 * NumTraits<Scalar>::AddCost,
    PacketAccess = packet_traits<Scalar>::HasIGamma
  };
 };
 /** \internal
  * \brief Template functor to compute the complementary incomplete gamma function igammac(a, x)
  *
  * \sa class CwiseBinaryOp, Cwise::igammac
  */
 template<typename Scalar> struct scalar_igammac_op {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_igammac_op)
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& x) const {
    using numext::igammac; return igammac(a, x);
  }
  template<typename Packet>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const
  {
    return internal::pigammac(a, x);
  }
 };
 template<typename Scalar>
 struct functor_traits<scalar_igammac_op<Scalar> > {
  enum {
    // Guesstimate
    Cost = 20 * NumTraits<Scalar>::MulCost + 10 * NumTraits<Scalar>::AddCost,
    PacketAccess = packet_traits<Scalar>::HasIGammac
  };
 };
 //---------- binary functors bound to a constant, thus appearing as a unary functor ----------
-/** \internal
+// The following two classes permits to turn any binary functor into a unary one with one argument bound to a constant value.
-  * \brief Template functor to multiply a scalar by a fixed other one
+// They are analogues to std::binder1st/binder2nd but with the following differences:
-  *
+//  - they are compatible with packetOp
-  * \sa class CwiseUnaryOp, MatrixBase::operator*, MatrixBase::operator/
+//  - they are portable across C++ versions (the std::binder* are deprecated in C++11)
-  */
+template<typename BinaryOp> struct bind1st_op : BinaryOp {
-/* NOTE why doing the pset1() in packetOp *is* an optimization ?
+
- * indeed it seems better to declare m_other as a Packet and do the pset1() once
+  typedef typename BinaryOp::first_argument_type  first_argument_type;
- * in the constructor. However, in practice:
+  typedef typename BinaryOp::second_argument_type second_argument_type;
- *  - GCC does not like m_other as a Packet and generate a load every time it needs it
+  typedef typename BinaryOp::result_type          result_type;
- *  - on the other hand GCC is able to moves the pset1() outside the loop :)
+
- *  - simpler code ;)
+  bind1st_op(const first_argument_type &val) : m_value(val) {}
- * (ICC and gcc 4.4 seems to perform well in both cases, the issue is visible with y = a*x + b*y)
+
- */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const second_argument_type& b) const { return BinaryOp::operator()(m_value,b); }
-template<typename Scalar>
+
-struct scalar_multiple_op {
+  template<typename Packet>
-  // FIXME default copy constructors seems bugged with std::complex<>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& b) const
-  EIGEN_DEVICE_FUNC
+  { return BinaryOp::packetOp(internal::pset1<Packet>(m_value), b); }
-  EIGEN_STRONG_INLINE scalar_multiple_op(const scalar_multiple_op& other) : m_other(other.m_other) { }
+
-  EIGEN_DEVICE_FUNC
+  first_argument_type m_value;
-  EIGEN_STRONG_INLINE scalar_multiple_op(const Scalar& other) : m_other(other) { }
+};
-  EIGEN_DEVICE_FUNC
+template<typename BinaryOp> struct functor_traits<bind1st_op<BinaryOp> > : functor_traits<BinaryOp> {};
-  EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a * m_other; }
+
 template<typename BinaryOp> struct bind2nd_op : BinaryOp {
  typedef typename BinaryOp::first_argument_type  first_argument_type;
  typedef typename BinaryOp::second_argument_type second_argument_type;
  typedef typename BinaryOp::result_type          result_type;
  bind2nd_op(const second_argument_type &val) : m_value(val) {}
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const first_argument_type& a) const { return BinaryOp::operator()(a,m_value); }
  template<typename Packet>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
-  { return internal::pmul(a, pset1<Packet>(m_other)); }
+  { return BinaryOp::packetOp(a,internal::pset1<Packet>(m_value)); }
-  typename add_const_on_value_type<typename NumTraits<Scalar>::Nested>::type m_other;
+
  second_argument_type m_value;
 };
-template<typename Scalar>
+template<typename BinaryOp> struct functor_traits<bind2nd_op<BinaryOp> > : functor_traits<BinaryOp> {};
 struct functor_traits<scalar_multiple_op<Scalar> >
 { enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul }; };
 template<typename Scalar1, typename Scalar2>
 struct scalar_multiple2_op {
  typedef typename scalar_product_traits<Scalar1,Scalar2>::ReturnType result_type;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_multiple2_op(const scalar_multiple2_op& other) : m_other(other.m_other) { }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_multiple2_op(const Scalar2& other) : m_other(other) { }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar1& a) const { return a * m_other; }
  typename add_const_on_value_type<typename NumTraits<Scalar2>::Nested>::type m_other;
 };
 template<typename Scalar1,typename Scalar2>
 struct functor_traits<scalar_multiple2_op<Scalar1,Scalar2> >
 { enum { Cost = NumTraits<Scalar1>::MulCost, PacketAccess = false }; };
 /** \internal
  * \brief Template functor to divide a scalar by a fixed other one
  *
  * This functor is used to implement the quotient of a matrix by
  * a scalar where the scalar type is not necessarily a floating point type.
  *
  * \sa class CwiseUnaryOp, MatrixBase::operator/
  */
 template<typename Scalar>
 struct scalar_quotient1_op {
  // FIXME default copy constructors seems bugged with std::complex<>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_quotient1_op(const scalar_quotient1_op& other) : m_other(other.m_other) { }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_quotient1_op(const Scalar& other) : m_other(other) {}
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a / m_other; }
  template <typename Packet>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
  { return internal::pdiv(a, pset1<Packet>(m_other)); }
  typename add_const_on_value_type<typename NumTraits<Scalar>::Nested>::type m_other;
 };
 template<typename Scalar>
 struct functor_traits<scalar_quotient1_op<Scalar> >
 { enum { Cost = 2 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasDiv }; };
 template<typename Scalar1, typename Scalar2>
 struct scalar_quotient2_op {
  typedef typename scalar_product_traits<Scalar1,Scalar2>::ReturnType result_type;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_quotient2_op(const scalar_quotient2_op& other) : m_other(other.m_other) { }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_quotient2_op(const Scalar2& other) : m_other(other) { }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar1& a) const { return a / m_other; }
  typename add_const_on_value_type<typename NumTraits<Scalar2>::Nested>::type m_other;
 };
 template<typename Scalar1,typename Scalar2>
 struct functor_traits<scalar_quotient2_op<Scalar1,Scalar2> >
 { enum { Cost = 2 * NumTraits<Scalar1>::MulCost, PacketAccess = false }; };
 // In Eigen, any binary op (Product, CwiseBinaryOp) require the Lhs and Rhs to have the same scalar type, except for multiplication
 // where the mixing of different types is handled by scalar_product_traits
 // In particular, real * complex<real> is allowed.
 // FIXME move this to functor_traits adding a functor_default
 template<typename Functor> struct functor_is_product_like { enum { ret = 0 }; };
 template<typename LhsScalar,typename RhsScalar> struct functor_is_product_like<scalar_product_op<LhsScalar,RhsScalar> > { enum { ret = 1 }; };
 template<typename LhsScalar,typename RhsScalar> struct functor_is_product_like<scalar_conj_product_op<LhsScalar,RhsScalar> > { enum { ret = 1 }; };
 template<typename LhsScalar,typename RhsScalar> struct functor_is_product_like<scalar_quotient_op<LhsScalar,RhsScalar> > { enum { ret = 1 }; };
 /** \internal
  * \brief Template functor to add a scalar to a fixed other one
  * \sa class CwiseUnaryOp, Array::operator+
  */
 /* If you wonder why doing the pset1() in packetOp() is an optimization check scalar_multiple_op */
 template<typename Scalar>
 struct scalar_add_op {
  // FIXME default copy constructors seems bugged with std::complex<>
  EIGEN_DEVICE_FUNC inline scalar_add_op(const scalar_add_op& other) : m_other(other.m_other) { }
  EIGEN_DEVICE_FUNC inline scalar_add_op(const Scalar& other) : m_other(other) { }
  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a + m_other; }
  template <typename Packet>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
  { return internal::padd(a, pset1<Packet>(m_other)); }
  const Scalar m_other;
 };
 template<typename Scalar>
 struct functor_traits<scalar_add_op<Scalar> >
 { enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = packet_traits<Scalar>::HasAdd }; };
 /** \internal
  * \brief Template functor to subtract a fixed scalar to another one
  * \sa class CwiseUnaryOp, Array::operator-, struct scalar_add_op, struct scalar_rsub_op
  */
 template<typename Scalar>
 struct scalar_sub_op {
  EIGEN_DEVICE_FUNC inline scalar_sub_op(const scalar_sub_op& other) : m_other(other.m_other) { }
  EIGEN_DEVICE_FUNC inline scalar_sub_op(const Scalar& other) : m_other(other) { }
  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a - m_other; }
  template <typename Packet>
  EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const
  { return internal::psub(a, pset1<Packet>(m_other)); }
  const Scalar m_other;
 };
 template<typename Scalar>
 struct functor_traits<scalar_sub_op<Scalar> >
 { enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = packet_traits<Scalar>::HasAdd }; };
 /** \internal
  * \brief Template functor to subtract a scalar to fixed another one
  * \sa class CwiseUnaryOp, Array::operator-, struct scalar_add_op, struct scalar_sub_op
  */
 template<typename Scalar>
 struct scalar_rsub_op {
  EIGEN_DEVICE_FUNC inline scalar_rsub_op(const scalar_rsub_op& other) : m_other(other.m_other) { }
  EIGEN_DEVICE_FUNC inline scalar_rsub_op(const Scalar& other) : m_other(other) { }
  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return m_other - a; }
  template <typename Packet>
  EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const
  { return internal::psub(pset1<Packet>(m_other), a); }
  const Scalar m_other;
 };
 template<typename Scalar>
 struct functor_traits<scalar_rsub_op<Scalar> >
 { enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = packet_traits<Scalar>::HasAdd }; };
 /** \internal
  * \brief Template functor to raise a scalar to a power
  * \sa class CwiseUnaryOp, Cwise::pow
  */
 template<typename Scalar>
 struct scalar_pow_op {
  // FIXME default copy constructors seems bugged with std::complex<>
  EIGEN_DEVICE_FUNC inline scalar_pow_op(const scalar_pow_op& other) : m_exponent(other.m_exponent) { }
  EIGEN_DEVICE_FUNC inline scalar_pow_op(const Scalar& exponent) : m_exponent(exponent) {}
  EIGEN_DEVICE_FUNC
  inline Scalar operator() (const Scalar& a) const { return numext::pow(a, m_exponent); }
  const Scalar m_exponent;
 };
 template<typename Scalar>
 struct functor_traits<scalar_pow_op<Scalar> >
 { enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false }; };
 /** \internal
  * \brief Template functor to compute the quotient between a scalar and array entries.
  * \sa class CwiseUnaryOp, Cwise::inverse()
  */
 template<typename Scalar>
 struct scalar_inverse_mult_op {
  EIGEN_DEVICE_FUNC scalar_inverse_mult_op(const Scalar& other) : m_other(other) {}
  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return m_other / a; }
  template<typename Packet>
  EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const
  { return internal::pdiv(pset1<Packet>(m_other),a); }
  Scalar m_other;
 };
 template<typename Scalar>
 struct functor_traits<scalar_inverse_mult_op<Scalar> >
 { enum { PacketAccess = packet_traits<Scalar>::HasDiv, Cost = NumTraits<Scalar>::template Div<PacketAccess>::Cost }; };
 } // end namespace internal
--- a/Eigen/src/Core/functors/CMakeLists.txt
+++ b/Eigen/src/Core/functors/CMakeLists.txt
@ -1,6 +0,0 @@
 FILE(GLOB Eigen_Core_Functor_SRCS "*.h")
 INSTALL(FILES
  ${Eigen_Core_Functor_SRCS}
  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/functors COMPONENT Devel
  )
--- a/Eigen/src/Core/functors/NullaryFunctors.h
+++ b/Eigen/src/Core/functors/NullaryFunctors.h
@ -18,20 +18,20 @@ template<typename Scalar>
 struct scalar_constant_op {
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_constant_op(const scalar_constant_op& other) : m_other(other.m_other) { }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_constant_op(const Scalar& other) : m_other(other) { }
-  template<typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() () const { return m_other; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index, Index = 0) const { return m_other; }
+  template<typename PacketType>
-  template<typename Index, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packetOp() const { return internal::pset1<PacketType>(m_other); }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packetOp(Index, Index = 0) const { return internal::pset1<PacketType>(m_other); }
  const Scalar m_other;
 };
 template<typename Scalar>
 struct functor_traits<scalar_constant_op<Scalar> >
-{ enum { Cost = 1, PacketAccess = packet_traits<Scalar>::Vectorizable, IsRepeatable = true }; };
+{ enum { Cost = 0 /* as the constant value should be loaded in register only once for the whole expression */,
         PacketAccess = packet_traits<Scalar>::Vectorizable, IsRepeatable = true }; };
 template<typename Scalar> struct scalar_identity_op {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_identity_op)
-  template<typename Index>
+  template<typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index row, Index col) const { return row==col ? Scalar(1) : Scalar(0); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType row, IndexType col) const { return row==col ? Scalar(1) : Scalar(0); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_identity_op<Scalar> >
@ -55,15 +55,15 @@ struct linspaced_op_impl<Scalar,Packet,/*RandomAccess*/false,/*IsInteger*/false>
    m_packetStep(pset1<Packet>(unpacket_traits<Packet>::size*m_step)),
    m_base(padd(pset1<Packet>(low), pmul(pset1<Packet>(m_step),plset<Packet>(-unpacket_traits<Packet>::size)))) {}
-  template<typename Index>
+  template<typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index i) const 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType i) const
  { 
    m_base = padd(m_base, pset1<Packet>(m_step));
    return m_low+Scalar(i)*m_step; 
  }
-  template<typename Index>
+  template<typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(Index) const { return m_base = padd(m_base,m_packetStep); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType) const { return m_base = padd(m_base,m_packetStep); }
  const Scalar m_low;
  const Scalar m_step;
@ -81,11 +81,11 @@ struct linspaced_op_impl<Scalar,Packet,/*RandomAccess*/true,/*IsInteger*/false>
    m_low(low), m_step(num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1)),
    m_lowPacket(pset1<Packet>(m_low)), m_stepPacket(pset1<Packet>(m_step)), m_interPacket(plset<Packet>(0)) {}
-  template<typename Index>
+  template<typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index i) const { return m_low+i*m_step; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType i) const { return m_low+i*m_step; }
-  template<typename Index>
+  template<typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(Index i) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const
  { return internal::padd(m_lowPacket, pmul(m_stepPacket, padd(pset1<Packet>(Scalar(i)),m_interPacket))); }
  const Scalar m_low;
@ -99,24 +99,24 @@ template <typename Scalar, typename Packet>
 struct linspaced_op_impl<Scalar,Packet,/*RandomAccess*/true,/*IsInteger*/true>
 {
  linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) :
-    m_low(low), m_length(high-low), m_divisor(num_steps==1?1:num_steps-1), m_interPacket(plset<Packet>(0))
+    m_low(low), m_length(high-low), m_divisor(convert_index<Scalar>(num_steps==1?1:num_steps-1)), m_interPacket(plset<Packet>(0))
  {}
-  template<typename Index>
+  template<typename IndexType>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  const Scalar operator() (Index i) const {
+  const Scalar operator() (IndexType i) const {
    return m_low + (m_length*Scalar(i))/m_divisor;
  }
-  template<typename Index>
+  template<typename IndexType>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  const Packet packetOp(Index i) const {
+  const Packet packetOp(IndexType i) const {
    return internal::padd(pset1<Packet>(m_low), pdiv(pmul(pset1<Packet>(m_length), padd(pset1<Packet>(Scalar(i)),m_interPacket)),
                                                     pset1<Packet>(m_divisor))); }
  const Scalar m_low;
  const Scalar m_length;
-  const Index  m_divisor;
+  const Scalar  m_divisor;
  const Packet m_interPacket;
 };
@ -142,29 +142,11 @@ template <typename Scalar, typename PacketType, bool RandomAccess> struct linspa
    : impl((num_steps==1 ? high : low),high,num_steps)
  {}
-  template<typename Index>
+  template<typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index i) const { return impl(i); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType i) const { return impl(i); }
-  // We need this function when assigning e.g. a RowVectorXd to a MatrixXd since
+  template<typename Packet,typename IndexType>
-  // there row==0 and col is used for the actual iteration.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const { return impl.packetOp(i); }
  template<typename Index>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index row, Index col) const 
  {
    eigen_assert(col==0 || row==0);
    return impl(col + row);
  }
  template<typename Index, typename Packet>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(Index i) const { return impl.packetOp(i); }
  // We need this function when assigning e.g. a RowVectorXd to a MatrixXd since
  // there row==0 and col is used for the actual iteration.
  template<typename Index, typename Packet>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(Index row, Index col) const
  {
    eigen_assert(col==0 || row==0);
    return impl.packetOp(col + row);
  }
  // This proxy object handles the actual required temporaries, the different
  // implementations (random vs. sequential access) as well as the
@ -174,11 +156,11 @@ template <typename Scalar, typename PacketType, bool RandomAccess> struct linspa
  const linspaced_op_impl<Scalar,PacketType,(NumTraits<Scalar>::IsInteger?true:RandomAccess),NumTraits<Scalar>::IsInteger> impl;
 };
-// all functors allow linear access, except scalar_identity_op. So we fix here a quick meta
+// Linear access is automatically determined from the operator() prototypes available for the given functor.
-// to indicate whether a functor allows linear access, just always answering 'yes' except for
+// If it exposes an operator()(i,j), then we assume the i and j coefficients are required independently
-// scalar_identity_op.
+// and linear access is not possible. In all other cases, linear access is enabled.
-template<typename Functor> struct functor_has_linear_access { enum { ret = 1 }; };
+// Users should not have to deal with this struture.
-template<typename Scalar> struct functor_has_linear_access<scalar_identity_op<Scalar> > { enum { ret = 0 }; };
+template<typename Functor> struct functor_has_linear_access { enum { ret = !has_binary_operator<Functor>::value }; };
 } // end namespace internal
--- a/Eigen/src/Core/functors/TernaryFunctors.h
+++ b/Eigen/src/Core/functors/TernaryFunctors.h
@ -0,0 +1,25 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #ifndef EIGEN_TERNARY_FUNCTORS_H
 #define EIGEN_TERNARY_FUNCTORS_H
 namespace Eigen {
 namespace internal {
 //---------- associative ternary functors ----------
 } // end namespace internal
 } // end namespace Eigen
 #endif // EIGEN_TERNARY_FUNCTORS_H
--- a/Eigen/src/Core/functors/UnaryFunctors.h
+++ b/Eigen/src/Core/functors/UnaryFunctors.h
@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@ -248,7 +248,7 @@ struct functor_traits<scalar_exp_op<Scalar> > {
     // double: 7 pmadd, 5 pmul, 3 padd/psub, 1 div,  13 other
     : (14 * NumTraits<Scalar>::AddCost +
        6 * NumTraits<Scalar>::MulCost +
-        NumTraits<Scalar>::template Div<packet_traits<Scalar>::HasDiv>::Cost))
+        scalar_div_cost<Scalar,packet_traits<Scalar>::HasDiv>::value))
 #else
    Cost =
    (sizeof(Scalar) == 4
@ -257,7 +257,7 @@ struct functor_traits<scalar_exp_op<Scalar> > {
     // double: 7 pmadd, 5 pmul, 3 padd/psub, 1 div,  13 other
     : (23 * NumTraits<Scalar>::AddCost +
        12 * NumTraits<Scalar>::MulCost +
-        NumTraits<Scalar>::template Div<packet_traits<Scalar>::HasDiv>::Cost))
+        scalar_div_cost<Scalar,packet_traits<Scalar>::HasDiv>::value))
 #endif
  };
 };
@ -266,7 +266,7 @@ struct functor_traits<scalar_exp_op<Scalar> > {
  *
  * \brief Template functor to compute the logarithm of a scalar
  *
-  * \sa class CwiseUnaryOp, Cwise::log()
+  * \sa class CwiseUnaryOp, ArrayBase::log()
  */
 template<typename Scalar> struct scalar_log_op {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_log_op)
@ -293,6 +293,26 @@ struct functor_traits<scalar_log_op<Scalar> > {
  };
 };
 /** \internal
  *
  * \brief Template functor to compute the logarithm of 1 plus a scalar value
  *
  * \sa class CwiseUnaryOp, ArrayBase::log1p()
  */
 template<typename Scalar> struct scalar_log1p_op {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_log1p_op)
  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::log1p(a); }
  template <typename Packet>
  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plog1p(a); }
 };
 template <typename Scalar>
 struct functor_traits<scalar_log1p_op<Scalar> > {
  enum {
    PacketAccess = packet_traits<Scalar>::HasLog1p,
    Cost = functor_traits<scalar_log_op<Scalar> >::Cost // TODO measure cost of log1p
  };
 };
 /** \internal
  *
  * \brief Template functor to compute the base-10 logarithm of a scalar
@ -452,142 +472,6 @@ struct functor_traits<scalar_asin_op<Scalar> >
 };
 /** \internal
 * \brief Template functor to compute the natural log of the absolute
 * value of Gamma of a scalar
 * \sa class CwiseUnaryOp, Cwise::lgamma()
 */
 template<typename Scalar> struct scalar_lgamma_op {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_lgamma_op)
  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
    using numext::lgamma; return lgamma(a);
  }
  typedef typename packet_traits<Scalar>::type Packet;
  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plgamma(a); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_lgamma_op<Scalar> >
 {
  enum {
    // Guesstimate
    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
    PacketAccess = packet_traits<Scalar>::HasLGamma
  };
 };
 /** \internal
 * \brief Template functor to compute psi, the derivative of lgamma of a scalar.
 * \sa class CwiseUnaryOp, Cwise::digamma()
 */
 template<typename Scalar> struct scalar_digamma_op {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_digamma_op)
  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
    using numext::digamma; return digamma(a);
  }
  typedef typename packet_traits<Scalar>::type Packet;
  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pdigamma(a); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_digamma_op<Scalar> >
 {
  enum {
    // Guesstimate
    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
    PacketAccess = packet_traits<Scalar>::HasDiGamma
  };
 };
 /** \internal
 * \brief Template functor to compute the Riemann Zeta function of two arguments.
 * \sa class CwiseUnaryOp, Cwise::zeta()
 */
 template<typename Scalar> struct scalar_zeta_op {
    EIGEN_EMPTY_STRUCT_CTOR(scalar_zeta_op)
    EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& x, const Scalar& q) const {
        using numext::zeta; return zeta(x, q);
    }
    typedef typename packet_traits<Scalar>::type Packet;
    EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& x, const Packet& q) const { return internal::pzeta(x, q); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_zeta_op<Scalar> >
 {
    enum {
        // Guesstimate
        Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
        PacketAccess = packet_traits<Scalar>::HasZeta
    };
 };
 /** \internal
 * \brief Template functor to compute the polygamma function.
 * \sa class CwiseUnaryOp, Cwise::polygamma()
 */
 template<typename Scalar> struct scalar_polygamma_op {
    EIGEN_EMPTY_STRUCT_CTOR(scalar_polygamma_op)
    EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& n, const Scalar& x) const {
        using numext::polygamma; return polygamma(n, x);
    }
    typedef typename packet_traits<Scalar>::type Packet;
    EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& n, const Packet& x) const { return internal::ppolygamma(n, x); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_polygamma_op<Scalar> >
 {
    enum {
        // Guesstimate
        Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
        PacketAccess = packet_traits<Scalar>::HasPolygamma
    };
 };
 /** \internal
 * \brief Template functor to compute the Gauss error function of a
 * scalar
 * \sa class CwiseUnaryOp, Cwise::erf()
 */
 template<typename Scalar> struct scalar_erf_op {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_erf_op)
  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
    using numext::erf; return erf(a);
  }
  typedef typename packet_traits<Scalar>::type Packet;
  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::perf(a); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_erf_op<Scalar> >
 {
  enum {
    // Guesstimate
    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
    PacketAccess = packet_traits<Scalar>::HasErf
  };
 };
 /** \internal
 * \brief Template functor to compute the Complementary Error Function
 * of a scalar
 * \sa class CwiseUnaryOp, Cwise::erfc()
 */
 template<typename Scalar> struct scalar_erfc_op {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_erfc_op)
  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
    using numext::erfc; return erfc(a);
  }
  typedef typename packet_traits<Scalar>::type Packet;
  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::perfc(a); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_erfc_op<Scalar> >
 {
  enum {
    // Guesstimate
    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
    PacketAccess = packet_traits<Scalar>::HasErfc
  };
 };
 /** \internal
  * \brief Template functor to compute the atan of a scalar
  * \sa class CwiseUnaryOp, ArrayBase::atan()
@ -607,38 +491,39 @@ struct functor_traits<scalar_atan_op<Scalar> >
  };
 };
 /** \internal
  * \brief Template functor to compute the tanh of a scalar
  * \sa class CwiseUnaryOp, ArrayBase::tanh()
  */
-template<typename Scalar> struct scalar_tanh_op {
+template <typename Scalar>
 struct scalar_tanh_op {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_tanh_op)
  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::tanh(a); }
  template <typename Packet>
-  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::ptanh(a); }
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& x) const { return ptanh(x); }
 };
 template <typename Scalar>
-struct functor_traits<scalar_tanh_op<Scalar> >
+struct functor_traits<scalar_tanh_op<Scalar> > {
 {
  enum {
    PacketAccess = packet_traits<Scalar>::HasTanh,
-    Cost =
+    Cost = ( (EIGEN_FAST_MATH && is_same<Scalar,float>::value)
    (PacketAccess
 // The following numbers are based on the AVX implementation,
 #ifdef EIGEN_VECTORIZE_FMA
                // Haswell can issue 2 add/mul/madd per cycle.
                // 9 pmadd, 2 pmul, 1 div, 2 other
-     ? (2 * NumTraits<Scalar>::AddCost + 6 * NumTraits<Scalar>::MulCost +
+                ? (2 * NumTraits<Scalar>::AddCost +
-     NumTraits<Scalar>::template Div<packet_traits<Scalar>::HasDiv>::Cost)
+                   6 * NumTraits<Scalar>::MulCost +
                   scalar_div_cost<Scalar,packet_traits<Scalar>::HasDiv>::value)
 #else
                ? (11 * NumTraits<Scalar>::AddCost +
                   11 * NumTraits<Scalar>::MulCost +
-        NumTraits<Scalar>::template Div<packet_traits<Scalar>::HasDiv>::Cost)
+                   scalar_div_cost<Scalar,packet_traits<Scalar>::HasDiv>::value)
 #endif
                // This number assumes a naive implementation of tanh
-     : (6 * NumTraits<Scalar>::AddCost + 3 * NumTraits<Scalar>::MulCost +
+                : (6 * NumTraits<Scalar>::AddCost +
-        2 * NumTraits<Scalar>::template Div<packet_traits<Scalar>::HasDiv>::Cost +
+                   3 * NumTraits<Scalar>::MulCost +
                   2 * scalar_div_cost<Scalar,packet_traits<Scalar>::HasDiv>::value +
                   functor_traits<scalar_exp_op<Scalar> >::Cost))
  };
 };
@ -880,9 +765,9 @@ struct scalar_sign_op<Scalar,true> {
  {
    typedef typename NumTraits<Scalar>::Real real_type;
    real_type aa = numext::abs(a);
-    if (aa==0)
+    if (aa==real_type(0))
      return Scalar(0);
-    aa = 1./aa;
+    aa = real_type(1)/aa;
    return Scalar(real(a)*aa, imag(a)*aa );
  }
  //TODO
--- a/Eigen/src/Core/products/CMakeLists.txt
+++ b/Eigen/src/Core/products/CMakeLists.txt
@ -1,6 +0,0 @@
 FILE(GLOB Eigen_Core_Product_SRCS "*.h")
 INSTALL(FILES
  ${Eigen_Core_Product_SRCS}
  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/products COMPONENT Devel
  )
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@ -299,16 +299,6 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads
  if (!useSpecificBlockingSizes(k, m, n)) {
    evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor, Index>(k, m, n, num_threads);
  }
  typedef gebp_traits<LhsScalar,RhsScalar> Traits;
  enum {
    kr = 8,
    mr = Traits::mr,
    nr = Traits::nr
  };
  if (k > kr) k -= k % kr;
  if (m > mr) m -= m % mr;
  if (n > nr) n -= n % nr;
 }
 template<typename LhsScalar, typename RhsScalar, typename Index>
@ -363,7 +353,7 @@ class gebp_traits
 public:
  typedef _LhsScalar LhsScalar;
  typedef _RhsScalar RhsScalar;
-  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
  enum {
    ConjLhs = _ConjLhs,
@ -444,15 +434,16 @@ public:
  template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp) const
  {
    conj_helper<LhsPacketType,RhsPacketType,ConjLhs,ConjRhs> cj;
    // It would be a lot cleaner to call pmadd all the time. Unfortunately if we
    // let gcc allocate the register in which to store the result of the pmul
    // (in the case where there is no FMA) gcc fails to figure out how to avoid
    // spilling register.
 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
    EIGEN_UNUSED_VARIABLE(tmp);
-    c = pmadd(a,b,c);
+    c = cj.pmadd(a,b,c);
 #else
-    tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp);
+    tmp = b; tmp = cj.pmul(a,tmp); c = padd(c,tmp);
 #endif
  }
@ -467,9 +458,6 @@ public:
    r = pmadd(c,alpha,r);
  }
 protected:
 //   conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
 //   conj_helper<LhsPacket,RhsPacket,ConjLhs,ConjRhs> pcj;
 };
 template<typename RealScalar, bool _ConjLhs>
@ -478,7 +466,7 @@ class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false>
 public:
  typedef std::complex<RealScalar> LhsScalar;
  typedef RealScalar RhsScalar;
-  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
  enum {
    ConjLhs = _ConjLhs,
@ -860,80 +848,6 @@ protected:
  conj_helper<ResPacket,ResPacket,false,ConjRhs> cj;
 };
 // helper for the rotating kernel below
 template <typename GebpKernel, bool UseRotatingKernel = GebpKernel::UseRotatingKernel>
 struct PossiblyRotatingKernelHelper
 {
  // default implementation, not rotating
  typedef typename GebpKernel::Traits Traits;
  typedef typename Traits::RhsScalar RhsScalar;
  typedef typename Traits::RhsPacket RhsPacket;
  typedef typename Traits::AccPacket AccPacket;
  const Traits& traits;
  PossiblyRotatingKernelHelper(const Traits& t) : traits(t) {}
  template <size_t K, size_t Index>
  void loadOrRotateRhs(RhsPacket& to, const RhsScalar* from) const
  {
    traits.loadRhs(from + (Index+4*K)*Traits::RhsProgress, to);
  }
  void unrotateResult(AccPacket&,
                      AccPacket&,
                      AccPacket&,
                      AccPacket&)
  {
  }
 };
 // rotating implementation
 template <typename GebpKernel>
 struct PossiblyRotatingKernelHelper<GebpKernel, true>
 {
  typedef typename GebpKernel::Traits Traits;
  typedef typename Traits::RhsScalar RhsScalar;
  typedef typename Traits::RhsPacket RhsPacket;
  typedef typename Traits::AccPacket AccPacket;
  const Traits& traits;
  PossiblyRotatingKernelHelper(const Traits& t) : traits(t) {}
  template <size_t K, size_t Index>
  void loadOrRotateRhs(RhsPacket& to, const RhsScalar* from) const
  {
    if (Index == 0) {
      to = pload<RhsPacket>(from + 4*K*Traits::RhsProgress);
    } else {
      EIGEN_ASM_COMMENT("Do not reorder code, we're very tight on registers");
      to = protate<1>(to);
    }
  }
  void unrotateResult(AccPacket& res0,
                      AccPacket& res1,
                      AccPacket& res2,
                      AccPacket& res3)
  {
    PacketBlock<AccPacket> resblock;
    resblock.packet[0] = res0;
    resblock.packet[1] = res1;
    resblock.packet[2] = res2;
    resblock.packet[3] = res3;
    ptranspose(resblock);
    resblock.packet[3] = protate<1>(resblock.packet[3]);
    resblock.packet[2] = protate<2>(resblock.packet[2]);
    resblock.packet[1] = protate<3>(resblock.packet[1]);
    ptranspose(resblock);
    res0 = resblock.packet[0];
    res1 = resblock.packet[1];
    res2 = resblock.packet[2];
    res3 = resblock.packet[3];
  }
 };
 /* optimized GEneral packed Block * packed Panel product kernel
 *
 * Mixing type logic: C += A * B
@ -967,16 +881,6 @@ struct gebp_kernel
    ResPacketSize = Traits::ResPacketSize
  };
  static const bool UseRotatingKernel =
    EIGEN_ARCH_ARM &&
    internal::is_same<LhsScalar, float>::value &&
    internal::is_same<RhsScalar, float>::value &&
    internal::is_same<ResScalar, float>::value &&
    Traits::LhsPacketSize == 4 &&
    Traits::RhsPacketSize == 4 &&
    Traits::ResPacketSize == 4;
  EIGEN_DONT_INLINE
  void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
                  Index rows, Index depth, Index cols, ResScalar alpha,
@ -1010,8 +914,6 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
    // Usually, make sense only with FMA
    if(mr>=3*Traits::LhsProgress)
    {
      PossiblyRotatingKernelHelper<gebp_kernel> possiblyRotatingKernelHelper(traits);
      // Here, the general idea is to loop on each largest micro horizontal panel of the lhs (3*Traits::LhsProgress x depth)
      // and on each largest micro vertical panel of the rhs (depth * nr).
      // Blocking sizes, i.e., 'depth' has been computed so that the micro horizontal panel of the lhs fit in L1.
@ -1074,19 +976,19 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
              traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0);  \
              traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1);  \
              traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2);  \
-              possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 0>(B_0, blB); \
+              traits.loadRhs(blB + (0+4*K)*Traits::RhsProgress, B_0); \
              traits.madd(A0, B_0, C0, T0); \
              traits.madd(A1, B_0, C4, T0); \
              traits.madd(A2, B_0, C8, B_0); \
-              possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 1>(B_0, blB); \
+              traits.loadRhs(blB + (1+4*K)*Traits::RhsProgress, B_0); \
              traits.madd(A0, B_0, C1, T0); \
              traits.madd(A1, B_0, C5, T0); \
              traits.madd(A2, B_0, C9, B_0); \
-              possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 2>(B_0, blB); \
+              traits.loadRhs(blB + (2+4*K)*Traits::RhsProgress, B_0); \
              traits.madd(A0, B_0, C2,  T0); \
              traits.madd(A1, B_0, C6,  T0); \
              traits.madd(A2, B_0, C10, B_0); \
-              possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 3>(B_0, blB); \
+              traits.loadRhs(blB + (3+4*K)*Traits::RhsProgress, B_0); \
              traits.madd(A0, B_0, C3 , T0); \
              traits.madd(A1, B_0, C7,  T0); \
              traits.madd(A2, B_0, C11, B_0); \
@ -1120,10 +1022,6 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
 #undef EIGEN_GEBP_ONESTEP
          possiblyRotatingKernelHelper.unrotateResult(C0, C1, C2, C3);
          possiblyRotatingKernelHelper.unrotateResult(C4, C5, C6, C7);
          possiblyRotatingKernelHelper.unrotateResult(C8, C9, C10, C11);
          ResPacket R0, R1, R2;
          ResPacket alphav = pset1<ResPacket>(alpha);
@ -1625,9 +1523,13 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
          prefetch(&blA[0]);
          const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
-          // NOTE The following piece of code doesn't work for 512 bit registers,
+          // The following piece of code wont work for 512 bit registers
-          // so we don't call it for registers that contain more than 8 values.
+          // Moreover, if LhsProgress==8 it assumes that there is a half packet of the same size
-          if( ((SwappedTraits::LhsProgress % 4)==0) && (SwappedTraits::LhsProgress <= 8))
+          // as nr (which is currently 4) for the return type.
          typedef typename unpacket_traits<SResPacket>::half SResPacketHalf;
          if ((SwappedTraits::LhsProgress % 4) == 0 &&
              (SwappedTraits::LhsProgress <= 8) &&
              (SwappedTraits::LhsProgress!=8 || unpacket_traits<SResPacketHalf>::size==nr))
          {
            SAccPacket C0, C1, C2, C3;
            straits.initAcc(C0);
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@ -25,7 +25,7 @@ struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLh
 {
  typedef gebp_traits<RhsScalar,LhsScalar> Traits;
-  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
  static EIGEN_STRONG_INLINE void run(
    Index rows, Index cols, Index depth,
    const LhsScalar* lhs, Index lhsStride,
@ -55,7 +55,7 @@ struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLh
 typedef gebp_traits<LhsScalar,RhsScalar> Traits;
-typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
 static void run(Index rows, Index cols, Index depth,
  const LhsScalar* _lhs, Index lhsStride,
  const RhsScalar* _rhs, Index rhsStride,
@ -309,8 +309,8 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
      this->m_blockA = m_staticA;
      this->m_blockB = m_staticB;
 #else
-      this->m_blockA = reinterpret_cast<LhsScalar*>((std::size_t(m_staticA) + (EIGEN_DEFAULT_ALIGN_BYTES-1)) & ~std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1));
+      this->m_blockA = reinterpret_cast<LhsScalar*>((internal::UIntPtr(m_staticA) + (EIGEN_DEFAULT_ALIGN_BYTES-1)) & ~std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1));
-      this->m_blockB = reinterpret_cast<RhsScalar*>((std::size_t(m_staticB) + (EIGEN_DEFAULT_ALIGN_BYTES-1)) & ~std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1));
+      this->m_blockB = reinterpret_cast<RhsScalar*>((internal::UIntPtr(m_staticB) + (EIGEN_DEFAULT_ALIGN_BYTES-1)) & ~std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1));
 #endif
    }
--- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
@ -40,7 +40,7 @@ template <typename Index, typename LhsScalar, int LhsStorageOrder, bool Conjugat
                          typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, int  UpLo, int Version>
 struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor,UpLo,Version>
 {
-  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
  static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs, Index lhsStride,
                                      const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resStride,
                                      const ResScalar& alpha, level3_blocking<RhsScalar,LhsScalar>& blocking)
@ -57,7 +57,7 @@ template <typename Index, typename LhsScalar, int LhsStorageOrder, bool Conjugat
                          typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, int  UpLo, int Version>
 struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor,UpLo,Version>
 {
-  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
  static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride,
                                      const RhsScalar* _rhs, Index rhsStride, ResScalar* _res, Index resStride,
                                      const ResScalar& alpha, level3_blocking<LhsScalar,RhsScalar>& blocking)
--- a/Eigen/src/Core/products/GeneralMatrixVector.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector.h
@ -58,7 +58,7 @@ namespace internal {
 template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
 struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
 {
-  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
 enum {
  Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
@ -140,7 +140,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,C
  // find how many columns do we have to skip to be aligned with the result (if possible)
  Index skipColumns = 0;
  // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
-  if( (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == size) || (size_t(res)%sizeof(ResScalar)) )
+  if( (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == size) || (UIntPtr(res)%sizeof(ResScalar)) )
  {
    alignedSize = 0;
    alignedStart = 0;
@ -183,8 +183,8 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,C
    alignmentPattern = AllAligned;
  }
-  const Index offset1 = (FirstAligned && alignmentStep==1?3:1);
+  const Index offset1 = (FirstAligned && alignmentStep==1)?3:1;
-  const Index offset3 = (FirstAligned && alignmentStep==1?1:3);
+  const Index offset3 = (FirstAligned && alignmentStep==1)?1:3;
  Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
  for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
@ -334,7 +334,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,C
 template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
 struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
 {
-typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
 enum {
  Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
@ -457,8 +457,8 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,R
    alignmentPattern = AllAligned;
  }
-  const Index offset1 = (FirstAligned && alignmentStep==1?3:1);
+  const Index offset1 = (FirstAligned && alignmentStep==1)?3:1;
-  const Index offset3 = (FirstAligned && alignmentStep==1?1:3);
+  const Index offset3 = (FirstAligned && alignmentStep==1)?1:3;
  Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
  for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
--- a/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h
@ -122,7 +122,7 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh
      Map<const Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder>, 0, OuterStride<> > lhs(_lhs,m,m,OuterStride<>(lhsStride)); \
      a_tmp = lhs.conjugate(); \
      a = a_tmp.data(); \
-      lda = a_tmp.outerStride(); \
+      lda = convert_index<BlasIndex>(a_tmp.outerStride()); \
    } else a = _lhs; \
    if (LhsStorageOrder==RowMajor) uplo='U'; \
 \
@ -256,7 +256,7 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL
        b_tmp = lhs.transpose(); \
      } \
      b = b_tmp.data(); \
-      ldb = b_tmp.outerStride(); \
+      ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
    } \
 \
    BLASPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
--- a/Eigen/src/Core/products/SelfadjointMatrixVector.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h
@ -179,7 +179,7 @@ struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,0,true>
  {
    typedef typename Dest::Scalar ResScalar;
    typedef typename Rhs::Scalar RhsScalar;
-    typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
+    typedef Map<Matrix<ResScalar,Dynamic,1>, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits<ResScalar>::size)> MappedDest;
    eigen_assert(dest.rows()==a_lhs.rows() && dest.cols()==a_rhs.cols());
--- a/Eigen/src/Core/products/TriangularMatrixVector.h
+++ b/Eigen/src/Core/products/TriangularMatrixVector.h
@ -20,7 +20,7 @@ struct triangular_matrix_vector_product;
 template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int Version>
 struct triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,ColMajor,Version>
 {
-  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
  enum {
    IsLower = ((Mode&Lower)==Lower),
    HasUnitDiag = (Mode & UnitDiag)==UnitDiag,
@ -91,7 +91,7 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
 template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs,int Version>
 struct triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,RowMajor,Version>
 {
-  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
  enum {
    IsLower = ((Mode&Lower)==Lower),
    HasUnitDiag = (Mode & UnitDiag)==UnitDiag,
@ -216,7 +216,7 @@ template<int Mode> struct trmv_selector<Mode,ColMajor>
    typedef internal::blas_traits<Rhs> RhsBlasTraits;
    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
-    typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
+    typedef Map<Matrix<ResScalar,Dynamic,1>, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits<ResScalar>::size)> MappedDest;
    typename internal::add_const_on_value_type<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs);
    typename internal::add_const_on_value_type<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(rhs);
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@ -44,16 +44,29 @@ template<bool Conjugate> struct conj_if;
 template<> struct conj_if<true> {
  template<typename T>
-  inline T operator()(const T& x) { return numext::conj(x); }
+  inline T operator()(const T& x) const { return numext::conj(x); }
  template<typename T>
-  inline T pconj(const T& x) { return internal::pconj(x); }
+  inline T pconj(const T& x) const { return internal::pconj(x); }
 };
 template<> struct conj_if<false> {
  template<typename T>
-  inline const T& operator()(const T& x) { return x; }
+  inline const T& operator()(const T& x) const { return x; }
  template<typename T>
-  inline const T& pconj(const T& x) { return x; }
+  inline const T& pconj(const T& x) const { return x; }
 };
 // Generic implementation for custom complex types.
 template<typename LhsScalar, typename RhsScalar, bool ConjLhs, bool ConjRhs>
 struct conj_helper
 {
  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar>::ReturnType Scalar;
  EIGEN_STRONG_INLINE Scalar pmadd(const LhsScalar& x, const RhsScalar& y, const Scalar& c) const
  { return padd(c, pmul(x,y)); }
  EIGEN_STRONG_INLINE Scalar pmul(const LhsScalar& x, const RhsScalar& y) const
  { return conj_if<ConjLhs>()(x) *  conj_if<ConjRhs>()(y); }
 };
 template<typename Scalar> struct conj_helper<Scalar,Scalar,false,false>
@ -111,7 +124,7 @@ template<typename RealScalar,bool Conj> struct conj_helper<RealScalar, std::comp
 };
 template<typename From,typename To> struct get_factor {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE To run(const From& x) { return x; }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE To run(const From& x) { return To(x); }
 };
 template<typename Scalar> struct get_factor<Scalar,typename NumTraits<Scalar>::Real> {
@ -135,7 +148,7 @@ class BlasVectorMapper {
  template <typename Packet>
  EIGEN_DEVICE_FUNC bool aligned(Index i) const {
-    return (size_t(m_data+i)%sizeof(Packet))==0;
+    return (UIntPtr(m_data+i)%sizeof(Packet))==0;
  }
  protected:
@ -227,7 +240,7 @@ class blas_data_mapper {
  EIGEN_DEVICE_FUNC const Scalar* data() const { return m_data; }
  EIGEN_DEVICE_FUNC Index firstAligned(Index size) const {
-    if (size_t(m_data)%sizeof(Scalar)) {
+    if (UIntPtr(m_data)%sizeof(Scalar)) {
      return -1;
    }
    return internal::first_default_aligned(m_data, size);
@ -293,17 +306,33 @@ struct blas_traits<CwiseUnaryOp<scalar_conjugate_op<Scalar>, NestedXpr> >
 };
 // pop scalar multiple
-template<typename Scalar, typename NestedXpr>
+template<typename Scalar, typename NestedXpr, typename Plain>
-struct blas_traits<CwiseUnaryOp<scalar_multiple_op<Scalar>, NestedXpr> >
+struct blas_traits<CwiseBinaryOp<scalar_product_op<Scalar>, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain>, NestedXpr> >
 : blas_traits<NestedXpr>
 {
  typedef blas_traits<NestedXpr> Base;
-  typedef CwiseUnaryOp<scalar_multiple_op<Scalar>, NestedXpr> XprType;
+  typedef CwiseBinaryOp<scalar_product_op<Scalar>, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain>, NestedXpr> XprType;
  typedef typename Base::ExtractType ExtractType;
-  static inline ExtractType extract(const XprType& x) { return Base::extract(x.nestedExpression()); }
+  static inline ExtractType extract(const XprType& x) { return Base::extract(x.rhs()); }
  static inline Scalar extractScalarFactor(const XprType& x)
-  { return x.functor().m_other * Base::extractScalarFactor(x.nestedExpression()); }
+  { return x.lhs().functor().m_other * Base::extractScalarFactor(x.rhs()); }
 };
 template<typename Scalar, typename NestedXpr, typename Plain>
 struct blas_traits<CwiseBinaryOp<scalar_product_op<Scalar>, NestedXpr, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain> > >
 : blas_traits<NestedXpr>
 {
  typedef blas_traits<NestedXpr> Base;
  typedef CwiseBinaryOp<scalar_product_op<Scalar>, NestedXpr, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain> > XprType;
  typedef typename Base::ExtractType ExtractType;
  static inline ExtractType extract(const XprType& x) { return Base::extract(x.lhs()); }
  static inline Scalar extractScalarFactor(const XprType& x)
  { return Base::extractScalarFactor(x.lhs()) * x.rhs().functor().m_other; }
 };
 template<typename Scalar, typename Plain1, typename Plain2>
 struct blas_traits<CwiseBinaryOp<scalar_product_op<Scalar>, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain1>,
                                                            const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain2> > >
 : blas_traits<CwiseNullaryOp<scalar_constant_op<Scalar>,Plain1> >
 {};
 // pop opposite
 template<typename Scalar, typename NestedXpr>
--- a/Eigen/src/Core/util/CMakeLists.txt
+++ b/Eigen/src/Core/util/CMakeLists.txt
@ -1,6 +0,0 @@
 FILE(GLOB Eigen_Core_util_SRCS "*.h")
 INSTALL(FILES 
  ${Eigen_Core_util_SRCS}
  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/util COMPONENT Devel
  )
--- a/Eigen/src/Core/util/Constants.h
+++ b/Eigen/src/Core/util/Constants.h
@ -199,7 +199,7 @@ const unsigned int HereditaryBits = RowMajorBit
 /** \ingroup enums
  * Enum containing possible values for the \c Mode or \c UpLo parameter of
  * MatrixBase::selfadjointView() and MatrixBase::triangularView(), and selfadjoint solvers. */
-enum {
+enum UpLoType {
  /** View matrix as a lower triangular matrix. */
  Lower=0x1,                      
  /** View matrix as an upper triangular matrix. */
@ -224,7 +224,7 @@ enum {
 /** \ingroup enums
  * Enum for indicating whether a buffer is aligned or not. */
-enum {
+enum AlignmentType {
  Unaligned=0,        /**< Data pointer has no specific alignment. */
  Aligned8=8,         /**< Data pointer is aligned on a 8 bytes boundary. */
  Aligned16=16,       /**< Data pointer is aligned on a 16 bytes boundary. */
@ -273,7 +273,7 @@ enum DirectionType {
 /** \internal \ingroup enums
  * Enum to specify how to traverse the entries of a matrix. */
-enum {
+enum TraversalType {
  /** \internal Default traversal, no vectorization, no index-based access */
  DefaultTraversal,
  /** \internal No vectorization, use index-based access to have only one for loop instead of 2 nested loops */
@ -295,7 +295,7 @@ enum {
 /** \internal \ingroup enums
  * Enum to specify whether to unroll loops when traversing over the entries of a matrix. */
-enum {
+enum UnrollingType {
  /** \internal Do not unroll loops. */
  NoUnrolling,
  /** \internal Unroll only the inner loop, but not the outer loop. */
@ -307,7 +307,7 @@ enum {
 /** \internal \ingroup enums
  * Enum to specify whether to use the default (built-in) implementation or the specialization. */
-enum {
+enum SpecializedType {
  Specialized,
  BuiltIn
 };
@ -315,7 +315,7 @@ enum {
 /** \ingroup enums
  * Enum containing possible values for the \p _Options template parameter of
  * Matrix, Array and BandMatrix. */
-enum {
+enum StorageOptions {
  /** Storage order is column major (see \ref TopicStorageOrders). */
  ColMajor = 0,
  /** Storage order is row major (see \ref TopicStorageOrders). */
@ -328,7 +328,7 @@ enum {
 /** \ingroup enums
  * Enum for specifying whether to apply or solve on the left or right. */
-enum {
+enum SideType {
  /** Apply transformation on the left. */
  OnTheLeft = 1,  
  /** Apply transformation on the right. */
@ -353,7 +353,7 @@ enum Default_t    { Default };
 /** \internal \ingroup enums
  * Used in AmbiVector. */
-enum {
+enum AmbiVectorMode {
  IsDense         = 0,
  IsSparse
 };
@ -479,8 +479,9 @@ namespace Architecture
 }
 /** \internal \ingroup enums
-  * Enum used as template parameter in Product and product evalautors. */
+  * Enum used as template parameter in Product and product evaluators. */
-enum { DefaultProduct=0, LazyProduct, AliasFreeProduct, CoeffBasedProductMode, LazyCoeffBasedProductMode, OuterProduct, InnerProduct, GemvProduct, GemmProduct };
+enum ProductImplType
 { DefaultProduct=0, LazyProduct, AliasFreeProduct, CoeffBasedProductMode, LazyCoeffBasedProductMode, OuterProduct, InnerProduct, GemvProduct, GemmProduct };
 /** \internal \ingroup enums
  * Enum used in experimental parallel implementation. */
@ -492,7 +493,7 @@ struct Dense {};
 /** The type used to identify a general sparse storage. */
 struct Sparse {};
-/** The type used to identify a general solver (foctored) storage. */
+/** The type used to identify a general solver (factored) storage. */
 struct SolverStorage {};
 /** The type used to identify a permutation storage. */
--- a/Eigen/src/Core/util/DisableStupidWarnings.h
+++ b/Eigen/src/Core/util/DisableStupidWarnings.h
@ -14,12 +14,13 @@
  // 4512 - assignment operator could not be generated
  // 4522 - 'class' : multiple assignment operators specified
  // 4700 - uninitialized local variable 'xyz' used
  // 4714 - function marked as __forceinline not inlined
  // 4717 - 'function' : recursive on all control paths, function will cause runtime stack overflow
  // 4800 - 'type' : forcing value to bool 'true' or 'false' (performance warning)
  #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
    #pragma warning( push )
  #endif
-  #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4717 4800)
+  #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800)
 #elif defined __INTEL_COMPILER
  // 2196 - routine is both "inline" and "noinline" ("noinline" assumed)
@ -41,6 +42,14 @@
    #pragma clang diagnostic push
  #endif
  #pragma clang diagnostic ignored "-Wconstant-logical-operand"
 #elif defined __GNUC__ && __GNUC__>=6
  #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
    #pragma GCC diagnostic push
  #endif
  #pragma GCC diagnostic ignored "-Wignored-attributes"
 #endif
 #if defined __NVCC__
@ -48,11 +57,19 @@
  #pragma diag_suppress code_is_unreachable
  // Disable the "dynamic initialization in unreachable code" message
  #pragma diag_suppress initialization_not_reachable
-  // Disable the "calling a __host__ function from a __host__ __device__ function is not allowed" messages (yes, there are 4 of them)
+  // Disable the "invalid error number" message that we get with older versions of nvcc
  #pragma diag_suppress 1222
  // Disable the "calling a __host__ function from a __host__ __device__ function is not allowed" messages (yes, there are many of them and they seem to change with every version of the compiler)
  #pragma diag_suppress 2527
  #pragma diag_suppress 2529
  #pragma diag_suppress 2651
  #pragma diag_suppress 2653
  #pragma diag_suppress 2668
  #pragma diag_suppress 2669
  #pragma diag_suppress 2670
  #pragma diag_suppress 2671
  #pragma diag_suppress 2735
  #pragma diag_suppress 2737
 #endif
 #endif // not EIGEN_WARNINGS_DISABLED
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@ -91,6 +91,7 @@ template<typename NullaryOp, typename MatrixType>         class CwiseNullaryOp;
 template<typename UnaryOp,   typename MatrixType>         class CwiseUnaryOp;
 template<typename ViewOp,    typename MatrixType>         class CwiseUnaryView;
 template<typename BinaryOp,  typename Lhs, typename Rhs>  class CwiseBinaryOp;
 template<typename TernaryOp, typename Arg1, typename Arg2, typename Arg3>  class CwiseTernaryOp;
 template<typename Decomposition, typename Rhstype>        class Solve;
 template<typename XprType>                                class Inverse;
@ -174,9 +175,11 @@ namespace internal {
 // with optional conjugation of the arguments.
 template<typename LhsScalar, typename RhsScalar, bool ConjLhs=false, bool ConjRhs=false> struct conj_helper;
-template<typename Scalar> struct scalar_sum_op;
+template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_sum_op;
-template<typename Scalar> struct scalar_difference_op;
+template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_difference_op;
-template<typename LhsScalar,typename RhsScalar> struct scalar_conj_product_op;
+template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_conj_product_op;
 template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_min_op;
 template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_max_op;
 template<typename Scalar> struct scalar_opposite_op;
 template<typename Scalar> struct scalar_conjugate_op;
 template<typename Scalar> struct scalar_real_op;
@ -192,27 +195,28 @@ template<typename Scalar> struct scalar_sin_op;
 template<typename Scalar> struct scalar_acos_op;
 template<typename Scalar> struct scalar_asin_op;
 template<typename Scalar> struct scalar_tan_op;
 template<typename Scalar> struct scalar_pow_op;
 template<typename Scalar> struct scalar_inverse_op;
 template<typename Scalar> struct scalar_square_op;
 template<typename Scalar> struct scalar_cube_op;
 template<typename Scalar, typename NewType> struct scalar_cast_op;
 template<typename Scalar> struct scalar_multiple_op;
 template<typename Scalar> struct scalar_quotient1_op;
 template<typename Scalar> struct scalar_min_op;
 template<typename Scalar> struct scalar_max_op;
 template<typename Scalar> struct scalar_random_op;
 template<typename Scalar> struct scalar_add_op;
 template<typename Scalar> struct scalar_constant_op;
 template<typename Scalar> struct scalar_identity_op;
 template<typename Scalar,bool iscpx> struct scalar_sign_op;
 template<typename Scalar,typename ScalarExponent> struct scalar_pow_op;
 template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_hypot_op;
 template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_product_op;
 template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_quotient_op;
 // SpecialFunctions module
 template<typename Scalar> struct scalar_lgamma_op;
 template<typename Scalar> struct scalar_digamma_op;
 template<typename Scalar> struct scalar_erf_op;
 template<typename Scalar> struct scalar_erfc_op;
 template<typename Scalar> struct scalar_igamma_op;
 template<typename Scalar> struct scalar_igammac_op;
-
+template<typename Scalar> struct scalar_zeta_op;
-template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_product_op;
+template<typename Scalar> struct scalar_betainc_op;
 template<typename LhsScalar,typename RhsScalar> struct scalar_multiple2_op;
 template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_quotient_op;
 template<typename LhsScalar,typename RhsScalar> struct scalar_quotient2_op;
 } // end namespace internal
--- a/Eigen/src/Core/util/MKL_support.h
+++ b/Eigen/src/Core/util/MKL_support.h
@ -49,7 +49,7 @@
  #define EIGEN_USE_LAPACKE
 #endif
-#if defined(EIGEN_USE_LAPACKE) || defined(EIGEN_USE_MKL_VML)
+#if defined(EIGEN_USE_MKL_VML)
  #define EIGEN_USE_MKL
 #endif
@ -72,7 +72,7 @@
 #endif
 #if defined EIGEN_USE_MKL
-#include <mkl_lapacke.h>
+
 #define EIGEN_MKL_VML_THRESHOLD 128
 /* MKL_DOMAIN_BLAS, etc are defined only in 10.3 update 7 */
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@ -13,7 +13,7 @@
 #define EIGEN_WORLD_VERSION 3
 #define EIGEN_MAJOR_VERSION 2
-#define EIGEN_MINOR_VERSION 92
+#define EIGEN_MINOR_VERSION 94
 #define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \
                                      (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \
@ -28,9 +28,9 @@
  #define EIGEN_COMP_GNUC 0
 #endif
-/// \internal EIGEN_COMP_CLANG set to 1 if the compiler is clang (alias for __clang__)
+/// \internal EIGEN_COMP_CLANG set to major+minor version (e.g., 307 for clang 3.7) if the compiler is clang
 #if defined(__clang__)
-  #define EIGEN_COMP_CLANG 1
+  #define EIGEN_COMP_CLANG (__clang_major__*100+__clang_minor__)
 #else
  #define EIGEN_COMP_CLANG 0
 #endif
@ -71,6 +71,15 @@
  #define EIGEN_COMP_MSVC 0
 #endif
 // For the record, here is a table summarizing the possible values for EIGEN_COMP_MSVC:
 //  name  ver   MSC_VER
 //  2008    9      1500
 //  2010   10      1600
 //  2012   11      1700
 //  2013   12      1800
 //  2015   14      1900
 //  "15"   15      1900
 /// \internal EIGEN_COMP_MSVC_STRICT set to 1 if the compiler is really Microsoft Visual C++ and not ,e.g., ICC
 #if EIGEN_COMP_MSVC && !(EIGEN_COMP_ICC)
  #define EIGEN_COMP_MSVC_STRICT _MSC_VER
@ -340,50 +349,82 @@
 # define __has_feature(x) 0
 #endif
 // Upperbound on the C++ version to use.
 // Expected values are 03, 11, 14, 17, etc.
 // By default, let's use an arbitrarily large C++ version.
 #ifndef EIGEN_MAX_CPP_VER
 #define EIGEN_MAX_CPP_VER 99
 #endif
 // Do we support r-value references?
-#if (__has_feature(cxx_rvalue_references) || \
+#ifndef EIGEN_HAS_RVALUE_REFERENCES
 #if EIGEN_MAX_CPP_VER>=11 && \
    (__has_feature(cxx_rvalue_references) || \
    (defined(__cplusplus) && __cplusplus >= 201103L) || \
    (EIGEN_COMP_MSVC >= 1600))
-  #define EIGEN_HAVE_RVALUE_REFERENCES
+  #define EIGEN_HAS_RVALUE_REFERENCES 1
 #else
  #define EIGEN_HAS_RVALUE_REFERENCES 0
 #endif
 #endif
 // Does the compiler support C99?
-#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901))       \
+#ifndef EIGEN_HAS_C99_MATH
 #if EIGEN_MAX_CPP_VER>=11 && \
    ((defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901))       \
  || (defined(__GNUC__) && defined(_GLIBCXX_USE_C99)) \
-  || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER))
+  || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER)))
  #define EIGEN_HAS_C99_MATH 1
 #else
  #define EIGEN_HAS_C99_MATH 0
 #endif
 #endif
 // Does the compiler support result_of?
-#if (__has_feature(cxx_lambdas) || (defined(__cplusplus) && __cplusplus >= 201103L))
+#ifndef EIGEN_HAS_STD_RESULT_OF
 #if EIGEN_MAX_CPP_VER>=11 && ((__has_feature(cxx_lambdas) || (defined(__cplusplus) && __cplusplus >= 201103L)))
 #define EIGEN_HAS_STD_RESULT_OF 1
 #else
 #define EIGEN_HAS_STD_RESULT_OF 0
 #endif
 #endif
 // Does the compiler support variadic templates?
-#if __cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900
+#ifndef EIGEN_HAS_VARIADIC_TEMPLATES
-// Disable the use of variadic templates when compiling with nvcc on ARM devices:
+#if EIGEN_MAX_CPP_VER>=11 && (__cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900) \
    && ( !defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 )
    // ^^ Disable the use of variadic templates when compiling with nvcc on ARM devices:
    //    this prevents nvcc from crashing when compiling Eigen on Tegra X1
 #if !defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64
 #define EIGEN_HAS_VARIADIC_TEMPLATES 1
 #else
 #define EIGEN_HAS_VARIADIC_TEMPLATES 0
 #endif
 #endif
-// Does the compiler support const expressions?
+// Does the compiler fully support const expressions? (as in c++14)
 #ifndef EIGEN_HAS_CONSTEXPR
 #ifdef __CUDACC__
 // Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above
-#if __cplusplus > 199711L && defined(__CUDACC_VER__) && (EIGEN_COMP_CLANG || __CUDACC_VER__ >= 70500)
+#if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && defined(__CUDACC_VER__) && (EIGEN_COMP_CLANG || __CUDACC_VER__ >= 70500))
  #define EIGEN_HAS_CONSTEXPR 1
 #endif
-#elif __has_feature(cxx_relaxed_constexpr) || (defined(__cplusplus) && __cplusplus >= 201402L) || \
+#elif EIGEN_MAX_CPP_VER>=14 && (__has_feature(cxx_relaxed_constexpr) || (defined(__cplusplus) && __cplusplus >= 201402L) || \
-  (EIGEN_GNUC_AT_LEAST(4,8) && (__cplusplus > 199711L))
+  (EIGEN_GNUC_AT_LEAST(4,8) && (__cplusplus > 199711L)))
 #define EIGEN_HAS_CONSTEXPR 1
 #endif
 #ifndef EIGEN_HAS_CONSTEXPR
 #define EIGEN_HAS_CONSTEXPR 0
 #endif
 #endif
 // Does the compiler support C++11 math?
 // Let's be conservative and enable the default C++11 implementation only if we are sure it exists
 #ifndef EIGEN_HAS_CXX11_MATH
-  #if (__cplusplus > 201103L) || (__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC)  \
+  #if EIGEN_MAX_CPP_VER>=11 && ((__cplusplus > 201103L) || (__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC)  \
-      && (EIGEN_ARCH_i386_OR_x86_64) && (EIGEN_OS_GNULINUX || EIGEN_OS_WIN_STRICT || EIGEN_OS_MAC)
+      && (EIGEN_ARCH_i386_OR_x86_64) && (EIGEN_OS_GNULINUX || EIGEN_OS_WIN_STRICT || EIGEN_OS_MAC))
    #define EIGEN_HAS_CXX11_MATH 1
  #else
    #define EIGEN_HAS_CXX11_MATH 0
@ -392,9 +433,10 @@
 // Does the compiler support proper C++11 containers?
 #ifndef EIGEN_HAS_CXX11_CONTAINERS
-  #if    (__cplusplus > 201103L) \
+  #if    EIGEN_MAX_CPP_VER>=11 && \
         ((__cplusplus > 201103L) \
      || ((__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_ICC>=1400)) \
-      || EIGEN_COMP_MSVC >= 1900
+      || EIGEN_COMP_MSVC >= 1900)
    #define EIGEN_HAS_CXX11_CONTAINERS 1
  #else
    #define EIGEN_HAS_CXX11_CONTAINERS 0
@ -403,9 +445,11 @@
 // Does the compiler support C++11 noexcept?
 #ifndef EIGEN_HAS_CXX11_NOEXCEPT
-  #if    (__cplusplus > 201103L) \
+  #if    EIGEN_MAX_CPP_VER>=11 && \
         (__has_feature(cxx_noexcept) \
      || (__cplusplus > 201103L) \
      || ((__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_ICC>=1400)) \
-      || EIGEN_COMP_MSVC >= 1900
+      || EIGEN_COMP_MSVC >= 1900)
    #define EIGEN_HAS_CXX11_NOEXCEPT 1
  #else
    #define EIGEN_HAS_CXX11_NOEXCEPT 0
@ -427,6 +471,8 @@
 #define EIGEN_CAT2(a,b) a ## b
 #define EIGEN_CAT(a,b) EIGEN_CAT2(a,b)
 #define EIGEN_COMMA ,
 // convert a token to a string
 #define EIGEN_MAKESTRING2(a) #a
 #define EIGEN_MAKESTRING(a) EIGEN_MAKESTRING2(a)
@ -725,6 +771,11 @@ namespace Eigen {
 #define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES
 #endif
 #ifndef EIGEN_UNALIGNED_VECTORIZE
 #define EIGEN_UNALIGNED_VECTORIZE 1
 #endif
 //----------------------------------------------------------------------
@ -839,18 +890,10 @@ namespace Eigen {
 #define EIGEN_IMPLIES(a,b) (!(a) || (b))
-#define EIGEN_MAKE_CWISE_BINARY_OP(METHOD,FUNCTOR) \
+// the expression type of a standard coefficient wise binary operation
-  template<typename OtherDerived> \
+#define EIGEN_CWISE_BINARY_RETURN_TYPE(LHS,RHS,OPNAME) \
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseBinaryOp<FUNCTOR<Scalar>, const Derived, const OtherDerived> \
  (METHOD)(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const \
  { \
    return CwiseBinaryOp<FUNCTOR<Scalar>, const Derived, const OtherDerived>(derived(), other.derived()); \
  }
 // the expression type of a cwise product
 #define EIGEN_CWISE_PRODUCT_RETURN_TYPE(LHS,RHS) \
    CwiseBinaryOp< \
-      internal::scalar_product_op< \
+      EIGEN_CAT(EIGEN_CAT(internal::scalar_,OPNAME),_op)< \
          typename internal::traits<LHS>::Scalar, \
          typename internal::traits<RHS>::Scalar \
      >, \
@ -858,6 +901,55 @@ namespace Eigen {
      const RHS \
    >
 #define EIGEN_MAKE_CWISE_BINARY_OP(METHOD,OPNAME) \
  template<typename OtherDerived> \
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const EIGEN_CWISE_BINARY_RETURN_TYPE(Derived,OtherDerived,OPNAME) \
  (METHOD)(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const \
  { \
    return EIGEN_CWISE_BINARY_RETURN_TYPE(Derived,OtherDerived,OPNAME)(derived(), other.derived()); \
  }
 #define EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME,TYPEA,TYPEB) \
  (Eigen::internal::has_ReturnType<Eigen::ScalarBinaryOpTraits<TYPEA,TYPEB,EIGEN_CAT(EIGEN_CAT(Eigen::internal::scalar_,OPNAME),_op)<TYPEA,TYPEB>  > >::value)
 #define EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(EXPR,SCALAR,OPNAME) \
  CwiseBinaryOp<EIGEN_CAT(EIGEN_CAT(internal::scalar_,OPNAME),_op)<typename internal::traits<EXPR>::Scalar,SCALAR>, const EXPR, \
                const typename internal::plain_constant_type<EXPR,SCALAR>::type>
 #define EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(SCALAR,EXPR,OPNAME) \
  CwiseBinaryOp<EIGEN_CAT(EIGEN_CAT(internal::scalar_,OPNAME),_op)<SCALAR,typename internal::traits<EXPR>::Scalar>, \
                const typename internal::plain_constant_type<EXPR,SCALAR>::type, const EXPR>
 // Workaround for MSVC 2010 (see ML thread "patch with compile for for MSVC 2010")
 #if EIGEN_COMP_MSVC_STRICT<=1600
 #define EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(X) typename internal::enable_if<true,X>::type
 #else
 #define EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(X) X
 #endif
 #define EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(METHOD,OPNAME) \
  template <typename T> EIGEN_DEVICE_FUNC inline \
  EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename internal::promote_scalar_arg<Scalar EIGEN_COMMA T EIGEN_COMMA EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME,Scalar,T)>::type,OPNAME))\
  (METHOD)(const T& scalar) const { \
    typedef typename internal::promote_scalar_arg<Scalar,T,EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME,Scalar,T)>::type PromotedT; \
    return EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,PromotedT,OPNAME)(derived(), \
           typename internal::plain_constant_type<Derived,PromotedT>::type(derived().rows(), derived().cols(), internal::scalar_constant_op<PromotedT>(scalar))); \
  }
 #define EIGEN_MAKE_SCALAR_BINARY_OP_ONTHELEFT(METHOD,OPNAME) \
  template <typename T> EIGEN_DEVICE_FUNC inline friend \
  EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename internal::promote_scalar_arg<Scalar EIGEN_COMMA T EIGEN_COMMA EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME,T,Scalar)>::type,Derived,OPNAME)) \
  (METHOD)(const T& scalar, const StorageBaseType& matrix) { \
    typedef typename internal::promote_scalar_arg<Scalar,T,EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME,T,Scalar)>::type PromotedT; \
    return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(PromotedT,Derived,OPNAME)( \
           typename internal::plain_constant_type<Derived,PromotedT>::type(matrix.derived().rows(), matrix.derived().cols(), internal::scalar_constant_op<PromotedT>(scalar)), matrix.derived()); \
  }
 #define EIGEN_MAKE_SCALAR_BINARY_OP(METHOD,OPNAME) \
  EIGEN_MAKE_SCALAR_BINARY_OP_ONTHELEFT(METHOD,OPNAME) \
  EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(METHOD,OPNAME)
 #ifdef EIGEN_EXCEPTIONS
 #  define EIGEN_THROW_X(X) throw X
 #  define EIGEN_THROW throw
@ -865,8 +957,8 @@ namespace Eigen {
 #  define EIGEN_CATCH(X) catch (X)
 #else
 #  ifdef __CUDA_ARCH__
-#    define EIGEN_THROW_X(X) asm("trap;") return {}
+#    define EIGEN_THROW_X(X) asm("trap;")
-#    define EIGEN_THROW asm("trap;"); return {}
+#    define EIGEN_THROW asm("trap;")
 #  else
 #    define EIGEN_THROW_X(X) std::abort()
 #    define EIGEN_THROW std::abort()
@ -875,10 +967,16 @@ namespace Eigen {
 #  define EIGEN_CATCH(X) else
 #endif
 #if EIGEN_HAS_CXX11_NOEXCEPT
 #   define EIGEN_INCLUDE_TYPE_TRAITS
 #   define EIGEN_NOEXCEPT noexcept
 #   define EIGEN_NOEXCEPT_IF(x) noexcept(x)
 #   define EIGEN_NO_THROW noexcept(true)
 #   define EIGEN_EXCEPTION_SPEC(X) noexcept(false)
 #else
 #   define EIGEN_NOEXCEPT
 #   define EIGEN_NOEXCEPT_IF(x)
 #   define EIGEN_NO_THROW throw()
 #   define EIGEN_EXCEPTION_SPEC(X) throw(X)
 #endif
--- a/Show More
+++ b/Show More