Merged eigen/eigen into default

2025-08-13 04:09:10 +08:00 · 2015-07-03 00:41:11 +02:00 · 2015-07-03 00:41:11 +02:00 · 9115896590
commit 9115896590
parent 95ef94f1ee 8f1d547c92
88 changed files with 3006 additions and 619 deletions
--- a/Eigen/src/Core/ArrayBase.h
+++ b/Eigen/src/Core/ArrayBase.h
@ -83,22 +83,10 @@ template<typename Derived> class ArrayBase
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** \internal the plain matrix type corresponding to this expression. Note that is not necessarily
+    typedef typename Base::PlainObject PlainObject;
      * exactly the return type of eval(): in the case of plain matrices, the return type of eval() is a const
      * reference to a matrix, not a matrix! It is however guaranteed that the return type of eval() is either
      * PlainObject or const PlainObject&.
      */
    typedef Array<typename internal::traits<Derived>::Scalar,
                internal::traits<Derived>::RowsAtCompileTime,
                internal::traits<Derived>::ColsAtCompileTime,
                AutoAlign | (internal::traits<Derived>::Flags&RowMajorBit ? RowMajor : ColMajor),
                internal::traits<Derived>::MaxRowsAtCompileTime,
                internal::traits<Derived>::MaxColsAtCompileTime
          > PlainObject;
    /** \internal Represents a matrix with all coefficients equal to one another*/
-    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,Derived> ConstantReturnType;
+    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,PlainObject> ConstantReturnType;
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::ArrayBase
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@ -756,6 +756,26 @@ EIGEN_DEVICE_FUNC void call_assignment_no_alias(Dst& dst, const Src& src)
  call_assignment_no_alias(dst, src, internal::assign_op<typename Dst::Scalar>());
 }
 template<typename Dst, typename Src, typename Func>
 EIGEN_DEVICE_FUNC void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src, const Func& func)
 {
  Index dstRows = src.rows();
  Index dstCols = src.cols();
  if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
    dst.resize(dstRows, dstCols);
  // TODO check whether this is the right place to perform these checks:
  EIGEN_STATIC_ASSERT_LVALUE(Dst)
  EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Dst,Src)
  Assignment<Dst,Src,Func>::run(dst, src, func);
 }
 template<typename Dst, typename Src>
 EIGEN_DEVICE_FUNC void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src)
 {
  call_assignment_no_alias_no_transpose(dst, src, internal::assign_op<typename Dst::Scalar>());
 }
 // forward declaration
 template<typename Dst, typename Src> void check_for_aliasing(const Dst &dst, const Src &src);
@ -783,7 +803,6 @@ struct Assignment<DstXprType, SrcXprType, Functor, EigenBase2EigenBase, Scalar>
  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &/*func*/)
  {
    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
    src.evalTo(dst);
  }
 };
--- a/Eigen/src/Core/Assign_MKL.h
+++ b/Eigen/src/Core/Assign_MKL.h
@ -1,5 +1,6 @@
 /*
 Copyright (c) 2011, Intel Corporation. All rights reserved.
 Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 Redistribution and use in source and binary forms, with or without modification,
 are permitted provided that the following conditions are met:
@ -37,17 +38,13 @@ namespace Eigen {
 namespace internal {
-template<typename Op> struct vml_call
+template<typename Dst, typename Src>
 { enum { IsSupported = 0 }; };
 template<typename Dst, typename Src, typename UnaryOp>
 class vml_assign_traits
 {
  private:
    enum {
      DstHasDirectAccess = Dst::Flags & DirectAccessBit,
      SrcHasDirectAccess = Src::Flags & DirectAccessBit,
      StorageOrdersAgree = (int(Dst::IsRowMajor) == int(Src::IsRowMajor)),
      InnerSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::SizeAtCompileTime)
                : int(Dst::Flags)&RowMajorBit ? int(Dst::ColsAtCompileTime)
@ -57,173 +54,118 @@ class vml_assign_traits
                    : int(Dst::MaxRowsAtCompileTime),
      MaxSizeAtCompileTime = Dst::SizeAtCompileTime,
-      MightEnableVml =  vml_call<UnaryOp>::IsSupported && StorageOrdersAgree && DstHasDirectAccess && SrcHasDirectAccess
+      MightEnableVml = StorageOrdersAgree && DstHasDirectAccess && SrcHasDirectAccess && Src::InnerStrideAtCompileTime==1 && Dst::InnerStrideAtCompileTime==1,
                     && Src::InnerStrideAtCompileTime==1 && Dst::InnerStrideAtCompileTime==1,
      MightLinearize = MightEnableVml && (int(Dst::Flags) & int(Src::Flags) & LinearAccessBit),
      VmlSize = MightLinearize ? MaxSizeAtCompileTime : InnerMaxSize,
-      LargeEnough = VmlSize==Dynamic || VmlSize>=EIGEN_MKL_VML_THRESHOLD,
+      LargeEnough = VmlSize==Dynamic || VmlSize>=EIGEN_MKL_VML_THRESHOLD
      MayEnableVml = MightEnableVml && LargeEnough,
      MayLinearize = MayEnableVml && MightLinearize
    };
  public:
    enum {
-      Traversal = MayLinearize ? LinearVectorizedTraversal
+      EnableVml = MightEnableVml && LargeEnough,
-                : MayEnableVml ? InnerVectorizedTraversal
+      Traversal = MightLinearize ? LinearTraversal : DefaultTraversal
                : DefaultTraversal
    };
 };
-template<typename Derived1, typename Derived2, typename UnaryOp, int Traversal, int Unrolling,
+#define EIGEN_PP_EXPAND(ARG) ARG
         int VmlTraversal = vml_assign_traits<Derived1, Derived2, UnaryOp>::Traversal >
 struct vml_assign_impl
  : assign_impl<Derived1, Eigen::CwiseUnaryOp<UnaryOp, Derived2>,Traversal,Unrolling,BuiltIn>
 {
 };
 template<typename Derived1, typename Derived2, typename UnaryOp, int Traversal, int Unrolling>
 struct vml_assign_impl<Derived1, Derived2, UnaryOp, Traversal, Unrolling, InnerVectorizedTraversal>
 {
  typedef typename Derived1::Scalar Scalar;
  static inline void run(Derived1& dst, const CwiseUnaryOp<UnaryOp, Derived2>& src)
  {
    // in case we want to (or have to) skip VML at runtime we can call:
    // assign_impl<Derived1,Eigen::CwiseUnaryOp<UnaryOp, Derived2>,Traversal,Unrolling,BuiltIn>::run(dst,src);
    const Index innerSize = dst.innerSize();
    const Index outerSize = dst.outerSize();
    for(Index outer = 0; outer < outerSize; ++outer) {
      const Scalar *src_ptr = src.IsRowMajor ?  &(src.nestedExpression().coeffRef(outer,0)) :
                                                &(src.nestedExpression().coeffRef(0, outer));
      Scalar *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer,0)) : &(dst.coeffRef(0, outer));
      vml_call<UnaryOp>::run(src.functor(), innerSize, src_ptr, dst_ptr );
    }
  }
 };
 template<typename Derived1, typename Derived2, typename UnaryOp, int Traversal, int Unrolling>
 struct vml_assign_impl<Derived1, Derived2, UnaryOp, Traversal, Unrolling, LinearVectorizedTraversal>
 {
  static inline void run(Derived1& dst, const CwiseUnaryOp<UnaryOp, Derived2>& src)
  {
    // in case we want to (or have to) skip VML at runtime we can call:
    // assign_impl<Derived1,Eigen::CwiseUnaryOp<UnaryOp, Derived2>,Traversal,Unrolling,BuiltIn>::run(dst,src);
    vml_call<UnaryOp>::run(src.functor(), dst.size(), src.nestedExpression().data(), dst.data() );
  }
 };
 // Macroses
 #define EIGEN_MKL_VML_SPECIALIZE_ASSIGN(TRAVERSAL,UNROLLING) \
  template<typename Derived1, typename Derived2, typename UnaryOp> \
  struct assign_impl<Derived1, Eigen::CwiseUnaryOp<UnaryOp, Derived2>, TRAVERSAL, UNROLLING, Specialized>  {  \
    static inline void run(Derived1 &dst, const Eigen::CwiseUnaryOp<UnaryOp, Derived2> &src) { \
      vml_assign_impl<Derived1,Derived2,UnaryOp,TRAVERSAL,UNROLLING>::run(dst, src); \
    } \
  };
 EIGEN_MKL_VML_SPECIALIZE_ASSIGN(DefaultTraversal,NoUnrolling)
 EIGEN_MKL_VML_SPECIALIZE_ASSIGN(DefaultTraversal,CompleteUnrolling)
 EIGEN_MKL_VML_SPECIALIZE_ASSIGN(DefaultTraversal,InnerUnrolling)
 EIGEN_MKL_VML_SPECIALIZE_ASSIGN(LinearTraversal,NoUnrolling)
 EIGEN_MKL_VML_SPECIALIZE_ASSIGN(LinearTraversal,CompleteUnrolling)
 EIGEN_MKL_VML_SPECIALIZE_ASSIGN(InnerVectorizedTraversal,NoUnrolling)
 EIGEN_MKL_VML_SPECIALIZE_ASSIGN(InnerVectorizedTraversal,CompleteUnrolling)
 EIGEN_MKL_VML_SPECIALIZE_ASSIGN(InnerVectorizedTraversal,InnerUnrolling)
 EIGEN_MKL_VML_SPECIALIZE_ASSIGN(LinearVectorizedTraversal,CompleteUnrolling)
 EIGEN_MKL_VML_SPECIALIZE_ASSIGN(LinearVectorizedTraversal,NoUnrolling)
 EIGEN_MKL_VML_SPECIALIZE_ASSIGN(SliceVectorizedTraversal,NoUnrolling)
 #if !defined (EIGEN_FAST_MATH) || (EIGEN_FAST_MATH != 1)
-#define  EIGEN_MKL_VML_MODE VML_HA
+#define EIGEN_VMLMODE_EXPAND_LA , VML_HA
 #else
-#define  EIGEN_MKL_VML_MODE VML_LA
+#define EIGEN_VMLMODE_EXPAND_LA , VML_LA
 #endif
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE)     \
+#define EIGEN_VMLMODE_EXPAND__ 
-  template<> struct vml_call< scalar_##EIGENOP##_op<EIGENTYPE> > {               \
+
-    enum { IsSupported = 1 };                                                    \
+#define EIGEN_VMLMODE_PREFIX_LA vm
-    static inline void run( const scalar_##EIGENOP##_op<EIGENTYPE>& /*func*/,        \
+#define EIGEN_VMLMODE_PREFIX__  v
-                            int size, const EIGENTYPE* src, EIGENTYPE* dst) {    \
+#define EIGEN_VMLMODE_PREFIX(VMLMODE) EIGEN_CAT(EIGEN_VMLMODE_PREFIX_,VMLMODE)
-      VMLOP(size, (const VMLTYPE*)src, (VMLTYPE*)dst);                           \
+
 #define EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE, VMLMODE)                                           \
  template< typename DstXprType, typename SrcXprNested>                                                                         \
  struct Assignment<DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested>, assign_op<EIGENTYPE>,             \
                   Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml,EIGENTYPE>::type> {    \
    typedef CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested> SrcXprType;                                            \
    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE> &/*func*/) {                             \
      eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());                                                       \
      if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal) {                                              \
        VMLOP(dst.size(), (const VMLTYPE*)src.nestedExpression().data(),                                                        \
              (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE) );                                           \
      } else {                                                                                                                  \
        const Index outerSize = dst.outerSize();                                                                                \
        for(Index outer = 0; outer < outerSize; ++outer) {                                                                      \
          const EIGENTYPE *src_ptr = src.IsRowMajor ? &(src.nestedExpression().coeffRef(outer,0)) :                             \
                                                      &(src.nestedExpression().coeffRef(0, outer));                             \
          EIGENTYPE *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer,0)) : &(dst.coeffRef(0, outer));                           \
          VMLOP( dst.innerSize(), (const VMLTYPE*)src_ptr,                                                                      \
                (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE));                                             \
        }                                                                                                                       \
      }                                                                                                                         \
    }                                                                                                                           \
  };                                                                                                                            \
 #define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(EIGENOP, VMLOP, VMLMODE)                                                         \
  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, EIGEN_CAT(EIGEN_VMLMODE_PREFIX(VMLMODE),s##VMLOP), float, float, VMLMODE)           \
  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, EIGEN_CAT(EIGEN_VMLMODE_PREFIX(VMLMODE),d##VMLOP), double, double, VMLMODE)
 #define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_CPLX(EIGENOP, VMLOP, VMLMODE)                                                         \
  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, EIGEN_CAT(EIGEN_VMLMODE_PREFIX(VMLMODE),c##VMLOP), scomplex, MKL_Complex8, VMLMODE) \
  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, EIGEN_CAT(EIGEN_VMLMODE_PREFIX(VMLMODE),z##VMLOP), dcomplex, MKL_Complex16, VMLMODE)
 #define EIGEN_MKL_VML_DECLARE_UNARY_CALLS(EIGENOP, VMLOP, VMLMODE)                                                              \
  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(EIGENOP, VMLOP, VMLMODE)                                                               \
  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_CPLX(EIGENOP, VMLOP, VMLMODE)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS(sin,   Sin,   LA)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS(asin,  Asin,  LA)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS(sinh,  Sinh,  LA)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS(cos,   Cos,   LA)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS(acos,  Acos,  LA)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS(cosh,  Cosh,  LA)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS(tan,   Tan,   LA)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS(atan,  Atan,  LA)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS(tanh,  Tanh,  LA)
 // EIGEN_MKL_VML_DECLARE_UNARY_CALLS(abs,   Abs,    _)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS(exp,   Exp,   LA)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS(log,   Ln,    LA)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS(log10, Log10, LA)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS(sqrt,  Sqrt,  _)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(square, Sqr,   _)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_CPLX(arg, Arg,      _)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(round, Round,  _)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(floor, Floor,  _)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil,  Ceil,   _)
 #define EIGEN_MKL_VML_DECLARE_POW_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE, VMLMODE)                                           \
  template< typename DstXprType, typename SrcXprNested>                                                                       \
  struct Assignment<DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested>, assign_op<EIGENTYPE>,           \
                   Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml,EIGENTYPE>::type> {  \
    typedef CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested> SrcXprType;                                          \
    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE> &/*func*/) {                           \
      eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());                                                     \
      VMLTYPE exponent = reinterpret_cast<const VMLTYPE&>(src.functor().m_exponent);                                          \
      if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal)                                              \
      {                                                                                                                       \
        VMLOP( dst.size(), (const VMLTYPE*)src.nestedExpression().data(), exponent,                                           \
              (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE) );                                         \
      } else {                                                                                                                \
        const Index outerSize = dst.outerSize();                                                                              \
        for(Index outer = 0; outer < outerSize; ++outer) {                                                                    \
          const EIGENTYPE *src_ptr = src.IsRowMajor ? &(src.nestedExpression().coeffRef(outer,0)) :                           \
                                                      &(src.nestedExpression().coeffRef(0, outer));                           \
          EIGENTYPE *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer,0)) : &(dst.coeffRef(0, outer));                         \
          VMLOP( dst.innerSize(), (const VMLTYPE*)src_ptr, exponent,                                                          \
                 (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE));                                          \
        }                                                                                                                     \
      }                                                                                                                       \
    }                                                                                                                         \
  };
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALL_LA(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE)  \
+EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmsPowx, float,    float,         LA)
-  template<> struct vml_call< scalar_##EIGENOP##_op<EIGENTYPE> > {               \
+EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmdPowx, double,   double,        LA)
-    enum { IsSupported = 1 };                                                    \
+EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmcPowx, scomplex, MKL_Complex8,  LA)
-    static inline void run( const scalar_##EIGENOP##_op<EIGENTYPE>& /*func*/,        \
+EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmzPowx, dcomplex, MKL_Complex16, LA)
                            int size, const EIGENTYPE* src, EIGENTYPE* dst) {    \
      MKL_INT64 vmlMode = EIGEN_MKL_VML_MODE;                                    \
      VMLOP(size, (const VMLTYPE*)src, (VMLTYPE*)dst, vmlMode);                  \
    }                                                                            \
  };
 #define EIGEN_MKL_VML_DECLARE_POW_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE)       \
  template<> struct vml_call< scalar_##EIGENOP##_op<EIGENTYPE> > {               \
    enum { IsSupported = 1 };                                                    \
    static inline void run( const scalar_##EIGENOP##_op<EIGENTYPE>& func,        \
                          int size, const EIGENTYPE* src, EIGENTYPE* dst) {      \
      EIGENTYPE exponent = func.m_exponent;                                      \
      MKL_INT64 vmlMode = EIGEN_MKL_VML_MODE;                                    \
      VMLOP(&size, (const VMLTYPE*)src, (const VMLTYPE*)&exponent,               \
                        (VMLTYPE*)dst, &vmlMode);                                \
    }                                                                            \
  };
 #define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(EIGENOP, VMLOP)                   \
  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, vs##VMLOP, float, float)             \
  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, vd##VMLOP, double, double)
 #define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_COMPLEX(EIGENOP, VMLOP)                \
  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, vc##VMLOP, scomplex, MKL_Complex8)   \
  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, vz##VMLOP, dcomplex, MKL_Complex16)
 #define EIGEN_MKL_VML_DECLARE_UNARY_CALLS(EIGENOP, VMLOP)                        \
  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(EIGENOP, VMLOP)                         \
  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_COMPLEX(EIGENOP, VMLOP)
 #define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL_LA(EIGENOP, VMLOP)                \
  EIGEN_MKL_VML_DECLARE_UNARY_CALL_LA(EIGENOP, vms##VMLOP, float, float)         \
  EIGEN_MKL_VML_DECLARE_UNARY_CALL_LA(EIGENOP, vmd##VMLOP, double, double)
 #define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_COMPLEX_LA(EIGENOP, VMLOP)             \
  EIGEN_MKL_VML_DECLARE_UNARY_CALL_LA(EIGENOP, vmc##VMLOP, scomplex, MKL_Complex8)  \
  EIGEN_MKL_VML_DECLARE_UNARY_CALL_LA(EIGENOP, vmz##VMLOP, dcomplex, MKL_Complex16)
 #define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(EIGENOP, VMLOP)                     \
  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL_LA(EIGENOP, VMLOP)                      \
  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_COMPLEX_LA(EIGENOP, VMLOP)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(sin,   Sin)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(asin,  Asin)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(sinh,  Sinh)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(cos,   Cos)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(acos,  Acos)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(cosh,  Cosh)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(tan,   Tan)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(atan,  Atan)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(tanh,  Tanh)
 //EIGEN_MKL_VML_DECLARE_UNARY_CALLS(abs,    Abs)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(exp,   Exp)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(log,   Ln)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(log10, Log10)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS(sqrt,     Sqrt)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(square, Sqr)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_COMPLEX(arg, Arg)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(round, Round)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(floor, Floor)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil,  Ceil)
 // The vm*powx functions are not avaibale in the windows version of MKL.
 #ifndef _WIN32
 EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmspowx_, float, float)
 EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmdpowx_, double, double)
 EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmcpowx_, scomplex, MKL_Complex8)
 EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmzpowx_, dcomplex, MKL_Complex16)
 #endif
 } // end namespace internal
--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h
@ -113,10 +113,10 @@ class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp
  */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, Derived>
+EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
 DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func)
 {
-  return CwiseNullaryOp<CustomNullaryOp, Derived>(rows, cols, func);
+  return CwiseNullaryOp<CustomNullaryOp, PlainObject>(rows, cols, func);
 }
 /** \returns an expression of a matrix defined by a custom functor \a func
@ -139,12 +139,12 @@ DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& f
  */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, Derived>
+EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
 DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  if(RowsAtCompileTime == 1) return CwiseNullaryOp<CustomNullaryOp, Derived>(1, size, func);
+  if(RowsAtCompileTime == 1) return CwiseNullaryOp<CustomNullaryOp, PlainObject>(1, size, func);
-  else return CwiseNullaryOp<CustomNullaryOp, Derived>(size, 1, func);
+  else return CwiseNullaryOp<CustomNullaryOp, PlainObject>(size, 1, func);
 }
 /** \returns an expression of a matrix defined by a custom functor \a func
@ -158,10 +158,10 @@ DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func)
  */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, Derived>
+EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
 DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func)
 {
-  return CwiseNullaryOp<CustomNullaryOp, Derived>(RowsAtCompileTime, ColsAtCompileTime, func);
+  return CwiseNullaryOp<CustomNullaryOp, PlainObject>(RowsAtCompileTime, ColsAtCompileTime, func);
 }
 /** \returns an expression of a constant matrix of value \a value
--- a/Eigen/src/Core/DenseBase.h
+++ b/Eigen/src/Core/DenseBase.h
@ -49,6 +49,8 @@ template<typename Derived> class DenseBase
  public:
    using internal::special_scalar_op_base<Derived,typename internal::traits<Derived>::Scalar,
                typename NumTraits<typename internal::traits<Derived>::Scalar>::Real>::operator*;
    using internal::special_scalar_op_base<Derived,typename internal::traits<Derived>::Scalar,
                typename NumTraits<typename internal::traits<Derived>::Scalar>::Real>::operator/;
    /** Inner iterator type to iterate over the coefficients of a row or column.
@ -179,6 +181,35 @@ template<typename Derived> class DenseBase
    enum { IsPlainObjectBase = 0 };
    /** The plain matrix type corresponding to this expression.
      * \sa PlainObject */
    typedef Matrix<typename internal::traits<Derived>::Scalar,
                internal::traits<Derived>::RowsAtCompileTime,
                internal::traits<Derived>::ColsAtCompileTime,
                AutoAlign | (internal::traits<Derived>::Flags&RowMajorBit ? RowMajor : ColMajor),
                internal::traits<Derived>::MaxRowsAtCompileTime,
                internal::traits<Derived>::MaxColsAtCompileTime
          > PlainMatrix;
    /** The plain array type corresponding to this expression.
      * \sa PlainObject */
    typedef Array<typename internal::traits<Derived>::Scalar,
                internal::traits<Derived>::RowsAtCompileTime,
                internal::traits<Derived>::ColsAtCompileTime,
                AutoAlign | (internal::traits<Derived>::Flags&RowMajorBit ? RowMajor : ColMajor),
                internal::traits<Derived>::MaxRowsAtCompileTime,
                internal::traits<Derived>::MaxColsAtCompileTime
          > PlainArray;
    /** \brief The plain matrix or array type corresponding to this expression.
      *
      * This is not necessarily exactly the return type of eval(). In the case of plain matrices,
      * the return type of eval() is a const reference to a matrix, not a matrix! It is however guaranteed
      * that the return type of eval() is either PlainObject or const PlainObject&.
      */
    typedef typename internal::conditional<internal::is_same<typename internal::traits<Derived>::XprKind,MatrixXpr >::value,
                                 PlainMatrix, PlainArray>::type PlainObject;
    /** \returns the number of nonzero coefficients which is in practice the number
      * of stored coefficients. */
    EIGEN_DEVICE_FUNC
@ -237,13 +268,12 @@ template<typename Derived> class DenseBase
    }
 #ifndef EIGEN_PARSED_BY_DOXYGEN
    /** \internal Represents a matrix with all coefficients equal to one another*/
-    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,Derived> ConstantReturnType;
+    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,PlainObject> ConstantReturnType;
    /** \internal Represents a vector with linearly spaced coefficients that allows sequential access only. */
-    typedef CwiseNullaryOp<internal::linspaced_op<Scalar,false>,Derived> SequentialLinSpacedReturnType;
+    typedef CwiseNullaryOp<internal::linspaced_op<Scalar,false>,PlainObject> SequentialLinSpacedReturnType;
    /** \internal Represents a vector with linearly spaced coefficients that allows random access. */
-    typedef CwiseNullaryOp<internal::linspaced_op<Scalar,true>,Derived> RandomAccessLinSpacedReturnType;
+    typedef CwiseNullaryOp<internal::linspaced_op<Scalar,true>,PlainObject> RandomAccessLinSpacedReturnType;
    /** \internal the return type of MatrixBase::eigenvalues() */
    typedef Matrix<typename NumTraits<typename internal::traits<Derived>::Scalar>::Real, internal::traits<Derived>::ColsAtCompileTime, 1> EigenvaluesReturnType;
@ -322,13 +352,13 @@ template<typename Derived> class DenseBase
    LinSpaced(const Scalar& low, const Scalar& high);
    template<typename CustomNullaryOp> EIGEN_DEVICE_FUNC
-    static const CwiseNullaryOp<CustomNullaryOp, Derived>
+    static const CwiseNullaryOp<CustomNullaryOp, PlainObject>
    NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func);
    template<typename CustomNullaryOp> EIGEN_DEVICE_FUNC
-    static const CwiseNullaryOp<CustomNullaryOp, Derived>
+    static const CwiseNullaryOp<CustomNullaryOp, PlainObject>
    NullaryExpr(Index size, const CustomNullaryOp& func);
    template<typename CustomNullaryOp> EIGEN_DEVICE_FUNC
-    static const CwiseNullaryOp<CustomNullaryOp, Derived>
+    static const CwiseNullaryOp<CustomNullaryOp, PlainObject>
    NullaryExpr(const CustomNullaryOp& func);
    EIGEN_DEVICE_FUNC static const ConstantReturnType Zero(Index rows, Index cols);
@ -466,9 +496,10 @@ template<typename Derived> class DenseBase
    ConstColwiseReturnType colwise() const;
    ColwiseReturnType colwise();
-    static const CwiseNullaryOp<internal::scalar_random_op<Scalar>,Derived> Random(Index rows, Index cols);
+    typedef CwiseNullaryOp<internal::scalar_random_op<Scalar>,PlainObject> RandomReturnType;
-    static const CwiseNullaryOp<internal::scalar_random_op<Scalar>,Derived> Random(Index size);
+    static const RandomReturnType Random(Index rows, Index cols);
-    static const CwiseNullaryOp<internal::scalar_random_op<Scalar>,Derived> Random();
+    static const RandomReturnType Random(Index size);
    static const RandomReturnType Random();
    template<typename ThenDerived,typename ElseDerived>
    const Select<Derived,ThenDerived,ElseDerived>
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@ -349,6 +349,7 @@ struct hypot_retval
 template<typename OldType, typename NewType>
 struct cast_impl
 {
  EIGEN_DEVICE_FUNC
  static inline NewType run(const OldType& x)
  {
    return static_cast<NewType>(x);
@ -360,6 +361,7 @@ struct cast_impl
 template<typename OldType, typename NewType>
 inline NewType cast(const OldType& x)
 {
  EIGEN_DEVICE_FUNC
  return cast_impl<OldType, NewType>::run(x);
 }
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@ -81,6 +81,7 @@ template<typename Derived> class MatrixBase
    using Base::operator*=;
    using Base::operator/=;
    using Base::operator*;
    using Base::operator/;
    typedef typename Base::CoeffReturnType CoeffReturnType;
    typedef typename Base::ConstTransposeReturnType ConstTransposeReturnType;
@ -101,23 +102,11 @@ template<typename Derived> class MatrixBase
    EIGEN_DEVICE_FUNC
    inline Index diagonalSize() const { return (std::min)(rows(),cols()); }
-    /** \brief The plain matrix type corresponding to this expression.
+    typedef typename Base::PlainObject PlainObject;
      *
      * This is not necessarily exactly the return type of eval(). In the case of plain matrices,
      * the return type of eval() is a const reference to a matrix, not a matrix! It is however guaranteed
      * that the return type of eval() is either PlainObject or const PlainObject&.
      */
    typedef Matrix<typename internal::traits<Derived>::Scalar,
                internal::traits<Derived>::RowsAtCompileTime,
                internal::traits<Derived>::ColsAtCompileTime,
                AutoAlign | (internal::traits<Derived>::Flags&RowMajorBit ? RowMajor : ColMajor),
                internal::traits<Derived>::MaxRowsAtCompileTime,
                internal::traits<Derived>::MaxColsAtCompileTime
          > PlainObject;
 #ifndef EIGEN_PARSED_BY_DOXYGEN
    /** \internal Represents a matrix with all coefficients equal to one another*/
-    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,Derived> ConstantReturnType;
+    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,PlainObject> ConstantReturnType;
    /** \internal the return type of MatrixBase::adjoint() */
    typedef typename internal::conditional<NumTraits<Scalar>::IsComplex,
                        CwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, ConstTransposeReturnType>,
@ -126,7 +115,7 @@ template<typename Derived> class MatrixBase
    /** \internal Return type of eigenvalues() */
    typedef Matrix<std::complex<RealScalar>, internal::traits<Derived>::ColsAtCompileTime, 1, ColMajor> EigenvaluesReturnType;
    /** \internal the return type of identity */
-    typedef CwiseNullaryOp<internal::scalar_identity_op<Scalar>,Derived> IdentityReturnType;
+    typedef CwiseNullaryOp<internal::scalar_identity_op<Scalar>,PlainObject> IdentityReturnType;
    /** \internal the return type of unit vectors */
    typedef Block<const CwiseNullaryOp<internal::scalar_identity_op<Scalar>, SquareMatrixType>,
                  internal::traits<Derived>::RowsAtCompileTime,
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@ -751,7 +751,6 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
  using Base::m_diagImpl;
  using Base::m_matImpl;
  using Base::coeff;
  using Base::packet_impl;
  typedef typename Base::Scalar Scalar;
  typedef typename Base::PacketScalar PacketScalar;
@ -776,7 +775,8 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
  template<int LoadMode>
  EIGEN_STRONG_INLINE PacketScalar packet(Index row, Index col) const
  {
-    // NVCC complains about template keyword, so we disable this function in CUDA mode
+    // FIXME: NVCC used to complain about the template keyword, but we have to check whether this is still the case.
    // See also similar calls below.
    return this->template packet_impl<LoadMode>(row,col, row,
                                 typename internal::conditional<int(StorageOrder)==RowMajor, internal::true_type, internal::false_type>::type());
  }
@ -798,7 +798,6 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape,
  using Base::m_diagImpl;
  using Base::m_matImpl;
  using Base::coeff;
  using Base::packet_impl;
  typedef typename Base::Scalar Scalar;
  typedef typename Base::PacketScalar PacketScalar;
--- a/Eigen/src/Core/Random.h
+++ b/Eigen/src/Core/Random.h
@ -53,7 +53,7 @@ struct functor_traits<scalar_random_op<Scalar> >
  * \sa DenseBase::setRandom(), DenseBase::Random(Index), DenseBase::Random()
  */
 template<typename Derived>
-inline const CwiseNullaryOp<internal::scalar_random_op<typename internal::traits<Derived>::Scalar>, Derived>
+inline const typename DenseBase<Derived>::RandomReturnType
 DenseBase<Derived>::Random(Index rows, Index cols)
 {
  return NullaryExpr(rows, cols, internal::scalar_random_op<Scalar>());
@ -84,7 +84,7 @@ DenseBase<Derived>::Random(Index rows, Index cols)
  * \sa DenseBase::setRandom(), DenseBase::Random(Index,Index), DenseBase::Random()
  */
 template<typename Derived>
-inline const CwiseNullaryOp<internal::scalar_random_op<typename internal::traits<Derived>::Scalar>, Derived>
+inline const typename DenseBase<Derived>::RandomReturnType
 DenseBase<Derived>::Random(Index size)
 {
  return NullaryExpr(size, internal::scalar_random_op<Scalar>());
@ -110,7 +110,7 @@ DenseBase<Derived>::Random(Index size)
  * \sa DenseBase::setRandom(), DenseBase::Random(Index,Index), DenseBase::Random(Index)
  */
 template<typename Derived>
-inline const CwiseNullaryOp<internal::scalar_random_op<typename internal::traits<Derived>::Scalar>, Derived>
+inline const typename DenseBase<Derived>::RandomReturnType
 DenseBase<Derived>::Random()
 {
  return NullaryExpr(RowsAtCompileTime, ColsAtCompileTime, internal::scalar_random_op<Scalar>());
--- a/Eigen/src/Core/Reverse.h
+++ b/Eigen/src/Core/Reverse.h
@ -70,10 +70,6 @@ template<typename MatrixType, int Direction> class Reverse
    typedef typename internal::remove_all<MatrixType>::type NestedExpression;
    using Base::IsRowMajor;
    // next line is necessary because otherwise const version of operator()
    // is hidden by non-const version defined in this file
    using Base::operator(); 
  protected:
    enum {
      PacketSize = internal::packet_traits<Scalar>::size,
@ -101,69 +97,6 @@ template<typename MatrixType, int Direction> class Reverse
      return -m_matrix.innerStride();
    }
    EIGEN_DEVICE_FUNC inline Scalar& operator()(Index row, Index col)
    {
      eigen_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
      return coeffRef(row, col);
    }
    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index col)
    {
      return m_matrix.const_cast_derived().coeffRef(ReverseRow ? m_matrix.rows() - row - 1 : row,
                                                    ReverseCol ? m_matrix.cols() - col - 1 : col);
    }
    EIGEN_DEVICE_FUNC inline CoeffReturnType coeff(Index row, Index col) const
    {
      return m_matrix.coeff(ReverseRow ? m_matrix.rows() - row - 1 : row,
                            ReverseCol ? m_matrix.cols() - col - 1 : col);
    }
    EIGEN_DEVICE_FUNC inline CoeffReturnType coeff(Index index) const
    {
      return m_matrix.coeff(m_matrix.size() - index - 1);
    }
    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index)
    {
      return m_matrix.const_cast_derived().coeffRef(m_matrix.size() - index - 1);
    }
    EIGEN_DEVICE_FUNC inline Scalar& operator()(Index index)
    {
      eigen_assert(index >= 0 && index < m_matrix.size());
      return coeffRef(index);
    }
    template<int LoadMode>
    inline const PacketScalar packet(Index row, Index col) const
    {
      return reverse_packet::run(m_matrix.template packet<LoadMode>(
                                    ReverseRow ? m_matrix.rows() - row - OffsetRow : row,
                                    ReverseCol ? m_matrix.cols() - col - OffsetCol : col));
    }
    template<int LoadMode>
    inline void writePacket(Index row, Index col, const PacketScalar& x)
    {
      m_matrix.const_cast_derived().template writePacket<LoadMode>(
                                      ReverseRow ? m_matrix.rows() - row - OffsetRow : row,
                                      ReverseCol ? m_matrix.cols() - col - OffsetCol : col,
                                      reverse_packet::run(x));
    }
    template<int LoadMode>
    inline const PacketScalar packet(Index index) const
    {
      return internal::preverse(m_matrix.template packet<LoadMode>( m_matrix.size() - index - PacketSize ));
    }
    template<int LoadMode>
    inline void writePacket(Index index, const PacketScalar& x)
    {
      m_matrix.const_cast_derived().template writePacket<LoadMode>(m_matrix.size() - index - PacketSize, internal::preverse(x));
    }
    EIGEN_DEVICE_FUNC const typename internal::remove_all<typename MatrixType::Nested>::type&
    nestedExpression() const 
    {
--- a/Eigen/src/Core/StableNorm.h
+++ b/Eigen/src/Core/StableNorm.h
@ -157,6 +157,7 @@ inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
 MatrixBase<Derived>::stableNorm() const
 {
  using std::sqrt;
  using std::abs;
  const Index blockSize = 4096;
  RealScalar scale(0);
  RealScalar invScale(1);
@ -164,12 +165,18 @@ MatrixBase<Derived>::stableNorm() const
  enum {
    Alignment = (int(Flags)&DirectAccessBit) || (int(Flags)&AlignedBit) ? 1 : 0
  };
  typedef typename internal::conditional<Alignment, Ref<const Matrix<Scalar,Dynamic,1,0,blockSize,1>, Aligned>,
                                                    typename Base::ConstSegmentReturnType>::type SegmentWrapper;
  Index n = size();
  if(n==1)
    return abs(this->coeff(0));
  Index bi = internal::first_aligned(derived());
  if (bi>0)
    internal::stable_norm_kernel(this->head(bi), ssq, scale, invScale);
  for (; bi<n; bi+=blockSize)
-    internal::stable_norm_kernel(this->segment(bi,numext::mini(blockSize, n - bi)).template forceAlignedAccessIf<Alignment>(), ssq, scale, invScale);
+    internal::stable_norm_kernel(SegmentWrapper(this->segment(bi,numext::mini(blockSize, n - bi))), ssq, scale, invScale);
  return scale * sqrt(ssq);
 }
--- a/Eigen/src/Core/functors/BinaryFunctors.h
+++ b/Eigen/src/Core/functors/BinaryFunctors.h
@ -392,6 +392,18 @@ template<typename Scalar>
 struct functor_traits<scalar_quotient1_op<Scalar> >
 { enum { Cost = 2 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasDiv }; };
 template<typename Scalar1, typename Scalar2>
 struct scalar_quotient2_op {
  typedef typename scalar_product_traits<Scalar1,Scalar2>::ReturnType result_type;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_quotient2_op(const scalar_quotient2_op& other) : m_other(other.m_other) { }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_quotient2_op(const Scalar2& other) : m_other(other) { }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar1& a) const { return a / m_other; }
  typename add_const_on_value_type<typename NumTraits<Scalar2>::Nested>::type m_other;
 };
 template<typename Scalar1,typename Scalar2>
 struct functor_traits<scalar_quotient2_op<Scalar1,Scalar2> >
 { enum { Cost = 2 * NumTraits<Scalar1>::MulCost, PacketAccess = false }; };
 // In Eigen, any binary op (Product, CwiseBinaryOp) require the Lhs and Rhs to have the same scalar type, except for multiplication
 // where the mixing of different types is handled by scalar_product_traits
 // In particular, real * complex<real> is allowed.
--- a/Eigen/src/Core/products/GeneralMatrixVector_MKL.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector_MKL.h
@ -46,38 +46,37 @@ namespace internal {
 // gemv specialization
-template<typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs>
+template<typename Index, typename LhsScalar, int StorageOrder, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs>
-struct general_matrix_vector_product_gemv :
+struct general_matrix_vector_product_gemv;
  general_matrix_vector_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,ConjugateRhs,BuiltIn> {};
 #define EIGEN_MKL_GEMV_SPECIALIZE(Scalar) \
 template<typename Index, bool ConjugateLhs, bool ConjugateRhs> \
-struct general_matrix_vector_product<Index,Scalar,ColMajor,ConjugateLhs,Scalar,ConjugateRhs,Specialized> { \
+struct general_matrix_vector_product<Index,Scalar,const_blas_data_mapper<Scalar,Index,ColMajor>,ColMajor,ConjugateLhs,Scalar,const_blas_data_mapper<Scalar,Index,RowMajor>,ConjugateRhs,Specialized> { \
 static void run( \
  Index rows, Index cols, \
-  const Scalar* lhs, Index lhsStride, \
+  const const_blas_data_mapper<Scalar,Index,ColMajor> &lhs, \
-  const Scalar* rhs, Index rhsIncr, \
+  const const_blas_data_mapper<Scalar,Index,RowMajor> &rhs, \
  Scalar* res, Index resIncr, Scalar alpha) \
 { \
  if (ConjugateLhs) { \
-    general_matrix_vector_product<Index,Scalar,ColMajor,ConjugateLhs,Scalar,ConjugateRhs,BuiltIn>::run( \
+    general_matrix_vector_product<Index,Scalar,const_blas_data_mapper<Scalar,Index,ColMajor>,ColMajor,ConjugateLhs,Scalar,const_blas_data_mapper<Scalar,Index,RowMajor>,ConjugateRhs,BuiltIn>::run( \
-      rows, cols, lhs, lhsStride, rhs, rhsIncr, res, resIncr, alpha); \
+      rows, cols, lhs, rhs, res, resIncr, alpha); \
  } else { \
    general_matrix_vector_product_gemv<Index,Scalar,ColMajor,ConjugateLhs,Scalar,ConjugateRhs>::run( \
-      rows, cols, lhs, lhsStride, rhs, rhsIncr, res, resIncr, alpha); \
+      rows, cols, lhs.data(), lhs.stride(), rhs.data(), rhs.stride(), res, resIncr, alpha); \
  } \
 } \
 }; \
 template<typename Index, bool ConjugateLhs, bool ConjugateRhs> \
-struct general_matrix_vector_product<Index,Scalar,RowMajor,ConjugateLhs,Scalar,ConjugateRhs,Specialized> { \
+struct general_matrix_vector_product<Index,Scalar,const_blas_data_mapper<Scalar,Index,RowMajor>,RowMajor,ConjugateLhs,Scalar,const_blas_data_mapper<Scalar,Index,ColMajor>,ConjugateRhs,Specialized> { \
 static void run( \
  Index rows, Index cols, \
-  const Scalar* lhs, Index lhsStride, \
+  const const_blas_data_mapper<Scalar,Index,RowMajor> &lhs, \
-  const Scalar* rhs, Index rhsIncr, \
+  const const_blas_data_mapper<Scalar,Index,ColMajor> &rhs, \
  Scalar* res, Index resIncr, Scalar alpha) \
 { \
    general_matrix_vector_product_gemv<Index,Scalar,RowMajor,ConjugateLhs,Scalar,ConjugateRhs>::run( \
-      rows, cols, lhs, lhsStride, rhs, rhsIncr, res, resIncr, alpha); \
+      rows, cols, lhs.data(), lhs.stride(), rhs.data(), rhs.stride(), res, resIncr, alpha); \
 } \
 }; \
--- a/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h
@ -122,7 +122,7 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
       Map<const MatrixLhs, 0, OuterStride<> > lhsMap(_lhs,rows,depth,OuterStride<>(lhsStride)); \
       MatrixLhs aa_tmp=lhsMap.template triangularView<Mode>(); \
       MKL_INT aStride = aa_tmp.outerStride(); \
-       gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth); \
+       gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth, 1, true); \
       general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor>::run( \
       rows, cols, depth, aa_tmp.data(), aStride, _rhs, rhsStride, res, resStride, alpha, gemm_blocking, 0); \
 \
@ -236,7 +236,7 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
       Map<const MatrixRhs, 0, OuterStride<> > rhsMap(_rhs,depth,cols, OuterStride<>(rhsStride)); \
       MatrixRhs aa_tmp=rhsMap.template triangularView<Mode>(); \
       MKL_INT aStride = aa_tmp.outerStride(); \
-       gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth); \
+       gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth, 1, true); \
       general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor>::run( \
       rows, cols, depth, _lhs, lhsStride, aa_tmp.data(), aStride, res, resStride, alpha, gemm_blocking, 0); \
 \
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@ -224,6 +224,7 @@ class blas_data_mapper {
  }
  const Index stride() const { return m_stride; }
  const Scalar* data() const { return m_data; }
  Index firstAligned(Index size) const {
    if (size_t(m_data)%sizeof(Scalar)) {
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@ -213,6 +213,7 @@ template<typename Scalar> struct scalar_identity_op;
 template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_product_op;
 template<typename LhsScalar,typename RhsScalar> struct scalar_multiple2_op;
 template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_quotient_op;
 template<typename LhsScalar,typename RhsScalar> struct scalar_quotient2_op;
 } // end namespace internal
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@ -645,7 +645,7 @@ namespace Eigen {
 // just an empty macro !
 #define EIGEN_EMPTY
-#if EIGEN_COMP_MSVC_STRICT && EIGEN_COMP_MSVC < 1800 // for older MSVC versions using the base operator is sufficient (cf Bug 1000)
+#if EIGEN_COMP_MSVC_STRICT && EIGEN_COMP_MSVC < 1900 // for older MSVC versions using the base operator is sufficient (cf Bug 1000)
  #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
    using Base::operator =;
 #elif EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@ -427,7 +427,9 @@ struct special_scalar_op_base : public DenseCoeffsBase<Derived>
 {
  // dummy operator* so that the
  // "using special_scalar_op_base::operator*" compiles
-  void operator*() const;
+  struct dummy {};
  void operator*(dummy) const;
  void operator/(dummy) const;
 };
 template<typename Derived,typename Scalar,typename OtherScalar>
@ -451,6 +453,16 @@ struct special_scalar_op_base<Derived,Scalar,OtherScalar,true>  : public DenseCo
 #endif
    return static_cast<const special_scalar_op_base&>(matrix).operator*(scalar);
  }
  const CwiseUnaryOp<scalar_quotient2_op<Scalar,OtherScalar>, Derived>
  operator/(const OtherScalar& scalar) const
  {
 #ifdef EIGEN_SPECIAL_SCALAR_MULTIPLE_PLUGIN
    EIGEN_SPECIAL_SCALAR_MULTIPLE_PLUGIN
 #endif
    return CwiseUnaryOp<scalar_quotient2_op<Scalar,OtherScalar>, Derived>
      (*static_cast<const Derived*>(this), scalar_quotient2_op<Scalar,OtherScalar>(scalar));
  }
 };
 template<typename XprType, typename CastType> struct cast_return_type
--- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
+++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
@ -486,10 +486,11 @@ ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag
  while (end>0)
  {
    EIGEN_ASM_COMMENT("beginabs");
    for (Index i = start; i<end; ++i)
      if (internal::isMuchSmallerThan(abs(subdiag[i]),(abs(diag[i])+abs(diag[i+1]))) || abs(subdiag[i]) <= considerAsZero)
        subdiag[i] = 0;
-
+    EIGEN_ASM_COMMENT("endabs");
    // find the largest unreduced block
    while (end>0 && subdiag[end-1]==0)
    {
--- a/Eigen/src/Eigenvalues/Tridiagonalization.h
+++ b/Eigen/src/Eigenvalues/Tridiagonalization.h
@ -464,9 +464,10 @@ struct tridiagonalization_inplace_selector<MatrixType,3,false>
  static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ)
  {
    using std::sqrt;
    const RealScalar tol = (std::numeric_limits<RealScalar>::min)();
    diag[0] = mat(0,0);
    RealScalar v1norm2 = numext::abs2(mat(2,0));
-    if(v1norm2 == RealScalar(0))
+    if(v1norm2 <= tol)
    {
      diag[1] = mat(1,1);
      diag[2] = mat(2,2);
--- a/Eigen/src/Geometry/OrthoMethods.h
+++ b/Eigen/src/Geometry/OrthoMethods.h
@ -18,6 +18,10 @@ namespace Eigen {
  * \returns the cross product of \c *this and \a other
  *
  * Here is a very good explanation of cross-product: http://xkcd.com/199/
  * 
  * With complex numbers, the cross product is implemented as
  * \f$ (\mathbf{a}+i\mathbf{b}) \times (\mathbf{c}+i\mathbf{d}) = (\mathbf{a} \times \mathbf{c} - \mathbf{b} \times \mathbf{d}) - i(\mathbf{a} \times \mathbf{d} - \mathbf{b} \times \mathbf{c})\f$
  * 
  * \sa MatrixBase::cross3()
  */
 template<typename Derived>
--- a/Eigen/src/Householder/Householder.h
+++ b/Eigen/src/Householder/Householder.h
@ -75,8 +75,9 @@ void MatrixBase<Derived>::makeHouseholder(
  RealScalar tailSqNorm = size()==1 ? RealScalar(0) : tail.squaredNorm();
  Scalar c0 = coeff(0);
  const RealScalar tol = (std::numeric_limits<RealScalar>::min)();
-  if(tailSqNorm == RealScalar(0) && numext::imag(c0)==RealScalar(0))
+  if(tailSqNorm <= tol && numext::abs2(numext::imag(c0))<=tol)
  {
    tau = RealScalar(0);
    beta = numext::real(c0);
--- a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
+++ b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
@ -136,6 +136,12 @@ struct traits<BiCGSTAB<_MatrixType,_Preconditioner> >
  * and setTolerance() methods. The defaults are the size of the problem for the maximal number of iterations
  * and NumTraits<Scalar>::epsilon() for the tolerance.
  * 
  * The tolerance corresponds to the relative residual error: |Ax-b|/|b|
  * 
  * \b Performance: when using sparse matrices, best performance is achied for a row-major sparse matrix format.
  * Moreover, in this case multi-threading can be exploited if the user code is compiled with OpenMP enabled.
  * See \ref TopicMultiThreading for details.
  * 
  * This class can be used as the direct solver classes. Here is a typical usage example:
  * \include BiCGSTAB_simple.cpp
  * 
--- a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
+++ b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
@ -114,20 +114,28 @@ struct traits<ConjugateGradient<_MatrixType,_UpLo,_Preconditioner> >
  *
  * \tparam _MatrixType the type of the matrix A, can be a dense or a sparse matrix.
  * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower,
-  *               Upper, or Lower|Upper in which the full matrix entries will be considered. Default is Lower.
+  *               \c Upper, or \c Lower|Upper in which the full matrix entries will be considered.
  *               Default is \c Lower, best performance is \c Lower|Upper.
  * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner
  *
  * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations()
  * and setTolerance() methods. The defaults are the size of the problem for the maximal number of iterations
  * and NumTraits<Scalar>::epsilon() for the tolerance.
  * 
  * The tolerance corresponds to the relative residual error: |Ax-b|/|b|
  * 
  * \b Performance: Even though the default value of \c _UpLo is \c Lower, significantly higher performance is
  * achieved when using a complete matrix and \b Lower|Upper as the \a _UpLo template parameter. Moreover, in this
  * case multi-threading can be exploited if the user code is compiled with OpenMP enabled.
  * See \ref TopicMultiThreading for details.
  * 
  * This class can be used as the direct solver classes. Here is a typical usage example:
    \code
    int n = 10000;
    VectorXd x(n), b(n);
    SparseMatrix<double> A(n,n);
    // fill A and b
-    ConjugateGradient<SparseMatrix<double> > cg;
+    ConjugateGradient<SparseMatrix<double>, Lower|Upper> cg;
    cg.compute(A);
    x = cg.solve(b);
    std::cout << "#iterations:     " << cg.iterations() << std::endl;
@ -183,10 +191,13 @@ public:
  template<typename Rhs,typename Dest>
  void _solve_with_guess_impl(const Rhs& b, Dest& x) const
  {
    typedef Ref<const MatrixType> MatRef;
    typedef typename internal::conditional<UpLo==(Lower|Upper) && (!MatrixType::IsRowMajor) && (!NumTraits<Scalar>::IsComplex),
                                           Transpose<const MatRef>, MatRef const&>::type RowMajorWrapper;
    typedef typename internal::conditional<UpLo==(Lower|Upper),
-                                           Ref<const MatrixType>&,
+                                           RowMajorWrapper,
-                                           typename Ref<const MatrixType>::template ConstSelfAdjointViewReturnType<UpLo>::Type
+                                           typename MatRef::template ConstSelfAdjointViewReturnType<UpLo>::Type
-                                          >::type MatrixWrapperType;
+                                          >::type SelfAdjointWrapper;
    m_iterations = Base::maxIterations();
    m_error = Base::m_tolerance;
@ -196,7 +207,8 @@ public:
      m_error = Base::m_tolerance;
      typename Dest::ColXpr xj(x,j);
-      internal::conjugate_gradient(MatrixWrapperType(mp_matrix), b.col(j), xj, Base::m_preconditioner, m_iterations, m_error);
+      RowMajorWrapper row_mat(mp_matrix);
      internal::conjugate_gradient(SelfAdjointWrapper(row_mat), b.col(j), xj, Base::m_preconditioner, m_iterations, m_error);
    }
    m_isInitialized = true;
--- a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
+++ b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
@ -126,10 +126,16 @@ public:
  /** \internal */
  Index cols() const { return mp_matrix.cols(); }
-  /** \returns the tolerance threshold used by the stopping criteria */
+  /** \returns the tolerance threshold used by the stopping criteria.
    * \sa setTolerance()
    */
  RealScalar tolerance() const { return m_tolerance; }
-  /** Sets the tolerance threshold used by the stopping criteria */
+  /** Sets the tolerance threshold used by the stopping criteria.
    *
    * This value is used as an upper bound to the relative residual error: |Ax-b|/|b|.
    * The default value is the machine precision given by NumTraits<Scalar>::epsilon()
    */
  Derived& setTolerance(const RealScalar& tolerance)
  {
    m_tolerance = tolerance;
@ -167,7 +173,9 @@ public:
    return m_iterations;
  }
-  /** \returns the tolerance error reached during the last solve */
+  /** \returns the tolerance error reached during the last solve.
    * It is a close approximation of the true relative residual error |Ax-b|/|b|.
    */
  RealScalar error() const
  {
    eigen_assert(m_isInitialized && "ConjugateGradient is not initialized.");
--- a/Eigen/src/PardisoSupport/PardisoSupport.h
+++ b/Eigen/src/PardisoSupport/PardisoSupport.h
@ -54,7 +54,7 @@ namespace internal
  template<>
  struct pardiso_run_selector<long long int>
  {
-    typedef long long int IndexTypeType;
+    typedef long long int IndexType;
    static IndexType run( _MKL_DSS_HANDLE_t pt, IndexType maxfct, IndexType mnum, IndexType type, IndexType phase, IndexType n, void *a,
                      IndexType *ia, IndexType *ja, IndexType *perm, IndexType nrhs, IndexType *iparm, IndexType msglvl, void *b, void *x)
    {
@ -93,19 +93,19 @@ namespace internal
    typedef typename _MatrixType::StorageIndex StorageIndex;    
  };
-}
+} // end namespace internal
 template<class Derived>
-class PardisoImpl : public SparseSolveBase<PardisoImpl<Derived>
+class PardisoImpl : public SparseSolverBase<Derived>
 {
  protected:
-    typedef SparseSolveBase<PardisoImpl<Derived> Base;
+    typedef SparseSolverBase<Derived> Base;
    using Base::derived;
    using Base::m_isInitialized;
    typedef internal::pardiso_traits<Derived> Traits;
  public:
-    using base::_solve_impl;
+    using Base::_solve_impl;
    typedef typename Traits::MatrixType MatrixType;
    typedef typename Traits::Scalar Scalar;
@ -173,16 +173,17 @@ class PardisoImpl : public SparseSolveBase<PardisoImpl<Derived>
    Derived& compute(const MatrixType& matrix);
-    template<typename BDerived, typename XDerived>
+    template<typename Rhs,typename Dest>
-    bool _solve_impl(const MatrixBase<BDerived> &b, MatrixBase<XDerived>& x) const;
+    void _solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const;
  protected:
    void pardisoRelease()
    {
      if(m_isInitialized) // Factorization ran at least once
      {
-        internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, -1, m_size, 0, 0, 0, m_perm.data(), 0,
+        internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, -1, m_size,0, 0, 0, m_perm.data(), 0,
-                                                   m_iparm.data(), m_msglvl, 0, 0);
+                                                          m_iparm.data(), m_msglvl, NULL, NULL);
        m_isInitialized = false;
      }
    }
@ -217,12 +218,14 @@ class PardisoImpl : public SparseSolveBase<PardisoImpl<Derived>
      m_iparm[27] = (sizeof(RealScalar) == 4) ? 1 : 0;
      m_iparm[34] = 1;  // C indexing
      m_iparm[59] = 1;  // Automatic switch between In-Core and Out-of-Core modes
      memset(m_pt, 0, sizeof(m_pt));
    }
  protected:
    // cached data to reduce reallocation, etc.
-    void manageErrorCode(Index error)
+    void manageErrorCode(Index error) const
    {
      switch(error)
      {
@ -239,7 +242,7 @@ class PardisoImpl : public SparseSolveBase<PardisoImpl<Derived>
    }
    mutable SparseMatrixType m_matrix;
-    ComputationInfo m_info;
+    mutable ComputationInfo m_info;
    bool m_analysisIsOk, m_factorizationIsOk;
    Index m_type, m_msglvl;
    mutable void *m_pt[64];
@ -256,7 +259,6 @@ Derived& PardisoImpl<Derived>::compute(const MatrixType& a)
  eigen_assert(a.rows() == a.cols());
  pardisoRelease();
  memset(m_pt, 0, sizeof(m_pt));
  m_perm.setZero(m_size);
  derived().getMatrix(a);
@ -279,7 +281,6 @@ Derived& PardisoImpl<Derived>::analyzePattern(const MatrixType& a)
  eigen_assert(m_size == a.cols());
  pardisoRelease();
  memset(m_pt, 0, sizeof(m_pt));
  m_perm.setZero(m_size);
  derived().getMatrix(a);
@ -313,12 +314,15 @@ Derived& PardisoImpl<Derived>::factorize(const MatrixType& a)
  return derived();
 }
-template<class Base>
+template<class Derived>
 template<typename BDerived,typename XDerived>
-bool PardisoImpl<Base>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBase<XDerived>& x) const
+void PardisoImpl<Derived>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBase<XDerived>& x) const
 {
  if(m_iparm[0] == 0) // Factorization was not computed
-    return false;
+  {
    m_info = InvalidInput;
    return;
  }
  //Index n = m_matrix.rows();
  Index nrhs = Index(b.cols());
@ -353,7 +357,7 @@ bool PardisoImpl<Base>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBase<XD
                                                            m_perm.data(), nrhs, m_iparm.data(), m_msglvl,
                                                            rhs_ptr, x.derived().data());
-  return error==0;
+  manageErrorCode(error);
 }
@ -373,7 +377,7 @@ template<typename MatrixType>
 class PardisoLU : public PardisoImpl< PardisoLU<MatrixType> >
 {
  protected:
-    typedef PardisoImpl< PardisoLU<MatrixType> > Base;
+    typedef PardisoImpl<PardisoLU> Base;
    typedef typename Base::Scalar Scalar;
    typedef typename Base::RealScalar RealScalar;
    using Base::pardisoInit;
@ -401,6 +405,7 @@ class PardisoLU : public PardisoImpl< PardisoLU<MatrixType> >
    void getMatrix(const MatrixType& matrix)
    {
      m_matrix = matrix;
      m_matrix.makeCompressed();
    }
 };
@ -424,7 +429,6 @@ class PardisoLLT : public PardisoImpl< PardisoLLT<MatrixType,_UpLo> >
  protected:
    typedef PardisoImpl< PardisoLLT<MatrixType,_UpLo> > Base;
    typedef typename Base::Scalar Scalar;
    typedef typename Base::StorageIndex StorageIndex;
    typedef typename Base::RealScalar RealScalar;
    using Base::pardisoInit;
    using Base::m_matrix;
@ -432,9 +436,9 @@ class PardisoLLT : public PardisoImpl< PardisoLLT<MatrixType,_UpLo> >
  public:
    typedef typename Base::StorageIndex StorageIndex;
    enum { UpLo = _UpLo };
    using Base::compute;
    using Base::solve;
    PardisoLLT()
      : Base()
@ -457,6 +461,7 @@ class PardisoLLT : public PardisoImpl< PardisoLLT<MatrixType,_UpLo> >
      PermutationMatrix<Dynamic,Dynamic,StorageIndex> p_null;
      m_matrix.resize(matrix.rows(), matrix.cols());
      m_matrix.template selfadjointView<Upper>() = matrix.template selfadjointView<UpLo>().twistedBy(p_null);
      m_matrix.makeCompressed();
    }
 };
@ -482,7 +487,6 @@ class PardisoLDLT : public PardisoImpl< PardisoLDLT<MatrixType,Options> >
  protected:
    typedef PardisoImpl< PardisoLDLT<MatrixType,Options> > Base;
    typedef typename Base::Scalar Scalar;
    typedef typename Base::StorageIndex StorageIndex;
    typedef typename Base::RealScalar RealScalar;
    using Base::pardisoInit;
    using Base::m_matrix;
@ -490,8 +494,8 @@ class PardisoLDLT : public PardisoImpl< PardisoLDLT<MatrixType,Options> >
  public:
    typedef typename Base::StorageIndex StorageIndex;
    using Base::compute;
    using Base::solve;
    enum { UpLo = Options&(Upper|Lower) };
    PardisoLDLT()
@ -513,6 +517,7 @@ class PardisoLDLT : public PardisoImpl< PardisoLDLT<MatrixType,Options> >
      PermutationMatrix<Dynamic,Dynamic,StorageIndex> p_null;
      m_matrix.resize(matrix.rows(), matrix.cols());
      m_matrix.template selfadjointView<Upper>() = matrix.template selfadjointView<UpLo>().twistedBy(p_null);
      m_matrix.makeCompressed();
    }
 };
--- a/Eigen/src/SparseCore/CompressedStorage.h
+++ b/Eigen/src/SparseCore/CompressedStorage.h
@ -50,9 +50,9 @@ class CompressedStorage
    CompressedStorage& operator=(const CompressedStorage& other)
    {
      resize(other.size());
      if(other.size()>0)
      {
        resize(other.size());
        internal::smart_copy(other.m_values,  other.m_values  + m_size, m_values);
        internal::smart_copy(other.m_indices, other.m_indices + m_size, m_indices);
      }
--- a/Eigen/src/SparseCore/SparseAssign.h
+++ b/Eigen/src/SparseCore/SparseAssign.h
@ -16,8 +16,7 @@ template<typename Derived>
 template<typename OtherDerived>
 Derived& SparseMatrixBase<Derived>::operator=(const EigenBase<OtherDerived> &other)
 {
-  // TODO use the evaluator mechanism
+  internal::call_assignment_no_alias(derived(), other.derived());
  other.derived().evalTo(derived());
  return derived();
 }
@ -182,6 +181,39 @@ struct Assignment<DstXprType, Solve<DecType,RhsType>, internal::assign_op<Scalar
  }
 };
 struct Diagonal2Sparse {};
 template<> struct AssignmentKind<SparseShape,DiagonalShape> { typedef Diagonal2Sparse Kind; };
 template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
 struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Sparse, Scalar>
 {
  typedef typename DstXprType::StorageIndex StorageIndex;
  typedef Array<StorageIndex,Dynamic,1> ArrayXI;
  typedef Array<Scalar,Dynamic,1> ArrayXS;
  template<int Options>
  static void run(SparseMatrix<Scalar,Options,StorageIndex> &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &/*func*/)
  {
    Index size = src.diagonal().size();
    dst.makeCompressed();
    dst.resizeNonZeros(size);
    Map<ArrayXI>(dst.innerIndexPtr(), size).setLinSpaced(0,StorageIndex(size)-1);
    Map<ArrayXI>(dst.outerIndexPtr(), size+1).setLinSpaced(0,StorageIndex(size));
    Map<ArrayXS>(dst.valuePtr(), size) = src.diagonal();
  }
  template<typename DstDerived>
  static void run(SparseMatrixBase<DstDerived> &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &/*func*/)
  {
    dst.diagonal() = src.diagonal();
  }
  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar> &/*func*/)
  { dst.diagonal() += src.diagonal(); }
  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar> &/*func*/)
  { dst.diagonal() -= src.diagonal(); }
 };
 } // end namespace internal
 } // end namespace Eigen
--- a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
+++ b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
@ -390,6 +390,22 @@ SparseMatrixBase<Derived>::operator+=(const SparseMatrixBase<OtherDerived>& othe
  return derived() = derived() + other.derived();
 }
 template<typename Derived>
 template<typename OtherDerived>
 Derived& SparseMatrixBase<Derived>::operator+=(const DiagonalBase<OtherDerived>& other)
 {
  call_assignment_no_alias(derived(), other.derived(), internal::add_assign_op<Scalar>());
  return derived();
 }
 template<typename Derived>
 template<typename OtherDerived>
 Derived& SparseMatrixBase<Derived>::operator-=(const DiagonalBase<OtherDerived>& other)
 {
  call_assignment_no_alias(derived(), other.derived(), internal::sub_assign_op<Scalar>());
  return derived();
 }
 template<typename Derived>
 template<typename OtherDerived>
 EIGEN_STRONG_INLINE const EIGEN_SPARSE_CWISE_PRODUCT_RETURN_TYPE
--- a/Eigen/src/SparseCore/SparseDenseProduct.h
+++ b/Eigen/src/SparseCore/SparseDenseProduct.h
@ -30,23 +30,48 @@ struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, t
  typedef typename internal::remove_all<DenseRhsType>::type Rhs;
  typedef typename internal::remove_all<DenseResType>::type Res;
  typedef typename evaluator<Lhs>::InnerIterator LhsInnerIterator;
  typedef typename evaluator<Lhs>::type LhsEval;
  static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha)
  {
-    typename evaluator<Lhs>::type lhsEval(lhs);
+    LhsEval lhsEval(lhs);
    Index n = lhs.outerSize();
 #ifdef EIGEN_HAS_OPENMP
    Eigen::initParallel();
    Index threads = Eigen::nbThreads();
 #endif
    for(Index c=0; c<rhs.cols(); ++c)
    {
-      Index n = lhs.outerSize();
+#ifdef EIGEN_HAS_OPENMP
-      for(Index j=0; j<n; ++j)
+      // This 20000 threshold has been found experimentally on 2D and 3D Poisson problems.
      // It basically represents the minimal amount of work to be done to be worth it.
      if(threads>1 && lhs.nonZeros() > 20000)
      {
        #pragma omp parallel for schedule(static) num_threads(threads)
        for(Index i=0; i<n; ++i)
          processRow(lhsEval,rhs,res,alpha,i,c);
      }
      else
 #endif
      {
        for(Index i=0; i<n; ++i)
          processRow(lhsEval,rhs,res,alpha,i,c);
      }
    }
  }
  static void processRow(const LhsEval& lhsEval, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha, Index i, Index col)
  {
    typename Res::Scalar tmp(0);
-        for(LhsInnerIterator it(lhsEval,j); it ;++it)
+    for(LhsInnerIterator it(lhsEval,i); it ;++it)
-          tmp += it.value() * rhs.coeff(it.index(),c);
+      tmp += it.value() * rhs.coeff(it.index(),col);
-        res.coeffRef(j,c) += alpha * tmp;
+    res.coeffRef(i,col) += alpha * tmp;
      }
    }
  }
 };
 // FIXME: what is the purpose of the following specialization? Is it for the BlockedSparse format?
 template<typename T1, typename T2/*, int _Options, typename _StrideType*/>
 struct scalar_product_traits<T1, Ref<T2/*, _Options, _StrideType*/> >
 {
--- a/Eigen/src/SparseCore/SparseMatrix.h
+++ b/Eigen/src/SparseCore/SparseMatrix.h
@ -97,8 +97,8 @@ class SparseMatrix
    using Base::isCompressed;
    using Base::nonZeros;
    _EIGEN_SPARSE_PUBLIC_INTERFACE(SparseMatrix)
-    EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseMatrix, +=)
+    using Base::operator+=;
-    EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseMatrix, -=)
+    using Base::operator-=;
    typedef MappedSparseMatrix<Scalar,Flags> Map;
    typedef Diagonal<SparseMatrix> DiagonalReturnType;
@ -696,6 +696,15 @@ class SparseMatrix
      other.evalTo(*this);
    }
    /** \brief Copy constructor with in-place evaluation */
    template<typename OtherDerived>
    explicit SparseMatrix(const DiagonalBase<OtherDerived>& other)
      : Base(), m_outerSize(0), m_innerSize(0), m_outerIndex(0), m_innerNonZeros(0)
    {
      check_template_parameters();
      *this = other.derived();
    }
    /** Swaps the content of two sparse matrices of the same type.
      * This is a fast operation that simply swaps the underlying pointers and parameters. */
    inline void swap(SparseMatrix& other)
--- a/Eigen/src/SparseCore/SparseMatrixBase.h
+++ b/Eigen/src/SparseCore/SparseMatrixBase.h
@ -243,6 +243,11 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
    template<typename OtherDerived>
    Derived& operator-=(const SparseMatrixBase<OtherDerived>& other);
    template<typename OtherDerived>
    Derived& operator+=(const DiagonalBase<OtherDerived>& other);
    template<typename OtherDerived>
    Derived& operator-=(const DiagonalBase<OtherDerived>& other);
    Derived& operator*=(const Scalar& other);
    Derived& operator/=(const Scalar& other);
@ -367,6 +372,8 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
    static inline StorageIndex convert_index(const Index idx) {
      return internal::convert_index<StorageIndex>(idx);
    }
  private:
    template<typename Dest> void evalTo(Dest &) const;
 };
 } // end namespace Eigen
--- a/Eigen/src/SparseCore/SparseSelfAdjointView.h
+++ b/Eigen/src/SparseCore/SparseSelfAdjointView.h
@ -45,8 +45,13 @@ template<typename MatrixType, unsigned int _Mode> class SparseSelfAdjointView
 {
  public:
-    enum { Mode = _Mode };
+    enum {
      Mode = _Mode,
      RowsAtCompileTime = internal::traits<SparseSelfAdjointView>::RowsAtCompileTime,
      ColsAtCompileTime = internal::traits<SparseSelfAdjointView>::ColsAtCompileTime
    };
    typedef EigenBase<SparseSelfAdjointView> Base;
    typedef typename MatrixType::Scalar Scalar;
    typedef typename MatrixType::StorageIndex StorageIndex;
    typedef Matrix<StorageIndex,Dynamic,1> VectorI;
@ -116,20 +121,6 @@ template<typename MatrixType, unsigned int _Mode> class SparseSelfAdjointView
    template<typename DerivedU>
    SparseSelfAdjointView& rankUpdate(const SparseMatrixBase<DerivedU>& u, const Scalar& alpha = Scalar(1));
    /** \internal triggered by sparse_matrix = SparseSelfadjointView; */
    template<typename DestScalar,int StorageOrder> void evalTo(SparseMatrix<DestScalar,StorageOrder,StorageIndex>& _dest) const
    {
      internal::permute_symm_to_fullsymm<Mode>(m_matrix, _dest);
    }
    template<typename DestScalar> void evalTo(DynamicSparseMatrix<DestScalar,ColMajor,StorageIndex>& _dest) const
    {
      // TODO directly evaluate into _dest;
      SparseMatrix<DestScalar,ColMajor,StorageIndex> tmp(_dest.rows(),_dest.cols());
      internal::permute_symm_to_fullsymm<Mode>(m_matrix, tmp);
      _dest = tmp;
    }
    /** \returns an expression of P H P^-1 */
    // TODO implement twists in a more evaluator friendly fashion
    SparseSymmetricPermutationProduct<_MatrixTypeNested,Mode> twistedBy(const PermutationMatrix<Dynamic,Dynamic,StorageIndex>& perm) const
@ -140,7 +131,7 @@ template<typename MatrixType, unsigned int _Mode> class SparseSelfAdjointView
    template<typename SrcMatrixType,int SrcMode>
    SparseSelfAdjointView& operator=(const SparseSymmetricPermutationProduct<SrcMatrixType,SrcMode>& permutedMatrix)
    {
-      permutedMatrix.evalTo(*this);
+      internal::call_assignment_no_alias_no_transpose(*this, permutedMatrix);
      return *this;
    }
@ -157,11 +148,21 @@ template<typename MatrixType, unsigned int _Mode> class SparseSelfAdjointView
      return *this = src.twistedBy(pnull);
    }
    void resize(Index rows, Index cols)
    {
      EIGEN_ONLY_USED_FOR_DEBUG(rows);
      EIGEN_ONLY_USED_FOR_DEBUG(cols);
      eigen_assert(rows == this->rows() && cols == this->cols()
                && "SparseSelfadjointView::resize() does not actually allow to resize.");
    }
  protected:
    typename MatrixType::Nested m_matrix;
    //mutable VectorI m_countPerRow;
    //mutable VectorI m_countPerCol;
  private:
    template<typename Dest> void evalTo(Dest &) const;
 };
 /***************************************************************************
@ -200,6 +201,47 @@ SparseSelfAdjointView<MatrixType,Mode>::rankUpdate(const SparseMatrixBase<Derive
  return *this;
 }
 namespace internal {
 // TODO currently a selfadjoint expression has the form SelfAdjointView<.,.>
 //      in the future selfadjoint-ness should be defined by the expression traits
 //      such that Transpose<SelfAdjointView<.,.> > is valid. (currently TriangularBase::transpose() is overloaded to make it work)
 template<typename MatrixType, unsigned int Mode>
 struct evaluator_traits<SparseSelfAdjointView<MatrixType,Mode> >
 {
  typedef typename storage_kind_to_evaluator_kind<typename MatrixType::StorageKind>::Kind Kind;
  typedef SparseSelfAdjointShape Shape;
  static const int AssumeAliasing = 0;
 };
 struct SparseSelfAdjoint2Sparse {};
 template<> struct AssignmentKind<SparseShape,SparseSelfAdjointShape> { typedef SparseSelfAdjoint2Sparse Kind; };
 template<> struct AssignmentKind<SparseSelfAdjointShape,SparseShape> { typedef Sparse2Sparse Kind; };
 template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
 struct Assignment<DstXprType, SrcXprType, Functor, SparseSelfAdjoint2Sparse, Scalar>
 {
  typedef typename DstXprType::StorageIndex StorageIndex;
  template<typename DestScalar,int StorageOrder>
  static void run(SparseMatrix<DestScalar,StorageOrder,StorageIndex> &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &/*func*/)
  {
    internal::permute_symm_to_fullsymm<SrcXprType::Mode>(src.matrix(), dst);
  }
  template<typename DestScalar>
  static void run(DynamicSparseMatrix<DestScalar,ColMajor,StorageIndex>& dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &/*func*/)
  {
    // TODO directly evaluate into dst;
    SparseMatrix<DestScalar,ColMajor,StorageIndex> tmp(dst.rows(),dst.cols());
    internal::permute_symm_to_fullsymm<SrcXprType::Mode>(src.matrix(), tmp);
    dst = tmp;
  }
 };
 } // end namespace internal
 /***************************************************************************
 * Implementation of sparse self-adjoint time dense matrix
 ***************************************************************************/
@ -253,17 +295,6 @@ inline void sparse_selfadjoint_time_dense_product(const SparseLhsType& lhs, cons
  }
 }
 // TODO currently a selfadjoint expression has the form SelfAdjointView<.,.>
 //      in the future selfadjoint-ness should be defined by the expression traits
 //      such that Transpose<SelfAdjointView<.,.> > is valid. (currently TriangularBase::transpose() is overloaded to make it work)
 template<typename MatrixType, unsigned int Mode>
 struct evaluator_traits<SparseSelfAdjointView<MatrixType,Mode> >
 {
  typedef typename storage_kind_to_evaluator_kind<typename MatrixType::StorageKind>::Kind Kind;
  typedef SparseSelfAdjointShape Shape;
  static const int AssumeAliasing = 0;
 };
 template<typename LhsView, typename Rhs, int ProductType>
 struct generic_product_impl<LhsView, Rhs, SparseSelfAdjointShape, DenseShape, ProductType>
@ -519,12 +550,16 @@ class SparseSymmetricPermutationProduct
  public:
    typedef typename MatrixType::Scalar Scalar;
    typedef typename MatrixType::StorageIndex StorageIndex;
    enum {
      RowsAtCompileTime = internal::traits<SparseSymmetricPermutationProduct>::RowsAtCompileTime,
      ColsAtCompileTime = internal::traits<SparseSymmetricPermutationProduct>::ColsAtCompileTime
    };
  protected:
    typedef PermutationMatrix<Dynamic,Dynamic,StorageIndex> Perm;
  public:
    typedef Matrix<StorageIndex,Dynamic,1> VectorI;
    typedef typename MatrixType::Nested MatrixTypeNested;
-    typedef typename internal::remove_all<MatrixTypeNested>::type _MatrixTypeNested;
+    typedef typename internal::remove_all<MatrixTypeNested>::type NestedExpression;
    SparseSymmetricPermutationProduct(const MatrixType& mat, const Perm& perm)
      : m_matrix(mat), m_perm(perm)
@ -533,19 +568,8 @@ class SparseSymmetricPermutationProduct
    inline Index rows() const { return m_matrix.rows(); }
    inline Index cols() const { return m_matrix.cols(); }
-    template<typename DestScalar, int Options, typename DstIndex>
+    const NestedExpression& matrix() const { return m_matrix; }
-    void evalTo(SparseMatrix<DestScalar,Options,DstIndex>& _dest) const
+    const Perm& perm() const { return m_perm; }
    {
 //       internal::permute_symm_to_fullsymm<Mode>(m_matrix,_dest,m_perm.indices().data());
      SparseMatrix<DestScalar,(Options&RowMajor)==RowMajor ? ColMajor : RowMajor, DstIndex> tmp;
      internal::permute_symm_to_fullsymm<Mode>(m_matrix,tmp,m_perm.indices().data());
      _dest = tmp;
    }
    template<typename DestType,unsigned int DestMode> void evalTo(SparseSelfAdjointView<DestType,DestMode>& dest) const
    {
      internal::permute_symm_to_symm<Mode,DestMode>(m_matrix,dest.matrix(),m_perm.indices().data());
    }
  protected:
    MatrixTypeNested m_matrix;
@ -553,6 +577,31 @@ class SparseSymmetricPermutationProduct
 };
 namespace internal {
 template<typename DstXprType, typename MatrixType, int Mode, typename Scalar>
 struct Assignment<DstXprType, SparseSymmetricPermutationProduct<MatrixType,Mode>, internal::assign_op<Scalar>, Sparse2Sparse>
 {
  typedef SparseSymmetricPermutationProduct<MatrixType,Mode> SrcXprType;
  typedef typename DstXprType::StorageIndex DstIndex;
  template<int Options>
  static void run(SparseMatrix<Scalar,Options,DstIndex> &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
  {
    // internal::permute_symm_to_fullsymm<Mode>(m_matrix,_dest,m_perm.indices().data());
    SparseMatrix<Scalar,(Options&RowMajor)==RowMajor ? ColMajor : RowMajor, DstIndex> tmp;
    internal::permute_symm_to_fullsymm<Mode>(src.matrix(),tmp,src.perm().indices().data());
    dst = tmp;
  }
  template<typename DestType,unsigned int DestMode>
  static void run(SparseSelfAdjointView<DestType,DestMode>& dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
  {
    internal::permute_symm_to_symm<Mode,DestMode>(src.matrix(),dst.matrix(),src.perm().indices().data());
  }
 };
 } // end namespace internal
 } // end namespace Eigen
 #endif // EIGEN_SPARSE_SELFADJOINTVIEW_H
--- a/Eigen/src/SparseCore/SparseUtil.h
+++ b/Eigen/src/SparseCore/SparseUtil.h
@ -37,11 +37,7 @@ EIGEN_STRONG_INLINE Derived& operator Op(const Other& scalar) \
 }
 #define EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATORS(Derived) \
-EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(Derived, =) \
+EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(Derived, =)
 EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(Derived, +=) \
 EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(Derived, -=) \
 EIGEN_SPARSE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, *=) \
 EIGEN_SPARSE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, /=)
 // TODO this is mostly the same as EIGEN_GENERIC_PUBLIC_INTERFACE
 #define _EIGEN_SPARSE_PUBLIC_INTERFACE(Derived) \
--- a/Eigen/src/SparseLU/SparseLU_Memory.h
+++ b/Eigen/src/SparseLU/SparseLU_Memory.h
@ -152,8 +152,8 @@ Index SparseLUImpl<Scalar,StorageIndex>::memInit(Index m, Index n, Index annz, I
 {
  Index& num_expansions = glu.num_expansions; //No memory expansions so far
  num_expansions = 0;
-  glu.nzumax = glu.nzlumax = (std::min)(fillratio * annz / n, m) * n; // estimated number of nonzeros in U 
+  glu.nzumax = glu.nzlumax = (std::min)(fillratio * (annz+1) / n, m) * n; // estimated number of nonzeros in U 
-  glu.nzlmax = (std::max)(Index(4), fillratio) * annz / 4; // estimated  nnz in L factor
+  glu.nzlmax = (std::max)(Index(4), fillratio) * (annz+1) / 4; // estimated  nnz in L factor
  // Return the estimated size to the user if necessary
  Index tempSpace;
  tempSpace = (2*panel_size + 4 + LUNoMarker) * m * sizeof(Index) + (panel_size + 1) * m * sizeof(Scalar);
--- a/Eigen/src/SparseQR/SparseQR.h
+++ b/Eigen/src/SparseQR/SparseQR.h
@ -23,6 +23,10 @@ namespace internal {
    typedef typename SparseQRType::MatrixType ReturnType;
    typedef typename ReturnType::StorageIndex StorageIndex;
    typedef typename ReturnType::StorageKind StorageKind;
    enum {
      RowsAtCompileTime = Dynamic,
      ColsAtCompileTime = Dynamic
    };
  };
  template <typename SparseQRType> struct traits<SparseQRMatrixQTransposeReturnType<SparseQRType> >
  {
@ -235,8 +239,9 @@ class SparseQR : public SparseSolverBase<SparseQR<_MatrixType,_OrderingType> >
      return m_info;
    }
-  protected:
+
-    inline void sort_matrix_Q()
+    /** \internal */
    inline void _sort_matrix_Q()
    {
      if(this->m_isQSorted) return;
      // The matrix Q is sorted during the transposition
@ -267,7 +272,6 @@ class SparseQR : public SparseSolverBase<SparseQR<_MatrixType,_OrderingType> >
    bool m_isEtreeOk;               // whether the elimination tree match the initial input matrix
    template <typename, typename > friend struct SparseQR_QProduct;
    template <typename > friend struct SparseQRMatrixQReturnType;
 };
@ -635,6 +639,10 @@ struct SparseQRMatrixQReturnType : public EigenBase<SparseQRMatrixQReturnType<Sp
 {  
  typedef typename SparseQRType::Scalar Scalar;
  typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
  enum {
    RowsAtCompileTime = Dynamic,
    ColsAtCompileTime = Dynamic
  };
  explicit SparseQRMatrixQReturnType(const SparseQRType& qr) : m_qr(qr) {}
  template<typename Derived>
  SparseQR_QProduct<SparseQRType, Derived> operator*(const MatrixBase<Derived>& other)
@ -652,19 +660,6 @@ struct SparseQRMatrixQReturnType : public EigenBase<SparseQRMatrixQReturnType<Sp
  {
    return SparseQRMatrixQTransposeReturnType<SparseQRType>(m_qr);
  }
  template<typename Dest> void evalTo(MatrixBase<Dest>& dest) const
  {
    dest.derived() = m_qr.matrixQ() * Dest::Identity(m_qr.rows(), m_qr.rows());
  }
  template<typename Dest> void evalTo(SparseMatrixBase<Dest>& dest) const
  {
    Dest idMat(m_qr.rows(), m_qr.rows());
    idMat.setIdentity();
    // Sort the sparse householder reflectors if needed
    const_cast<SparseQRType *>(&m_qr)->sort_matrix_Q();
    dest.derived() = SparseQR_QProduct<SparseQRType, Dest>(m_qr, idMat, false);
  }
  const SparseQRType& m_qr;
 };
@ -680,6 +675,47 @@ struct SparseQRMatrixQTransposeReturnType
  const SparseQRType& m_qr;
 };
 namespace internal {
 template<typename SparseQRType>
 struct evaluator_traits<SparseQRMatrixQReturnType<SparseQRType> >
 {
  typedef typename SparseQRType::MatrixType MatrixType;
  typedef typename storage_kind_to_evaluator_kind<typename MatrixType::StorageKind>::Kind Kind;
  typedef SparseShape Shape;
  static const int AssumeAliasing = 0;
 };
 template< typename DstXprType, typename SparseQRType>
 struct Assignment<DstXprType, SparseQRMatrixQReturnType<SparseQRType>, internal::assign_op<typename DstXprType::Scalar>, Sparse2Sparse>
 {
  typedef SparseQRMatrixQReturnType<SparseQRType> SrcXprType;
  typedef typename DstXprType::Scalar Scalar;
  typedef typename DstXprType::StorageIndex StorageIndex;
  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &/*func*/)
  {
    typename DstXprType::PlainObject idMat(src.m_qr.rows(), src.m_qr.rows());
    idMat.setIdentity();
    // Sort the sparse householder reflectors if needed
    const_cast<SparseQRType *>(&src.m_qr)->_sort_matrix_Q();
    dst = SparseQR_QProduct<SparseQRType, DstXprType>(src.m_qr, idMat, false);
  }
 };
 template< typename DstXprType, typename SparseQRType>
 struct Assignment<DstXprType, SparseQRMatrixQReturnType<SparseQRType>, internal::assign_op<typename DstXprType::Scalar>, Sparse2Dense>
 {
  typedef SparseQRMatrixQReturnType<SparseQRType> SrcXprType;
  typedef typename DstXprType::Scalar Scalar;
  typedef typename DstXprType::StorageIndex StorageIndex;
  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &/*func*/)
  {
    dst = src.m_qr.matrixQ() * DstXprType::Identity(src.m_qr.rows(), src.m_qr.rows());
  }
 };
 } // end namespace internal
 } // end namespace Eigen
 #endif
--- a/cmake/EigenDetermineVSServicePack.cmake
+++ b/cmake/EigenDetermineVSServicePack.cmake
@ -26,6 +26,8 @@ function(EigenDetermineVSServicePack _pack)
                        set(_sp "vc110sp2")
                    elseif(${_cl_version} VERSION_EQUAL "17.00.60610.1")
                        set(_sp "vc110sp3")
                    else()
                        set(_sp ${CMAKE_CXX_COMPILER_VERSION})
                    endif()
                endif()
--- a/cmake/EigenTesting.cmake
+++ b/cmake/EigenTesting.cmake
@ -367,7 +367,7 @@ macro(ei_get_compilerver VAR)
    # on all other system we rely on ${CMAKE_CXX_COMPILER}
    # supporting a "--version" or "/version" flag
-    if(WIN32 AND NOT CYGWIN)
+    if(WIN32 AND NOT CYGWIN AND NOT MINGW)
      set(EIGEN_CXX_FLAG_VERSION "/version")
    else()
      set(EIGEN_CXX_FLAG_VERSION "--version")
--- a/doc/QuickReference.dox
+++ b/doc/QuickReference.dox
@ -13,17 +13,17 @@ The Eigen library is divided in a Core module and several additional modules. Ea
 <table class="manual">
 <tr><th>Module</th><th>Header file</th><th>Contents</th></tr>
-<tr><td>\link Core_Module Core \endlink</td><td>\code#include <Eigen/Core>\endcode</td><td>Matrix and Array classes, basic linear algebra (including triangular and selfadjoint products), array manipulation</td></tr>
+<tr            ><td>\link Core_Module Core \endlink</td><td>\code#include <Eigen/Core>\endcode</td><td>Matrix and Array classes, basic linear algebra (including triangular and selfadjoint products), array manipulation</td></tr>
 <tr class="alt"><td>\link Geometry_Module Geometry \endlink</td><td>\code#include <Eigen/Geometry>\endcode</td><td>Transform, Translation, Scaling, Rotation2D and 3D rotations (Quaternion, AngleAxis)</td></tr>
-<tr><td>\link LU_Module LU \endlink</td><td>\code#include <Eigen/LU>\endcode</td><td>Inverse, determinant, LU decompositions with solver (FullPivLU, PartialPivLU)</td></tr>
+<tr            ><td>\link LU_Module LU \endlink</td><td>\code#include <Eigen/LU>\endcode</td><td>Inverse, determinant, LU decompositions with solver (FullPivLU, PartialPivLU)</td></tr>
-<tr><td>\link Cholesky_Module Cholesky \endlink</td><td>\code#include <Eigen/Cholesky>\endcode</td><td>LLT and LDLT Cholesky factorization with solver</td></tr>
+<tr class="alt"><td>\link Cholesky_Module Cholesky \endlink</td><td>\code#include <Eigen/Cholesky>\endcode</td><td>LLT and LDLT Cholesky factorization with solver</td></tr>
-<tr class="alt"><td>\link Householder_Module Householder \endlink</td><td>\code#include <Eigen/Householder>\endcode</td><td>Householder transformations; this module is used by several linear algebra modules</td></tr>
+<tr            ><td>\link Householder_Module Householder \endlink</td><td>\code#include <Eigen/Householder>\endcode</td><td>Householder transformations; this module is used by several linear algebra modules</td></tr>
-<tr><td>\link SVD_Module SVD \endlink</td><td>\code#include <Eigen/SVD>\endcode</td><td>SVD decomposition with least-squares solver (JacobiSVD)</td></tr>
+<tr class="alt"><td>\link SVD_Module SVD \endlink</td><td>\code#include <Eigen/SVD>\endcode</td><td>SVD decompositions with least-squares solver (JacobiSVD, BDCSVD)</td></tr>
-<tr class="alt"><td>\link QR_Module QR \endlink</td><td>\code#include <Eigen/QR>\endcode</td><td>QR decomposition with solver (HouseholderQR, ColPivHouseholderQR, FullPivHouseholderQR)</td></tr>
+<tr            ><td>\link QR_Module QR \endlink</td><td>\code#include <Eigen/QR>\endcode</td><td>QR decomposition with solver (HouseholderQR, ColPivHouseholderQR, FullPivHouseholderQR)</td></tr>
-<tr><td>\link Eigenvalues_Module Eigenvalues \endlink</td><td>\code#include <Eigen/Eigenvalues>\endcode</td><td>Eigenvalue, eigenvector decompositions (EigenSolver, SelfAdjointEigenSolver, ComplexEigenSolver)</td></tr>
+<tr class="alt"><td>\link Eigenvalues_Module Eigenvalues \endlink</td><td>\code#include <Eigen/Eigenvalues>\endcode</td><td>Eigenvalue, eigenvector decompositions (EigenSolver, SelfAdjointEigenSolver, ComplexEigenSolver)</td></tr>
-<tr class="alt"><td>\link Sparse_modules Sparse \endlink</td><td>\code#include <Eigen/Sparse>\endcode</td><td>%Sparse matrix storage and related basic linear algebra (SparseMatrix, DynamicSparseMatrix, SparseVector)</td></tr>
+<tr            ><td>\link Sparse_modules Sparse \endlink</td><td>\code#include <Eigen/Sparse>\endcode</td><td>%Sparse matrix storage and related basic linear algebra (SparseMatrix, SparseVector) \n (see \ref SparseQuickRefPage for details on sparse modules)</td></tr>
-<tr><td></td><td>\code#include <Eigen/Dense>\endcode</td><td>Includes Core, Geometry, LU, Cholesky, SVD, QR, and Eigenvalues header files</td></tr>
+<tr class="alt"><td></td><td>\code#include <Eigen/Dense>\endcode</td><td>Includes Core, Geometry, LU, Cholesky, SVD, QR, and Eigenvalues header files</td></tr>
-<tr class="alt"><td></td><td>\code#include <Eigen/Eigen>\endcode</td><td>Includes %Dense and %Sparse header files (the whole Eigen library)</td></tr>
+<tr            ><td></td><td>\code#include <Eigen/Eigen>\endcode</td><td>Includes %Dense and %Sparse header files (the whole Eigen library)</td></tr>
 </table>
 <a href="#" class="top">top</a>
@ -364,32 +364,10 @@ vec3 = vec1.cross(vec2);\endcode</td></tr>
 <a href="#" class="top">top</a>
 \section QuickRef_Coeffwise Coefficient-wise \& Array operators
 Coefficient-wise operators for matrices and vectors:
 <table class="manual">
 <tr><th>Matrix API \matrixworld</th><th>Via Array conversions</th></tr>
 <tr><td>\code
 mat1.cwiseMin(mat2)
 mat1.cwiseMax(mat2)
 mat1.cwiseAbs2()
 mat1.cwiseAbs()
 mat1.cwiseSqrt()
 mat1.cwiseProduct(mat2)
 mat1.cwiseQuotient(mat2)\endcode
 </td><td>\code
 mat1.array().min(mat2.array())
 mat1.array().max(mat2.array())
 mat1.array().abs2()
 mat1.array().abs()
 mat1.array().sqrt()
 mat1.array() * mat2.array()
 mat1.array() / mat2.array()
 \endcode</td></tr>
 </table>
-It is also very simple to apply any user defined function \c foo using DenseBase::unaryExpr together with std::ptr_fun:
+In addition to the aforementioned operators, Eigen supports numerous coefficient-wise operator and functions.
-\code mat1.unaryExpr(std::ptr_fun(foo))\endcode
+Most of them unambiguously makes sense in array-world\arrayworld. The following operators are readily available for arrays,
-
+or available through .array() for vectors and matrices:
 Array operators:\arrayworld
 <table class="manual">
 <tr><td>Arithmetic operators</td><td>\code
@ -400,28 +378,107 @@ array1 + scalar     array1 - scalar     array1 += scalar    array1 -= scalar
 array1 < array2     array1 > array2     array1 < scalar     array1 > scalar
 array1 <= array2    array1 >= array2    array1 <= scalar    array1 >= scalar
 array1 == array2    array1 != array2    array1 == scalar    array1 != scalar
 array1.min(array2)  array1.max(array2)  array1.min(scalar)  array1.max(scalar)
 \endcode</td></tr>
-<tr><td>Trigo, power, and \n misc functions \n and the STL variants</td><td>\code
+<tr><td>Trigo, power, and \n misc functions \n and the STL-like variants</td><td>\code
 array1.min(array2)            
 array1.max(array2)            
 array1.abs2()
 array1.abs()                  abs(array1)
 array1.sqrt()                 sqrt(array1)
 array1.log()                  log(array1)
 array1.log10()                log10(array1)
 array1.exp()                  exp(array1)
-array1.pow(exponent)          pow(array1,exponent)
+array1.pow(array2)            pow(array1,array2)
 array1.pow(scalar)            pow(array1,scalar)
 array1.square()
 array1.cube()
 array1.inverse()
 array1.sin()                  sin(array1)
 array1.cos()                  cos(array1)
 array1.tan()                  tan(array1)
 array1.asin()                 asin(array1)
 array1.acos()                 acos(array1)
 array1.atan()                 atan(array1)
 array1.sinh()                 sinh(array1)
 array1.cosh()                 cosh(array1)
 array1.tanh()                 tanh(array1)
 array1.arg()                  arg(array1)
 array1.floor()                floor(array1)
 array1.ceil()                 ceil(array1)
 array1.round()                round(aray1)
 array1.isFinite()             isfinite(array1)
 array1.isInf()                isinf(array1)
 array1.isNaN()                isnan(array1)
 \endcode
 </td></tr>
 </table>
 The following coefficient-wise operators are available for all kind of expressions (matrices, vectors, and arrays), and for both real or complex scalar types:
 <table class="manual">
 <tr><th>Eigen's API</th><th>STL-like APIs\arrayworld </th><th>Comments</th></tr>
 <tr><td>\code
 mat1.real()
 mat1.imag()
 mat1.conjugate()
 \endcode
 </td><td>\code
 real(array1)
 imag(array1)
 conj(array1)
 \endcode
 </td><td>
 \code
 // read-write, no-op for real expressions
 // read-only for real, read-write for complexes
 // no-op for real expressions
 \endcode
 </td></tr>
 </table>
 Some coefficient-wise operators are readily available for for matrices and vectors through the following cwise* methods:
 <table class="manual">
 <tr><th>Matrix API \matrixworld</th><th>Via Array conversions</th></tr>
 <tr><td>\code
 mat1.cwiseMin(mat2)         mat1.cwiseMin(scalar)
 mat1.cwiseMax(mat2)         mat1.cwiseMax(scalar)
 mat1.cwiseAbs2()
 mat1.cwiseAbs()
 mat1.cwiseSqrt()
 mat1.cwiseInverse()
 mat1.cwiseProduct(mat2)
 mat1.cwiseQuotient(mat2)
 mat1.cwiseEqual(mat2)       mat1.cwiseEqual(scalar)
 mat1.cwiseNotEqual(mat2)
 \endcode
 </td><td>\code
 mat1.array().min(mat2.array())    mat1.array().min(scalar)
 mat1.array().max(mat2.array())    mat1.array().max(scalar)
 mat1.array().abs2()
 mat1.array().abs()
 mat1.array().sqrt()
 mat1.array().inverse()
 mat1.array() * mat2.array()
 mat1.array() / mat2.array()
 mat1.array() == mat2.array()      mat1.array() == scalar
 mat1.array() != mat2.array()
 \endcode</td></tr>
 </table>
 The main difference between the two API is that the one based on cwise* methods returns an expression in the matrix world,
 while the second one (based on .array()) returns an array expression.
 Recall that .array() has no cost, it only changes the available API and interpretation of the data.
 It is also very simple to apply any user defined function \c foo using DenseBase::unaryExpr together with <a href="http://en.cppreference.com/w/cpp/utility/functional/ptr_fun">std::ptr_fun</a> (c++03), <a href="http://en.cppreference.com/w/cpp/utility/functional/ref">std::ref</a> (c++11), or <a href="http://en.cppreference.com/w/cpp/language/lambda">lambdas</a> (c++11):
 \code
 mat1.unaryExpr(std::ptr_fun(foo));
 mat1.unaryExpr(std::ref(foo));
 mat1.unaryExpr([](double x) { return foo(x); });
 \endcode
 <a href="#" class="top">top</a>
 \section QuickRef_Reductions Reductions
--- a/doc/SparseLinearSystems.dox
+++ b/doc/SparseLinearSystems.dox
@ -21,7 +21,7 @@ They are summarized in the following table:
 <tr><td>ConjugateGradient</td><td>\link IterativeLinearSolvers_Module IterativeLinearSolvers \endlink</td><td>Classic iterative CG</td><td>SPD</td><td>Preconditionning</td>
    <td>built-in, MPL2</td>
    <td>Recommended for large symmetric problems (e.g., 3D Poisson eq.)</td></tr>
-<tr><td>LSCG</td><td>\link IterativeLinearSolvers_Module IterativeLinearSolvers \endlink</td><td>CG for rectangular least-square problem</td><td>Rectangular</td><td>Preconditionning</td>
+<tr><td>LeastSquaresConjugateGradient</td><td>\link IterativeLinearSolvers_Module IterativeLinearSolvers \endlink</td><td>CG for rectangular least-square problem</td><td>Rectangular</td><td>Preconditionning</td>
    <td>built-in, MPL2</td>
    <td>Solve for min |A'Ax-b|^2 without forming A'A</td></tr>
 <tr><td>BiCGSTAB</td><td>\link IterativeLinearSolvers_Module IterativeLinearSolvers \endlink</td><td>Iterative stabilized bi-conjugate gradient</td><td>Square</td><td>Preconditionning</td>
--- a/doc/TopicMultithreading.dox
+++ b/doc/TopicMultithreading.dox
@ -22,8 +22,12 @@ n = Eigen::nbThreads( );
 You can disable Eigen's multi threading at compile time by defining the EIGEN_DONT_PARALLELIZE preprocessor token.
 Currently, the following algorithms can make use of multi-threading:
- * general matrix - matrix products
+ - general dense matrix - matrix products
- * PartialPivLU
+ - PartialPivLU
 - row-major-sparse * dense vector/matrix products
 - ConjugateGradient with \c Lower|Upper as the \c UpLo template parameter.
 - BiCGSTAB with a row-major sparse matrix format.
 - LeastSquaresConjugateGradient
 \section TopicMultiThreading_UsingEigenWithMT Using Eigen in a multi-threaded application
--- a/test/array.cpp
+++ b/test/array.cpp
@ -125,7 +125,10 @@ template<typename ArrayType> void comparisons(const ArrayType& m)
  ArrayType m1 = ArrayType::Random(rows, cols),
            m2 = ArrayType::Random(rows, cols),
-             m3(rows, cols);            
+            m3(rows, cols),
            m4 = m1;
  m4 = (m4.abs()==Scalar(0)).select(1,m4);
  VERIFY(((m1 + Scalar(1)) > m1).all());
  VERIFY(((m1 - Scalar(1)) < m1).all());
@ -197,7 +200,10 @@ template<typename ArrayType> void array_real(const ArrayType& m)
  ArrayType m1 = ArrayType::Random(rows, cols),
            m2 = ArrayType::Random(rows, cols),
-            m3(rows, cols);
+            m3(rows, cols),
            m4 = m1;
  m4 = (m4.abs()==Scalar(0)).select(1,m4);
  Scalar  s1 = internal::random<Scalar>();
@ -215,9 +221,9 @@ template<typename ArrayType> void array_real(const ArrayType& m)
  VERIFY_IS_APPROX(m1.round(), round(m1));
  VERIFY_IS_APPROX(m1.floor(), floor(m1));
  VERIFY_IS_APPROX(m1.ceil(), ceil(m1));
-  VERIFY((m1.isNaN() == isnan(m1)).all());
+  VERIFY((m1.isNaN() == Eigen::isnan(m1)).all());
-  VERIFY((m1.isInf() == isinf(m1)).all());
+  VERIFY((m1.isInf() == Eigen::isinf(m1)).all());
-  VERIFY((m1.isFinite() == isfinite(m1)).all());
+  VERIFY((m1.isFinite() == Eigen::isfinite(m1)).all());
  VERIFY_IS_APPROX(m1.inverse(), inverse(m1));
  VERIFY_IS_APPROX(m1.abs(), abs(m1));
  VERIFY_IS_APPROX(m1.abs2(), abs2(m1));
@ -243,9 +249,9 @@ template<typename ArrayType> void array_real(const ArrayType& m)
  VERIFY_IS_APPROX(tanh(m1), (0.5*(exp(m1)-exp(-m1)))/(0.5*(exp(m1)+exp(-m1))));
  VERIFY_IS_APPROX(arg(m1), ((ArrayType)(m1<0))*std::acos(-1.0));
  VERIFY((round(m1) <= ceil(m1) && round(m1) >= floor(m1)).all());
-  VERIFY(isnan(m1*0.0/0.0).all());
+  VERIFY(Eigen::isnan((m1*0.0)/0.0).all());
-  VERIFY(isinf(m1/0.0).all());
+  VERIFY(Eigen::isinf(m4/0.0).all());
-  VERIFY((isfinite(m1) && !isfinite(m1*0.0/0.0) && !isfinite(m1/0.0)).all());
+  VERIFY((Eigen::isfinite(m1) && (!Eigen::isfinite(m1*0.0/0.0)) && (!Eigen::isfinite(m4/0.0))).all());
  VERIFY_IS_APPROX(inverse(inverse(m1)),m1);
  VERIFY((abs(m1) == m1 || abs(m1) == -m1).all());
  VERIFY_IS_APPROX(m3, sqrt(abs2(m1)));
@ -299,7 +305,11 @@ template<typename ArrayType> void array_complex(const ArrayType& m)
  Index cols = m.cols();
  ArrayType m1 = ArrayType::Random(rows, cols),
-            m2(rows, cols);
+            m2(rows, cols),
            m4 = m1;
  m4.real() = (m4.real().abs()==RealScalar(0)).select(RealScalar(1),m4.real());
  m4.imag() = (m4.imag().abs()==RealScalar(0)).select(RealScalar(1),m4.imag());
  Array<RealScalar, -1, -1> m3(rows, cols);
@ -317,9 +327,9 @@ template<typename ArrayType> void array_complex(const ArrayType& m)
  VERIFY_IS_APPROX(m1.cosh(), cosh(m1));
  VERIFY_IS_APPROX(m1.tanh(), tanh(m1));
  VERIFY_IS_APPROX(m1.arg(), arg(m1));
-  VERIFY((m1.isNaN() == isnan(m1)).all());
+  VERIFY((m1.isNaN() == Eigen::isnan(m1)).all());
-  VERIFY((m1.isInf() == isinf(m1)).all());
+  VERIFY((m1.isInf() == Eigen::isinf(m1)).all());
-  VERIFY((m1.isFinite() == isfinite(m1)).all());
+  VERIFY((m1.isFinite() == Eigen::isfinite(m1)).all());
  VERIFY_IS_APPROX(m1.inverse(), inverse(m1));
  VERIFY_IS_APPROX(m1.log(), log(m1));
  VERIFY_IS_APPROX(m1.log10(), log10(m1));
@ -345,20 +355,20 @@ template<typename ArrayType> void array_complex(const ArrayType& m)
  VERIFY_IS_APPROX(arg(m1), m3);
  std::complex<RealScalar> zero(0.0,0.0);
-  VERIFY(isnan(m1*zero/zero).all());
+  VERIFY(Eigen::isnan(m1*zero/zero).all());
 #if EIGEN_COMP_CLANG
  // clang's complex division is notoriously broken
-  if(numext::isinf(m1(0,0)/Scalar(0))) {
+  if(numext::isinf(m4(0,0)/RealScalar(0))) {
 #endif
-  VERIFY(isinf(m1/zero).all());
+  VERIFY(Eigen::isinf(m4/zero).all());
 #if EIGEN_COMP_CLANG
  }
  else
  {
-    VERIFY(isinf(m1.real()/zero.real()).all());
+    VERIFY(Eigen::isinf(m4.real()/zero.real()).all());
  }
 #endif
-  VERIFY((isfinite(m1) && !isfinite(m1*zero/zero) && !isfinite(m1/zero)).all());
+  VERIFY((Eigen::isfinite(m1) && (!Eigen::isfinite(m1*zero/zero)) && (!Eigen::isfinite(m1/zero))).all());
  VERIFY_IS_APPROX(inverse(inverse(m1)),m1);
  VERIFY_IS_APPROX(conj(m1.conjugate()), m1);
--- a/test/basicstuff.cpp
+++ b/test/basicstuff.cpp
@ -223,7 +223,7 @@ void fixedSizeMatrixConstruction()
    for(int k=0; k<2; ++k) VERIFY(m2(k) == DenseIndex(raw[k]));
    for(int k=0; k<2; ++k) VERIFY(a2(k) == DenseIndex(raw[k]));
    for(int k=0; k<2; ++k) VERIFY(m3(k) == int(raw[k]));
-    for(int k=0; k<2; ++k) VERIFY(m4(k) == float(raw[k]));
+    for(int k=0; k<2; ++k) VERIFY((m4(k)) == Scalar(float(raw[k])));
  }
  {
    Matrix<Scalar,1,1> m(raw), m1(raw[0]), m2( (DenseIndex(raw[0])) ), m3( (int(raw[0])) );
--- a/test/bdcsvd.cpp
+++ b/test/bdcsvd.cpp
@ -69,8 +69,8 @@ void test_bdcsvd()
  CALL_SUBTEST_7(( svd_verify_assert<BDCSVD<MatrixXf>  >(MatrixXf(10,12)) ));
  CALL_SUBTEST_8(( svd_verify_assert<BDCSVD<MatrixXcd> >(MatrixXcd(7,5)) ));
-  CALL_SUBTEST_1(( svd_all_trivial_2x2(bdcsvd<Matrix2cd>) ));
+  CALL_SUBTEST_101(( svd_all_trivial_2x2(bdcsvd<Matrix2cd>) ));
-  CALL_SUBTEST_1(( svd_all_trivial_2x2(bdcsvd<Matrix2d>) ));
+  CALL_SUBTEST_102(( svd_all_trivial_2x2(bdcsvd<Matrix2d>) ));
  for(int i = 0; i < g_repeat; i++) {
    CALL_SUBTEST_3(( bdcsvd<Matrix3f>() ));
@ -104,8 +104,8 @@ void test_bdcsvd()
  CALL_SUBTEST_7( BDCSVD<MatrixXf>(10,10) );
  // Check that preallocation avoids subsequent mallocs
-  CALL_SUBTEST_9( svd_preallocate() );
+  CALL_SUBTEST_9( svd_preallocate<void>() );
-  CALL_SUBTEST_2( svd_underoverflow() );
+  CALL_SUBTEST_2( svd_underoverflow<void>() );
 }
--- a/test/jacobisvd.cpp
+++ b/test/jacobisvd.cpp
@ -114,7 +114,7 @@ void test_jacobisvd()
  CALL_SUBTEST_7( JacobiSVD<MatrixXf>(10,10) );
  // Check that preallocation avoids subsequent mallocs
-  CALL_SUBTEST_9( svd_preallocate() );
+  CALL_SUBTEST_9( svd_preallocate<void>() );
-  CALL_SUBTEST_2( svd_underoverflow() );
+  CALL_SUBTEST_2( svd_underoverflow<void>() );
 }
--- a/test/linearstructure.cpp
+++ b/test/linearstructure.cpp
@ -88,6 +88,10 @@ template<typename MatrixType> void real_complex(DenseIndex rows = MatrixType::Ro
  g_called = false;
  VERIFY_IS_APPROX(m1*s, m1*Scalar(s));
  VERIFY(g_called && "matrix<complex> * real not properly optimized");
  g_called = false;
  VERIFY_IS_APPROX(m1/s, m1/Scalar(s));
  VERIFY(g_called && "matrix<complex> / real not properly optimized");
 }
 void test_linearstructure()
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@ -315,9 +315,29 @@ template<typename Scalar> void packetmath_real()
  CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasExp, std::exp, internal::pexp);
  {
    data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
    data1[1] = std::numeric_limits<Scalar>::epsilon();
    packet_helper<internal::packet_traits<Scalar>::HasExp,Packet> h;
    h.store(data2, internal::pexp(h.load(data1)));
    VERIFY(numext::isnan(data2[0]));
    VERIFY_IS_EQUAL(std::exp(std::numeric_limits<Scalar>::epsilon()), data2[1]);
    data1[0] = -std::numeric_limits<Scalar>::epsilon();
    data1[1] = 0;
    h.store(data2, internal::pexp(h.load(data1)));
    VERIFY_IS_EQUAL(std::exp(-std::numeric_limits<Scalar>::epsilon()), data2[0]);
    VERIFY_IS_EQUAL(std::exp(0), data2[1]);
    data1[0] = (std::numeric_limits<Scalar>::min)();
    data1[1] = -(std::numeric_limits<Scalar>::min)();
    h.store(data2, internal::pexp(h.load(data1)));
    VERIFY_IS_EQUAL(std::exp((std::numeric_limits<Scalar>::min)()), data2[0]);
    VERIFY_IS_EQUAL(std::exp(-(std::numeric_limits<Scalar>::min)()), data2[1]);
    data1[0] = std::numeric_limits<Scalar>::denorm_min();
    data1[1] = -std::numeric_limits<Scalar>::denorm_min();
    h.store(data2, internal::pexp(h.load(data1)));
    VERIFY_IS_EQUAL(std::exp(std::numeric_limits<Scalar>::denorm_min()), data2[0]);
    VERIFY_IS_EQUAL(std::exp(-std::numeric_limits<Scalar>::denorm_min()), data2[1]);
  }
  for (int i=0; i<size; ++i)
@ -331,12 +351,33 @@ template<typename Scalar> void packetmath_real()
  CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasLog, std::log, internal::plog);
  {
    data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
    data1[1] = std::numeric_limits<Scalar>::epsilon();
    packet_helper<internal::packet_traits<Scalar>::HasLog,Packet> h;
    h.store(data2, internal::plog(h.load(data1)));
-    VERIFY(numext::isnan(data2[0]));
+    VERIFY(std::isnan(data2[0]));
    //    VERIFY_IS_EQUAL(std::log(std::numeric_limits<Scalar>::epsilon()), data2[1]);
    data1[0] = -std::numeric_limits<Scalar>::epsilon();
    data1[1] = 0;
    h.store(data2, internal::plog(h.load(data1)));
    VERIFY(std::isnan(data2[0]));
    //    VERIFY_IS_EQUAL(std::log(0), data2[1]);
    data1[0] = (std::numeric_limits<Scalar>::min)();
    data1[1] = -(std::numeric_limits<Scalar>::min)();
    h.store(data2, internal::plog(h.load(data1)));
    VERIFY_IS_EQUAL(std::log((std::numeric_limits<Scalar>::min)()), data2[0]);
    //    VERIFY(std::isnan(data2[1]));
    data1[0] = std::numeric_limits<Scalar>::denorm_min();
    data1[1] = -std::numeric_limits<Scalar>::denorm_min();
    h.store(data2, internal::plog(h.load(data1)));
    //    VERIFY_IS_EQUAL(std::log(std::numeric_limits<Scalar>::denorm_min()), data2[0]);
    //    VERIFY(std::isnan(data2[1]));
    data1[0] = -1.0f;
    h.store(data2, internal::plog(h.load(data1)));
-    VERIFY(numext::isnan(data2[0]));
+    VERIFY(std::isnan(data2[0]));
 #if !EIGEN_FAST_MATH
    h.store(data2, internal::psqrt(h.load(data1)));
    VERIFY(numext::isnan(data2[0]));
--- a/test/qr_colpivoting.cpp
+++ b/test/qr_colpivoting.cpp
@ -23,8 +23,8 @@ template<typename MatrixType> void qr()
  MatrixType m1;
  createRandomPIMatrixOfRank(rank,rows,cols,m1);
  ColPivHouseholderQR<MatrixType> qr(m1);
-  VERIFY(rank == qr.rank());
+  VERIFY_IS_EQUAL(rank, qr.rank());
-  VERIFY(cols - qr.rank() == qr.dimensionOfKernel());
+  VERIFY_IS_EQUAL(cols - qr.rank(), qr.dimensionOfKernel());
  VERIFY(!qr.isInjective());
  VERIFY(!qr.isInvertible());
  VERIFY(!qr.isSurjective());
@ -51,11 +51,11 @@ template<typename MatrixType, int Cols2> void qr_fixedsize()
  Matrix<Scalar,Rows,Cols> m1;
  createRandomPIMatrixOfRank(rank,Rows,Cols,m1);
  ColPivHouseholderQR<Matrix<Scalar,Rows,Cols> > qr(m1);
-  VERIFY(rank == qr.rank());
+  VERIFY_IS_EQUAL(rank, qr.rank());
-  VERIFY(Cols - qr.rank() == qr.dimensionOfKernel());
+  VERIFY_IS_EQUAL(Cols - qr.rank(), qr.dimensionOfKernel());
-  VERIFY(qr.isInjective() == (rank == Rows));
+  VERIFY_IS_EQUAL(qr.isInjective(), (rank == Rows));
-  VERIFY(qr.isSurjective() == (rank == Cols));
+  VERIFY_IS_EQUAL(qr.isSurjective(), (rank == Cols));
-  VERIFY(qr.isInvertible() == (qr.isInjective() && qr.isSurjective()));
+  VERIFY_IS_EQUAL(qr.isInvertible(), (qr.isInjective() && qr.isSurjective()));
  Matrix<Scalar,Rows,Cols> r = qr.matrixQR().template triangularView<Upper>();
  Matrix<Scalar,Rows,Cols> c = qr.householderQ() * r * qr.colsPermutation().inverse();
--- a/test/qr_fullpivoting.cpp
+++ b/test/qr_fullpivoting.cpp
@ -23,8 +23,8 @@ template<typename MatrixType> void qr()
  MatrixType m1;
  createRandomPIMatrixOfRank(rank,rows,cols,m1);
  FullPivHouseholderQR<MatrixType> qr(m1);
-  VERIFY(rank == qr.rank());
+  VERIFY_IS_EQUAL(rank, qr.rank());
-  VERIFY(cols - qr.rank() == qr.dimensionOfKernel());
+  VERIFY_IS_EQUAL(cols - qr.rank(), qr.dimensionOfKernel());
  VERIFY(!qr.isInjective());
  VERIFY(!qr.isInvertible());
  VERIFY(!qr.isSurjective());
--- a/test/ref.cpp
+++ b/test/ref.cpp
@ -221,6 +221,12 @@ int test_ref_overload_fun1(Ref<MatrixXf> )       { return 3; }
 int test_ref_overload_fun2(Ref<const MatrixXd> ) { return 4; }
 int test_ref_overload_fun2(Ref<const MatrixXf> ) { return 5; }
 void test_ref_ambiguous(const Ref<const ArrayXd> &A, Ref<ArrayXd> B)
 {
  B = A;
  B = A - A;
 }
 // See also bug 969
 void test_ref_overloads()
 {
@ -233,6 +239,9 @@ void test_ref_overloads()
  VERIFY( test_ref_overload_fun2(Ad)==4 );
  VERIFY( test_ref_overload_fun2(Ad+Bd)==4 );
  VERIFY( test_ref_overload_fun2(Af+Bf)==5 );
  ArrayXd A, B;
  test_ref_ambiguous(A, B);
 }
 void test_ref()
--- a/test/sparse_basic.cpp
+++ b/test/sparse_basic.cpp
@ -365,6 +365,20 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
    VERIFY_IS_APPROX(m2, refMat2);
  }
  // test diagonal to sparse
  {
    DenseVector d = DenseVector::Random(rows);
    DenseMatrix refMat2 = d.asDiagonal();
    SparseMatrixType m2(rows, rows);
    m2 = d.asDiagonal();
    VERIFY_IS_APPROX(m2, refMat2);
    SparseMatrixType m3(d.asDiagonal());
    VERIFY_IS_APPROX(m3, refMat2);
    refMat2 += d.asDiagonal();
    m2 += d.asDiagonal();
    VERIFY_IS_APPROX(m2, refMat2);
  }
  // test conservative resize
  {
      std::vector< std::pair<StorageIndex,StorageIndex> > inc;
--- a/test/sparse_solver.h
+++ b/test/sparse_solver.h
@ -272,6 +272,7 @@ template<typename Solver> void check_sparse_spd_solving(Solver& solver, int maxS
          DenseVector b = it.rhs();
          DenseVector refX = it.refX();
          PermutationMatrix<Dynamic, Dynamic, StorageIndex> pnull;
          halfA.resize(A.rows(), A.cols());
          if(Solver::UpLo == (Lower|Upper))
            halfA = A;
          else
--- a/test/sparseqr.cpp
+++ b/test/sparseqr.cpp
@ -89,6 +89,11 @@ template<typename Scalar> void test_sparseqr_scalar()
  QtQ = Q * Q.adjoint();
  idM.resize(Q.rows(), Q.rows()); idM.setIdentity();
  VERIFY(idM.isApprox(QtQ));
  // Q to dense
  DenseMat dQ;
  dQ = solver.matrixQ();
  VERIFY_IS_APPROX(Q, dQ);
 }
 void test_sparseqr()
 {
--- a/test/svd_common.h
+++ b/test/svd_common.h
@ -33,6 +33,7 @@ void svd_check_full(const MatrixType& m, const SvdType& svd)
  };
  typedef typename MatrixType::Scalar Scalar;
  typedef typename MatrixType::RealScalar RealScalar;
  typedef Matrix<Scalar, RowsAtCompileTime, RowsAtCompileTime> MatrixUType;
  typedef Matrix<Scalar, ColsAtCompileTime, ColsAtCompileTime> MatrixVType;
@ -40,7 +41,10 @@ void svd_check_full(const MatrixType& m, const SvdType& svd)
  sigma.diagonal() = svd.singularValues().template cast<Scalar>();
  MatrixUType u = svd.matrixU();
  MatrixVType v = svd.matrixV();
-  VERIFY_IS_APPROX(m, u * sigma * v.adjoint());
+  RealScalar scaling = m.cwiseAbs().maxCoeff();
  if(scaling<=(std::numeric_limits<RealScalar>::min)())
    scaling = RealScalar(1);
  VERIFY_IS_APPROX(m/scaling, u * (sigma/scaling) * v.adjoint());
  VERIFY_IS_UNITARY(u);
  VERIFY_IS_UNITARY(v);
 }
@ -307,6 +311,7 @@ void svd_inf_nan()
 // Regression test for bug 286: JacobiSVD loops indefinitely with some
 // matrices containing denormal numbers.
 template<typename>
 void svd_underoverflow()
 {
 #if defined __INTEL_COMPILER
@ -384,6 +389,7 @@ void svd_all_trivial_2x2( void (*cb)(const MatrixType&,bool) )
  } while((id<int(value_set.size())).all());
 }
 template<typename>
 void svd_preallocate()
 {
  Vector3f v(3.f, 2.f, 1.f);
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@ -59,8 +59,10 @@
 #include "Eigen/Core"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h"
@ -80,6 +82,7 @@
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h"
@ -88,6 +91,7 @@
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h"
--- a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h
+++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h
@ -23,7 +23,7 @@ template <typename T, size_t n> class array {
  EIGEN_DEVICE_FUNC
  EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { return values[index]; }
-  static const std::size_t size = n;
+  static const std::size_t size() { return n; }
  T values[n];
--- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
@ -375,6 +375,28 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
      resize(dims);
    }
 #ifndef EIGEN_EMULATE_CXX11_META_H
    template <typename std::ptrdiff_t... Indices>
    EIGEN_DEVICE_FUNC
    void resize(const Sizes<Indices...>& dimensions) {
      array<Index, NumIndices> dims;
      for (std::size_t i = 0; i < NumIndices; ++i) {
        dims[i] = static_cast<Index>(dimensions[i]);
      }
      resize(dims);
    }
 #else
    template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5>
    EIGEN_DEVICE_FUNC
    void resize(const Sizes<V1, V2, V3, V4, V5>& dimensions) {
      array<Index, NumIndices> dims;
      for (std::size_t i = 0; i < NumIndices; ++i) {
        dims[i] = static_cast<Index>(dimensions[i]);
      }
      resize(dims);
    }
 #endif
  protected:
    bool checkIndexRange(const array<Index, NumIndices>& indices) const
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@ -108,6 +108,12 @@ class TensorBase<Derived, ReadOnlyAccessors>
      return unaryExpr(internal::scalar_inverse_op<Scalar>());
    }
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_tanh_op<Scalar>, const Derived>
    tanh() const {
      return unaryExpr(internal::scalar_tanh_op<Scalar>());
    }
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_exp_op<Scalar>, const Derived>
    exp() const {
@ -295,11 +301,10 @@ class TensorBase<Derived, ReadOnlyAccessors>
      return TensorReductionOp<internal::SumReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::SumReducer<CoeffReturnType>());
    }
-    const TensorReductionOp<internal::SumReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>
+    const TensorReductionOp<internal::SumReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>
    sum() const {
-      array<Index, NumDimensions> in_dims;
+      DimensionList<Index, NumDimensions> in_dims;
-      for (int i = 0; i < NumDimensions; ++i) in_dims[i] = i;
+      return TensorReductionOp<internal::SumReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::SumReducer<CoeffReturnType>());
      return TensorReductionOp<internal::SumReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::SumReducer<CoeffReturnType>());
    }
    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@ -308,11 +313,10 @@ class TensorBase<Derived, ReadOnlyAccessors>
      return TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::MeanReducer<CoeffReturnType>());
    }
-    const TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>
+    const TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>
    mean() const {
-      array<Index, NumDimensions> in_dims;
+      DimensionList<Index, NumDimensions> in_dims;
-      for (int i = 0; i < NumDimensions; ++i) in_dims[i] = i;
+      return TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MeanReducer<CoeffReturnType>());
      return TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MeanReducer<CoeffReturnType>());
    }
    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@ -321,11 +325,10 @@ class TensorBase<Derived, ReadOnlyAccessors>
      return TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::ProdReducer<CoeffReturnType>());
    }
-    const TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>
+    const TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>
    prod() const {
-      array<Index, NumDimensions> in_dims;
+      DimensionList<Index, NumDimensions> in_dims;
-      for (int i = 0; i < NumDimensions; ++i) in_dims[i] = i;
+      return TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::ProdReducer<CoeffReturnType>());
      return TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::ProdReducer<CoeffReturnType>());
    }
    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@ -334,11 +337,10 @@ class TensorBase<Derived, ReadOnlyAccessors>
      return TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::MaxReducer<CoeffReturnType>());
    }
-    const TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>
+    const TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>
    maximum() const {
-      array<Index, NumDimensions> in_dims;
+      DimensionList<Index, NumDimensions> in_dims;
-      for (int i = 0; i < NumDimensions; ++i) in_dims[i] = i;
+      return TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MaxReducer<CoeffReturnType>());
      return TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MaxReducer<CoeffReturnType>());
    }
    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@ -347,11 +349,10 @@ class TensorBase<Derived, ReadOnlyAccessors>
      return TensorReductionOp<internal::MinReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::MinReducer<CoeffReturnType>());
    }
-    const TensorReductionOp<internal::MinReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>
+    const TensorReductionOp<internal::MinReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>
    minimum() const {
-      array<Index, NumDimensions> in_dims;
+      DimensionList<Index, NumDimensions> in_dims;
-      for (int i = 0; i < NumDimensions; ++i) in_dims[i] = i;
+      return TensorReductionOp<internal::MinReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MinReducer<CoeffReturnType>());
      return TensorReductionOp<internal::MinReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MinReducer<CoeffReturnType>());
    }
    template <typename Reducer, typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@ -413,6 +414,26 @@ class TensorBase<Derived, ReadOnlyAccessors>
                                                                 padding_type);
    }
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>
    extract_volume_patches(const Index patch_planes, const Index patch_rows, const Index patch_cols,
                           const Index plane_stride = 1, const Index row_stride = 1, const Index col_stride = 1,
                           const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = 0) const {
      return TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, 1, 1, 1, padding_type, padding_value);
    }
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>
    extract_volume_patches(const Index patch_planes, const Index patch_rows, const Index patch_cols,
                           const Index plane_stride, const Index row_stride, const Index col_stride,
                           const Index plane_inflate_stride, const Index row_inflate_stride, const Index col_inflate_stride,
                           const Index padding_top_z, const Index padding_bottom_z,
                           const Index padding_top, const Index padding_bottom,
                           const Index padding_left, const Index padding_right, const Scalar padding_value = 0) const {
      return TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, plane_inflate_stride, row_inflate_stride, col_inflate_stride, padding_top_z, padding_bottom_z, padding_top, padding_bottom, padding_left, padding_right, padding_value);
    }
    // Morphing operators.
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    const TensorLayoutSwapOp<const Derived>
@ -460,6 +481,18 @@ class TensorBase<Derived, ReadOnlyAccessors>
      return TensorStridingOp<const Strides, const Derived>(derived(), strides);
    }
    // Added support for custom unary and binary operations
    template <typename CustomUnaryFunc>
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    const TensorCustomUnaryOp<const CustomUnaryFunc, const Derived> customOp(const CustomUnaryFunc& op) const {
      return TensorCustomUnaryOp<const CustomUnaryFunc, const Derived>(derived(), op);
    }
    template <typename OtherDerived, typename CustomBinaryFunc>
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    const TensorCustomBinaryOp<const CustomBinaryFunc, const Derived, const OtherDerived> customOp(const OtherDerived& other, const CustomBinaryFunc& op) const {
      return TensorCustomBinaryOp<const CustomBinaryFunc, const Derived, const OtherDerived>(derived(), other, op);
    }
    // Force the evaluation of the expression.
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    const TensorForcedEvalOp<const Derived> eval() const {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@ -106,8 +106,7 @@ class TensorChippingOp : public TensorBase<TensorChippingOp<DimId, XprType> >
  {
    typedef TensorAssignOp<TensorChippingOp, const TensorChippingOp> Assign;
    Assign assign(*this, other);
-    static const bool Vectorize = TensorEvaluator<const Assign, DefaultDevice>::PacketAccess;
+    internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
    internal::TensorExecutor<const Assign, DefaultDevice, Vectorize>::run(assign, DefaultDevice());
    return *this;
  }
@ -117,8 +116,7 @@ class TensorChippingOp : public TensorBase<TensorChippingOp<DimId, XprType> >
  {
    typedef TensorAssignOp<TensorChippingOp, const OtherDerived> Assign;
    Assign assign(*this, other);
-    static const bool Vectorize = TensorEvaluator<const Assign, DefaultDevice>::PacketAccess;
+    internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
    internal::TensorExecutor<const Assign, DefaultDevice, Vectorize>::run(assign, DefaultDevice());
    return *this;
  }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@ -88,7 +88,7 @@ class TensorConcatenationOp : public TensorBase<TensorConcatenationOp<Axis, LhsX
    {
      typedef TensorAssignOp<TensorConcatenationOp, const TensorConcatenationOp> Assign;
      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
      return *this;
    }
@ -98,7 +98,7 @@ class TensorConcatenationOp : public TensorBase<TensorConcatenationOp<Axis, LhsX
    {
      typedef TensorAssignOp<TensorConcatenationOp, const OtherDerived> Assign;
      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
      return *this;
    }
@ -334,7 +334,7 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
    eigen_assert(index + packetSize - 1 < this->dimensions().TotalSize());
    EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize];
-    PacketReturnType rslt = internal::pstore<PacketReturnType>(values, x);
+    internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
    for (int i = 0; i < packetSize; ++i) {
      coeffRef(index+i) = values[i];
    }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@ -364,14 +364,6 @@ class TensorContractionInputMapper<Scalar, Index, side, Tensor, nocontract_t, co
 };
 template <size_t n> struct max_n_1 {
  static const size_t size = n;
 };
 template <> struct max_n_1<0> {
  static const size_t size = 1;
 };
 template<typename Dimensions, typename LhsXprType, typename RhsXprType>
 struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
 {
@ -459,19 +451,6 @@ class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXp
 };
 template<bool cond> struct Cond {};
 template<typename T1, typename T2> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 const T1& choose(Cond<true>, const T1& first, const T2&) {
  return first;
 }
 template<typename T1, typename T2> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 const T2& choose(Cond<false>, const T1&, const T2& second) {
  return second;
 }
 template<typename Derived>
 struct TensorContractionEvaluatorBase
 {
@ -508,13 +487,13 @@ struct TensorContractionEvaluatorBase
  static const int RDims =
      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
  static const unsigned int ContractDims = internal::array_size<Indices>::value;
-  static const int NumDims = internal::max_n_1<LDims + RDims - 2 * ContractDims>::size;
+  static const int NumDims = max_n_1<LDims + RDims - 2 * ContractDims>::size;
  typedef array<Index, LDims> left_dim_mapper_t;
  typedef array<Index, RDims> right_dim_mapper_t;
  typedef array<Index, ContractDims> contract_t;
-  typedef array<Index, internal::max_n_1<LDims - ContractDims>::size> left_nocontract_t;
+  typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t;
-  typedef array<Index, internal::max_n_1<RDims - ContractDims>::size> right_nocontract_t;
+  typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t;
  typedef DSizes<Index, NumDims> Dimensions;
@ -869,10 +848,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
  typedef array<Index, RDims> right_dim_mapper_t;
  typedef array<Index, ContractDims> contract_t;
-  typedef array<Index, internal::max_n_1<LDims - ContractDims>::size> left_nocontract_t;
+  typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t;
-  typedef array<Index, internal::max_n_1<RDims - ContractDims>::size> right_nocontract_t;
+  typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t;
-  static const int NumDims = internal::max_n_1<LDims + RDims - 2 * ContractDims>::size;
+  static const int NumDims = max_n_1<LDims + RDims - 2 * ContractDims>::size;
  // Could we use NumDimensions here?
  typedef DSizes<Index, NumDims> Dimensions;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
@ -1241,10 +1241,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
  typedef array<Index, RDims> right_dim_mapper_t;
  typedef array<Index, ContractDims> contract_t;
-  typedef array<Index, internal::max_n_1<LDims - ContractDims>::size> left_nocontract_t;
+  typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t;
-  typedef array<Index, internal::max_n_1<RDims - ContractDims>::size> right_nocontract_t;
+  typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t;
-  static const int NumDims = internal::max_n_1<LDims + RDims - 2 * ContractDims>::size;
+  static const int NumDims = max_n_1<LDims + RDims - 2 * ContractDims>::size;
  typedef DSizes<Index, NumDims> Dimensions;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@ -93,10 +93,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
  typedef array<Index, RDims> right_dim_mapper_t;
  typedef array<Index, ContractDims> contract_t;
-  typedef array<Index, internal::max_n_1<LDims - ContractDims>::size> left_nocontract_t;
+  typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t;
-  typedef array<Index, internal::max_n_1<RDims - ContractDims>::size> right_nocontract_t;
+  typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t;
-  static const int NumDims = internal::max_n_1<LDims + RDims - 2 * ContractDims>::size;
+  static const int NumDims = max_n_1<LDims + RDims - 2 * ContractDims>::size;
  typedef DSizes<Index, NumDims> Dimensions;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@ -510,7 +510,8 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
      Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
      typedef TensorEvalToOp<const KernelArgType> EvalTo;
      EvalTo evalToTmp(local, m_kernelArg);
-      internal::TensorExecutor<const EvalTo, Device, TensorEvaluator<KernelArgType, Device>::PacketAccess>::run(evalToTmp, m_device);
+      const bool PacketAccess = internal::IsVectorizable<Device, KernelArgType>::value;
      internal::TensorExecutor<const EvalTo, Device, PacketAccess>::run(evalToTmp, m_device);
      m_kernel = local;
      m_local_kernel = true;
@ -815,7 +816,8 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
      Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
      typedef TensorEvalToOp<const KernelArgType> EvalTo;
      EvalTo evalToTmp(local, m_kernelArg);
-      internal::TensorExecutor<const EvalTo, GpuDevice, TensorEvaluator<KernelArgType, GpuDevice>::PacketAccess>::run(evalToTmp, m_device);
+      const bool PacketAccess = internal::IsVectorizable<GpuDevice, KernelArgType>::value;
      internal::TensorExecutor<const EvalTo, GpuDevice, PacketAccess>::run(evalToTmp, m_device);
      m_kernel = local;
      m_local_kernel = true;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
@ -0,0 +1,310 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H
 #define EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H
 namespace Eigen {
 /** \class TensorCustomUnaryOp
  * \ingroup CXX11_Tensor_Module
  *
  * \brief Tensor custom class.
  *
  *
  */
 namespace internal {
 template<typename CustomUnaryFunc, typename XprType>
 struct traits<TensorCustomUnaryOp<CustomUnaryFunc, XprType> >
 {
  typedef typename XprType::Scalar Scalar;
  typedef typename packet_traits<Scalar>::type Packet;
  typedef typename XprType::StorageKind StorageKind;
  typedef typename XprType::Index Index;
  typedef typename XprType::Nested Nested;
  typedef typename remove_reference<Nested>::type _Nested;
  static const int NumDimensions = traits<XprType>::NumDimensions;
  static const int Layout = traits<XprType>::Layout;
 };
 template<typename CustomUnaryFunc, typename XprType>
 struct eval<TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Eigen::Dense>
 {
  typedef const TensorCustomUnaryOp<CustomUnaryFunc, XprType>& type;
 };
 template<typename CustomUnaryFunc, typename XprType>
 struct nested<TensorCustomUnaryOp<CustomUnaryFunc, XprType> >
 {
  typedef TensorCustomUnaryOp<CustomUnaryFunc, XprType> type;
 };
 }  // end namespace internal
 template<typename CustomUnaryFunc, typename XprType>
 class TensorCustomUnaryOp : public TensorBase<TensorCustomUnaryOp<CustomUnaryFunc, XprType>, ReadOnlyAccessors>
 {
  public:
  typedef typename internal::traits<TensorCustomUnaryOp>::Scalar Scalar;
  typedef typename internal::traits<TensorCustomUnaryOp>::Packet Packet;
  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename XprType::PacketReturnType PacketReturnType;
  typedef typename internal::nested<TensorCustomUnaryOp>::type Nested;
  typedef typename internal::traits<TensorCustomUnaryOp>::StorageKind StorageKind;
  typedef typename internal::traits<TensorCustomUnaryOp>::Index Index;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCustomUnaryOp(const XprType& expr, const CustomUnaryFunc& func)
      : m_expr(expr), m_func(func) {}
  EIGEN_DEVICE_FUNC
  const CustomUnaryFunc& func() const { return m_func; }
  EIGEN_DEVICE_FUNC
  const typename internal::remove_all<typename XprType::Nested>::type&
  expression() const { return m_expr; }
  protected:
    typename XprType::Nested m_expr;
    const CustomUnaryFunc m_func;
 };
 // Eval as rvalue
 template<typename CustomUnaryFunc, typename XprType, typename Device>
 struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Device>
 {
  typedef TensorCustomUnaryOp<CustomUnaryFunc, XprType> ArgType;
  typedef typename internal::traits<ArgType>::Index Index;
  static const int NumDims = internal::traits<ArgType>::NumDimensions;
  typedef DSizes<Index, NumDims> Dimensions;
  typedef
      typename internal::remove_const<typename ArgType::Scalar>::type Scalar;
  enum {
    IsAligned = false,
    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
    BlockAccess = false,
    Layout = TensorEvaluator<XprType, Device>::Layout,
    CoordAccess = false,  // to be implemented
  };
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device)
      : m_op(op), m_device(device), m_result(NULL)
  {
    m_dimensions = op.func().dimensions(op.expression());
  }
  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
  typedef typename XprType::PacketReturnType PacketReturnType;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
    if (data) {
      evalTo(data);
      return false;
    } else {
      m_result = static_cast<CoeffReturnType*>(
          m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
      evalTo(m_result);
      return true;
    }
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
    if (m_result != NULL) {
      m_device.deallocate(m_result);
      m_result = NULL;
    }
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
    return m_result[index];
  }
  template<int LoadMode>
  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
    return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
  }
  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; }
 protected:
  EIGEN_DEVICE_FUNC void evalTo(Scalar* data) {
    TensorMap<Tensor<CoeffReturnType, NumDims, Layout, Index> > result(
        data, m_dimensions);
    m_op.func().eval(m_op.expression(), result, m_device);
  }
  Dimensions m_dimensions;
  const ArgType m_op;
  const Device& m_device;
  CoeffReturnType* m_result;
 };
 /** \class TensorCustomBinaryOp
  * \ingroup CXX11_Tensor_Module
  *
  * \brief Tensor custom class.
  *
  *
  */
 namespace internal {
 template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
 struct traits<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> >
 {
  typedef typename internal::promote_storage_type<typename LhsXprType::Scalar,
                                                  typename RhsXprType::Scalar>::ret Scalar;
  typedef typename packet_traits<Scalar>::type Packet;
  typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType,
                                                  typename RhsXprType::CoeffReturnType>::ret CoeffReturnType;
  typedef typename internal::promote_storage_type<typename LhsXprType::PacketReturnType,
                                                  typename RhsXprType::PacketReturnType>::ret PacketReturnType;
  typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
                                        typename traits<RhsXprType>::StorageKind>::ret StorageKind;
  typedef typename promote_index_type<typename traits<LhsXprType>::Index,
                                      typename traits<RhsXprType>::Index>::type Index;
  typedef typename LhsXprType::Nested LhsNested;
  typedef typename RhsXprType::Nested RhsNested;
  typedef typename remove_reference<LhsNested>::type _LhsNested;
  typedef typename remove_reference<RhsNested>::type _RhsNested;
  static const int NumDimensions = traits<LhsXprType>::NumDimensions;
  static const int Layout = traits<LhsXprType>::Layout;
 };
 template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
 struct eval<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Eigen::Dense>
 {
  typedef const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>& type;
 };
 template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
 struct nested<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> >
 {
  typedef TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> type;
 };
 }  // end namespace internal
 template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
 class TensorCustomBinaryOp : public TensorBase<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, ReadOnlyAccessors>
 {
  public:
  typedef typename internal::traits<TensorCustomBinaryOp>::Scalar Scalar;
  typedef typename internal::traits<TensorCustomBinaryOp>::Packet Packet;
  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
  typedef typename internal::traits<TensorCustomBinaryOp>::CoeffReturnType CoeffReturnType;
  typedef typename internal::traits<TensorCustomBinaryOp>::PacketReturnType PacketReturnType;
  typedef typename internal::nested<TensorCustomBinaryOp>::type Nested;
  typedef typename internal::traits<TensorCustomBinaryOp>::StorageKind StorageKind;
  typedef typename internal::traits<TensorCustomBinaryOp>::Index Index;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCustomBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, const CustomBinaryFunc& func)
      : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_func(func) {}
  EIGEN_DEVICE_FUNC
  const CustomBinaryFunc& func() const { return m_func; }
  EIGEN_DEVICE_FUNC
  const typename internal::remove_all<typename LhsXprType::Nested>::type&
  lhsExpression() const { return m_lhs_xpr; }
  EIGEN_DEVICE_FUNC
  const typename internal::remove_all<typename RhsXprType::Nested>::type&
  rhsExpression() const { return m_rhs_xpr; }
  protected:
    typename LhsXprType::Nested m_lhs_xpr;
    typename RhsXprType::Nested m_rhs_xpr;
    const CustomBinaryFunc m_func;
 };
 // Eval as rvalue
 template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType, typename Device>
 struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Device>
 {
  typedef TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> XprType;
  typedef typename internal::traits<XprType>::Index Index;
  static const int NumDims = internal::traits<XprType>::NumDimensions;
  typedef DSizes<Index, NumDims> Dimensions;
  typedef typename XprType::Scalar Scalar;
  enum {
    IsAligned = false,
    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
    BlockAccess = false,
    Layout = TensorEvaluator<LhsXprType, Device>::Layout,
    CoordAccess = false,  // to be implemented
  };
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
      : m_op(op), m_device(device), m_result(NULL)
  {
    m_dimensions = op.func().dimensions(op.lhsExpression(), op.rhsExpression());
  }
  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
  typedef typename XprType::PacketReturnType PacketReturnType;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
    if (data) {
      evalTo(data);
      return false;
    } else {
      m_result = static_cast<Scalar *>(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
      evalTo(m_result);
      return true;
    }
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
    if (m_result != NULL) {
      m_device.deallocate(m_result);
      m_result = NULL;
    }
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
    return m_result[index];
  }
  template<int LoadMode>
  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
    return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
  }
  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; }
 protected:
  EIGEN_DEVICE_FUNC void evalTo(Scalar* data) {
    TensorMap<Tensor<Scalar, NumDims, Layout> > result(data, m_dimensions);
    m_op.func().eval(m_op.lhsExpression(), m_op.rhsExpression(), result, m_device);
  }
  Dimensions m_dimensions;
  const XprType m_op;
  const Device& m_device;
  CoeffReturnType* m_result;
 };
 } // end namespace Eigen
 #endif // EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
@ -28,8 +28,25 @@ struct DefaultDevice {
    ::memset(buffer, c, n);
  }
-  EIGEN_STRONG_INLINE size_t numThreads() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
 #ifndef __CUDA_ARCH__
    // Running on the host CPU
    return 1;
 #else
    // Running on a CUDA device
    return 32;
 #endif
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
 #ifndef __CUDA_ARCH__
    // Running single threaded on the host CPU
    // Should return an enum that encodes the ISA supported by the CPU
    return 1;
 #else
    // Running on a CUDA device
    return __CUDA_ARCH__ / 100;
 #endif
  }
 };
@ -38,10 +55,19 @@ struct DefaultDevice {
 // We should really use a thread pool here but first we need to find a portable thread pool library.
 #ifdef EIGEN_USE_THREADS
 // This defines an interface that ThreadPoolDevice can take to use
 // custom thread pools underneath.
 class ThreadPoolInterface {
 public:
  virtual void Schedule(std::function<void()> fn) = 0;
  virtual ~ThreadPoolInterface() {}
 };
 // The implementation of the ThreadPool type ensures that the Schedule method
 // runs the functions it is provided in FIFO order when the scheduling is done
 // by a single thread.
-class ThreadPool {
+class ThreadPool : public ThreadPoolInterface {
 public:
  // Construct a pool that contains "num_threads" threads.
  explicit ThreadPool(int num_threads) {
@ -182,7 +208,7 @@ static EIGEN_STRONG_INLINE void wait_until_ready(Notification* n) {
 // Build a thread pool device on top the an existing pool of threads.
 struct ThreadPoolDevice {
-  ThreadPoolDevice(ThreadPool* pool, size_t num_cores) : pool_(pool), num_threads_(num_cores) { }
+  ThreadPoolDevice(ThreadPoolInterface* pool, size_t num_cores) : pool_(pool), num_threads_(num_cores) { }
  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
    return internal::aligned_malloc(num_bytes);
@ -204,6 +230,11 @@ struct ThreadPoolDevice {
    return num_threads_;
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
    // Should return an enum that encodes the ISA supported by the CPU
    return 1;
  }
  template <class Function, class... Args>
  EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const {
    Notification* n = new Notification();
@ -219,7 +250,7 @@ struct ThreadPoolDevice {
  }
 private:
-  ThreadPool* pool_;
+  ThreadPoolInterface* pool_;
  size_t num_threads_;
 };
@ -260,9 +291,12 @@ static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
  assert(status == cudaSuccess);
 }
 // Cuda stream to use when no stream is specified explicitely.
 static const cudaStream_t default_stream = cudaStreamDefault;
 struct GpuDevice {
  // The cudastream is not owned: the caller is responsible for its initialization and eventual destruction.
-  GpuDevice(const cudaStream_t* stream) : stream_(stream) { eigen_assert(stream); }
+  GpuDevice(const cudaStream_t* stream = &default_stream) : stream_(stream) { eigen_assert(stream); }
  EIGEN_STRONG_INLINE const cudaStream_t& stream() const { return *stream_; }
@ -308,6 +342,8 @@ struct GpuDevice {
    return 32;
  }
 inline int majorDeviceVersion() const { return m_deviceProperties.major; }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
    cudaStreamSynchronize(*stream_);
  }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h
@ -0,0 +1,235 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #ifndef EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H
 #define EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H
 namespace Eigen {
 /** \internal
  *
  * \class TensorDimensionList
  * \ingroup CXX11_Tensor_Module
  *
  * \brief Special case of tensor index list used to list all the dimensions of a tensor of rank n.
  *
  * \sa Tensor
  */
 template <typename Index, std::size_t Rank> struct DimensionList {
  const Index operator[] (const Index i) const { return i; }
 };
 namespace internal {
 template<typename Index, std::size_t Rank> struct array_size<DimensionList<Index, Rank> > {
  static const size_t value = Rank;
 };
 template<typename Index, std::size_t Rank> struct array_size<const DimensionList<Index, Rank> > {
  static const size_t value = Rank;
 };
 template<DenseIndex n, typename Index, std::size_t Rank> const Index array_get(DimensionList<Index, Rank>&) {
  return n;
 }
 template<DenseIndex n, typename Index, std::size_t Rank> const Index array_get(const DimensionList<Index, Rank>&) {
  return n;
 }
 #if defined(EIGEN_HAS_CONSTEXPR)
 template <typename Index, std::size_t Rank>
 struct index_known_statically<DimensionList<Index, Rank> > {
  constexpr bool operator() (const DenseIndex) const {
    return true;
  }
 };
 template <typename Index, std::size_t Rank>
 struct index_known_statically<const DimensionList<Index, Rank> > {
  constexpr bool operator() (const DenseIndex) const {
    return true;
  }
 };
 template <typename Index, std::size_t Rank>
 struct all_indices_known_statically<DimensionList<Index, Rank> > {
  constexpr bool operator() () const {
    return true;
  }
 };
 template <typename Index, std::size_t Rank>
 struct all_indices_known_statically<const DimensionList<Index, Rank> > {
  constexpr bool operator() () const {
    return true;
  }
 };
 template <typename Index, std::size_t Rank>
 struct indices_statically_known_to_increase<DimensionList<Index, Rank> > {
  constexpr bool operator() () const {
    return true;
  }
 };
 template <typename Index, std::size_t Rank>
 struct indices_statically_known_to_increase<const DimensionList<Index, Rank> > {
  constexpr bool operator() () const {
    return true;
  }
 };
 template <typename Index, std::size_t Rank>
 struct index_statically_eq<DimensionList<Index, Rank> > {
  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
    return i == value;
  }
 };
 template <typename Index, std::size_t Rank>
 struct index_statically_eq<const DimensionList<Index, Rank> > {
  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
    return i == value;
  }
 };
 template <typename Index, std::size_t Rank>
 struct index_statically_ne<DimensionList<Index, Rank> > {
  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
    return i != value;
  }
 };
 template <typename Index, std::size_t Rank>
 struct index_statically_ne<const DimensionList<Index, Rank> > {
  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
    return i != value;
  }
 };
 template <typename Index, std::size_t Rank>
 struct index_statically_gt<DimensionList<Index, Rank> > {
  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
    return i > value;
  }
 };
 template <typename Index, std::size_t Rank>
 struct index_statically_gt<const DimensionList<Index, Rank> > {
  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
    return i > value;
  }
 };
 template <typename Index, std::size_t Rank>
 struct index_statically_lt<DimensionList<Index, Rank> > {
  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
    return i < value;
  }
 };
 template <typename Index, std::size_t Rank>
 struct index_statically_lt<const DimensionList<Index, Rank> > {
  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
    return i < value;
  }
 };
 #else
 template <typename Index, std::size_t Rank>
 struct index_known_statically<DimensionList<Index, Rank> > {
  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex) const {
    return true;
  }
 };
 template <typename Index, std::size_t Rank>
 struct index_known_statically<const DimensionList<Index, Rank> > {
  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex) const {
    return true;
  }
 };
 template <typename Index, std::size_t Rank>
 struct all_indices_known_statically<DimensionList<Index, Rank> > {
  EIGEN_ALWAYS_INLINE bool operator() () const {
    return true;
  }
 };
 template <typename Index, std::size_t Rank>
 struct all_indices_known_statically<const DimensionList<Index, Rank> > {
  EIGEN_ALWAYS_INLINE bool operator() () const {
    return true;
  }
 };
 template <typename Index, std::size_t Rank>
 struct indices_statically_known_to_increase<DimensionList<Index, Rank> > {
  EIGEN_ALWAYS_INLINE bool operator() () const {
    return true;
  }
 };
 template <typename Index, std::size_t Rank>
 struct indices_statically_known_to_increase<const DimensionList<Index, Rank> > {
  EIGEN_ALWAYS_INLINE bool operator() () const {
    return true;
  }
 };
 template <typename Index, std::size_t Rank>
 struct index_statically_eq<DimensionList<Index, Rank> > {
  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex, const DenseIndex) const {
    return false;
  }
 };
 template <typename Index, std::size_t Rank>
 struct index_statically_eq<const DimensionList<Index, Rank> > {
  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex, const DenseIndex) const {
    return false;
  }
 };
 template <typename Index, std::size_t Rank>
 struct index_statically_ne<DimensionList<Index, Rank> > {
  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex, const DenseIndex) const {
    return false;
  }
 };
 template <typename Index, std::size_t Rank>
 struct index_statically_ne<const DimensionList<Index, Rank> > {
  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex, const DenseIndex) const {
    return false;
  }
 };
 template <typename Index, std::size_t Rank>
 struct index_statically_gt<DimensionList<Index, Rank> > {
  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex, const DenseIndex) const {
    return false;
  }
 };
 template <typename Index, std::size_t Rank>
 struct index_statically_gt<const DimensionList<Index, Rank> > {
  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex, const DenseIndex) const {
    return false;
  }
 };
 template <typename Index, std::size_t Rank>
 struct index_statically_lt<DimensionList<Index, Rank> > {
  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex, const DenseIndex) const {
    return false;
  }
 };
 template <typename Index, std::size_t Rank>
 struct index_statically_lt<const DimensionList<Index, Rank> > {
  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex, const DenseIndex) const {
    return false;
  }
 };
 #endif
 }  // end namespace internal
 }  // end namespace Eigen
 #endif // EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@ -69,6 +69,31 @@ struct fixed_size_tensor_index_linearization_helper<Index, NumIndices, 0, RowMaj
  }
 };
 template<typename Index, std::size_t n>
 struct fixed_size_tensor_index_extraction_helper
 {
  template <typename Dimensions> EIGEN_DEVICE_FUNC
  static inline Index run(const Index index,
                          const Dimensions& dimensions)
  {
    const Index mult = (index == n) ? 1 : 0;
    return array_get<n>(dimensions) * mult +
        fixed_size_tensor_index_extraction_helper<Index, n - 1>::run(index, dimensions);
  }
 };
 template<typename Index>
 struct fixed_size_tensor_index_extraction_helper<Index, 0>
 {
  template <typename Dimensions> EIGEN_DEVICE_FUNC
  static inline Index run(const Index index,
                          const Dimensions& dimensions)
  {
    const Index mult = (index == 0) ? 1 : 0;
    return array_get<0>(dimensions) * mult;
  }
 };
 }  // end namespace internal
@ -99,6 +124,10 @@ struct Sizes : internal::numeric_list<std::ptrdiff_t, Indices...> {
  }
 #endif
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t operator[] (const int index) const {
    return internal::fixed_size_tensor_index_extraction_helper<std::ptrdiff_t, Base::count - 1>::run(index, *this);
  }
  template <typename T> Sizes& operator = (const T& /*other*/) {
    // add assertion failure if the size of other is different
    return *this;
@ -114,10 +143,12 @@ struct Sizes : internal::numeric_list<std::ptrdiff_t, Indices...> {
  }
 };
 namespace internal {
 template <typename std::ptrdiff_t... Indices>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes<Indices...>&) {
  return Sizes<Indices...>::total_size;
 }
 }
 #else
@ -166,6 +197,24 @@ template <std::size_t V1=0, std::size_t V2=0, std::size_t V3=0, std::size_t V4=0
  }
 #endif
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex operator[] (const int index) const {
    switch (index) {
      case 0:
        return internal::get<0, Base>::value;
      case 1:
        return internal::get<1, Base>::value;
      case 2:
        return internal::get<2, Base>::value;
      case 3:
        return internal::get<3, Base>::value;
      case 4:
        return internal::get<4, Base>::value;
      default:
        eigen_assert(false && "index overflow");
        return static_cast<std::size_t>(-1);
    }
  }
  template <typename T> Sizes& operator = (const T&) {
    // to do: check the size of other
    return *this;
@ -181,10 +230,12 @@ template <std::size_t V1=0, std::size_t V2=0, std::size_t V3=0, std::size_t V4=0
  }
 };
 namespace internal {
 template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_prod(const Sizes<V1, V2, V3, V4, V5>&) {
  return Sizes<V1, V2, V3, V4, V5>::total_size;
 }
 }
 #endif
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@ -113,9 +113,9 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* scalar) {
-    m_impl.evalSubExprsIfNeeded(NULL);
+    eigen_assert(scalar == NULL);
-    return true;
+    return m_impl.evalSubExprsIfNeeded(m_buffer);
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@ -22,13 +22,8 @@ namespace Eigen {
  */
 namespace internal {
 template <typename Device, typename Expression>
 struct IsVectorizable {
  static const bool value = TensorEvaluator<Expression, Device>::PacketAccess;
 };
 // Default strategy: the expression is evaluated with a single cpu thread.
-template<typename Expression, typename Device = DefaultDevice, bool Vectorizable = IsVectorizable<Device, Expression>::value>
+template<typename Expression, typename Device, bool Vectorizable>
 class TensorExecutor
 {
 public:
@ -198,10 +193,6 @@ EigenMetaKernel_Vectorizable(Evaluator memcopied_eval, Index size) {
  }
 }
 template <typename Expression>
 struct IsVectorizable<GpuDevice, Expression> {
  static const bool value = TensorEvaluator<Expression, GpuDevice>::PacketAccess && TensorEvaluator<Expression, GpuDevice>::IsAligned;
 };
 template<typename Expression>
 class TensorExecutor<Expression, GpuDevice, false>
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@ -116,7 +116,8 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
    }
    typedef TensorEvalToOp<const ArgType> EvalTo;
    EvalTo evalToTmp(m_buffer, m_op);
-    internal::TensorExecutor<const EvalTo, Device, TensorEvaluator<ArgType, Device>::PacketAccess>::run(evalToTmp, m_device);
+    const bool PacketAccess = internal::IsVectorizable<Device, ArgType>::value;
    internal::TensorExecutor<const EvalTo, Device, PacketAccess>::run(evalToTmp, m_device);
    m_impl.cleanup();
    return true;
  }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@ -29,6 +29,7 @@ template<typename TargetType, typename XprType> class TensorConversionOp;
 template<typename Dimensions, typename InputXprType, typename KernelXprType> class TensorConvolutionOp;
 template<typename PatchDim, typename XprType> class TensorPatchOp;
 template<DenseIndex Rows, DenseIndex Cols, typename XprType> class TensorImagePatchOp;
 template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType> class TensorVolumePatchOp;
 template<typename Broadcast, typename XprType> class TensorBroadcastingOp;
 template<DenseIndex DimId, typename XprType> class TensorChippingOp;
 template<typename NewDimensions, typename XprType> class TensorReshapingOp;
@ -41,14 +42,36 @@ template<typename Strides, typename XprType> class TensorStridingOp;
 template<typename Generator, typename XprType> class TensorGeneratorOp;
 template<typename LeftXprType, typename RightXprType> class TensorAssignOp;
 template<typename CustomUnaryFunc, typename XprType> class TensorCustomUnaryOp;
 template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType> class TensorCustomBinaryOp;
 template<typename XprType> class TensorEvalToOp;
 template<typename XprType> class TensorForcedEvalOp;
 template<typename ExpressionType, typename DeviceType> class TensorDevice;
 template<typename Derived, typename Device> struct TensorEvaluator;
 class DefaultDevice;
 class ThreadPoolDevice;
 class GpuDevice;
 namespace internal {
-template<typename Expression, typename Device, bool Vectorizable> class TensorExecutor;
+
 template <typename Device, typename Expression>
 struct IsVectorizable {
  static const bool value = TensorEvaluator<Expression, Device>::PacketAccess;
 };
 template <typename Expression>
 struct IsVectorizable<GpuDevice, Expression> {
  static const bool value = TensorEvaluator<Expression, GpuDevice>::PacketAccess &&
                            TensorEvaluator<Expression, GpuDevice>::IsAligned;
 };
 template <typename Expression, typename Device,
          bool Vectorizable = IsVectorizable<Device, Expression>::value>
 class TensorExecutor;
 }  // end namespace internal
 }  // end namespace Eigen
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@ -17,6 +17,7 @@ namespace internal {
 template <typename T> struct SumReducer
 {
  static const bool PacketAccess = true;
  static const bool IsStateful = false;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
    (*accum) += t;
@ -49,6 +50,8 @@ template <typename T> struct SumReducer
 template <typename T> struct MeanReducer
 {
  static const bool PacketAccess = true;
  static const bool IsStateful = true;
  MeanReducer() : scalarCount_(0), packetCount_(0) { }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) {
@ -88,6 +91,7 @@ template <typename T> struct MeanReducer
 template <typename T> struct MaxReducer
 {
  static const bool PacketAccess = true;
  static const bool IsStateful = false;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
    if (t > *accum) { *accum = t; }
@ -120,6 +124,7 @@ template <typename T> struct MaxReducer
 template <typename T> struct MinReducer
 {
  static const bool PacketAccess = true;
  static const bool IsStateful = false;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
    if (t < *accum) { *accum = t; }
@ -153,6 +158,7 @@ template <typename T> struct MinReducer
 template <typename T> struct ProdReducer
 {
  static const bool PacketAccess = true;
  static const bool IsStateful = false;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
    (*accum) *= t;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
@ -90,7 +90,7 @@ class TensorLayoutSwapOp : public TensorBase<TensorLayoutSwapOp<XprType>, WriteA
    {
      typedef TensorAssignOp<TensorLayoutSwapOp, const TensorLayoutSwapOp> Assign;
      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
      return *this;
    }
@ -100,7 +100,7 @@ class TensorLayoutSwapOp : public TensorBase<TensorLayoutSwapOp<XprType>, WriteA
    {
      typedef TensorAssignOp<TensorLayoutSwapOp, const OtherDerived> Assign;
      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
      return *this;
    }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@ -0,0 +1,36 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #ifndef EIGEN_CXX11_TENSOR_TENSOR_META_H
 #define EIGEN_CXX11_TENSOR_TENSOR_META_H
 namespace Eigen {
 template<bool cond> struct Cond {};
 template<typename T1, typename T2> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 const T1& choose(Cond<true>, const T1& first, const T2&) {
  return first;
 }
 template<typename T1, typename T2> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 const T2& choose(Cond<false>, const T1&, const T2& second) {
  return second;
 }
 template <size_t n> struct max_n_1 {
  static const size_t size = n;
 };
 template <> struct max_n_1<0> {
  static const size_t size = 1;
 };
 }  // namespace Eigen
 #endif  // EIGEN_CXX11_TENSOR_TENSOR_META_H
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@ -78,7 +78,7 @@ class TensorReshapingOp : public TensorBase<TensorReshapingOp<NewDimensions, Xpr
    {
      typedef TensorAssignOp<TensorReshapingOp, const TensorReshapingOp> Assign;
      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
      return *this;
    }
@ -88,7 +88,7 @@ class TensorReshapingOp : public TensorBase<TensorReshapingOp<NewDimensions, Xpr
    {
      typedef TensorAssignOp<TensorReshapingOp, const OtherDerived> Assign;
      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
      return *this;
    }
@ -262,7 +262,7 @@ class TensorSlicingOp : public TensorBase<TensorSlicingOp<StartIndices, Sizes, X
    {
      typedef TensorAssignOp<TensorSlicingOp, const OtherDerived> Assign;
      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
      return *this;
    }
@ -271,7 +271,7 @@ class TensorSlicingOp : public TensorBase<TensorSlicingOp<StartIndices, Sizes, X
    {
      typedef TensorAssignOp<TensorSlicingOp, const TensorSlicingOp> Assign;
      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
      return *this;
    }
@ -411,7 +411,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
  {
    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+    eigen_assert(index+packetSize-1 < internal::array_prod(dimensions()));
    Index inputIndices[] = {0, 0};
    Index indices[] = {index, index + packetSize - 1};
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@ -44,6 +44,38 @@ struct nested<TensorReductionOp<Op, Dims, XprType>, 1, typename eval<TensorReduc
 };
 template <typename OutputDims> struct DimInitializer {
  template <typename InputDims, typename ReducedDims> EIGEN_DEVICE_FUNC
  static void run(const InputDims& input_dims,
                  const array<bool, internal::array_size<InputDims>::value>& reduced,
                  OutputDims* output_dims, ReducedDims* reduced_dims) {
    const int NumInputDims = internal::array_size<InputDims>::value;
    int outputIndex = 0;
    int reduceIndex = 0;
    for (int i = 0; i < NumInputDims; ++i) {
      if (reduced[i]) {
        (*reduced_dims)[reduceIndex] = input_dims[i];
        ++reduceIndex;
      } else {
        (*output_dims)[outputIndex] = input_dims[i];
        ++outputIndex;
      }
    }
  }
 };
 template <> struct DimInitializer<Sizes<1> > {
  template <typename InputDims, typename Index, size_t Rank> EIGEN_DEVICE_FUNC
  static void run(const InputDims& input_dims, const array<bool, Rank>&,
                  Sizes<1>*,  array<Index, Rank>* reduced_dims) {
    const int NumInputDims = internal::array_size<InputDims>::value;
    for (int i = 0; i < NumInputDims; ++i) {
      (*reduced_dims)[i] = input_dims[i];
    }
  }
 };
 template <typename ReducedDims, int NumTensorDims, int Layout>
 struct are_inner_most_dims {
  static const bool value = false;
@ -144,7 +176,7 @@ template <int DimIndex, typename Self, typename Op>
 struct InnerMostDimPreserver<DimIndex, Self, Op, true> {
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
    EIGEN_STATIC_ASSERT(DimIndex > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
-    for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
+    for (typename Self::Index j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
      const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
      InnerMostDimPreserver<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
    }
@ -154,13 +186,325 @@ struct InnerMostDimPreserver<DimIndex, Self, Op, true> {
 template <typename Self, typename Op>
 struct InnerMostDimPreserver<0, Self, Op, true> {
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
-    for (int j = 0; j < self.m_reducedDims[0]; ++j) {
+    for (typename Self::Index j = 0; j < self.m_reducedDims[0]; ++j) {
      const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
      reducer.reducePacket(self.m_impl.template packet<Unaligned>(input), accum);
    }
  }
 };
 // Default full reducer
 template <typename Self, typename Op, typename Device, bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
 struct FullReducer {
  static const bool HasOptimizedImplementation = false;
  static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::CoeffReturnType* output) {
    const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions());
    *output = InnerMostDimReducer<Self, Op>::reduce(self, 0, num_coeffs, reducer);
  }
 };
 #ifdef EIGEN_USE_THREADS
 // Multithreaded full reducers
 template <typename Eval, typename Op, bool Vectorizable = (Eval::InputPacketAccess & Op::PacketAccess)>
 struct FullReducerShard {
  static void run(const Eval& eval, typename Eval::Index firstIndex, typename Eval::Index numValuesToReduce, Op& reducer, FullReducerShard* shard) {
    shard->saccum = reducer.initialize();
    for (typename Eval::Index j = 0; j < numValuesToReduce; ++j) {
      reducer.reduce(eval.m_impl.coeff(firstIndex + j), &shard->saccum);
    }
  }
  typename Eval::CoeffReturnType saccum;
 };
 template <typename Eval, typename Op>
 struct FullReducerShard<Eval, Op, true> {
  static void run(const Eval& eval, typename Eval::Index firstIndex, typename Eval::Index numValuesToReduce, Op& reducer, FullReducerShard* shard) {
    const int packetSize = internal::unpacket_traits<typename Eval::PacketReturnType>::size;
    const typename Eval::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize;
    shard->paccum = reducer.template initializePacket<typename Eval::PacketReturnType>();
    for (typename Eval::Index j = 0; j < VectorizedSize; j += packetSize) {
      reducer.reducePacket(eval.m_impl.template packet<Unaligned>(firstIndex + j), &shard->paccum);
    }
    shard->saccum = reducer.initialize();
    for (typename Eval::Index j = VectorizedSize; j < numValuesToReduce; ++j) {
      reducer.reduce(eval.m_impl.coeff(firstIndex + j), &shard->saccum);
    }
  }
  typename Eval::PacketReturnType paccum;
  typename Eval::CoeffReturnType saccum;
 };
 template <typename Self, typename Op>
 struct FullReducer<Self, Op, ThreadPoolDevice, false> {
  static const bool HasOptimizedImplementation = !Op::IsStateful;
  // launch one reducer per thread and accumulate the result.
  static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device, typename Self::CoeffReturnType* output) {
    typedef typename Self::Index Index;
    const Index num_coeffs = array_prod(self.m_impl.dimensions());
    const Index blocksize = std::floor<Index>(static_cast<float>(num_coeffs)/device.numThreads());
    const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
    eigen_assert(num_coeffs >= numblocks * blocksize);
    std::vector<Notification*> results;
    results.reserve(numblocks);
    std::vector<FullReducerShard<Self, Op, false> > shards;
    shards.resize(numblocks);
    for (Index i = 0; i < numblocks; ++i) {
      results.push_back(device.enqueue(&FullReducerShard<Self, Op, false>::run, self, i*blocksize, blocksize, reducer, &shards[i]));
    }
    FullReducerShard<Self, Op, false> finalShard;
    if (numblocks * blocksize < num_coeffs) {
      FullReducerShard<Self, Op, false>::run(self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer, &finalShard);
    } else {
      finalShard.saccum = reducer.initialize();
    }
    for (Index i = 0; i < numblocks; ++i) {
      wait_until_ready(results[i]);
      delete results[i];
    }
    for (Index i = 0; i < numblocks; ++i) {
      reducer.reduce(shards[i].saccum, &finalShard.saccum);
    }
    *output = reducer.finalize(finalShard.saccum);
  }
 };
 template <typename Self, typename Op>
 struct FullReducer<Self, Op, ThreadPoolDevice, true> {
  static const bool HasOptimizedImplementation = !Op::IsStateful;
  // launch one reducer per thread and accumulate the result.
  static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device, typename Self::CoeffReturnType* output) {
    typedef typename Self::Index Index;
    const Index num_coeffs = array_prod(self.m_impl.dimensions());
    const Index blocksize = std::floor<Index>(static_cast<float>(num_coeffs)/device.numThreads());
    const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
    eigen_assert(num_coeffs >= numblocks * blocksize);
    std::vector<Notification*> results;
    results.reserve(numblocks);
    std::vector<FullReducerShard<Self, Op, true> > shards;
    shards.resize(numblocks);
    for (Index i = 0; i < numblocks; ++i) {
      results.push_back(device.enqueue(&FullReducerShard<Self, Op, true>::run, self, i*blocksize, blocksize, reducer, &shards[i]));
    }
    FullReducerShard<Self, Op, true> finalShard;
    if (numblocks * blocksize < num_coeffs) {
      FullReducerShard<Self, Op, true>::run(self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer, &finalShard);
    } else {
      finalShard.paccum = reducer.template initializePacket<typename Self::PacketReturnType>();
      finalShard.saccum = reducer.initialize();
    }
    for (Index i = 0; i < numblocks; ++i) {
      wait_until_ready(results[i]);
      delete results[i];
    }
    for (Index i = 0; i < numblocks; ++i) {
      reducer.reducePacket(shards[i].paccum, &finalShard.paccum);
      reducer.reduce(shards[i].saccum, &finalShard.saccum);
    }
    *output = reducer.finalizeBoth(finalShard.saccum, finalShard.paccum);
  }
 };
 #endif
 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
 // Full reducers for GPU, don't vectorize for now
 // Reducer function that enables multiple cuda thread to safely accumulate at the same
 // output address. It basically reads the current value of the output variable, and
 // attempts to update it with the new value. If in the meantime another cuda thread
 // updated the content of the output address it will try again.
 template <typename T, typename R>
 __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
 #if __CUDA_ARCH__ >= 300
  if (sizeof(T) == 4)
  {
    unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
    unsigned int newval = oldval;
    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
    if (newval == oldval) {
      return;
    }
    unsigned int readback;
    while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
      oldval = readback;
      newval = oldval;
      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
      if (newval == oldval) {
        return;
      }
    }
  }
  else if (sizeof(T) == 8) {
    unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
    unsigned long long newval = oldval;
    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
    if (newval == oldval) {
      return;
    }
    unsigned long long readback;
    while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
      oldval = readback;
      newval = oldval;
      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
      if (newval == oldval) {
        return;
      }
    }
  }
  else {
    assert(0 && "Wordsize not supported");
  }
 #else
  assert(0 && "Shouldn't be called on unsupported device");
 #endif
 }
 template <typename T>
 __device__ inline void atomicReduce(T* output, T accum, SumReducer<T>&) {
 #if __CUDA_ARCH__ >= 300
  atomicAdd(output, accum);
 #else
  assert(0 && "Shouldn't be called on unsupported device");
 #endif
 }
 template <int BlockSize, int NumPerThread, typename Self,
          typename Reducer, typename Index>
 __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
                                    typename Self::CoeffReturnType* output) {
  const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
  if (first_index == 0) {
    *output = reducer.initialize();
  }
  typename Self::CoeffReturnType accum = reducer.initialize();
  for (Index i = 0; i < NumPerThread; ++i) {
    const Index index = first_index + i * BlockSize;
    if (index >= num_coeffs) {
      break;
    }
    typename Self::CoeffReturnType val = input.m_impl.coeff(index);
    reducer.reduce(val, &accum);
  }
  for (int offset = warpSize/2; offset > 0; offset /= 2) {
    reducer.reduce(__shfl_down(accum, offset), &accum);
  }
  if ((threadIdx.x & (warpSize - 1)) == 0) {
    atomicReduce(output, accum, reducer);
  }
 }
 template <typename Self, typename Op, bool Vectorizable>
 struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
  // Unfortunately nvidia doesn't support well exotic types such as complex,
  // so reduce the scope of the optimized version of the code to the simple case
  // of floats.
  static const bool HasOptimizedImplementation = !Op::IsStateful &&
                                                 internal::is_same<typename Self::CoeffReturnType, float>::value;
  template <typename OutputType>
  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
    assert(false && "Should only be called on floats");
  }
  static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) {
    typedef typename Self::Index Index;
    const Index num_coeffs = array_prod(self.m_impl.dimensions());
    const int block_size = 256;
    const int num_per_thread = 128;
    const int num_blocks = std::ceil(static_cast<float>(num_coeffs) / (block_size * num_per_thread));
    LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread>),
                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output);
  }
 };
 #endif
 template <typename Self, typename Op,
          bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
 class BlockReducer {
 public:
  typedef typename Self::Index Index;
  typedef typename Self::Scalar Scalar;
  typedef typename Self::CoeffReturnType CoeffReturnType;
  explicit BlockReducer(const Op& reducer) : op_(reducer) {
    accum_ = op_.initialize();
  }
  void Reduce(Index index, Index num_values_to_reduce, Scalar* data) {
    for (Index i = 0; i < num_values_to_reduce; ++i) {
      op_.reduce(data[index + i], &accum_);
    }
  }
  CoeffReturnType Finalize() {
    return op_.finalize(accum_);
  }
 private:
  CoeffReturnType accum_;
  Op op_;
 };
 template <typename Self, typename Op>
 class BlockReducer<Self, Op, true> {
 public:
  typedef typename Self::Index Index;
  typedef typename Self::Scalar Scalar;
  typedef typename Self::CoeffReturnType CoeffReturnType;
  typedef typename Self::PacketReturnType PacketReturnType;
  explicit BlockReducer(const Op& reducer) : op_(reducer) {
    vaccum_ = op_.template initializePacket<PacketReturnType>();
    accum_ = op_.initialize();
  }
  void Reduce(Index index, Index num_values_to_reduce, Scalar* data) {
    const int packet_size = internal::unpacket_traits<PacketReturnType>::size;
    const typename Self::Index vectorized_size = (num_values_to_reduce /
                                                  packet_size) * packet_size;
    for (typename Self::Index i = 0; i < vectorized_size; i += packet_size) {
      op_.reducePacket(internal::ploadt<PacketReturnType, Unaligned>(
          &data[index + i]), &vaccum_);
    }
    for (typename Self::Index i = vectorized_size;
         i < num_values_to_reduce; ++i) {
      op_.reduce(data[index + i], &accum_);
    }
  }
  typename Self::CoeffReturnType Finalize() {
    return op_.finalizeBoth(accum_, vaccum_);
  }
 private:
  typename Self::PacketReturnType vaccum_;
  typename Self::CoeffReturnType accum_;
  Op op_;
 };
 }  // end namespace internal
@ -179,6 +523,7 @@ class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType>
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    TensorReductionOp(const XprType& expr, const Dims& dims) : m_expr(expr), m_dims(dims)
    { }
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    TensorReductionOp(const XprType& expr, const Dims& dims, const Op& reducer) : m_expr(expr), m_dims(dims), m_reducer(reducer)
    { }
@ -186,6 +531,7 @@ class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType>
    const XprType& expression() const { return m_expr; }
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    const Dims& dims() const { return m_dims; }
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    const Op& reducer() const { return m_reducer; }
  protected:
@ -201,10 +547,11 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
 {
  typedef TensorReductionOp<Op, Dims, ArgType> XprType;
  typedef typename XprType::Index Index;
-  static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
  static const int NumInputDims = internal::array_size<InputDimensions>::value;
  static const int NumReducedDims = internal::array_size<Dims>::value;
  static const int NumOutputDims = (NumInputDims==NumReducedDims) ? 1 : NumInputDims - NumReducedDims;
-  typedef DSizes<Index, NumOutputDims> Dimensions;
+  typedef typename internal::conditional<NumInputDims==NumReducedDims, Sizes<1>, DSizes<Index, NumOutputDims> >::type Dimensions;
  typedef typename XprType::Scalar Scalar;
  typedef TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> Self;
  static const bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess;
@ -218,9 +565,10 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
  static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
  static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
  static const bool RunningFullReduction = (NumInputDims==NumReducedDims);
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device), m_reducer(op.reducer())
+      : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device)
  {
    EIGEN_STATIC_ASSERT(NumInputDims >= NumReducedDims, YOU_MADE_A_PROGRAMMING_MISTAKE);
    EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
@ -238,17 +586,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
    }
    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
-    int outputIndex = 0;
+    internal::DimInitializer<Dimensions>::run(input_dims, reduced, &m_dimensions, &m_reducedDims);
    int reduceIndex = 0;
    for (int i = 0; i < NumInputDims; ++i) {
      if (reduced[i]) {
        m_reducedDims[reduceIndex] = input_dims[i];
        ++reduceIndex;
      } else {
        m_dimensions[outputIndex] = input_dims[i];
        ++outputIndex;
      }
    }
    // Precompute output strides.
    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@ -277,8 +615,8 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
      }
    }
-    outputIndex = 0;
+    int outputIndex = 0;
-    reduceIndex = 0;
+    int reduceIndex = 0;
    for (int i = 0; i < NumInputDims; ++i) {
      if (reduced[i]) {
        m_reducedStrides[reduceIndex] = input_strides[i];
@ -291,27 +629,50 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
    // Special case for full reductions
    if (NumInputDims == NumReducedDims) {
-      m_dimensions[0] = 1;
+      eigen_assert(m_dimensions[0] == 1);
      m_preservedStrides[0] = internal::array_prod(input_dims);
    }
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
  typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType;
  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
    m_impl.evalSubExprsIfNeeded(NULL);
    // Use the FullReducer if possible.
    if (RunningFullReduction && internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
        ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) ||
         (internal::array_prod(m_impl.dimensions()) > 1024 * 1024))) {
      bool need_assign = false;
      if (!data) {
        m_result = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType)));
        data = m_result;
        need_assign = true;
      }
      Op reducer(m_reducer);
      internal::FullReducer<Self, Op, Device>::run(*this, reducer, m_device, data);
      return need_assign;
    }
    return true;
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
    m_impl.cleanup();
    if (m_result) {
      m_device.deallocate(m_result);
    }
  }
  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
  typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
  {
    if (RunningFullReduction && m_result) {
      return *m_result;
    }
    Op reducer(m_reducer);
    if (ReducingInnerMostDims) {
      const Index num_values_to_reduce =
@ -372,6 +733,13 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
  template <int, typename, typename> friend struct internal::GenericDimReducer;
  template <typename, typename, bool> friend struct internal::InnerMostDimReducer;
  template <int, typename, typename, bool> friend struct internal::InnerMostDimPreserver;
  template <typename S, typename O, typename D, bool V> friend struct internal::FullReducer;
 #ifdef EIGEN_USE_THREADS
  template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
 #endif
 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
  template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*);
 #endif
  // Returns the Index in the input tensor of the first value that needs to be
  // used to compute the reduction at output index "index".
@ -392,7 +760,12 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
        startInput += idx * m_preservedStrides[i];
        index -= idx * m_outputStrides[i];
      }
      if (PreservingInnerMostDims) {
        eigen_assert(m_preservedStrides[0] == 1);
        startInput += index;
      } else {
        startInput += index * m_preservedStrides[0];
      }
    } else {
      for (int i = 0; i < NumOutputDims - 1; ++i) {
        // This is index_i in the output tensor.
@ -400,8 +773,13 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
        startInput += idx * m_preservedStrides[i];
        index -= idx * m_outputStrides[i];
      }
      if (PreservingInnerMostDims) {
        eigen_assert(m_preservedStrides[NumOutputDims - 1] == 1);
        startInput += index;
      } else {
        startInput += index * m_preservedStrides[NumOutputDims - 1];
      }
    }
    return startInput;
  }
@ -425,6 +803,16 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
  // Operation to apply for computing the reduction.
  Op m_reducer;
  // For full reductions
 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
  static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
 #else
  static const bool RunningOnGPU = false;
 #endif
  CoeffReturnType* m_result;
  const Device& m_device;
 };
 } // end namespace Eigen
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@ -80,7 +80,7 @@ class TensorReverseOp : public TensorBase<TensorReverseOp<ReverseDimensions,
    {
      typedef TensorAssignOp<TensorReverseOp, const TensorReverseOp> Assign;
      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
      return *this;
    }
@ -90,7 +90,7 @@ class TensorReverseOp : public TensorBase<TensorReverseOp<ReverseDimensions,
    {
      typedef TensorAssignOp<TensorReverseOp, const OtherDerived> Assign;
      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
      return *this;
    }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@ -78,7 +78,7 @@ class TensorShufflingOp : public TensorBase<TensorShufflingOp<Shuffle, XprType>
    {
      typedef TensorAssignOp<TensorShufflingOp, const TensorShufflingOp> Assign;
      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
      return *this;
    }
@ -88,7 +88,7 @@ class TensorShufflingOp : public TensorBase<TensorShufflingOp<Shuffle, XprType>
    {
      typedef TensorAssignOp<TensorShufflingOp, const OtherDerived> Assign;
      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
      return *this;
    }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@ -78,7 +78,7 @@ class TensorStridingOp : public TensorBase<TensorStridingOp<Strides, XprType> >
    {
      typedef TensorAssignOp<TensorStridingOp, const TensorStridingOp> Assign;
      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
      return *this;
    }
@ -88,7 +88,7 @@ class TensorStridingOp : public TensorBase<TensorStridingOp<Strides, XprType> >
    {
      typedef TensorAssignOp<TensorStridingOp, const OtherDerived> Assign;
      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
      return *this;
    }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
@ -0,0 +1,677 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 #ifndef EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H
 #define EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H
 namespace Eigen {
 /** \class TensorVolumePatch
  * \ingroup CXX11_Tensor_Module
  *
  * \brief Patch extraction specialized for processing of volumetric data.
  * This assumes that the input has a least 4 dimensions ordered as follows:
  *  - channels
  *  - planes
  *  - rows
  *  - columns
  *  - (optional) additional dimensions such as time or batch size.
  * Calling the volume patch code with patch_planes, patch_rows, and patch_cols
  * is equivalent to calling the regular patch extraction code with parameters
  * d, patch_planes, patch_rows, patch_cols, and 1 for all the additional
  * dimensions.
  */
 namespace internal {
 template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
 struct traits<TensorVolumePatchOp<Planes, Rows, Cols, XprType> > : public traits<XprType>
 {
  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
  typedef traits<XprType> XprTraits;
  typedef typename packet_traits<Scalar>::type Packet;
  typedef typename XprTraits::StorageKind StorageKind;
  typedef typename XprTraits::Index Index;
  typedef typename XprType::Nested Nested;
  typedef typename remove_reference<Nested>::type _Nested;
  static const int NumDimensions = XprTraits::NumDimensions + 1;
  static const int Layout = XprTraits::Layout;
 };
 template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
 struct eval<TensorVolumePatchOp<Planes, Rows, Cols, XprType>, Eigen::Dense>
 {
  typedef const TensorVolumePatchOp<Planes, Rows, Cols, XprType>& type;
 };
 template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
 struct nested<TensorVolumePatchOp<Planes, Rows, Cols, XprType>, 1, typename eval<TensorVolumePatchOp<Planes, Rows, Cols, XprType> >::type>
 {
  typedef TensorVolumePatchOp<Planes, Rows, Cols, XprType> type;
 };
 }  // end namespace internal
 template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
 class TensorVolumePatchOp : public TensorBase<TensorVolumePatchOp<Planes, Rows, Cols, XprType>, ReadOnlyAccessors>
 {
  public:
  typedef typename Eigen::internal::traits<TensorVolumePatchOp>::Scalar Scalar;
  typedef typename Eigen::internal::traits<TensorVolumePatchOp>::Packet Packet;
  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename XprType::PacketReturnType PacketReturnType;
  typedef typename Eigen::internal::nested<TensorVolumePatchOp>::type Nested;
  typedef typename Eigen::internal::traits<TensorVolumePatchOp>::StorageKind StorageKind;
  typedef typename Eigen::internal::traits<TensorVolumePatchOp>::Index Index;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorVolumePatchOp(const XprType& expr, DenseIndex patch_planes, DenseIndex patch_rows, DenseIndex patch_cols,
                                                            DenseIndex plane_strides, DenseIndex row_strides, DenseIndex col_strides,
                                                            DenseIndex in_plane_strides, DenseIndex in_row_strides, DenseIndex in_col_strides,
                                                            DenseIndex plane_inflate_strides, DenseIndex row_inflate_strides, DenseIndex col_inflate_strides,
                                                            PaddingType padding_type, Scalar padding_value)
      : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
        m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides),
        m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
        m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
        m_padding_explicit(false), m_padding_top_z(0), m_padding_bottom_z(0), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0),
        m_padding_type(padding_type), m_padding_value(padding_value) {}
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorVolumePatchOp(const XprType& expr, DenseIndex patch_planes, DenseIndex patch_rows, DenseIndex patch_cols,
                                                           DenseIndex plane_strides, DenseIndex row_strides, DenseIndex col_strides,
                                                           DenseIndex in_plane_strides, DenseIndex in_row_strides, DenseIndex in_col_strides,
                                                           DenseIndex plane_inflate_strides, DenseIndex row_inflate_strides, DenseIndex col_inflate_strides,
                                                           DenseIndex padding_top_z, DenseIndex padding_bottom_z,
                                                           DenseIndex padding_top, DenseIndex padding_bottom,
                                                           DenseIndex padding_left, DenseIndex padding_right,
                                                           Scalar padding_value)
      : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
        m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides),
        m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
        m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
        m_padding_explicit(true), m_padding_top_z(padding_top_z), m_padding_bottom_z(padding_bottom_z), m_padding_top(padding_top), m_padding_bottom(padding_bottom),
        m_padding_left(padding_left), m_padding_right(padding_right),
        m_padding_type(PADDING_VALID), m_padding_value(padding_value) {}
    EIGEN_DEVICE_FUNC
    DenseIndex patch_planes() const { return m_patch_planes; }
    EIGEN_DEVICE_FUNC
    DenseIndex patch_rows() const { return m_patch_rows; }
    EIGEN_DEVICE_FUNC
    DenseIndex patch_cols() const { return m_patch_cols; }
    EIGEN_DEVICE_FUNC
    DenseIndex plane_strides() const { return m_plane_strides; }
    EIGEN_DEVICE_FUNC
    DenseIndex row_strides() const { return m_row_strides; }
    EIGEN_DEVICE_FUNC
    DenseIndex col_strides() const { return m_col_strides; }
    EIGEN_DEVICE_FUNC
    DenseIndex in_plane_strides() const { return m_in_plane_strides; }
    EIGEN_DEVICE_FUNC
    DenseIndex in_row_strides() const { return m_in_row_strides; }
    EIGEN_DEVICE_FUNC
    DenseIndex in_col_strides() const { return m_in_col_strides; }
    EIGEN_DEVICE_FUNC
    DenseIndex plane_inflate_strides() const { return m_plane_inflate_strides; }
    EIGEN_DEVICE_FUNC
    DenseIndex row_inflate_strides() const { return m_row_inflate_strides; }
    EIGEN_DEVICE_FUNC
    DenseIndex col_inflate_strides() const { return m_col_inflate_strides; }
    EIGEN_DEVICE_FUNC
    bool padding_explicit() const { return m_padding_explicit; }
    EIGEN_DEVICE_FUNC
    DenseIndex padding_top_z() const { return m_padding_top_z; }
    EIGEN_DEVICE_FUNC
    DenseIndex padding_bottom_z() const { return m_padding_bottom_z; }
    EIGEN_DEVICE_FUNC
    DenseIndex padding_top() const { return m_padding_top; }
    EIGEN_DEVICE_FUNC
    DenseIndex padding_bottom() const { return m_padding_bottom; }
    EIGEN_DEVICE_FUNC
    DenseIndex padding_left() const { return m_padding_left; }
    EIGEN_DEVICE_FUNC
    DenseIndex padding_right() const { return m_padding_right; }
    EIGEN_DEVICE_FUNC
    PaddingType padding_type() const { return m_padding_type; }
    EIGEN_DEVICE_FUNC
    Scalar padding_value() const { return m_padding_value; }
    EIGEN_DEVICE_FUNC
    const typename internal::remove_all<typename XprType::Nested>::type&
    expression() const { return m_xpr; }
  protected:
    typename XprType::Nested m_xpr;
    const DenseIndex m_patch_planes;
    const DenseIndex m_patch_rows;
    const DenseIndex m_patch_cols;
    const DenseIndex m_plane_strides;
    const DenseIndex m_row_strides;
    const DenseIndex m_col_strides;
    const DenseIndex m_in_plane_strides;
    const DenseIndex m_in_row_strides;
    const DenseIndex m_in_col_strides;
    const DenseIndex m_plane_inflate_strides;
    const DenseIndex m_row_inflate_strides;
    const DenseIndex m_col_inflate_strides;
    const bool m_padding_explicit;
    const DenseIndex m_padding_top_z;
    const DenseIndex m_padding_bottom_z;
    const DenseIndex m_padding_top;
    const DenseIndex m_padding_bottom;
    const DenseIndex m_padding_left;
    const DenseIndex m_padding_right;
    const PaddingType m_padding_type;
    const Scalar m_padding_value;
 };
 // Eval as rvalue
 template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename ArgType, typename Device>
 struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, Device>
 {
  typedef TensorVolumePatchOp<Planes, Rows, Cols, ArgType> XprType;
  typedef typename XprType::Index Index;
  static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
  static const int NumDims = NumInputDims + 1;
  typedef DSizes<Index, NumDims> Dimensions;
  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
  enum {
    IsAligned = false,
    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
    BlockAccess = false,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
    CoordAccess = NumDims == 6,
  };
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
      : m_impl(op.expression(), device)
  {
    EIGEN_STATIC_ASSERT(NumDims >= 5, YOU_MADE_A_PROGRAMMING_MISTAKE);
    m_paddingValue = op.padding_value();
    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
    // Cache a few variables.
    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
      m_inputDepth = input_dims[0];
      m_inputPlanes = input_dims[1];
      m_inputRows = input_dims[2];
      m_inputCols = input_dims[3];
    } else {
      m_inputDepth = input_dims[NumInputDims-1];
      m_inputPlanes = input_dims[NumInputDims-2];
      m_inputRows = input_dims[NumInputDims-3];
      m_inputCols = input_dims[NumInputDims-4];
    }
    m_plane_strides = op.plane_strides();
    m_row_strides = op.row_strides();
    m_col_strides = op.col_strides();
    // Input strides and effective input/patch size
    m_in_plane_strides = op.in_plane_strides();
    m_in_row_strides = op.in_row_strides();
    m_in_col_strides = op.in_col_strides();
    m_plane_inflate_strides = op.plane_inflate_strides();
    m_row_inflate_strides = op.row_inflate_strides();
    m_col_inflate_strides = op.col_inflate_strides();
    // The "effective" spatial size after inflating data with zeros.
    m_input_planes_eff = (m_inputPlanes - 1) * m_plane_inflate_strides + 1;
    m_input_rows_eff = (m_inputRows - 1) * m_row_inflate_strides + 1;
    m_input_cols_eff = (m_inputCols - 1) * m_col_inflate_strides + 1;
    m_patch_planes_eff = op.patch_planes() + (op.patch_planes() - 1) * (m_in_plane_strides - 1);
    m_patch_rows_eff = op.patch_rows() + (op.patch_rows() - 1) * (m_in_row_strides - 1);
    m_patch_cols_eff = op.patch_cols() + (op.patch_cols() - 1) * (m_in_col_strides - 1);
    if (op.padding_explicit()) {
      m_outputPlanes = numext::ceil((m_input_planes_eff + op.padding_top_z() + op.padding_bottom_z() - m_patch_planes_eff + 1.f) / static_cast<float>(m_plane_strides));
      m_outputRows = numext::ceil((m_input_rows_eff + op.padding_top() + op.padding_bottom() - m_patch_rows_eff + 1.f) / static_cast<float>(m_row_strides));
      m_outputCols = numext::ceil((m_input_cols_eff + op.padding_left() + op.padding_right() - m_patch_cols_eff + 1.f) / static_cast<float>(m_col_strides));
      m_planePaddingTop = op.padding_top_z();
      m_rowPaddingTop = op.padding_top();
      m_colPaddingLeft = op.padding_left();
    } else {
      // Computing padding from the type
      switch (op.padding_type()) {
        case PADDING_VALID:
          m_outputPlanes = numext::ceil((m_input_planes_eff - m_patch_planes_eff + 1.f) / static_cast<float>(m_plane_strides));
          m_outputRows = numext::ceil((m_input_rows_eff - m_patch_rows_eff + 1.f) / static_cast<float>(m_row_strides));
          m_outputCols = numext::ceil((m_input_cols_eff - m_patch_cols_eff + 1.f) / static_cast<float>(m_col_strides));
          m_planePaddingTop = 0;
          m_rowPaddingTop = 0;
          m_colPaddingLeft = 0;
          break;
        case PADDING_SAME: {
          m_outputPlanes = numext::ceil(m_input_planes_eff / static_cast<float>(m_plane_strides));
          m_outputRows = numext::ceil(m_input_rows_eff / static_cast<float>(m_row_strides));
          m_outputCols = numext::ceil(m_input_cols_eff / static_cast<float>(m_col_strides));
          const Index dz = m_outputPlanes * m_plane_strides + m_patch_planes_eff - 1 - m_input_planes_eff;
          const Index dy = m_outputRows * m_row_strides + m_patch_rows_eff - 1 - m_input_rows_eff;
          const Index dx = m_outputCols * m_col_strides + m_patch_cols_eff - 1 - m_input_cols_eff;
          m_planePaddingTop = dz - dz / 2;
          m_rowPaddingTop = dy - dy / 2;
          m_colPaddingLeft = dx - dx / 2;
          break;
        }
        default:
          eigen_assert(false && "unexpected padding");
      }
    }
    eigen_assert(m_outputRows > 0);
    eigen_assert(m_outputCols > 0);
    eigen_assert(m_outputPlanes > 0);
    // Dimensions for result of extraction.
    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
      // ColMajor
      // 0: depth
      // 1: patch_planes
      // 2: patch_rows
      // 3: patch_cols
      // 4: number of patches
      // 5 and beyond: anything else (such as batch).
      m_dimensions[0] = input_dims[0];
      m_dimensions[1] = op.patch_planes();
      m_dimensions[2] = op.patch_rows();
      m_dimensions[3] = op.patch_cols();
      m_dimensions[4] = m_outputPlanes * m_outputRows * m_outputCols;
      for (int i = 5; i < NumDims; ++i) {
        m_dimensions[i] = input_dims[i-1];
      }
    } else {
      // RowMajor
      // NumDims-1: depth
      // NumDims-2: patch_planes
      // NumDims-3: patch_rows
      // NumDims-4: patch_cols
      // NumDims-5: number of patches
      // NumDims-6 and beyond: anything else (such as batch).
      m_dimensions[NumDims-1] = input_dims[NumInputDims-1];
      m_dimensions[NumDims-2] = op.patch_planes();
      m_dimensions[NumDims-3] = op.patch_rows();
      m_dimensions[NumDims-4] = op.patch_cols();
      m_dimensions[NumDims-5] = m_outputPlanes * m_outputRows * m_outputCols;
      for (int i = NumDims-6; i >= 0; --i) {
        m_dimensions[i] = input_dims[i];
      }
    }
    // Strides for the output tensor.
    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
      m_rowStride = m_dimensions[1];
      m_colStride = m_dimensions[2] * m_rowStride;
      m_patchStride = m_colStride * m_dimensions[3] * m_dimensions[0];
      m_otherStride = m_patchStride * m_dimensions[4];
    } else {
      m_rowStride = m_dimensions[NumDims-2];
      m_colStride = m_dimensions[NumDims-3] * m_rowStride;
      m_patchStride = m_colStride * m_dimensions[NumDims-4] * m_dimensions[NumDims-1];
      m_otherStride = m_patchStride * m_dimensions[NumDims-5];
    }
    // Strides for navigating through the input tensor.
    m_planeInputStride = m_inputDepth;
    m_rowInputStride = m_inputDepth * m_inputPlanes;
    m_colInputStride = m_inputDepth * m_inputRows * m_inputPlanes;
    m_otherInputStride = m_inputDepth * m_inputRows * m_inputCols * m_inputPlanes;
    m_outputPlanesRows = m_outputPlanes * m_outputRows;
    // Fast representations of different variables.
    m_fastOtherStride = internal::TensorIntDivisor<Index>(m_otherStride);
    m_fastPatchStride = internal::TensorIntDivisor<Index>(m_patchStride);
    m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride);
    m_fastRowStride = internal::TensorIntDivisor<Index>(m_rowStride);
    m_fastInputRowStride = internal::TensorIntDivisor<Index>(m_row_inflate_strides);
    m_fastInputColStride = internal::TensorIntDivisor<Index>(m_col_inflate_strides);
    m_fastInputPlaneStride = internal::TensorIntDivisor<Index>(m_plane_inflate_strides);
    m_fastInputColsEff = internal::TensorIntDivisor<Index>(m_input_cols_eff);
    m_fastOutputPlanes = internal::TensorIntDivisor<Index>(m_outputPlanes);
    m_fastOutputPlanesRows = internal::TensorIntDivisor<Index>(m_outputPlanesRows);
    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
      m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[0]);
    } else {
      m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[NumDims-1]);
    }
  }
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename XprType::PacketReturnType PacketReturnType;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
    m_impl.evalSubExprsIfNeeded(NULL);
    return true;
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
    m_impl.cleanup();
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
  {
    // Patch index corresponding to the passed in index.
    const Index patchIndex = index / m_fastPatchStride;
    // Spatial offset within the patch. This has to be translated into 3D
    // coordinates within the patch.
    const Index patchOffset = (index - patchIndex * m_patchStride) / m_fastOutputDepth;
    // Batch, etc.
    const Index otherIndex = (NumDims == 5) ? 0 : index / m_fastOtherStride;
    const Index patch3DIndex = (NumDims == 5) ? patchIndex : (index - otherIndex * m_otherStride) / m_fastPatchStride;
    // Calculate column index in the input original tensor.
    const Index colIndex = patch3DIndex / m_fastOutputPlanesRows;
    const Index colOffset = patchOffset / m_fastColStride;
    const Index inputCol = colIndex * m_col_strides + colOffset * m_in_col_strides - m_colPaddingLeft;
    const Index origInputCol = (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0);
    if (inputCol < 0 || inputCol >= m_input_cols_eff ||
        ((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides))) {
      return Scalar(m_paddingValue);
    }
    // Calculate row index in the original input tensor.
    const Index rowIndex = (patch3DIndex - colIndex * m_outputPlanesRows) / m_fastOutputPlanes;
    const Index rowOffset = (patchOffset - colOffset * m_colStride) / m_fastRowStride;
    const Index inputRow = rowIndex * m_row_strides + rowOffset * m_in_row_strides - m_rowPaddingTop;
    const Index origInputRow = (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0);
    if (inputRow < 0 || inputRow >= m_input_rows_eff ||
        ((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) {
      return Scalar(m_paddingValue);
    }
    // Calculate plane index in the original input tensor.
    const Index planeIndex = (patch3DIndex - m_outputPlanes * (colIndex * m_outputRows + rowIndex));
    const Index planeOffset = patchOffset - colOffset * m_colStride - rowOffset * m_rowStride;
    const Index inputPlane = planeIndex * m_plane_strides + planeOffset * m_in_plane_strides - m_planePaddingTop;
    const Index origInputPlane = (m_plane_inflate_strides == 1) ? inputPlane : ((inputPlane >= 0) ? (inputPlane / m_fastInputPlaneStride) : 0);
    if (inputPlane < 0 || inputPlane >= m_input_planes_eff ||
        ((m_plane_inflate_strides != 1) && (inputPlane != origInputPlane * m_plane_inflate_strides))) {
      return Scalar(m_paddingValue);
    }
    const int depth_index = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
    const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index];
    const Index inputIndex = depth +
        origInputRow * m_rowInputStride +
        origInputCol * m_colInputStride +
        origInputPlane * m_planeInputStride +
        otherIndex * m_otherInputStride;
    return m_impl.coeff(inputIndex);
  }
  template<int LoadMode>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
  {
    const Index packetSize = internal::unpacket_traits<PacketReturnType>::size;
    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
    if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1 ||
        m_in_plane_strides != 1 || m_plane_inflate_strides != 1) {
      return packetWithPossibleZero(index);
    }
    const Index indices[2] = {index, index + packetSize - 1};
    const Index patchIndex = indices[0] / m_fastPatchStride;
    if (patchIndex != indices[1] / m_fastPatchStride) {
      return packetWithPossibleZero(index);
    }
    const Index otherIndex = (NumDims == 5) ? 0 : indices[0] / m_fastOtherStride;
    eigen_assert(otherIndex == indices[1] / m_fastOtherStride);
    // Find the offset of the element wrt the location of the first element.
    const Index patchOffsets[2] = {(indices[0] - patchIndex * m_patchStride) / m_fastOutputDepth,
                                   (indices[1] - patchIndex * m_patchStride) / m_fastOutputDepth};
    const Index patch3DIndex = (NumDims == 5) ? patchIndex : (indices[0] - otherIndex * m_otherStride) / m_fastPatchStride;
    eigen_assert(patch3DIndex == (indices[1] - otherIndex * m_otherStride) / m_fastPatchStride);
    const Index colIndex = patch3DIndex / m_fastOutputPlanesRows;
    const Index colOffsets[2] = {
      patchOffsets[0] / m_fastColStride,
      patchOffsets[1] / m_fastColStride};
    // Calculate col indices in the original input tensor.
    const Index inputCols[2] = {
      colIndex * m_col_strides + colOffsets[0] - m_colPaddingLeft,
      colIndex * m_col_strides + colOffsets[1] - m_colPaddingLeft};
    if (inputCols[1] < 0 || inputCols[0] >= m_inputCols) {
      return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
    }
    if (inputCols[0] != inputCols[1]) {
      return packetWithPossibleZero(index);
    }
    const Index rowIndex = (patch3DIndex - colIndex * m_outputPlanesRows) / m_fastOutputPlanes;
    const Index rowOffsets[2] = {
      (patchOffsets[0] - colOffsets[0] * m_colStride) / m_fastRowStride,
      (patchOffsets[1] - colOffsets[1] * m_colStride) / m_fastRowStride};
    eigen_assert(rowOffsets[0] <= rowOffsets[1]);
    // Calculate col indices in the original input tensor.
    const Index inputRows[2] = {
      rowIndex * m_row_strides + rowOffsets[0] - m_rowPaddingTop,
      rowIndex * m_row_strides + rowOffsets[1] - m_rowPaddingTop};
    if (inputRows[1] < 0 || inputRows[0] >= m_inputRows) {
      return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
    }
    if (inputRows[0] != inputRows[1]) {
      return packetWithPossibleZero(index);
    }
    const Index planeIndex = (patch3DIndex - m_outputPlanes * (colIndex * m_outputRows + rowIndex));
    const Index planeOffsets[2] = {
      patchOffsets[0] - colOffsets[0] * m_colStride - rowOffsets[0] * m_rowStride,
      patchOffsets[1] - colOffsets[1] * m_colStride - rowOffsets[1] * m_rowStride};
    eigen_assert(planeOffsets[0] <= planeOffsets[1]);
    const Index inputPlanes[2] = {
      planeIndex * m_plane_strides + planeOffsets[0] - m_planePaddingTop,
      planeIndex * m_plane_strides + planeOffsets[1] - m_planePaddingTop};
    if (inputPlanes[1] < 0 || inputPlanes[0] >= m_inputPlanes) {
      return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
    }
    if (inputPlanes[0] >= 0 && inputPlanes[1] < m_inputPlanes) {
      // no padding
      const int depth_index = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
      const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index];
      const Index inputIndex = depth +
          inputRows[0] * m_rowInputStride +
          inputCols[0] * m_colInputStride +
          m_planeInputStride * inputPlanes[0] +
          otherIndex * m_otherInputStride;
      return m_impl.template packet<Unaligned>(inputIndex);
    }
    return packetWithPossibleZero(index);
  }
  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
  Index planePaddingTop() const { return m_planePaddingTop; }
  Index rowPaddingTop() const { return m_rowPaddingTop; }
  Index colPaddingLeft() const { return m_colPaddingLeft; }
  Index outputPlanes() const { return m_outputPlanes; }
  Index outputRows() const { return m_outputRows; }
  Index outputCols() const { return m_outputCols; }
  Index userPlaneStride() const { return m_plane_strides; }
  Index userRowStride() const { return m_row_strides; }
  Index userColStride() const { return m_col_strides; }
  Index userInPlaneStride() const { return m_in_plane_strides; }
  Index userInRowStride() const { return m_in_row_strides; }
  Index userInColStride() const { return m_in_col_strides; }
  Index planeInflateStride() const { return m_plane_inflate_strides; }
  Index rowInflateStride() const { return m_row_inflate_strides; }
  Index colInflateStride() const { return m_col_inflate_strides; }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords) const
  {
    // ColMajor
    //   0: depth, 1: patch_planes, 2: patch_rows, 3: patch_cols, 4: number of patches, 5: batches
    // RowMajor
    //   0: batches, 1: number of patches, 2: patch_cols , 3: patch_rows, 4: patch_planes, 5: depth
    const Index patch3DIndex = coords[static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 4 : 1];
    const Index colOffset = coords[static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 3 : 2];
    const Index rowOffset= coords[static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 3];
    const Index planeOffset = coords[static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 4];
    array<Index, NumDims-1> inputCoords;
    const Index colIndex = patch3DIndex / m_fastOutputPlanesRows;
    const Index inputCol = colIndex * m_col_strides + colOffset * m_in_col_strides - m_colPaddingLeft;
    const Index origInputCol = (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0);
    if (inputCol < 0 || inputCol >= m_input_cols_eff ||
        ((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides))) {
      return Scalar(m_paddingValue);
    }
    const Index rowIndex = (patch3DIndex - colIndex * m_outputPlanesRows) / m_fastOutputPlanes;
    const Index inputRow = rowIndex * m_row_strides + rowOffset * m_in_row_strides - m_rowPaddingTop;
    const Index origInputRow = (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0);
    if (inputRow < 0 || inputRow >= m_input_rows_eff ||
        ((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) {
      return Scalar(m_paddingValue);
    }
    const Index planeIndex = patch3DIndex - colIndex * m_outputPlanesRows - rowIndex * m_outputRows;
    const Index inputPlane = planeIndex * m_plane_strides + planeOffset * m_in_plane_strides - m_planePaddingTop;
    const Index origInputPlane = (m_plane_inflate_strides == 1) ? inputPlane : ((inputPlane >= 0) ? (inputPlane / m_fastInputPlaneStride) : 0);
    if (inputPlane < 0 || inputPlane >= m_input_planes_eff ||
        ((m_plane_inflate_strides != 1) && (inputPlane != origInputPlane * m_plane_inflate_strides))) {
      return Scalar(m_paddingValue);
    }
    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
      inputCoords[0] = coords[0];  // depth
      inputCoords[1] = origInputPlane;
      inputCoords[2] = origInputRow;
      inputCoords[3] = origInputCol;
      inputCoords[4] = coords[5];  // batch
    } else {
      inputCoords[4] = coords[5];  // depth
      inputCoords[3] = origInputPlane;
      inputCoords[2] = origInputRow;
      inputCoords[1] = origInputCol;
      inputCoords[0] = coords[0];  // batch
    }
    if (TensorEvaluator<ArgType, Device>::CoordAccess) {
      return m_impl.coeff(inputCoords);
    } else {
      Index inputIndex;
      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
        inputIndex =
          inputCoords[4] * m_otherInputStride +
          inputCoords[3] * m_colInputStride +
          inputCoords[2] * m_rowInputStride +
          inputCoords[1] * m_planeInputStride +
          inputCoords[0];
      } else {
        inputIndex =
          inputCoords[0] * m_otherInputStride +
          inputCoords[1] * m_colInputStride +
          inputCoords[2] * m_rowInputStride +
          inputCoords[3] * m_planeInputStride +
          inputCoords[4];
      }
      return m_impl.coeff(inputIndex);
    }
  }
 protected:
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
  {
    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
    EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
    for (int i = 0; i < packetSize; ++i) {
      values[i] = coeff(index+i);
    }
    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
    return rslt;
  }
  Dimensions m_dimensions;
  // Parameters passed to the costructor.
  Index m_plane_strides;
  Index m_row_strides;
  Index m_col_strides;
  Index m_outputPlanes;
  Index m_outputRows;
  Index m_outputCols;
  Index m_planePaddingTop;
  Index m_rowPaddingTop;
  Index m_colPaddingLeft;
  Index m_in_plane_strides;
  Index m_in_row_strides;
  Index m_in_col_strides;
  Index m_plane_inflate_strides;
  Index m_row_inflate_strides;
  Index m_col_inflate_strides;
  // Cached input size.
  Index m_inputDepth;
  Index m_inputPlanes;
  Index m_inputRows;
  Index m_inputCols;
  // Other cached variables.
  Index m_outputPlanesRows;
  // Effective input/patch post-inflation size.
  Index m_input_planes_eff;
  Index m_input_rows_eff;
  Index m_input_cols_eff;
  Index m_patch_planes_eff;
  Index m_patch_rows_eff;
  Index m_patch_cols_eff;
  // Strides for the output tensor.
  Index m_otherStride;
  Index m_patchStride;
  Index m_rowStride;
  Index m_colStride;
  // Strides for the input tensor.
  Index m_planeInputStride;
  Index m_rowInputStride;
  Index m_colInputStride;
  Index m_otherInputStride;
  internal::TensorIntDivisor<Index> m_fastOtherStride;
  internal::TensorIntDivisor<Index> m_fastPatchStride;
  internal::TensorIntDivisor<Index> m_fastColStride;
  internal::TensorIntDivisor<Index> m_fastRowStride;
  internal::TensorIntDivisor<Index> m_fastInputPlaneStride;
  internal::TensorIntDivisor<Index> m_fastInputRowStride;
  internal::TensorIntDivisor<Index> m_fastInputColStride;
  internal::TensorIntDivisor<Index> m_fastInputColsEff;
  internal::TensorIntDivisor<Index> m_fastOutputPlanesRows;
  internal::TensorIntDivisor<Index> m_fastOutputPlanes;
  internal::TensorIntDivisor<Index> m_fastOutputDepth;
  Scalar m_paddingValue;
  TensorEvaluator<ArgType, Device> m_impl;
 };
 } // end namespace Eigen
 #endif // EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@ -125,6 +125,7 @@ if(EIGEN_TEST_CXX11)
  ei_add_test(cxx11_tensor_padding "-std=c++0x")
  ei_add_test(cxx11_tensor_patch "-std=c++0x")
  ei_add_test(cxx11_tensor_image_patch "-std=c++0x")
  ei_add_test(cxx11_tensor_volume_patch "-std=c++0x")
  ei_add_test(cxx11_tensor_reduction "-std=c++0x")
  ei_add_test(cxx11_tensor_shuffling "-std=c++0x")
  ei_add_test(cxx11_tensor_striding "-std=c++0x")
@ -136,10 +137,12 @@ if(EIGEN_TEST_CXX11)
  ei_add_test(cxx11_tensor_layout_swap "-std=c++0x")
  ei_add_test(cxx11_tensor_io "-std=c++0x")
  ei_add_test(cxx11_tensor_generator "-std=c++0x")
  ei_add_test(cxx11_tensor_custom_op "-std=c++0x")
  # These tests needs nvcc
 #  ei_add_test(cxx11_tensor_device "-std=c++0x")
 #  ei_add_test(cxx11_tensor_cuda "-std=c++0x")
 #  ei_add_test(cxx11_tensor_contract_cuda "-std=c++0x")
 #  ei_add_test(cxx11_tensor_reduction_cuda "-std=c++0x")
 endif()
--- a/unsupported/test/cxx11_meta.cpp
+++ b/unsupported/test/cxx11_meta.cpp
@ -354,7 +354,3 @@ void test_cxx11_meta()
  CALL_SUBTEST(test_array_zip_and_apply());
  CALL_SUBTEST(test_array_misc());
 }
 /*
 * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
 */
--- a/unsupported/test/cxx11_tensor_custom_op.cpp
+++ b/unsupported/test/cxx11_tensor_custom_op.cpp
@ -0,0 +1,107 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #include "main.h"
 #include <Eigen/CXX11/Tensor>
 using Eigen::Tensor;
 struct InsertZeros {
  DSizes<DenseIndex, 2> dimensions(const Tensor<float, 2>& input) const {
    DSizes<DenseIndex, 2> result;
    result[0] = input.dimension(0) * 2;
    result[1] = input.dimension(1) * 2;
    return result;
  }
  template <typename Output, typename Device>
  void eval(const Tensor<float, 2>& input, Output& output, const Device& device) const
  {
    array<DenseIndex, 2> strides{{2, 2}};
    output.stride(strides).device(device) = input;
    Eigen::DSizes<DenseIndex, 2> offsets(1,1);
    Eigen::DSizes<DenseIndex, 2> extents(output.dimension(0)-1, output.dimension(1)-1);
    output.slice(offsets, extents).stride(strides).device(device) = input.constant(0.0f);
  }
 };
 static void test_custom_unary_op()
 {
  Tensor<float, 2> tensor(3,5);
  tensor.setRandom();
  Tensor<float, 2> result = tensor.customOp(InsertZeros());
  VERIFY_IS_EQUAL(result.dimension(0), 6);
  VERIFY_IS_EQUAL(result.dimension(1), 10);
  for (int i = 0; i < 6; i+=2) {
    for (int j = 0; j < 10; j+=2) {
      VERIFY_IS_EQUAL(result(i, j), tensor(i/2, j/2));
    }
  }
  for (int i = 1; i < 6; i+=2) {
    for (int j = 1; j < 10; j+=2) {
      VERIFY_IS_EQUAL(result(i, j), 0);
    }
  }
 }
 struct BatchMatMul {
  DSizes<DenseIndex, 3> dimensions(const Tensor<float, 3>& input1, const Tensor<float, 3>& input2) const {
    DSizes<DenseIndex, 3> result;
    result[0] = input1.dimension(0);
    result[1] = input2.dimension(1);
    result[2] = input2.dimension(2);
    return result;
  }
  template <typename Output, typename Device>
  void eval(const Tensor<float, 3>& input1, const Tensor<float, 3>& input2,
            Output& output, const Device& device) const
  {
    typedef Tensor<float, 3>::DimensionPair DimPair;
    array<DimPair, 1> dims({{DimPair(1, 0)}});
    for (int i = 0; i < output.dimension(2); ++i) {
      output.template chip<2>(i).device(device) = input1.chip<2>(i).contract(input2.chip<2>(i), dims);
    }
  }
 };
 static void test_custom_binary_op()
 {
  Tensor<float, 3> tensor1(2,3,5);
  tensor1.setRandom();
  Tensor<float, 3> tensor2(3,7,5);
  tensor2.setRandom();
  Tensor<float, 3> result = tensor1.customOp(tensor2, BatchMatMul());
  for (int i = 0; i < 5; ++i) {
    typedef Tensor<float, 3>::DimensionPair DimPair;
    array<DimPair, 1> dims({{DimPair(1, 0)}});
    Tensor<float, 2> reference = tensor1.chip<2>(i).contract(tensor2.chip<2>(i), dims);
    TensorRef<Tensor<float, 2>> val = result.chip<2>(i);
    for (int j = 0; j < 2; ++j) {
      for (int k = 0; k < 7; ++k) {
        VERIFY_IS_APPROX(val(j, k), reference(j, k));
      }
    }
  }
 }
 void test_cxx11_tensor_custom_op()
 {
  CALL_SUBTEST(test_custom_unary_op());
  CALL_SUBTEST(test_custom_binary_op());
 }
--- a/unsupported/test/cxx11_tensor_reduction_cuda.cpp
+++ b/unsupported/test/cxx11_tensor_reduction_cuda.cpp
@ -0,0 +1,55 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
 #define EIGEN_TEST_FUNC cxx11_tensor_reduction_cuda
 #define EIGEN_USE_GPU
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 template<int DataLayout>
 static void test_full_reductions() {
  Eigen::GpuDevice gpu_device;
  const int num_rows = internal::random<int>(1024, 5*1024);
  const int num_cols = internal::random<int>(1024, 5*1024);
  Tensor<float, 2, DataLayout> in(num_rows, num_cols);
  in.setRandom();
  Tensor<float, 1, DataLayout> full_redux(1);
  full_redux = in.sum();
  std::size_t in_bytes = in.size() * sizeof(float);
  std::size_t out_bytes = full_redux.size() * sizeof(float);
  float* gpu_in_ptr = static_cast<float*>(gpu_device.allocate(in_bytes));
  float* gpu_out_ptr = static_cast<float*>(gpu_device.allocate(out_bytes));
  gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
  TensorMap<Tensor<float, 2, DataLayout> > in_gpu(gpu_in_ptr, num_rows, num_cols);
  TensorMap<Tensor<float, 1, DataLayout> > out_gpu(gpu_out_ptr, 1);
  out_gpu.device(gpu_device) = in_gpu.sum();
  Tensor<float, 1, DataLayout> full_redux_gpu(1);
  gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
  gpu_device.synchronize();
  // Check that the CPU and GPU reductions return the same result.
  VERIFY_IS_APPROX(full_redux(0), full_redux_gpu(0));
 }
 void test_cxx11_tensor_reduction_cuda() {
  CALL_SUBTEST(test_full_reductions<ColMajor>());
  CALL_SUBTEST(test_full_reductions<RowMajor>());
 }
--- a/unsupported/test/cxx11_tensor_thread_pool.cpp
+++ b/unsupported/test/cxx11_tensor_thread_pool.cpp
@ -228,6 +228,29 @@ static void test_multithread_contraction_agrees_with_singlethread() {
 }
 template<int DataLayout>
 static void test_multithreaded_reductions() {
  const int num_threads = internal::random<int>(3, 11);
  ThreadPool thread_pool(num_threads);
  Eigen::ThreadPoolDevice thread_pool_device(&thread_pool, num_threads);
  const int num_rows = internal::random<int>(13, 732);
  const int num_cols = internal::random<int>(13, 732);
  Tensor<float, 2, DataLayout> t1(num_rows, num_cols);
  t1.setRandom();
  Tensor<float, 1, DataLayout> full_redux(1);
  full_redux = t1.sum();
  Tensor<float, 1, DataLayout> full_redux_tp(1);
  full_redux_tp.device(thread_pool_device) = t1.sum();
  // Check that the single threaded and the multi threaded reductions return
  // the same result.
  VERIFY_IS_APPROX(full_redux(0), full_redux_tp(0));
 }
 static void test_memcpy() {
  for (int i = 0; i < 5; ++i) {
@ -271,6 +294,9 @@ void test_cxx11_tensor_thread_pool()
  CALL_SUBTEST(test_contraction_corner_cases<ColMajor>());
  CALL_SUBTEST(test_contraction_corner_cases<RowMajor>());
  CALL_SUBTEST(test_multithreaded_reductions<ColMajor>());
  CALL_SUBTEST(test_multithreaded_reductions<RowMajor>());
  CALL_SUBTEST(test_memcpy());
  CALL_SUBTEST(test_multithread_random());
--- a/unsupported/test/cxx11_tensor_volume_patch.cpp
+++ b/unsupported/test/cxx11_tensor_volume_patch.cpp
@ -0,0 +1,112 @@
 #include "main.h"
 #include <Eigen/CXX11/Tensor>
 using Eigen::Tensor;
 static void test_single_voxel_patch()
 {
  Tensor<float, 5> tensor(4,2,3,5,7);
  tensor.setRandom();
  Tensor<float, 5, RowMajor> tensor_row_major = tensor.swap_layout();
  Tensor<float, 6> single_voxel_patch;
  single_voxel_patch = tensor.extract_volume_patches(1, 1, 1);
  VERIFY_IS_EQUAL(single_voxel_patch.dimension(0), 4);
  VERIFY_IS_EQUAL(single_voxel_patch.dimension(1), 1);
  VERIFY_IS_EQUAL(single_voxel_patch.dimension(2), 1);
  VERIFY_IS_EQUAL(single_voxel_patch.dimension(3), 1);
  VERIFY_IS_EQUAL(single_voxel_patch.dimension(4), 2 * 3 * 5);
  VERIFY_IS_EQUAL(single_voxel_patch.dimension(5), 7);
  Tensor<float, 6, RowMajor> single_voxel_patch_row_major;
  single_voxel_patch_row_major = tensor_row_major.extract_volume_patches(1, 1, 1);
  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(0), 7);
  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(1), 2 * 3 * 5);
  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(2), 1);
  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(3), 1);
  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(4), 1);
  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(5), 4);
  for (int i = 0; i < tensor.size(); ++i) {
    VERIFY_IS_EQUAL(tensor.data()[i], single_voxel_patch.data()[i]);
    VERIFY_IS_EQUAL(tensor_row_major.data()[i], single_voxel_patch_row_major.data()[i]);
    VERIFY_IS_EQUAL(tensor.data()[i], tensor_row_major.data()[i]);
  }
 }
 static void test_entire_volume_patch()
 {
  const int depth = 4;
  const int patch_z = 2;
  const int patch_y = 3;
  const int patch_x = 5;
  const int batch = 7;
  Tensor<float, 5> tensor(depth, patch_z, patch_y, patch_x, batch);
  tensor.setRandom();
  Tensor<float, 5, RowMajor> tensor_row_major = tensor.swap_layout();
  Tensor<float, 6> entire_volume_patch;
  entire_volume_patch = tensor.extract_volume_patches(patch_z, patch_y, patch_x);
  VERIFY_IS_EQUAL(entire_volume_patch.dimension(0), depth);
  VERIFY_IS_EQUAL(entire_volume_patch.dimension(1), patch_z);
  VERIFY_IS_EQUAL(entire_volume_patch.dimension(2), patch_y);
  VERIFY_IS_EQUAL(entire_volume_patch.dimension(3), patch_x);
  VERIFY_IS_EQUAL(entire_volume_patch.dimension(4), patch_z * patch_y * patch_x);
  VERIFY_IS_EQUAL(entire_volume_patch.dimension(5), batch);
  Tensor<float, 6, RowMajor> entire_volume_patch_row_major;
  entire_volume_patch_row_major = tensor_row_major.extract_volume_patches(patch_z, patch_y, patch_x);
  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(0), batch);
  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(1), patch_z * patch_y * patch_x);
  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(2), patch_x);
  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(3), patch_y);
  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(4), patch_z);
  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(5), depth);
  const int dz = patch_z - 1;
  const int dy = patch_y - 1;
  const int dx = patch_x - 1;
  const int forward_pad_z = dz - dz / 2;
  const int forward_pad_y = dy - dy / 2;
  const int forward_pad_x = dx - dx / 2;
  for (int pz = 0; pz < patch_z; pz++) {
    for (int py = 0; py < patch_y; py++) {
      for (int px = 0; px < patch_x; px++) {
        const int patchId = pz + patch_z * (py + px * patch_y);
        for (int z = 0; z < patch_z; z++) {
          for (int y = 0; y < patch_y; y++) {
            for (int x = 0; x < patch_x; x++) {
              for (int b = 0; b < batch; b++) {
                for (int d = 0; d < depth; d++) {
                  float expected = 0.0f;
                  float expected_row_major = 0.0f;
                  const int eff_z = z - forward_pad_z + pz;
                  const int eff_y = y - forward_pad_y + py;
                  const int eff_x = x - forward_pad_x + px;
                  if (eff_z >= 0 && eff_y >= 0 && eff_x >= 0 &&
                      eff_z < patch_z && eff_y < patch_y && eff_x < patch_x) {
                    expected = tensor(d, eff_z, eff_y, eff_x, b);
                    expected_row_major = tensor_row_major(b, eff_x, eff_y, eff_z, d);
                  }
                  VERIFY_IS_EQUAL(entire_volume_patch(d, z, y, x, patchId, b), expected);
                  VERIFY_IS_EQUAL(entire_volume_patch_row_major(b, patchId, x, y, z, d), expected_row_major);
                }
              }
            }
          }
        }
      }
    }
  }
 }
 void test_cxx11_tensor_volume_patch()
 {
  CALL_SUBTEST(test_single_voxel_patch());
  CALL_SUBTEST(test_entire_volume_patch());
 }