diff --git a/Eigen/src/Core/ArrayBase.h b/Eigen/src/Core/ArrayBase.h
index 82c12076e..151c05526 100644
--- a/Eigen/src/Core/ArrayBase.h
+++ b/Eigen/src/Core/ArrayBase.h
@@ -83,22 +83,10 @@ template<typename Derived> class ArrayBase
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** \internal the plain matrix type corresponding to this expression. Note that is not necessarily
-      * exactly the return type of eval(): in the case of plain matrices, the return type of eval() is a const
-      * reference to a matrix, not a matrix! It is however guaranteed that the return type of eval() is either
-      * PlainObject or const PlainObject&.
-      */
-    typedef Array<typename internal::traits<Derived>::Scalar,
-                internal::traits<Derived>::RowsAtCompileTime,
-                internal::traits<Derived>::ColsAtCompileTime,
-                AutoAlign | (internal::traits<Derived>::Flags&RowMajorBit ? RowMajor : ColMajor),
-                internal::traits<Derived>::MaxRowsAtCompileTime,
-                internal::traits<Derived>::MaxColsAtCompileTime
-          > PlainObject;
-
+    typedef typename Base::PlainObject PlainObject;
 
     /** \internal Represents a matrix with all coefficients equal to one another*/
-    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,Derived> ConstantReturnType;
+    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,PlainObject> ConstantReturnType;
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::ArrayBase
diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h
index 4622e2759..97938fdba 100644
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@@ -756,6 +756,26 @@ EIGEN_DEVICE_FUNC void call_assignment_no_alias(Dst& dst, const Src& src)
   call_assignment_no_alias(dst, src, internal::assign_op<typename Dst::Scalar>());
 }
 
+template<typename Dst, typename Src, typename Func>
+EIGEN_DEVICE_FUNC void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src, const Func& func)
+{
+  Index dstRows = src.rows();
+  Index dstCols = src.cols();
+  if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+    dst.resize(dstRows, dstCols);
+  
+  // TODO check whether this is the right place to perform these checks:
+  EIGEN_STATIC_ASSERT_LVALUE(Dst)
+  EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Dst,Src)
+  
+  Assignment<Dst,Src,Func>::run(dst, src, func);
+}
+template<typename Dst, typename Src>
+EIGEN_DEVICE_FUNC void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src)
+{
+  call_assignment_no_alias_no_transpose(dst, src, internal::assign_op<typename Dst::Scalar>());
+}
+
 // forward declaration
 template<typename Dst, typename Src> void check_for_aliasing(const Dst &dst, const Src &src);
 
@@ -783,7 +803,6 @@ struct Assignment<DstXprType, SrcXprType, Functor, EigenBase2EigenBase, Scalar>
   EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &/*func*/)
   {
     eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
-    
     src.evalTo(dst);
   }
 };
diff --git a/Eigen/src/Core/Assign_MKL.h b/Eigen/src/Core/Assign_MKL.h
old mode 100644
new mode 100755
index d4ef1d3a4..897187a30
--- a/Eigen/src/Core/Assign_MKL.h
+++ b/Eigen/src/Core/Assign_MKL.h
@@ -1,6 +1,7 @@
 /*
  Copyright (c) 2011, Intel Corporation. All rights reserved.
-
+ Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+ 
  Redistribution and use in source and binary forms, with or without modification,
  are permitted provided that the following conditions are met:
 
@@ -37,17 +38,13 @@ namespace Eigen {
 
 namespace internal {
 
-template<typename Op> struct vml_call
-{ enum { IsSupported = 0 }; };
-
-template<typename Dst, typename Src, typename UnaryOp>
+template<typename Dst, typename Src>
 class vml_assign_traits
 {
   private:
     enum {
       DstHasDirectAccess = Dst::Flags & DirectAccessBit,
       SrcHasDirectAccess = Src::Flags & DirectAccessBit,
-
       StorageOrdersAgree = (int(Dst::IsRowMajor) == int(Src::IsRowMajor)),
       InnerSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::SizeAtCompileTime)
                 : int(Dst::Flags)&RowMajorBit ? int(Dst::ColsAtCompileTime)
@@ -57,173 +54,118 @@ class vml_assign_traits
                     : int(Dst::MaxRowsAtCompileTime),
       MaxSizeAtCompileTime = Dst::SizeAtCompileTime,
 
-      MightEnableVml =  vml_call<UnaryOp>::IsSupported && StorageOrdersAgree && DstHasDirectAccess && SrcHasDirectAccess
-                     && Src::InnerStrideAtCompileTime==1 && Dst::InnerStrideAtCompileTime==1,
+      MightEnableVml = StorageOrdersAgree && DstHasDirectAccess && SrcHasDirectAccess && Src::InnerStrideAtCompileTime==1 && Dst::InnerStrideAtCompileTime==1,
       MightLinearize = MightEnableVml && (int(Dst::Flags) & int(Src::Flags) & LinearAccessBit),
       VmlSize = MightLinearize ? MaxSizeAtCompileTime : InnerMaxSize,
-      LargeEnough = VmlSize==Dynamic || VmlSize>=EIGEN_MKL_VML_THRESHOLD,
-      MayEnableVml = MightEnableVml && LargeEnough,
-      MayLinearize = MayEnableVml && MightLinearize
+      LargeEnough = VmlSize==Dynamic || VmlSize>=EIGEN_MKL_VML_THRESHOLD
     };
   public:
     enum {
-      Traversal = MayLinearize ? LinearVectorizedTraversal
-                : MayEnableVml ? InnerVectorizedTraversal
-                : DefaultTraversal
+      EnableVml = MightEnableVml && LargeEnough,
+      Traversal = MightLinearize ? LinearTraversal : DefaultTraversal
     };
 };
 
-template<typename Derived1, typename Derived2, typename UnaryOp, int Traversal, int Unrolling,
-         int VmlTraversal = vml_assign_traits<Derived1, Derived2, UnaryOp>::Traversal >
-struct vml_assign_impl
-  : assign_impl<Derived1, Eigen::CwiseUnaryOp<UnaryOp, Derived2>,Traversal,Unrolling,BuiltIn>
-{
-};
-
-template<typename Derived1, typename Derived2, typename UnaryOp, int Traversal, int Unrolling>
-struct vml_assign_impl<Derived1, Derived2, UnaryOp, Traversal, Unrolling, InnerVectorizedTraversal>
-{
-  typedef typename Derived1::Scalar Scalar;
-  static inline void run(Derived1& dst, const CwiseUnaryOp<UnaryOp, Derived2>& src)
-  {
-    // in case we want to (or have to) skip VML at runtime we can call:
-    // assign_impl<Derived1,Eigen::CwiseUnaryOp<UnaryOp, Derived2>,Traversal,Unrolling,BuiltIn>::run(dst,src);
-    const Index innerSize = dst.innerSize();
-    const Index outerSize = dst.outerSize();
-    for(Index outer = 0; outer < outerSize; ++outer) {
-      const Scalar *src_ptr = src.IsRowMajor ?  &(src.nestedExpression().coeffRef(outer,0)) :
-                                                &(src.nestedExpression().coeffRef(0, outer));
-      Scalar *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer,0)) : &(dst.coeffRef(0, outer));
-      vml_call<UnaryOp>::run(src.functor(), innerSize, src_ptr, dst_ptr );
-    }
-  }
-};
-
-template<typename Derived1, typename Derived2, typename UnaryOp, int Traversal, int Unrolling>
-struct vml_assign_impl<Derived1, Derived2, UnaryOp, Traversal, Unrolling, LinearVectorizedTraversal>
-{
-  static inline void run(Derived1& dst, const CwiseUnaryOp<UnaryOp, Derived2>& src)
-  {
-    // in case we want to (or have to) skip VML at runtime we can call:
-    // assign_impl<Derived1,Eigen::CwiseUnaryOp<UnaryOp, Derived2>,Traversal,Unrolling,BuiltIn>::run(dst,src);
-    vml_call<UnaryOp>::run(src.functor(), dst.size(), src.nestedExpression().data(), dst.data() );
-  }
-};
-
-// Macroses
-
-#define EIGEN_MKL_VML_SPECIALIZE_ASSIGN(TRAVERSAL,UNROLLING) \
-  template<typename Derived1, typename Derived2, typename UnaryOp> \
-  struct assign_impl<Derived1, Eigen::CwiseUnaryOp<UnaryOp, Derived2>, TRAVERSAL, UNROLLING, Specialized>  {  \
-    static inline void run(Derived1 &dst, const Eigen::CwiseUnaryOp<UnaryOp, Derived2> &src) { \
-      vml_assign_impl<Derived1,Derived2,UnaryOp,TRAVERSAL,UNROLLING>::run(dst, src); \
-    } \
-  };
-
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(DefaultTraversal,NoUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(DefaultTraversal,CompleteUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(DefaultTraversal,InnerUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(LinearTraversal,NoUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(LinearTraversal,CompleteUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(InnerVectorizedTraversal,NoUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(InnerVectorizedTraversal,CompleteUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(InnerVectorizedTraversal,InnerUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(LinearVectorizedTraversal,CompleteUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(LinearVectorizedTraversal,NoUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(SliceVectorizedTraversal,NoUnrolling)
-
-
+#define EIGEN_PP_EXPAND(ARG) ARG
 #if !defined (EIGEN_FAST_MATH) || (EIGEN_FAST_MATH != 1)
-#define  EIGEN_MKL_VML_MODE VML_HA
+#define EIGEN_VMLMODE_EXPAND_LA , VML_HA
 #else
-#define  EIGEN_MKL_VML_MODE VML_LA
+#define EIGEN_VMLMODE_EXPAND_LA , VML_LA
 #endif
 
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE)     \
-  template<> struct vml_call< scalar_##EIGENOP##_op<EIGENTYPE> > {               \
-    enum { IsSupported = 1 };                                                    \
-    static inline void run( const scalar_##EIGENOP##_op<EIGENTYPE>& /*func*/,        \
-                            int size, const EIGENTYPE* src, EIGENTYPE* dst) {    \
-      VMLOP(size, (const VMLTYPE*)src, (VMLTYPE*)dst);                           \
-    }                                                                            \
+#define EIGEN_VMLMODE_EXPAND__ 
+
+#define EIGEN_VMLMODE_PREFIX_LA vm
+#define EIGEN_VMLMODE_PREFIX__  v
+#define EIGEN_VMLMODE_PREFIX(VMLMODE) EIGEN_CAT(EIGEN_VMLMODE_PREFIX_,VMLMODE)
+
+#define EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE, VMLMODE)                                           \
+  template< typename DstXprType, typename SrcXprNested>                                                                         \
+  struct Assignment<DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested>, assign_op<EIGENTYPE>,             \
+                   Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml,EIGENTYPE>::type> {    \
+    typedef CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested> SrcXprType;                                            \
+    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE> &/*func*/) {                             \
+      eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());                                                       \
+      if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal) {                                              \
+        VMLOP(dst.size(), (const VMLTYPE*)src.nestedExpression().data(),                                                        \
+              (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE) );                                           \
+      } else {                                                                                                                  \
+        const Index outerSize = dst.outerSize();                                                                                \
+        for(Index outer = 0; outer < outerSize; ++outer) {                                                                      \
+          const EIGENTYPE *src_ptr = src.IsRowMajor ? &(src.nestedExpression().coeffRef(outer,0)) :                             \
+                                                      &(src.nestedExpression().coeffRef(0, outer));                             \
+          EIGENTYPE *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer,0)) : &(dst.coeffRef(0, outer));                           \
+          VMLOP( dst.innerSize(), (const VMLTYPE*)src_ptr,                                                                      \
+                (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE));                                             \
+        }                                                                                                                       \
+      }                                                                                                                         \
+    }                                                                                                                           \
+  };                                                                                                                            \
+
+
+#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(EIGENOP, VMLOP, VMLMODE)                                                         \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, EIGEN_CAT(EIGEN_VMLMODE_PREFIX(VMLMODE),s##VMLOP), float, float, VMLMODE)           \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, EIGEN_CAT(EIGEN_VMLMODE_PREFIX(VMLMODE),d##VMLOP), double, double, VMLMODE)
+
+#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_CPLX(EIGENOP, VMLOP, VMLMODE)                                                         \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, EIGEN_CAT(EIGEN_VMLMODE_PREFIX(VMLMODE),c##VMLOP), scomplex, MKL_Complex8, VMLMODE) \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, EIGEN_CAT(EIGEN_VMLMODE_PREFIX(VMLMODE),z##VMLOP), dcomplex, MKL_Complex16, VMLMODE)
+  
+#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS(EIGENOP, VMLOP, VMLMODE)                                                              \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(EIGENOP, VMLOP, VMLMODE)                                                               \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_CPLX(EIGENOP, VMLOP, VMLMODE)
+
+  
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(sin,   Sin,   LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(asin,  Asin,  LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(sinh,  Sinh,  LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(cos,   Cos,   LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(acos,  Acos,  LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(cosh,  Cosh,  LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(tan,   Tan,   LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(atan,  Atan,  LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(tanh,  Tanh,  LA)
+// EIGEN_MKL_VML_DECLARE_UNARY_CALLS(abs,   Abs,    _)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(exp,   Exp,   LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(log,   Ln,    LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(log10, Log10, LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(sqrt,  Sqrt,  _)
+
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(square, Sqr,   _)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_CPLX(arg, Arg,      _)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(round, Round,  _)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(floor, Floor,  _)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil,  Ceil,   _)
+
+#define EIGEN_MKL_VML_DECLARE_POW_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE, VMLMODE)                                           \
+  template< typename DstXprType, typename SrcXprNested>                                                                       \
+  struct Assignment<DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested>, assign_op<EIGENTYPE>,           \
+                   Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml,EIGENTYPE>::type> {  \
+    typedef CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested> SrcXprType;                                          \
+    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE> &/*func*/) {                           \
+      eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());                                                     \
+      VMLTYPE exponent = reinterpret_cast<const VMLTYPE&>(src.functor().m_exponent);                                          \
+      if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal)                                              \
+      {                                                                                                                       \
+        VMLOP( dst.size(), (const VMLTYPE*)src.nestedExpression().data(), exponent,                                           \
+              (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE) );                                         \
+      } else {                                                                                                                \
+        const Index outerSize = dst.outerSize();                                                                              \
+        for(Index outer = 0; outer < outerSize; ++outer) {                                                                    \
+          const EIGENTYPE *src_ptr = src.IsRowMajor ? &(src.nestedExpression().coeffRef(outer,0)) :                           \
+                                                      &(src.nestedExpression().coeffRef(0, outer));                           \
+          EIGENTYPE *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer,0)) : &(dst.coeffRef(0, outer));                         \
+          VMLOP( dst.innerSize(), (const VMLTYPE*)src_ptr, exponent,                                                          \
+                 (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE));                                          \
+        }                                                                                                                     \
+      }                                                                                                                       \
+    }                                                                                                                         \
   };
-
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALL_LA(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE)  \
-  template<> struct vml_call< scalar_##EIGENOP##_op<EIGENTYPE> > {               \
-    enum { IsSupported = 1 };                                                    \
-    static inline void run( const scalar_##EIGENOP##_op<EIGENTYPE>& /*func*/,        \
-                            int size, const EIGENTYPE* src, EIGENTYPE* dst) {    \
-      MKL_INT64 vmlMode = EIGEN_MKL_VML_MODE;                                    \
-      VMLOP(size, (const VMLTYPE*)src, (VMLTYPE*)dst, vmlMode);                  \
-    }                                                                            \
-  };
-
-#define EIGEN_MKL_VML_DECLARE_POW_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE)       \
-  template<> struct vml_call< scalar_##EIGENOP##_op<EIGENTYPE> > {               \
-    enum { IsSupported = 1 };                                                    \
-    static inline void run( const scalar_##EIGENOP##_op<EIGENTYPE>& func,        \
-                          int size, const EIGENTYPE* src, EIGENTYPE* dst) {      \
-      EIGENTYPE exponent = func.m_exponent;                                      \
-      MKL_INT64 vmlMode = EIGEN_MKL_VML_MODE;                                    \
-      VMLOP(&size, (const VMLTYPE*)src, (const VMLTYPE*)&exponent,               \
-                        (VMLTYPE*)dst, &vmlMode);                                \
-    }                                                                            \
-  };
-
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(EIGENOP, VMLOP)                   \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, vs##VMLOP, float, float)             \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, vd##VMLOP, double, double)
-
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_COMPLEX(EIGENOP, VMLOP)                \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, vc##VMLOP, scomplex, MKL_Complex8)   \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, vz##VMLOP, dcomplex, MKL_Complex16)
-
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS(EIGENOP, VMLOP)                        \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(EIGENOP, VMLOP)                         \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_COMPLEX(EIGENOP, VMLOP)
-
-
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL_LA(EIGENOP, VMLOP)                \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL_LA(EIGENOP, vms##VMLOP, float, float)         \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL_LA(EIGENOP, vmd##VMLOP, double, double)
-
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_COMPLEX_LA(EIGENOP, VMLOP)             \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL_LA(EIGENOP, vmc##VMLOP, scomplex, MKL_Complex8)  \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL_LA(EIGENOP, vmz##VMLOP, dcomplex, MKL_Complex16)
-
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(EIGENOP, VMLOP)                     \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL_LA(EIGENOP, VMLOP)                      \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_COMPLEX_LA(EIGENOP, VMLOP)
-
-
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(sin,   Sin)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(asin,  Asin)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(sinh,  Sinh)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(cos,   Cos)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(acos,  Acos)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(cosh,  Cosh)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(tan,   Tan)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(atan,  Atan)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(tanh,  Tanh)
-//EIGEN_MKL_VML_DECLARE_UNARY_CALLS(abs,    Abs)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(exp,   Exp)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(log,   Ln)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(log10, Log10)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS(sqrt,     Sqrt)
-
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(square, Sqr)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_COMPLEX(arg, Arg)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(round, Round)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(floor, Floor)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil,  Ceil)
-
-// The vm*powx functions are not avaibale in the windows version of MKL.
-#ifndef _WIN32
-EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmspowx_, float, float)
-EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmdpowx_, double, double)
-EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmcpowx_, scomplex, MKL_Complex8)
-EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmzpowx_, dcomplex, MKL_Complex16)
-#endif
+  
+EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmsPowx, float,    float,         LA)
+EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmdPowx, double,   double,        LA)
+EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmcPowx, scomplex, MKL_Complex8,  LA)
+EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmzPowx, dcomplex, MKL_Complex16, LA)
 
 } // end namespace internal
 
diff --git a/Eigen/src/Core/CwiseNullaryOp.h b/Eigen/src/Core/CwiseNullaryOp.h
index 87ea14bac..395861c0c 100644
--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h
@@ -113,10 +113,10 @@ class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp
   */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, Derived>
+EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
 DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func)
 {
-  return CwiseNullaryOp<CustomNullaryOp, Derived>(rows, cols, func);
+  return CwiseNullaryOp<CustomNullaryOp, PlainObject>(rows, cols, func);
 }
 
 /** \returns an expression of a matrix defined by a custom functor \a func
@@ -139,12 +139,12 @@ DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& f
   */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, Derived>
+EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
 DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  if(RowsAtCompileTime == 1) return CwiseNullaryOp<CustomNullaryOp, Derived>(1, size, func);
-  else return CwiseNullaryOp<CustomNullaryOp, Derived>(size, 1, func);
+  if(RowsAtCompileTime == 1) return CwiseNullaryOp<CustomNullaryOp, PlainObject>(1, size, func);
+  else return CwiseNullaryOp<CustomNullaryOp, PlainObject>(size, 1, func);
 }
 
 /** \returns an expression of a matrix defined by a custom functor \a func
@@ -158,10 +158,10 @@ DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func)
   */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, Derived>
+EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
 DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func)
 {
-  return CwiseNullaryOp<CustomNullaryOp, Derived>(RowsAtCompileTime, ColsAtCompileTime, func);
+  return CwiseNullaryOp<CustomNullaryOp, PlainObject>(RowsAtCompileTime, ColsAtCompileTime, func);
 }
 
 /** \returns an expression of a constant matrix of value \a value
diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h
index 4e03e4a56..d486f3fd5 100644
--- a/Eigen/src/Core/DenseBase.h
+++ b/Eigen/src/Core/DenseBase.h
@@ -49,6 +49,8 @@ template<typename Derived> class DenseBase
   public:
     using internal::special_scalar_op_base<Derived,typename internal::traits<Derived>::Scalar,
                 typename NumTraits<typename internal::traits<Derived>::Scalar>::Real>::operator*;
+    using internal::special_scalar_op_base<Derived,typename internal::traits<Derived>::Scalar,
+                typename NumTraits<typename internal::traits<Derived>::Scalar>::Real>::operator/;
 
 
     /** Inner iterator type to iterate over the coefficients of a row or column.
@@ -178,6 +180,35 @@ template<typename Derived> class DenseBase
     };
 
     enum { IsPlainObjectBase = 0 };
+    
+    /** The plain matrix type corresponding to this expression.
+      * \sa PlainObject */
+    typedef Matrix<typename internal::traits<Derived>::Scalar,
+                internal::traits<Derived>::RowsAtCompileTime,
+                internal::traits<Derived>::ColsAtCompileTime,
+                AutoAlign | (internal::traits<Derived>::Flags&RowMajorBit ? RowMajor : ColMajor),
+                internal::traits<Derived>::MaxRowsAtCompileTime,
+                internal::traits<Derived>::MaxColsAtCompileTime
+          > PlainMatrix;
+    
+    /** The plain array type corresponding to this expression.
+      * \sa PlainObject */
+    typedef Array<typename internal::traits<Derived>::Scalar,
+                internal::traits<Derived>::RowsAtCompileTime,
+                internal::traits<Derived>::ColsAtCompileTime,
+                AutoAlign | (internal::traits<Derived>::Flags&RowMajorBit ? RowMajor : ColMajor),
+                internal::traits<Derived>::MaxRowsAtCompileTime,
+                internal::traits<Derived>::MaxColsAtCompileTime
+          > PlainArray;
+
+    /** \brief The plain matrix or array type corresponding to this expression.
+      *
+      * This is not necessarily exactly the return type of eval(). In the case of plain matrices,
+      * the return type of eval() is a const reference to a matrix, not a matrix! It is however guaranteed
+      * that the return type of eval() is either PlainObject or const PlainObject&.
+      */
+    typedef typename internal::conditional<internal::is_same<typename internal::traits<Derived>::XprKind,MatrixXpr >::value,
+                                 PlainMatrix, PlainArray>::type PlainObject;
 
     /** \returns the number of nonzero coefficients which is in practice the number
       * of stored coefficients. */
@@ -237,13 +268,12 @@ template<typename Derived> class DenseBase
     }
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-
     /** \internal Represents a matrix with all coefficients equal to one another*/
-    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,Derived> ConstantReturnType;
+    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,PlainObject> ConstantReturnType;
     /** \internal Represents a vector with linearly spaced coefficients that allows sequential access only. */
-    typedef CwiseNullaryOp<internal::linspaced_op<Scalar,false>,Derived> SequentialLinSpacedReturnType;
+    typedef CwiseNullaryOp<internal::linspaced_op<Scalar,false>,PlainObject> SequentialLinSpacedReturnType;
     /** \internal Represents a vector with linearly spaced coefficients that allows random access. */
-    typedef CwiseNullaryOp<internal::linspaced_op<Scalar,true>,Derived> RandomAccessLinSpacedReturnType;
+    typedef CwiseNullaryOp<internal::linspaced_op<Scalar,true>,PlainObject> RandomAccessLinSpacedReturnType;
     /** \internal the return type of MatrixBase::eigenvalues() */
     typedef Matrix<typename NumTraits<typename internal::traits<Derived>::Scalar>::Real, internal::traits<Derived>::ColsAtCompileTime, 1> EigenvaluesReturnType;
 
@@ -322,13 +352,13 @@ template<typename Derived> class DenseBase
     LinSpaced(const Scalar& low, const Scalar& high);
 
     template<typename CustomNullaryOp> EIGEN_DEVICE_FUNC
-    static const CwiseNullaryOp<CustomNullaryOp, Derived>
+    static const CwiseNullaryOp<CustomNullaryOp, PlainObject>
     NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func);
     template<typename CustomNullaryOp> EIGEN_DEVICE_FUNC
-    static const CwiseNullaryOp<CustomNullaryOp, Derived>
+    static const CwiseNullaryOp<CustomNullaryOp, PlainObject>
     NullaryExpr(Index size, const CustomNullaryOp& func);
     template<typename CustomNullaryOp> EIGEN_DEVICE_FUNC
-    static const CwiseNullaryOp<CustomNullaryOp, Derived>
+    static const CwiseNullaryOp<CustomNullaryOp, PlainObject>
     NullaryExpr(const CustomNullaryOp& func);
 
     EIGEN_DEVICE_FUNC static const ConstantReturnType Zero(Index rows, Index cols);
@@ -466,9 +496,10 @@ template<typename Derived> class DenseBase
     ConstColwiseReturnType colwise() const;
     ColwiseReturnType colwise();
 
-    static const CwiseNullaryOp<internal::scalar_random_op<Scalar>,Derived> Random(Index rows, Index cols);
-    static const CwiseNullaryOp<internal::scalar_random_op<Scalar>,Derived> Random(Index size);
-    static const CwiseNullaryOp<internal::scalar_random_op<Scalar>,Derived> Random();
+    typedef CwiseNullaryOp<internal::scalar_random_op<Scalar>,PlainObject> RandomReturnType;
+    static const RandomReturnType Random(Index rows, Index cols);
+    static const RandomReturnType Random(Index size);
+    static const RandomReturnType Random();
 
     template<typename ThenDerived,typename ElseDerived>
     const Select<Derived,ThenDerived,ElseDerived>
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index dfc651908..408f76b1a 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -349,6 +349,7 @@ struct hypot_retval
 template<typename OldType, typename NewType>
 struct cast_impl
 {
+  EIGEN_DEVICE_FUNC
   static inline NewType run(const OldType& x)
   {
     return static_cast<NewType>(x);
@@ -360,6 +361,7 @@ struct cast_impl
 template<typename OldType, typename NewType>
 inline NewType cast(const OldType& x)
 {
+  EIGEN_DEVICE_FUNC
   return cast_impl<OldType, NewType>::run(x);
 }
 
diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h
index 1686933e4..8549cf83c 100644
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@@ -81,6 +81,7 @@ template<typename Derived> class MatrixBase
     using Base::operator*=;
     using Base::operator/=;
     using Base::operator*;
+    using Base::operator/;
 
     typedef typename Base::CoeffReturnType CoeffReturnType;
     typedef typename Base::ConstTransposeReturnType ConstTransposeReturnType;
@@ -101,23 +102,11 @@ template<typename Derived> class MatrixBase
     EIGEN_DEVICE_FUNC
     inline Index diagonalSize() const { return (std::min)(rows(),cols()); }
 
-    /** \brief The plain matrix type corresponding to this expression.
-      *
-      * This is not necessarily exactly the return type of eval(). In the case of plain matrices,
-      * the return type of eval() is a const reference to a matrix, not a matrix! It is however guaranteed
-      * that the return type of eval() is either PlainObject or const PlainObject&.
-      */
-    typedef Matrix<typename internal::traits<Derived>::Scalar,
-                internal::traits<Derived>::RowsAtCompileTime,
-                internal::traits<Derived>::ColsAtCompileTime,
-                AutoAlign | (internal::traits<Derived>::Flags&RowMajorBit ? RowMajor : ColMajor),
-                internal::traits<Derived>::MaxRowsAtCompileTime,
-                internal::traits<Derived>::MaxColsAtCompileTime
-          > PlainObject;
+    typedef typename Base::PlainObject PlainObject;
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     /** \internal Represents a matrix with all coefficients equal to one another*/
-    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,Derived> ConstantReturnType;
+    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,PlainObject> ConstantReturnType;
     /** \internal the return type of MatrixBase::adjoint() */
     typedef typename internal::conditional<NumTraits<Scalar>::IsComplex,
                         CwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, ConstTransposeReturnType>,
@@ -126,7 +115,7 @@ template<typename Derived> class MatrixBase
     /** \internal Return type of eigenvalues() */
     typedef Matrix<std::complex<RealScalar>, internal::traits<Derived>::ColsAtCompileTime, 1, ColMajor> EigenvaluesReturnType;
     /** \internal the return type of identity */
-    typedef CwiseNullaryOp<internal::scalar_identity_op<Scalar>,Derived> IdentityReturnType;
+    typedef CwiseNullaryOp<internal::scalar_identity_op<Scalar>,PlainObject> IdentityReturnType;
     /** \internal the return type of unit vectors */
     typedef Block<const CwiseNullaryOp<internal::scalar_identity_op<Scalar>, SquareMatrixType>,
                   internal::traits<Derived>::RowsAtCompileTime,
diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h
old mode 100644
new mode 100755
index 2f199bc29..43ba86193
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -751,7 +751,6 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
   using Base::m_diagImpl;
   using Base::m_matImpl;
   using Base::coeff;
-  using Base::packet_impl;
   typedef typename Base::Scalar Scalar;
   typedef typename Base::PacketScalar PacketScalar;
   
@@ -776,7 +775,8 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
   template<int LoadMode>
   EIGEN_STRONG_INLINE PacketScalar packet(Index row, Index col) const
   {
-    // NVCC complains about template keyword, so we disable this function in CUDA mode
+    // FIXME: NVCC used to complain about the template keyword, but we have to check whether this is still the case.
+    // See also similar calls below.
     return this->template packet_impl<LoadMode>(row,col, row,
                                  typename internal::conditional<int(StorageOrder)==RowMajor, internal::true_type, internal::false_type>::type());
   }
@@ -798,7 +798,6 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape,
   using Base::m_diagImpl;
   using Base::m_matImpl;
   using Base::coeff;
-  using Base::packet_impl;
   typedef typename Base::Scalar Scalar;
   typedef typename Base::PacketScalar PacketScalar;
   
diff --git a/Eigen/src/Core/Random.h b/Eigen/src/Core/Random.h
index cf2a82877..02038e9e3 100644
--- a/Eigen/src/Core/Random.h
+++ b/Eigen/src/Core/Random.h
@@ -53,7 +53,7 @@ struct functor_traits<scalar_random_op<Scalar> >
   * \sa DenseBase::setRandom(), DenseBase::Random(Index), DenseBase::Random()
   */
 template<typename Derived>
-inline const CwiseNullaryOp<internal::scalar_random_op<typename internal::traits<Derived>::Scalar>, Derived>
+inline const typename DenseBase<Derived>::RandomReturnType
 DenseBase<Derived>::Random(Index rows, Index cols)
 {
   return NullaryExpr(rows, cols, internal::scalar_random_op<Scalar>());
@@ -84,7 +84,7 @@ DenseBase<Derived>::Random(Index rows, Index cols)
   * \sa DenseBase::setRandom(), DenseBase::Random(Index,Index), DenseBase::Random()
   */
 template<typename Derived>
-inline const CwiseNullaryOp<internal::scalar_random_op<typename internal::traits<Derived>::Scalar>, Derived>
+inline const typename DenseBase<Derived>::RandomReturnType
 DenseBase<Derived>::Random(Index size)
 {
   return NullaryExpr(size, internal::scalar_random_op<Scalar>());
@@ -110,7 +110,7 @@ DenseBase<Derived>::Random(Index size)
   * \sa DenseBase::setRandom(), DenseBase::Random(Index,Index), DenseBase::Random(Index)
   */
 template<typename Derived>
-inline const CwiseNullaryOp<internal::scalar_random_op<typename internal::traits<Derived>::Scalar>, Derived>
+inline const typename DenseBase<Derived>::RandomReturnType
 DenseBase<Derived>::Random()
 {
   return NullaryExpr(RowsAtCompileTime, ColsAtCompileTime, internal::scalar_random_op<Scalar>());
diff --git a/Eigen/src/Core/Reverse.h b/Eigen/src/Core/Reverse.h
index 993458aa5..ef301e66d 100644
--- a/Eigen/src/Core/Reverse.h
+++ b/Eigen/src/Core/Reverse.h
@@ -70,10 +70,6 @@ template<typename MatrixType, int Direction> class Reverse
     typedef typename internal::remove_all<MatrixType>::type NestedExpression;
     using Base::IsRowMajor;
 
-    // next line is necessary because otherwise const version of operator()
-    // is hidden by non-const version defined in this file
-    using Base::operator(); 
-
   protected:
     enum {
       PacketSize = internal::packet_traits<Scalar>::size,
@@ -101,69 +97,6 @@ template<typename MatrixType, int Direction> class Reverse
       return -m_matrix.innerStride();
     }
 
-    EIGEN_DEVICE_FUNC inline Scalar& operator()(Index row, Index col)
-    {
-      eigen_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
-      return coeffRef(row, col);
-    }
-
-    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index col)
-    {
-      return m_matrix.const_cast_derived().coeffRef(ReverseRow ? m_matrix.rows() - row - 1 : row,
-                                                    ReverseCol ? m_matrix.cols() - col - 1 : col);
-    }
-
-    EIGEN_DEVICE_FUNC inline CoeffReturnType coeff(Index row, Index col) const
-    {
-      return m_matrix.coeff(ReverseRow ? m_matrix.rows() - row - 1 : row,
-                            ReverseCol ? m_matrix.cols() - col - 1 : col);
-    }
-
-    EIGEN_DEVICE_FUNC inline CoeffReturnType coeff(Index index) const
-    {
-      return m_matrix.coeff(m_matrix.size() - index - 1);
-    }
-
-    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index)
-    {
-      return m_matrix.const_cast_derived().coeffRef(m_matrix.size() - index - 1);
-    }
-
-    EIGEN_DEVICE_FUNC inline Scalar& operator()(Index index)
-    {
-      eigen_assert(index >= 0 && index < m_matrix.size());
-      return coeffRef(index);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index row, Index col) const
-    {
-      return reverse_packet::run(m_matrix.template packet<LoadMode>(
-                                    ReverseRow ? m_matrix.rows() - row - OffsetRow : row,
-                                    ReverseCol ? m_matrix.cols() - col - OffsetCol : col));
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index row, Index col, const PacketScalar& x)
-    {
-      m_matrix.const_cast_derived().template writePacket<LoadMode>(
-                                      ReverseRow ? m_matrix.rows() - row - OffsetRow : row,
-                                      ReverseCol ? m_matrix.cols() - col - OffsetCol : col,
-                                      reverse_packet::run(x));
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index index) const
-    {
-      return internal::preverse(m_matrix.template packet<LoadMode>( m_matrix.size() - index - PacketSize ));
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index index, const PacketScalar& x)
-    {
-      m_matrix.const_cast_derived().template writePacket<LoadMode>(m_matrix.size() - index - PacketSize, internal::preverse(x));
-    }
-
     EIGEN_DEVICE_FUNC const typename internal::remove_all<typename MatrixType::Nested>::type&
     nestedExpression() const 
     {
diff --git a/Eigen/src/Core/StableNorm.h b/Eigen/src/Core/StableNorm.h
index 83a973365..f9cd01b7e 100644
--- a/Eigen/src/Core/StableNorm.h
+++ b/Eigen/src/Core/StableNorm.h
@@ -157,6 +157,7 @@ inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
 MatrixBase<Derived>::stableNorm() const
 {
   using std::sqrt;
+  using std::abs;
   const Index blockSize = 4096;
   RealScalar scale(0);
   RealScalar invScale(1);
@@ -164,12 +165,18 @@ MatrixBase<Derived>::stableNorm() const
   enum {
     Alignment = (int(Flags)&DirectAccessBit) || (int(Flags)&AlignedBit) ? 1 : 0
   };
+  typedef typename internal::conditional<Alignment, Ref<const Matrix<Scalar,Dynamic,1,0,blockSize,1>, Aligned>,
+                                                    typename Base::ConstSegmentReturnType>::type SegmentWrapper;
   Index n = size();
+  
+  if(n==1)
+    return abs(this->coeff(0));
+  
   Index bi = internal::first_aligned(derived());
   if (bi>0)
     internal::stable_norm_kernel(this->head(bi), ssq, scale, invScale);
   for (; bi<n; bi+=blockSize)
-    internal::stable_norm_kernel(this->segment(bi,numext::mini(blockSize, n - bi)).template forceAlignedAccessIf<Alignment>(), ssq, scale, invScale);
+    internal::stable_norm_kernel(SegmentWrapper(this->segment(bi,numext::mini(blockSize, n - bi))), ssq, scale, invScale);
   return scale * sqrt(ssq);
 }
 
diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h
index 85e605889..5dee6247e 100644
--- a/Eigen/src/Core/functors/BinaryFunctors.h
+++ b/Eigen/src/Core/functors/BinaryFunctors.h
@@ -392,6 +392,18 @@ template<typename Scalar>
 struct functor_traits<scalar_quotient1_op<Scalar> >
 { enum { Cost = 2 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasDiv }; };
 
+template<typename Scalar1, typename Scalar2>
+struct scalar_quotient2_op {
+  typedef typename scalar_product_traits<Scalar1,Scalar2>::ReturnType result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_quotient2_op(const scalar_quotient2_op& other) : m_other(other.m_other) { }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_quotient2_op(const Scalar2& other) : m_other(other) { }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar1& a) const { return a / m_other; }
+  typename add_const_on_value_type<typename NumTraits<Scalar2>::Nested>::type m_other;
+};
+template<typename Scalar1,typename Scalar2>
+struct functor_traits<scalar_quotient2_op<Scalar1,Scalar2> >
+{ enum { Cost = 2 * NumTraits<Scalar1>::MulCost, PacketAccess = false }; };
+
 // In Eigen, any binary op (Product, CwiseBinaryOp) require the Lhs and Rhs to have the same scalar type, except for multiplication
 // where the mixing of different types is handled by scalar_product_traits
 // In particular, real * complex<real> is allowed.
diff --git a/Eigen/src/Core/products/GeneralMatrixVector_MKL.h b/Eigen/src/Core/products/GeneralMatrixVector_MKL.h
old mode 100644
new mode 100755
index 1cb9fe6b5..12c3d13bd
--- a/Eigen/src/Core/products/GeneralMatrixVector_MKL.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector_MKL.h
@@ -46,38 +46,37 @@ namespace internal {
 
 // gemv specialization
 
-template<typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs>
-struct general_matrix_vector_product_gemv :
-  general_matrix_vector_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,ConjugateRhs,BuiltIn> {};
+template<typename Index, typename LhsScalar, int StorageOrder, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs>
+struct general_matrix_vector_product_gemv;
 
 #define EIGEN_MKL_GEMV_SPECIALIZE(Scalar) \
 template<typename Index, bool ConjugateLhs, bool ConjugateRhs> \
-struct general_matrix_vector_product<Index,Scalar,ColMajor,ConjugateLhs,Scalar,ConjugateRhs,Specialized> { \
+struct general_matrix_vector_product<Index,Scalar,const_blas_data_mapper<Scalar,Index,ColMajor>,ColMajor,ConjugateLhs,Scalar,const_blas_data_mapper<Scalar,Index,RowMajor>,ConjugateRhs,Specialized> { \
 static void run( \
   Index rows, Index cols, \
-  const Scalar* lhs, Index lhsStride, \
-  const Scalar* rhs, Index rhsIncr, \
+  const const_blas_data_mapper<Scalar,Index,ColMajor> &lhs, \
+  const const_blas_data_mapper<Scalar,Index,RowMajor> &rhs, \
   Scalar* res, Index resIncr, Scalar alpha) \
 { \
   if (ConjugateLhs) { \
-    general_matrix_vector_product<Index,Scalar,ColMajor,ConjugateLhs,Scalar,ConjugateRhs,BuiltIn>::run( \
-      rows, cols, lhs, lhsStride, rhs, rhsIncr, res, resIncr, alpha); \
+    general_matrix_vector_product<Index,Scalar,const_blas_data_mapper<Scalar,Index,ColMajor>,ColMajor,ConjugateLhs,Scalar,const_blas_data_mapper<Scalar,Index,RowMajor>,ConjugateRhs,BuiltIn>::run( \
+      rows, cols, lhs, rhs, res, resIncr, alpha); \
   } else { \
     general_matrix_vector_product_gemv<Index,Scalar,ColMajor,ConjugateLhs,Scalar,ConjugateRhs>::run( \
-      rows, cols, lhs, lhsStride, rhs, rhsIncr, res, resIncr, alpha); \
+      rows, cols, lhs.data(), lhs.stride(), rhs.data(), rhs.stride(), res, resIncr, alpha); \
   } \
 } \
 }; \
 template<typename Index, bool ConjugateLhs, bool ConjugateRhs> \
-struct general_matrix_vector_product<Index,Scalar,RowMajor,ConjugateLhs,Scalar,ConjugateRhs,Specialized> { \
+struct general_matrix_vector_product<Index,Scalar,const_blas_data_mapper<Scalar,Index,RowMajor>,RowMajor,ConjugateLhs,Scalar,const_blas_data_mapper<Scalar,Index,ColMajor>,ConjugateRhs,Specialized> { \
 static void run( \
   Index rows, Index cols, \
-  const Scalar* lhs, Index lhsStride, \
-  const Scalar* rhs, Index rhsIncr, \
+  const const_blas_data_mapper<Scalar,Index,RowMajor> &lhs, \
+  const const_blas_data_mapper<Scalar,Index,ColMajor> &rhs, \
   Scalar* res, Index resIncr, Scalar alpha) \
 { \
     general_matrix_vector_product_gemv<Index,Scalar,RowMajor,ConjugateLhs,Scalar,ConjugateRhs>::run( \
-      rows, cols, lhs, lhsStride, rhs, rhsIncr, res, resIncr, alpha); \
+      rows, cols, lhs.data(), lhs.stride(), rhs.data(), rhs.stride(), res, resIncr, alpha); \
 } \
 }; \
 
diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h b/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h
old mode 100644
new mode 100755
index 4cc56a42f..d9e7cf852
--- a/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h
@@ -122,7 +122,7 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
        Map<const MatrixLhs, 0, OuterStride<> > lhsMap(_lhs,rows,depth,OuterStride<>(lhsStride)); \
        MatrixLhs aa_tmp=lhsMap.template triangularView<Mode>(); \
        MKL_INT aStride = aa_tmp.outerStride(); \
-       gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth); \
+       gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth, 1, true); \
        general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor>::run( \
        rows, cols, depth, aa_tmp.data(), aStride, _rhs, rhsStride, res, resStride, alpha, gemm_blocking, 0); \
 \
@@ -236,7 +236,7 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
        Map<const MatrixRhs, 0, OuterStride<> > rhsMap(_rhs,depth,cols, OuterStride<>(rhsStride)); \
        MatrixRhs aa_tmp=rhsMap.template triangularView<Mode>(); \
        MKL_INT aStride = aa_tmp.outerStride(); \
-       gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth); \
+       gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth, 1, true); \
        general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor>::run( \
        rows, cols, depth, _lhs, lhsStride, aa_tmp.data(), aStride, res, resStride, alpha, gemm_blocking, 0); \
 \
diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
old mode 100644
new mode 100755
index ffeb5ac5f..934948ebd
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@@ -224,6 +224,7 @@ class blas_data_mapper {
   }
 
   const Index stride() const { return m_stride; }
+  const Scalar* data() const { return m_data; }
 
   Index firstAligned(Index size) const {
     if (size_t(m_data)%sizeof(Scalar)) {
diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h
index ba75d25af..7c20fed5e 100644
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@@ -213,6 +213,7 @@ template<typename Scalar> struct scalar_identity_op;
 template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_product_op;
 template<typename LhsScalar,typename RhsScalar> struct scalar_multiple2_op;
 template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_quotient_op;
+template<typename LhsScalar,typename RhsScalar> struct scalar_quotient2_op;
 
 } // end namespace internal
 
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index 8efe2f4ca..8cffb3f89 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -645,7 +645,7 @@ namespace Eigen {
 // just an empty macro !
 #define EIGEN_EMPTY
 
-#if EIGEN_COMP_MSVC_STRICT && EIGEN_COMP_MSVC < 1800 // for older MSVC versions using the base operator is sufficient (cf Bug 1000)
+#if EIGEN_COMP_MSVC_STRICT && EIGEN_COMP_MSVC < 1900 // for older MSVC versions using the base operator is sufficient (cf Bug 1000)
   #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
     using Base::operator =;
 #elif EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)
diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index 2cd78576e..8c280432b 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -427,7 +427,9 @@ struct special_scalar_op_base : public DenseCoeffsBase<Derived>
 {
   // dummy operator* so that the
   // "using special_scalar_op_base::operator*" compiles
-  void operator*() const;
+  struct dummy {};
+  void operator*(dummy) const;
+  void operator/(dummy) const;
 };
 
 template<typename Derived,typename Scalar,typename OtherScalar>
@@ -451,6 +453,16 @@ struct special_scalar_op_base<Derived,Scalar,OtherScalar,true>  : public DenseCo
 #endif
     return static_cast<const special_scalar_op_base&>(matrix).operator*(scalar);
   }
+  
+  const CwiseUnaryOp<scalar_quotient2_op<Scalar,OtherScalar>, Derived>
+  operator/(const OtherScalar& scalar) const
+  {
+#ifdef EIGEN_SPECIAL_SCALAR_MULTIPLE_PLUGIN
+    EIGEN_SPECIAL_SCALAR_MULTIPLE_PLUGIN
+#endif
+    return CwiseUnaryOp<scalar_quotient2_op<Scalar,OtherScalar>, Derived>
+      (*static_cast<const Derived*>(this), scalar_quotient2_op<Scalar,OtherScalar>(scalar));
+  }
 };
 
 template<typename XprType, typename CastType> struct cast_return_type
diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
index e46c16391..872866850 100644
--- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
+++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
@@ -486,10 +486,11 @@ ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag
   
   while (end>0)
   {
+    EIGEN_ASM_COMMENT("beginabs");
     for (Index i = start; i<end; ++i)
       if (internal::isMuchSmallerThan(abs(subdiag[i]),(abs(diag[i])+abs(diag[i+1]))) || abs(subdiag[i]) <= considerAsZero)
         subdiag[i] = 0;
-
+    EIGEN_ASM_COMMENT("endabs");
     // find the largest unreduced block
     while (end>0 && subdiag[end-1]==0)
     {
diff --git a/Eigen/src/Eigenvalues/Tridiagonalization.h b/Eigen/src/Eigenvalues/Tridiagonalization.h
index a6fb00b21..f21331312 100644
--- a/Eigen/src/Eigenvalues/Tridiagonalization.h
+++ b/Eigen/src/Eigenvalues/Tridiagonalization.h
@@ -464,9 +464,10 @@ struct tridiagonalization_inplace_selector<MatrixType,3,false>
   static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ)
   {
     using std::sqrt;
+    const RealScalar tol = (std::numeric_limits<RealScalar>::min)();
     diag[0] = mat(0,0);
     RealScalar v1norm2 = numext::abs2(mat(2,0));
-    if(v1norm2 == RealScalar(0))
+    if(v1norm2 <= tol)
     {
       diag[1] = mat(1,1);
       diag[2] = mat(2,2);
diff --git a/Eigen/src/Geometry/OrthoMethods.h b/Eigen/src/Geometry/OrthoMethods.h
index 6b2e57392..39b64b869 100644
--- a/Eigen/src/Geometry/OrthoMethods.h
+++ b/Eigen/src/Geometry/OrthoMethods.h
@@ -18,6 +18,10 @@ namespace Eigen {
   * \returns the cross product of \c *this and \a other
   *
   * Here is a very good explanation of cross-product: http://xkcd.com/199/
+  * 
+  * With complex numbers, the cross product is implemented as
+  * \f$ (\mathbf{a}+i\mathbf{b}) \times (\mathbf{c}+i\mathbf{d}) = (\mathbf{a} \times \mathbf{c} - \mathbf{b} \times \mathbf{d}) - i(\mathbf{a} \times \mathbf{d} - \mathbf{b} \times \mathbf{c})\f$
+  * 
   * \sa MatrixBase::cross3()
   */
 template<typename Derived>
diff --git a/Eigen/src/Householder/Householder.h b/Eigen/src/Householder/Householder.h
index 32112af9b..4c1f499a1 100644
--- a/Eigen/src/Householder/Householder.h
+++ b/Eigen/src/Householder/Householder.h
@@ -75,8 +75,9 @@ void MatrixBase<Derived>::makeHouseholder(
   
   RealScalar tailSqNorm = size()==1 ? RealScalar(0) : tail.squaredNorm();
   Scalar c0 = coeff(0);
+  const RealScalar tol = (std::numeric_limits<RealScalar>::min)();
 
-  if(tailSqNorm == RealScalar(0) && numext::imag(c0)==RealScalar(0))
+  if(tailSqNorm <= tol && numext::abs2(numext::imag(c0))<=tol)
   {
     tau = RealScalar(0);
     beta = numext::real(c0);
diff --git a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
index be98993f0..a34ee7628 100644
--- a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
+++ b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
@@ -136,6 +136,12 @@ struct traits<BiCGSTAB<_MatrixType,_Preconditioner> >
   * and setTolerance() methods. The defaults are the size of the problem for the maximal number of iterations
   * and NumTraits<Scalar>::epsilon() for the tolerance.
   * 
+  * The tolerance corresponds to the relative residual error: |Ax-b|/|b|
+  * 
+  * \b Performance: when using sparse matrices, best performance is achied for a row-major sparse matrix format.
+  * Moreover, in this case multi-threading can be exploited if the user code is compiled with OpenMP enabled.
+  * See \ref TopicMultiThreading for details.
+  * 
   * This class can be used as the direct solver classes. Here is a typical usage example:
   * \include BiCGSTAB_simple.cpp
   * 
diff --git a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
index 9e7dd1404..8f33c446d 100644
--- a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
+++ b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
@@ -114,20 +114,28 @@ struct traits<ConjugateGradient<_MatrixType,_UpLo,_Preconditioner> >
   *
   * \tparam _MatrixType the type of the matrix A, can be a dense or a sparse matrix.
   * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower,
-  *               Upper, or Lower|Upper in which the full matrix entries will be considered. Default is Lower.
+  *               \c Upper, or \c Lower|Upper in which the full matrix entries will be considered.
+  *               Default is \c Lower, best performance is \c Lower|Upper.
   * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner
   *
   * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations()
   * and setTolerance() methods. The defaults are the size of the problem for the maximal number of iterations
   * and NumTraits<Scalar>::epsilon() for the tolerance.
   * 
+  * The tolerance corresponds to the relative residual error: |Ax-b|/|b|
+  * 
+  * \b Performance: Even though the default value of \c _UpLo is \c Lower, significantly higher performance is
+  * achieved when using a complete matrix and \b Lower|Upper as the \a _UpLo template parameter. Moreover, in this
+  * case multi-threading can be exploited if the user code is compiled with OpenMP enabled.
+  * See \ref TopicMultiThreading for details.
+  * 
   * This class can be used as the direct solver classes. Here is a typical usage example:
     \code
     int n = 10000;
     VectorXd x(n), b(n);
     SparseMatrix<double> A(n,n);
     // fill A and b
-    ConjugateGradient<SparseMatrix<double> > cg;
+    ConjugateGradient<SparseMatrix<double>, Lower|Upper> cg;
     cg.compute(A);
     x = cg.solve(b);
     std::cout << "#iterations:     " << cg.iterations() << std::endl;
@@ -183,10 +191,13 @@ public:
   template<typename Rhs,typename Dest>
   void _solve_with_guess_impl(const Rhs& b, Dest& x) const
   {
+    typedef Ref<const MatrixType> MatRef;
+    typedef typename internal::conditional<UpLo==(Lower|Upper) && (!MatrixType::IsRowMajor) && (!NumTraits<Scalar>::IsComplex),
+                                           Transpose<const MatRef>, MatRef const&>::type RowMajorWrapper;
     typedef typename internal::conditional<UpLo==(Lower|Upper),
-                                           Ref<const MatrixType>&,
-                                           typename Ref<const MatrixType>::template ConstSelfAdjointViewReturnType<UpLo>::Type
-                                          >::type MatrixWrapperType;
+                                           RowMajorWrapper,
+                                           typename MatRef::template ConstSelfAdjointViewReturnType<UpLo>::Type
+                                          >::type SelfAdjointWrapper;
     m_iterations = Base::maxIterations();
     m_error = Base::m_tolerance;
 
@@ -196,7 +207,8 @@ public:
       m_error = Base::m_tolerance;
 
       typename Dest::ColXpr xj(x,j);
-      internal::conjugate_gradient(MatrixWrapperType(mp_matrix), b.col(j), xj, Base::m_preconditioner, m_iterations, m_error);
+      RowMajorWrapper row_mat(mp_matrix);
+      internal::conjugate_gradient(SelfAdjointWrapper(row_mat), b.col(j), xj, Base::m_preconditioner, m_iterations, m_error);
     }
 
     m_isInitialized = true;
diff --git a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
index 6477b9de2..22c602a91 100644
--- a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
+++ b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
@@ -126,10 +126,16 @@ public:
   /** \internal */
   Index cols() const { return mp_matrix.cols(); }
 
-  /** \returns the tolerance threshold used by the stopping criteria */
+  /** \returns the tolerance threshold used by the stopping criteria.
+    * \sa setTolerance()
+    */
   RealScalar tolerance() const { return m_tolerance; }
   
-  /** Sets the tolerance threshold used by the stopping criteria */
+  /** Sets the tolerance threshold used by the stopping criteria.
+    *
+    * This value is used as an upper bound to the relative residual error: |Ax-b|/|b|.
+    * The default value is the machine precision given by NumTraits<Scalar>::epsilon()
+    */
   Derived& setTolerance(const RealScalar& tolerance)
   {
     m_tolerance = tolerance;
@@ -167,7 +173,9 @@ public:
     return m_iterations;
   }
 
-  /** \returns the tolerance error reached during the last solve */
+  /** \returns the tolerance error reached during the last solve.
+    * It is a close approximation of the true relative residual error |Ax-b|/|b|.
+    */
   RealScalar error() const
   {
     eigen_assert(m_isInitialized && "ConjugateGradient is not initialized.");
diff --git a/Eigen/src/PardisoSupport/PardisoSupport.h b/Eigen/src/PardisoSupport/PardisoSupport.h
old mode 100644
new mode 100755
index 7ab2e3e6b..234e3213b
--- a/Eigen/src/PardisoSupport/PardisoSupport.h
+++ b/Eigen/src/PardisoSupport/PardisoSupport.h
@@ -54,7 +54,7 @@ namespace internal
   template<>
   struct pardiso_run_selector<long long int>
   {
-    typedef long long int IndexTypeType;
+    typedef long long int IndexType;
     static IndexType run( _MKL_DSS_HANDLE_t pt, IndexType maxfct, IndexType mnum, IndexType type, IndexType phase, IndexType n, void *a,
                       IndexType *ia, IndexType *ja, IndexType *perm, IndexType nrhs, IndexType *iparm, IndexType msglvl, void *b, void *x)
     {
@@ -93,19 +93,19 @@ namespace internal
     typedef typename _MatrixType::StorageIndex StorageIndex;    
   };
 
-}
+} // end namespace internal
 
 template<class Derived>
-class PardisoImpl : public SparseSolveBase<PardisoImpl<Derived>
+class PardisoImpl : public SparseSolverBase<Derived>
 {
   protected:
-    typedef SparseSolveBase<PardisoImpl<Derived> Base;
+    typedef SparseSolverBase<Derived> Base;
     using Base::derived;
     using Base::m_isInitialized;
     
     typedef internal::pardiso_traits<Derived> Traits;
   public:
-    using base::_solve_impl;
+    using Base::_solve_impl;
     
     typedef typename Traits::MatrixType MatrixType;
     typedef typename Traits::Scalar Scalar;
@@ -173,16 +173,17 @@ class PardisoImpl : public SparseSolveBase<PardisoImpl<Derived>
 
     Derived& compute(const MatrixType& matrix);
 
-    template<typename BDerived, typename XDerived>
-    bool _solve_impl(const MatrixBase<BDerived> &b, MatrixBase<XDerived>& x) const;
+    template<typename Rhs,typename Dest>
+    void _solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const;
 
   protected:
     void pardisoRelease()
     {
       if(m_isInitialized) // Factorization ran at least once
       {
-        internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, -1, m_size, 0, 0, 0, m_perm.data(), 0,
-                                                   m_iparm.data(), m_msglvl, 0, 0);
+        internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, -1, m_size,0, 0, 0, m_perm.data(), 0,
+                                                          m_iparm.data(), m_msglvl, NULL, NULL);
+        m_isInitialized = false;
       }
     }
 
@@ -217,12 +218,14 @@ class PardisoImpl : public SparseSolveBase<PardisoImpl<Derived>
       m_iparm[27] = (sizeof(RealScalar) == 4) ? 1 : 0;
       m_iparm[34] = 1;  // C indexing
       m_iparm[59] = 1;  // Automatic switch between In-Core and Out-of-Core modes
+      
+      memset(m_pt, 0, sizeof(m_pt));
     }
 
   protected:
     // cached data to reduce reallocation, etc.
     
-    void manageErrorCode(Index error)
+    void manageErrorCode(Index error) const
     {
       switch(error)
       {
@@ -239,7 +242,7 @@ class PardisoImpl : public SparseSolveBase<PardisoImpl<Derived>
     }
 
     mutable SparseMatrixType m_matrix;
-    ComputationInfo m_info;
+    mutable ComputationInfo m_info;
     bool m_analysisIsOk, m_factorizationIsOk;
     Index m_type, m_msglvl;
     mutable void *m_pt[64];
@@ -256,7 +259,6 @@ Derived& PardisoImpl<Derived>::compute(const MatrixType& a)
   eigen_assert(a.rows() == a.cols());
 
   pardisoRelease();
-  memset(m_pt, 0, sizeof(m_pt));
   m_perm.setZero(m_size);
   derived().getMatrix(a);
   
@@ -279,7 +281,6 @@ Derived& PardisoImpl<Derived>::analyzePattern(const MatrixType& a)
   eigen_assert(m_size == a.cols());
 
   pardisoRelease();
-  memset(m_pt, 0, sizeof(m_pt));
   m_perm.setZero(m_size);
   derived().getMatrix(a);
   
@@ -313,12 +314,15 @@ Derived& PardisoImpl<Derived>::factorize(const MatrixType& a)
   return derived();
 }
 
-template<class Base>
+template<class Derived>
 template<typename BDerived,typename XDerived>
-bool PardisoImpl<Base>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBase<XDerived>& x) const
+void PardisoImpl<Derived>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBase<XDerived>& x) const
 {
   if(m_iparm[0] == 0) // Factorization was not computed
-    return false;
+  {
+    m_info = InvalidInput;
+    return;
+  }
 
   //Index n = m_matrix.rows();
   Index nrhs = Index(b.cols());
@@ -353,7 +357,7 @@ bool PardisoImpl<Base>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBase<XD
                                                             m_perm.data(), nrhs, m_iparm.data(), m_msglvl,
                                                             rhs_ptr, x.derived().data());
 
-  return error==0;
+  manageErrorCode(error);
 }
 
 
@@ -373,7 +377,7 @@ template<typename MatrixType>
 class PardisoLU : public PardisoImpl< PardisoLU<MatrixType> >
 {
   protected:
-    typedef PardisoImpl< PardisoLU<MatrixType> > Base;
+    typedef PardisoImpl<PardisoLU> Base;
     typedef typename Base::Scalar Scalar;
     typedef typename Base::RealScalar RealScalar;
     using Base::pardisoInit;
@@ -401,6 +405,7 @@ class PardisoLU : public PardisoImpl< PardisoLU<MatrixType> >
     void getMatrix(const MatrixType& matrix)
     {
       m_matrix = matrix;
+      m_matrix.makeCompressed();
     }
 };
 
@@ -424,7 +429,6 @@ class PardisoLLT : public PardisoImpl< PardisoLLT<MatrixType,_UpLo> >
   protected:
     typedef PardisoImpl< PardisoLLT<MatrixType,_UpLo> > Base;
     typedef typename Base::Scalar Scalar;
-    typedef typename Base::StorageIndex StorageIndex;
     typedef typename Base::RealScalar RealScalar;
     using Base::pardisoInit;
     using Base::m_matrix;
@@ -432,9 +436,9 @@ class PardisoLLT : public PardisoImpl< PardisoLLT<MatrixType,_UpLo> >
 
   public:
 
+    typedef typename Base::StorageIndex StorageIndex;
     enum { UpLo = _UpLo };
     using Base::compute;
-    using Base::solve;
 
     PardisoLLT()
       : Base()
@@ -457,6 +461,7 @@ class PardisoLLT : public PardisoImpl< PardisoLLT<MatrixType,_UpLo> >
       PermutationMatrix<Dynamic,Dynamic,StorageIndex> p_null;
       m_matrix.resize(matrix.rows(), matrix.cols());
       m_matrix.template selfadjointView<Upper>() = matrix.template selfadjointView<UpLo>().twistedBy(p_null);
+      m_matrix.makeCompressed();
     }
 };
 
@@ -482,7 +487,6 @@ class PardisoLDLT : public PardisoImpl< PardisoLDLT<MatrixType,Options> >
   protected:
     typedef PardisoImpl< PardisoLDLT<MatrixType,Options> > Base;
     typedef typename Base::Scalar Scalar;
-    typedef typename Base::StorageIndex StorageIndex;
     typedef typename Base::RealScalar RealScalar;
     using Base::pardisoInit;
     using Base::m_matrix;
@@ -490,8 +494,8 @@ class PardisoLDLT : public PardisoImpl< PardisoLDLT<MatrixType,Options> >
 
   public:
 
+    typedef typename Base::StorageIndex StorageIndex;
     using Base::compute;
-    using Base::solve;
     enum { UpLo = Options&(Upper|Lower) };
 
     PardisoLDLT()
@@ -513,6 +517,7 @@ class PardisoLDLT : public PardisoImpl< PardisoLDLT<MatrixType,Options> >
       PermutationMatrix<Dynamic,Dynamic,StorageIndex> p_null;
       m_matrix.resize(matrix.rows(), matrix.cols());
       m_matrix.template selfadjointView<Upper>() = matrix.template selfadjointView<UpLo>().twistedBy(p_null);
+      m_matrix.makeCompressed();
     }
 };
 
diff --git a/Eigen/src/SparseCore/CompressedStorage.h b/Eigen/src/SparseCore/CompressedStorage.h
index d667944ce..2199848e9 100644
--- a/Eigen/src/SparseCore/CompressedStorage.h
+++ b/Eigen/src/SparseCore/CompressedStorage.h
@@ -50,9 +50,9 @@ class CompressedStorage
 
     CompressedStorage& operator=(const CompressedStorage& other)
     {
+      resize(other.size());
       if(other.size()>0)
       {
-        resize(other.size());
         internal::smart_copy(other.m_values,  other.m_values  + m_size, m_values);
         internal::smart_copy(other.m_indices, other.m_indices + m_size, m_indices);
       }
diff --git a/Eigen/src/SparseCore/SparseAssign.h b/Eigen/src/SparseCore/SparseAssign.h
index 469c2b188..1ce60f385 100644
--- a/Eigen/src/SparseCore/SparseAssign.h
+++ b/Eigen/src/SparseCore/SparseAssign.h
@@ -16,8 +16,7 @@ template<typename Derived>
 template<typename OtherDerived>
 Derived& SparseMatrixBase<Derived>::operator=(const EigenBase<OtherDerived> &other)
 {
-  // TODO use the evaluator mechanism
-  other.derived().evalTo(derived());
+  internal::call_assignment_no_alias(derived(), other.derived());
   return derived();
 }
 
@@ -182,6 +181,39 @@ struct Assignment<DstXprType, Solve<DecType,RhsType>, internal::assign_op<Scalar
   }
 };
 
+struct Diagonal2Sparse {};
+
+template<> struct AssignmentKind<SparseShape,DiagonalShape> { typedef Diagonal2Sparse Kind; };
+
+template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
+struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Sparse, Scalar>
+{
+  typedef typename DstXprType::StorageIndex StorageIndex;
+  typedef Array<StorageIndex,Dynamic,1> ArrayXI;
+  typedef Array<Scalar,Dynamic,1> ArrayXS;
+  template<int Options>
+  static void run(SparseMatrix<Scalar,Options,StorageIndex> &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &/*func*/)
+  {
+    Index size = src.diagonal().size();
+    dst.makeCompressed();
+    dst.resizeNonZeros(size);
+    Map<ArrayXI>(dst.innerIndexPtr(), size).setLinSpaced(0,StorageIndex(size)-1);
+    Map<ArrayXI>(dst.outerIndexPtr(), size+1).setLinSpaced(0,StorageIndex(size));
+    Map<ArrayXS>(dst.valuePtr(), size) = src.diagonal();
+  }
+  
+  template<typename DstDerived>
+  static void run(SparseMatrixBase<DstDerived> &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &/*func*/)
+  {
+    dst.diagonal() = src.diagonal();
+  }
+  
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar> &/*func*/)
+  { dst.diagonal() += src.diagonal(); }
+  
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar> &/*func*/)
+  { dst.diagonal() -= src.diagonal(); }
+};
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
index 096af7fb0..78d16892d 100644
--- a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
+++ b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
@@ -390,6 +390,22 @@ SparseMatrixBase<Derived>::operator+=(const SparseMatrixBase<OtherDerived>& othe
   return derived() = derived() + other.derived();
 }
 
+template<typename Derived>
+template<typename OtherDerived>
+Derived& SparseMatrixBase<Derived>::operator+=(const DiagonalBase<OtherDerived>& other)
+{
+  call_assignment_no_alias(derived(), other.derived(), internal::add_assign_op<Scalar>());
+  return derived();
+}
+
+template<typename Derived>
+template<typename OtherDerived>
+Derived& SparseMatrixBase<Derived>::operator-=(const DiagonalBase<OtherDerived>& other)
+{
+  call_assignment_no_alias(derived(), other.derived(), internal::sub_assign_op<Scalar>());
+  return derived();
+}
+    
 template<typename Derived>
 template<typename OtherDerived>
 EIGEN_STRONG_INLINE const EIGEN_SPARSE_CWISE_PRODUCT_RETURN_TYPE
diff --git a/Eigen/src/SparseCore/SparseDenseProduct.h b/Eigen/src/SparseCore/SparseDenseProduct.h
index 731d40f29..b8d71d3f8 100644
--- a/Eigen/src/SparseCore/SparseDenseProduct.h
+++ b/Eigen/src/SparseCore/SparseDenseProduct.h
@@ -30,23 +30,48 @@ struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, t
   typedef typename internal::remove_all<DenseRhsType>::type Rhs;
   typedef typename internal::remove_all<DenseResType>::type Res;
   typedef typename evaluator<Lhs>::InnerIterator LhsInnerIterator;
+  typedef typename evaluator<Lhs>::type LhsEval;
   static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha)
   {
-    typename evaluator<Lhs>::type lhsEval(lhs);
+    LhsEval lhsEval(lhs);
+    
+    Index n = lhs.outerSize();
+#ifdef EIGEN_HAS_OPENMP
+    Eigen::initParallel();
+    Index threads = Eigen::nbThreads();
+#endif
+    
     for(Index c=0; c<rhs.cols(); ++c)
     {
-      Index n = lhs.outerSize();
-      for(Index j=0; j<n; ++j)
+#ifdef EIGEN_HAS_OPENMP
+      // This 20000 threshold has been found experimentally on 2D and 3D Poisson problems.
+      // It basically represents the minimal amount of work to be done to be worth it.
+      if(threads>1 && lhs.nonZeros() > 20000)
       {
-        typename Res::Scalar tmp(0);
-        for(LhsInnerIterator it(lhsEval,j); it ;++it)
-          tmp += it.value() * rhs.coeff(it.index(),c);
-        res.coeffRef(j,c) += alpha * tmp;
+        #pragma omp parallel for schedule(static) num_threads(threads)
+        for(Index i=0; i<n; ++i)
+          processRow(lhsEval,rhs,res,alpha,i,c);
+      }
+      else
+#endif
+      {
+        for(Index i=0; i<n; ++i)
+          processRow(lhsEval,rhs,res,alpha,i,c);
       }
     }
   }
+  
+  static void processRow(const LhsEval& lhsEval, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha, Index i, Index col)
+  {
+    typename Res::Scalar tmp(0);
+    for(LhsInnerIterator it(lhsEval,i); it ;++it)
+      tmp += it.value() * rhs.coeff(it.index(),col);
+    res.coeffRef(i,col) += alpha * tmp;
+  }
+  
 };
 
+// FIXME: what is the purpose of the following specialization? Is it for the BlockedSparse format?
 template<typename T1, typename T2/*, int _Options, typename _StrideType*/>
 struct scalar_product_traits<T1, Ref<T2/*, _Options, _StrideType*/> >
 {
diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h
index fb05fb4b6..f18829866 100644
--- a/Eigen/src/SparseCore/SparseMatrix.h
+++ b/Eigen/src/SparseCore/SparseMatrix.h
@@ -97,8 +97,8 @@ class SparseMatrix
     using Base::isCompressed;
     using Base::nonZeros;
     _EIGEN_SPARSE_PUBLIC_INTERFACE(SparseMatrix)
-    EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseMatrix, +=)
-    EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseMatrix, -=)
+    using Base::operator+=;
+    using Base::operator-=;
 
     typedef MappedSparseMatrix<Scalar,Flags> Map;
     typedef Diagonal<SparseMatrix> DiagonalReturnType;
@@ -695,6 +695,15 @@ class SparseMatrix
       initAssignment(other);
       other.evalTo(*this);
     }
+    
+    /** \brief Copy constructor with in-place evaluation */
+    template<typename OtherDerived>
+    explicit SparseMatrix(const DiagonalBase<OtherDerived>& other)
+      : Base(), m_outerSize(0), m_innerSize(0), m_outerIndex(0), m_innerNonZeros(0)
+    {
+      check_template_parameters();
+      *this = other.derived();
+    }
 
     /** Swaps the content of two sparse matrices of the same type.
       * This is a fast operation that simply swaps the underlying pointers and parameters. */
diff --git a/Eigen/src/SparseCore/SparseMatrixBase.h b/Eigen/src/SparseCore/SparseMatrixBase.h
index f1b5d2a97..4e720904e 100644
--- a/Eigen/src/SparseCore/SparseMatrixBase.h
+++ b/Eigen/src/SparseCore/SparseMatrixBase.h
@@ -242,6 +242,11 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
     Derived& operator+=(const SparseMatrixBase<OtherDerived>& other);
     template<typename OtherDerived>
     Derived& operator-=(const SparseMatrixBase<OtherDerived>& other);
+    
+    template<typename OtherDerived>
+    Derived& operator+=(const DiagonalBase<OtherDerived>& other);
+    template<typename OtherDerived>
+    Derived& operator-=(const DiagonalBase<OtherDerived>& other);
 
     Derived& operator*=(const Scalar& other);
     Derived& operator/=(const Scalar& other);
@@ -367,6 +372,8 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
     static inline StorageIndex convert_index(const Index idx) {
       return internal::convert_index<StorageIndex>(idx);
     }
+  private:
+    template<typename Dest> void evalTo(Dest &) const;
 };
 
 } // end namespace Eigen
diff --git a/Eigen/src/SparseCore/SparseSelfAdjointView.h b/Eigen/src/SparseCore/SparseSelfAdjointView.h
index 3da856799..dec96efb5 100644
--- a/Eigen/src/SparseCore/SparseSelfAdjointView.h
+++ b/Eigen/src/SparseCore/SparseSelfAdjointView.h
@@ -45,8 +45,13 @@ template<typename MatrixType, unsigned int _Mode> class SparseSelfAdjointView
 {
   public:
     
-    enum { Mode = _Mode };
+    enum {
+      Mode = _Mode,
+      RowsAtCompileTime = internal::traits<SparseSelfAdjointView>::RowsAtCompileTime,
+      ColsAtCompileTime = internal::traits<SparseSelfAdjointView>::ColsAtCompileTime
+    };
 
+    typedef EigenBase<SparseSelfAdjointView> Base;
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::StorageIndex StorageIndex;
     typedef Matrix<StorageIndex,Dynamic,1> VectorI;
@@ -116,20 +121,6 @@ template<typename MatrixType, unsigned int _Mode> class SparseSelfAdjointView
     template<typename DerivedU>
     SparseSelfAdjointView& rankUpdate(const SparseMatrixBase<DerivedU>& u, const Scalar& alpha = Scalar(1));
     
-    /** \internal triggered by sparse_matrix = SparseSelfadjointView; */
-    template<typename DestScalar,int StorageOrder> void evalTo(SparseMatrix<DestScalar,StorageOrder,StorageIndex>& _dest) const
-    {
-      internal::permute_symm_to_fullsymm<Mode>(m_matrix, _dest);
-    }
-    
-    template<typename DestScalar> void evalTo(DynamicSparseMatrix<DestScalar,ColMajor,StorageIndex>& _dest) const
-    {
-      // TODO directly evaluate into _dest;
-      SparseMatrix<DestScalar,ColMajor,StorageIndex> tmp(_dest.rows(),_dest.cols());
-      internal::permute_symm_to_fullsymm<Mode>(m_matrix, tmp);
-      _dest = tmp;
-    }
-    
     /** \returns an expression of P H P^-1 */
     // TODO implement twists in a more evaluator friendly fashion
     SparseSymmetricPermutationProduct<_MatrixTypeNested,Mode> twistedBy(const PermutationMatrix<Dynamic,Dynamic,StorageIndex>& perm) const
@@ -140,7 +131,7 @@ template<typename MatrixType, unsigned int _Mode> class SparseSelfAdjointView
     template<typename SrcMatrixType,int SrcMode>
     SparseSelfAdjointView& operator=(const SparseSymmetricPermutationProduct<SrcMatrixType,SrcMode>& permutedMatrix)
     {
-      permutedMatrix.evalTo(*this);
+      internal::call_assignment_no_alias_no_transpose(*this, permutedMatrix);
       return *this;
     }
 
@@ -157,11 +148,21 @@ template<typename MatrixType, unsigned int _Mode> class SparseSelfAdjointView
       return *this = src.twistedBy(pnull);
     }
     
+    void resize(Index rows, Index cols)
+    {
+      EIGEN_ONLY_USED_FOR_DEBUG(rows);
+      EIGEN_ONLY_USED_FOR_DEBUG(cols);
+      eigen_assert(rows == this->rows() && cols == this->cols()
+                && "SparseSelfadjointView::resize() does not actually allow to resize.");
+    }
+    
   protected:
 
     typename MatrixType::Nested m_matrix;
     //mutable VectorI m_countPerRow;
     //mutable VectorI m_countPerCol;
+  private:
+    template<typename Dest> void evalTo(Dest &) const;
 };
 
 /***************************************************************************
@@ -200,6 +201,47 @@ SparseSelfAdjointView<MatrixType,Mode>::rankUpdate(const SparseMatrixBase<Derive
   return *this;
 }
 
+namespace internal {
+  
+// TODO currently a selfadjoint expression has the form SelfAdjointView<.,.>
+//      in the future selfadjoint-ness should be defined by the expression traits
+//      such that Transpose<SelfAdjointView<.,.> > is valid. (currently TriangularBase::transpose() is overloaded to make it work)
+template<typename MatrixType, unsigned int Mode>
+struct evaluator_traits<SparseSelfAdjointView<MatrixType,Mode> >
+{
+  typedef typename storage_kind_to_evaluator_kind<typename MatrixType::StorageKind>::Kind Kind;
+  typedef SparseSelfAdjointShape Shape;
+  
+  static const int AssumeAliasing = 0;
+};
+
+struct SparseSelfAdjoint2Sparse {};
+
+template<> struct AssignmentKind<SparseShape,SparseSelfAdjointShape> { typedef SparseSelfAdjoint2Sparse Kind; };
+template<> struct AssignmentKind<SparseSelfAdjointShape,SparseShape> { typedef Sparse2Sparse Kind; };
+
+template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
+struct Assignment<DstXprType, SrcXprType, Functor, SparseSelfAdjoint2Sparse, Scalar>
+{
+  typedef typename DstXprType::StorageIndex StorageIndex;
+  template<typename DestScalar,int StorageOrder>
+  static void run(SparseMatrix<DestScalar,StorageOrder,StorageIndex> &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &/*func*/)
+  {
+    internal::permute_symm_to_fullsymm<SrcXprType::Mode>(src.matrix(), dst);
+  }
+  
+  template<typename DestScalar>
+  static void run(DynamicSparseMatrix<DestScalar,ColMajor,StorageIndex>& dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &/*func*/)
+  {
+    // TODO directly evaluate into dst;
+    SparseMatrix<DestScalar,ColMajor,StorageIndex> tmp(dst.rows(),dst.cols());
+    internal::permute_symm_to_fullsymm<SrcXprType::Mode>(src.matrix(), tmp);
+    dst = tmp;
+  }
+};
+
+} // end namespace internal
+
 /***************************************************************************
 * Implementation of sparse self-adjoint time dense matrix
 ***************************************************************************/
@@ -252,18 +294,7 @@ inline void sparse_selfadjoint_time_dense_product(const SparseLhsType& lhs, cons
       res.row(j) += i.value() * rhs.row(j);
   }
 }
-  
-// TODO currently a selfadjoint expression has the form SelfAdjointView<.,.>
-//      in the future selfadjoint-ness should be defined by the expression traits
-//      such that Transpose<SelfAdjointView<.,.> > is valid. (currently TriangularBase::transpose() is overloaded to make it work)
-template<typename MatrixType, unsigned int Mode>
-struct evaluator_traits<SparseSelfAdjointView<MatrixType,Mode> >
-{
-  typedef typename storage_kind_to_evaluator_kind<typename MatrixType::StorageKind>::Kind Kind;
-  typedef SparseSelfAdjointShape Shape;
-  
-  static const int AssumeAliasing = 0;
-};
+
 
 template<typename LhsView, typename Rhs, int ProductType>
 struct generic_product_impl<LhsView, Rhs, SparseSelfAdjointShape, DenseShape, ProductType>
@@ -519,12 +550,16 @@ class SparseSymmetricPermutationProduct
   public:
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::StorageIndex StorageIndex;
+    enum {
+      RowsAtCompileTime = internal::traits<SparseSymmetricPermutationProduct>::RowsAtCompileTime,
+      ColsAtCompileTime = internal::traits<SparseSymmetricPermutationProduct>::ColsAtCompileTime
+    };
   protected:
     typedef PermutationMatrix<Dynamic,Dynamic,StorageIndex> Perm;
   public:
     typedef Matrix<StorageIndex,Dynamic,1> VectorI;
     typedef typename MatrixType::Nested MatrixTypeNested;
-    typedef typename internal::remove_all<MatrixTypeNested>::type _MatrixTypeNested;
+    typedef typename internal::remove_all<MatrixTypeNested>::type NestedExpression;
     
     SparseSymmetricPermutationProduct(const MatrixType& mat, const Perm& perm)
       : m_matrix(mat), m_perm(perm)
@@ -532,20 +567,9 @@ class SparseSymmetricPermutationProduct
     
     inline Index rows() const { return m_matrix.rows(); }
     inline Index cols() const { return m_matrix.cols(); }
-    
-    template<typename DestScalar, int Options, typename DstIndex>
-    void evalTo(SparseMatrix<DestScalar,Options,DstIndex>& _dest) const
-    {
-//       internal::permute_symm_to_fullsymm<Mode>(m_matrix,_dest,m_perm.indices().data());
-      SparseMatrix<DestScalar,(Options&RowMajor)==RowMajor ? ColMajor : RowMajor, DstIndex> tmp;
-      internal::permute_symm_to_fullsymm<Mode>(m_matrix,tmp,m_perm.indices().data());
-      _dest = tmp;
-    }
-    
-    template<typename DestType,unsigned int DestMode> void evalTo(SparseSelfAdjointView<DestType,DestMode>& dest) const
-    {
-      internal::permute_symm_to_symm<Mode,DestMode>(m_matrix,dest.matrix(),m_perm.indices().data());
-    }
+        
+    const NestedExpression& matrix() const { return m_matrix; }
+    const Perm& perm() const { return m_perm; }
     
   protected:
     MatrixTypeNested m_matrix;
@@ -553,6 +577,31 @@ class SparseSymmetricPermutationProduct
 
 };
 
+namespace internal {
+  
+template<typename DstXprType, typename MatrixType, int Mode, typename Scalar>
+struct Assignment<DstXprType, SparseSymmetricPermutationProduct<MatrixType,Mode>, internal::assign_op<Scalar>, Sparse2Sparse>
+{
+  typedef SparseSymmetricPermutationProduct<MatrixType,Mode> SrcXprType;
+  typedef typename DstXprType::StorageIndex DstIndex;
+  template<int Options>
+  static void run(SparseMatrix<Scalar,Options,DstIndex> &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
+  {
+    // internal::permute_symm_to_fullsymm<Mode>(m_matrix,_dest,m_perm.indices().data());
+    SparseMatrix<Scalar,(Options&RowMajor)==RowMajor ? ColMajor : RowMajor, DstIndex> tmp;
+    internal::permute_symm_to_fullsymm<Mode>(src.matrix(),tmp,src.perm().indices().data());
+    dst = tmp;
+  }
+  
+  template<typename DestType,unsigned int DestMode>
+  static void run(SparseSelfAdjointView<DestType,DestMode>& dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
+  {
+    internal::permute_symm_to_symm<Mode,DestMode>(src.matrix(),dst.matrix(),src.perm().indices().data());
+  }
+};
+
+} // end namespace internal
+
 } // end namespace Eigen
 
 #endif // EIGEN_SPARSE_SELFADJOINTVIEW_H
diff --git a/Eigen/src/SparseCore/SparseUtil.h b/Eigen/src/SparseCore/SparseUtil.h
index 215601aa6..d53a9cb17 100644
--- a/Eigen/src/SparseCore/SparseUtil.h
+++ b/Eigen/src/SparseCore/SparseUtil.h
@@ -37,11 +37,7 @@ EIGEN_STRONG_INLINE Derived& operator Op(const Other& scalar) \
 }
 
 #define EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATORS(Derived) \
-EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(Derived, =) \
-EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(Derived, +=) \
-EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(Derived, -=) \
-EIGEN_SPARSE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, *=) \
-EIGEN_SPARSE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, /=)
+EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(Derived, =)
 
 // TODO this is mostly the same as EIGEN_GENERIC_PUBLIC_INTERFACE
 #define _EIGEN_SPARSE_PUBLIC_INTERFACE(Derived) \
diff --git a/Eigen/src/SparseLU/SparseLU_Memory.h b/Eigen/src/SparseLU/SparseLU_Memory.h
index 1cf8bebc7..4dc42e87b 100644
--- a/Eigen/src/SparseLU/SparseLU_Memory.h
+++ b/Eigen/src/SparseLU/SparseLU_Memory.h
@@ -152,8 +152,8 @@ Index SparseLUImpl<Scalar,StorageIndex>::memInit(Index m, Index n, Index annz, I
 {
   Index& num_expansions = glu.num_expansions; //No memory expansions so far
   num_expansions = 0;
-  glu.nzumax = glu.nzlumax = (std::min)(fillratio * annz / n, m) * n; // estimated number of nonzeros in U 
-  glu.nzlmax = (std::max)(Index(4), fillratio) * annz / 4; // estimated  nnz in L factor
+  glu.nzumax = glu.nzlumax = (std::min)(fillratio * (annz+1) / n, m) * n; // estimated number of nonzeros in U 
+  glu.nzlmax = (std::max)(Index(4), fillratio) * (annz+1) / 4; // estimated  nnz in L factor
   // Return the estimated size to the user if necessary
   Index tempSpace;
   tempSpace = (2*panel_size + 4 + LUNoMarker) * m * sizeof(Index) + (panel_size + 1) * m * sizeof(Scalar);
diff --git a/Eigen/src/SparseQR/SparseQR.h b/Eigen/src/SparseQR/SparseQR.h
index ce4a70454..548b3f9b0 100644
--- a/Eigen/src/SparseQR/SparseQR.h
+++ b/Eigen/src/SparseQR/SparseQR.h
@@ -23,6 +23,10 @@ namespace internal {
     typedef typename SparseQRType::MatrixType ReturnType;
     typedef typename ReturnType::StorageIndex StorageIndex;
     typedef typename ReturnType::StorageKind StorageKind;
+    enum {
+      RowsAtCompileTime = Dynamic,
+      ColsAtCompileTime = Dynamic
+    };
   };
   template <typename SparseQRType> struct traits<SparseQRMatrixQTransposeReturnType<SparseQRType> >
   {
@@ -235,8 +239,9 @@ class SparseQR : public SparseSolverBase<SparseQR<_MatrixType,_OrderingType> >
       return m_info;
     }
 
-  protected:
-    inline void sort_matrix_Q()
+
+    /** \internal */
+    inline void _sort_matrix_Q()
     {
       if(this->m_isQSorted) return;
       // The matrix Q is sorted during the transposition
@@ -267,7 +272,6 @@ class SparseQR : public SparseSolverBase<SparseQR<_MatrixType,_OrderingType> >
     bool m_isEtreeOk;               // whether the elimination tree match the initial input matrix
     
     template <typename, typename > friend struct SparseQR_QProduct;
-    template <typename > friend struct SparseQRMatrixQReturnType;
     
 };
 
@@ -635,6 +639,10 @@ struct SparseQRMatrixQReturnType : public EigenBase<SparseQRMatrixQReturnType<Sp
 {  
   typedef typename SparseQRType::Scalar Scalar;
   typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
+  enum {
+    RowsAtCompileTime = Dynamic,
+    ColsAtCompileTime = Dynamic
+  };
   explicit SparseQRMatrixQReturnType(const SparseQRType& qr) : m_qr(qr) {}
   template<typename Derived>
   SparseQR_QProduct<SparseQRType, Derived> operator*(const MatrixBase<Derived>& other)
@@ -652,19 +660,6 @@ struct SparseQRMatrixQReturnType : public EigenBase<SparseQRMatrixQReturnType<Sp
   {
     return SparseQRMatrixQTransposeReturnType<SparseQRType>(m_qr);
   }
-  template<typename Dest> void evalTo(MatrixBase<Dest>& dest) const
-  {
-    dest.derived() = m_qr.matrixQ() * Dest::Identity(m_qr.rows(), m_qr.rows());
-  }
-  template<typename Dest> void evalTo(SparseMatrixBase<Dest>& dest) const
-  {
-    Dest idMat(m_qr.rows(), m_qr.rows());
-    idMat.setIdentity();
-    // Sort the sparse householder reflectors if needed
-    const_cast<SparseQRType *>(&m_qr)->sort_matrix_Q();
-    dest.derived() = SparseQR_QProduct<SparseQRType, Dest>(m_qr, idMat, false);
-  }
-
   const SparseQRType& m_qr;
 };
 
@@ -680,6 +675,47 @@ struct SparseQRMatrixQTransposeReturnType
   const SparseQRType& m_qr;
 };
 
+namespace internal {
+  
+template<typename SparseQRType>
+struct evaluator_traits<SparseQRMatrixQReturnType<SparseQRType> >
+{
+  typedef typename SparseQRType::MatrixType MatrixType;
+  typedef typename storage_kind_to_evaluator_kind<typename MatrixType::StorageKind>::Kind Kind;
+  typedef SparseShape Shape;
+  static const int AssumeAliasing = 0;
+};
+
+template< typename DstXprType, typename SparseQRType>
+struct Assignment<DstXprType, SparseQRMatrixQReturnType<SparseQRType>, internal::assign_op<typename DstXprType::Scalar>, Sparse2Sparse>
+{
+  typedef SparseQRMatrixQReturnType<SparseQRType> SrcXprType;
+  typedef typename DstXprType::Scalar Scalar;
+  typedef typename DstXprType::StorageIndex StorageIndex;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &/*func*/)
+  {
+    typename DstXprType::PlainObject idMat(src.m_qr.rows(), src.m_qr.rows());
+    idMat.setIdentity();
+    // Sort the sparse householder reflectors if needed
+    const_cast<SparseQRType *>(&src.m_qr)->_sort_matrix_Q();
+    dst = SparseQR_QProduct<SparseQRType, DstXprType>(src.m_qr, idMat, false);
+  }
+};
+
+template< typename DstXprType, typename SparseQRType>
+struct Assignment<DstXprType, SparseQRMatrixQReturnType<SparseQRType>, internal::assign_op<typename DstXprType::Scalar>, Sparse2Dense>
+{
+  typedef SparseQRMatrixQReturnType<SparseQRType> SrcXprType;
+  typedef typename DstXprType::Scalar Scalar;
+  typedef typename DstXprType::StorageIndex StorageIndex;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &/*func*/)
+  {
+    dst = src.m_qr.matrixQ() * DstXprType::Identity(src.m_qr.rows(), src.m_qr.rows());
+  }
+};
+
+} // end namespace internal
+
 } // end namespace Eigen
 
 #endif
diff --git a/cmake/EigenDetermineVSServicePack.cmake b/cmake/EigenDetermineVSServicePack.cmake
index a1a7348f6..fed78194d 100644
--- a/cmake/EigenDetermineVSServicePack.cmake
+++ b/cmake/EigenDetermineVSServicePack.cmake
@@ -26,6 +26,8 @@ function(EigenDetermineVSServicePack _pack)
                         set(_sp "vc110sp2")
                     elseif(${_cl_version} VERSION_EQUAL "17.00.60610.1")
                         set(_sp "vc110sp3")
+                    else()
+                        set(_sp ${CMAKE_CXX_COMPILER_VERSION})
                     endif()
                 endif()
                 
diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake
index f8aec777d..0774b5d81 100644
--- a/cmake/EigenTesting.cmake
+++ b/cmake/EigenTesting.cmake
@@ -367,7 +367,7 @@ macro(ei_get_compilerver VAR)
     # on all other system we rely on ${CMAKE_CXX_COMPILER}
     # supporting a "--version" or "/version" flag
     
-    if(WIN32 AND NOT CYGWIN)
+    if(WIN32 AND NOT CYGWIN AND NOT MINGW)
       set(EIGEN_CXX_FLAG_VERSION "/version")
     else()
       set(EIGEN_CXX_FLAG_VERSION "--version")
diff --git a/doc/QuickReference.dox b/doc/QuickReference.dox
index a4be0f68a..c90c9e9ce 100644
--- a/doc/QuickReference.dox
+++ b/doc/QuickReference.dox
@@ -13,17 +13,17 @@ The Eigen library is divided in a Core module and several additional modules. Ea
 
 <table class="manual">
 <tr><th>Module</th><th>Header file</th><th>Contents</th></tr>
-<tr><td>\link Core_Module Core \endlink</td><td>\code#include <Eigen/Core>\endcode</td><td>Matrix and Array classes, basic linear algebra (including triangular and selfadjoint products), array manipulation</td></tr>
+<tr            ><td>\link Core_Module Core \endlink</td><td>\code#include <Eigen/Core>\endcode</td><td>Matrix and Array classes, basic linear algebra (including triangular and selfadjoint products), array manipulation</td></tr>
 <tr class="alt"><td>\link Geometry_Module Geometry \endlink</td><td>\code#include <Eigen/Geometry>\endcode</td><td>Transform, Translation, Scaling, Rotation2D and 3D rotations (Quaternion, AngleAxis)</td></tr>
-<tr><td>\link LU_Module LU \endlink</td><td>\code#include <Eigen/LU>\endcode</td><td>Inverse, determinant, LU decompositions with solver (FullPivLU, PartialPivLU)</td></tr>
-<tr><td>\link Cholesky_Module Cholesky \endlink</td><td>\code#include <Eigen/Cholesky>\endcode</td><td>LLT and LDLT Cholesky factorization with solver</td></tr>
-<tr class="alt"><td>\link Householder_Module Householder \endlink</td><td>\code#include <Eigen/Householder>\endcode</td><td>Householder transformations; this module is used by several linear algebra modules</td></tr>
-<tr><td>\link SVD_Module SVD \endlink</td><td>\code#include <Eigen/SVD>\endcode</td><td>SVD decomposition with least-squares solver (JacobiSVD)</td></tr>
-<tr class="alt"><td>\link QR_Module QR \endlink</td><td>\code#include <Eigen/QR>\endcode</td><td>QR decomposition with solver (HouseholderQR, ColPivHouseholderQR, FullPivHouseholderQR)</td></tr>
-<tr><td>\link Eigenvalues_Module Eigenvalues \endlink</td><td>\code#include <Eigen/Eigenvalues>\endcode</td><td>Eigenvalue, eigenvector decompositions (EigenSolver, SelfAdjointEigenSolver, ComplexEigenSolver)</td></tr>
-<tr class="alt"><td>\link Sparse_modules Sparse \endlink</td><td>\code#include <Eigen/Sparse>\endcode</td><td>%Sparse matrix storage and related basic linear algebra (SparseMatrix, DynamicSparseMatrix, SparseVector)</td></tr>
-<tr><td></td><td>\code#include <Eigen/Dense>\endcode</td><td>Includes Core, Geometry, LU, Cholesky, SVD, QR, and Eigenvalues header files</td></tr>
-<tr class="alt"><td></td><td>\code#include <Eigen/Eigen>\endcode</td><td>Includes %Dense and %Sparse header files (the whole Eigen library)</td></tr>
+<tr            ><td>\link LU_Module LU \endlink</td><td>\code#include <Eigen/LU>\endcode</td><td>Inverse, determinant, LU decompositions with solver (FullPivLU, PartialPivLU)</td></tr>
+<tr class="alt"><td>\link Cholesky_Module Cholesky \endlink</td><td>\code#include <Eigen/Cholesky>\endcode</td><td>LLT and LDLT Cholesky factorization with solver</td></tr>
+<tr            ><td>\link Householder_Module Householder \endlink</td><td>\code#include <Eigen/Householder>\endcode</td><td>Householder transformations; this module is used by several linear algebra modules</td></tr>
+<tr class="alt"><td>\link SVD_Module SVD \endlink</td><td>\code#include <Eigen/SVD>\endcode</td><td>SVD decompositions with least-squares solver (JacobiSVD, BDCSVD)</td></tr>
+<tr            ><td>\link QR_Module QR \endlink</td><td>\code#include <Eigen/QR>\endcode</td><td>QR decomposition with solver (HouseholderQR, ColPivHouseholderQR, FullPivHouseholderQR)</td></tr>
+<tr class="alt"><td>\link Eigenvalues_Module Eigenvalues \endlink</td><td>\code#include <Eigen/Eigenvalues>\endcode</td><td>Eigenvalue, eigenvector decompositions (EigenSolver, SelfAdjointEigenSolver, ComplexEigenSolver)</td></tr>
+<tr            ><td>\link Sparse_modules Sparse \endlink</td><td>\code#include <Eigen/Sparse>\endcode</td><td>%Sparse matrix storage and related basic linear algebra (SparseMatrix, SparseVector) \n (see \ref SparseQuickRefPage for details on sparse modules)</td></tr>
+<tr class="alt"><td></td><td>\code#include <Eigen/Dense>\endcode</td><td>Includes Core, Geometry, LU, Cholesky, SVD, QR, and Eigenvalues header files</td></tr>
+<tr            ><td></td><td>\code#include <Eigen/Eigen>\endcode</td><td>Includes %Dense and %Sparse header files (the whole Eigen library)</td></tr>
 </table>
 
 <a href="#" class="top">top</a>
@@ -364,32 +364,10 @@ vec3 = vec1.cross(vec2);\endcode</td></tr>
 
 <a href="#" class="top">top</a>
 \section QuickRef_Coeffwise Coefficient-wise \& Array operators
-Coefficient-wise operators for matrices and vectors:
-<table class="manual">
-<tr><th>Matrix API \matrixworld</th><th>Via Array conversions</th></tr>
-<tr><td>\code
-mat1.cwiseMin(mat2)
-mat1.cwiseMax(mat2)
-mat1.cwiseAbs2()
-mat1.cwiseAbs()
-mat1.cwiseSqrt()
-mat1.cwiseProduct(mat2)
-mat1.cwiseQuotient(mat2)\endcode
-</td><td>\code
-mat1.array().min(mat2.array())
-mat1.array().max(mat2.array())
-mat1.array().abs2()
-mat1.array().abs()
-mat1.array().sqrt()
-mat1.array() * mat2.array()
-mat1.array() / mat2.array()
-\endcode</td></tr>
-</table>
 
-It is also very simple to apply any user defined function \c foo using DenseBase::unaryExpr together with std::ptr_fun:
-\code mat1.unaryExpr(std::ptr_fun(foo))\endcode
-
-Array operators:\arrayworld
+In addition to the aforementioned operators, Eigen supports numerous coefficient-wise operator and functions.
+Most of them unambiguously makes sense in array-world\arrayworld. The following operators are readily available for arrays,
+or available through .array() for vectors and matrices:
 
 <table class="manual">
 <tr><td>Arithmetic operators</td><td>\code
@@ -400,28 +378,107 @@ array1 + scalar     array1 - scalar     array1 += scalar    array1 -= scalar
 array1 < array2     array1 > array2     array1 < scalar     array1 > scalar
 array1 <= array2    array1 >= array2    array1 <= scalar    array1 >= scalar
 array1 == array2    array1 != array2    array1 == scalar    array1 != scalar
+array1.min(array2)  array1.max(array2)  array1.min(scalar)  array1.max(scalar)
 \endcode</td></tr>
-<tr><td>Trigo, power, and \n misc functions \n and the STL variants</td><td>\code
-array1.min(array2)            
-array1.max(array2)            
+<tr><td>Trigo, power, and \n misc functions \n and the STL-like variants</td><td>\code
 array1.abs2()
 array1.abs()                  abs(array1)
 array1.sqrt()                 sqrt(array1)
 array1.log()                  log(array1)
+array1.log10()                log10(array1)
 array1.exp()                  exp(array1)
-array1.pow(exponent)          pow(array1,exponent)
+array1.pow(array2)            pow(array1,array2)
+array1.pow(scalar)            pow(array1,scalar)
 array1.square()
 array1.cube()
 array1.inverse()
+
 array1.sin()                  sin(array1)
 array1.cos()                  cos(array1)
 array1.tan()                  tan(array1)
 array1.asin()                 asin(array1)
 array1.acos()                 acos(array1)
+array1.atan()                 atan(array1)
+array1.sinh()                 sinh(array1)
+array1.cosh()                 cosh(array1)
+array1.tanh()                 tanh(array1)
+array1.arg()                  arg(array1)
+
+array1.floor()                floor(array1)
+array1.ceil()                 ceil(array1)
+array1.round()                round(aray1)
+
+array1.isFinite()             isfinite(array1)
+array1.isInf()                isinf(array1)
+array1.isNaN()                isnan(array1)
 \endcode
 </td></tr>
 </table>
 
+
+The following coefficient-wise operators are available for all kind of expressions (matrices, vectors, and arrays), and for both real or complex scalar types:
+
+<table class="manual">
+<tr><th>Eigen's API</th><th>STL-like APIs\arrayworld </th><th>Comments</th></tr>
+<tr><td>\code
+mat1.real()
+mat1.imag()
+mat1.conjugate()
+\endcode
+</td><td>\code
+real(array1)
+imag(array1)
+conj(array1)
+\endcode
+</td><td>
+\code
+ // read-write, no-op for real expressions
+ // read-only for real, read-write for complexes
+ // no-op for real expressions
+\endcode
+</td></tr>
+</table>
+
+Some coefficient-wise operators are readily available for for matrices and vectors through the following cwise* methods:
+<table class="manual">
+<tr><th>Matrix API \matrixworld</th><th>Via Array conversions</th></tr>
+<tr><td>\code
+mat1.cwiseMin(mat2)         mat1.cwiseMin(scalar)
+mat1.cwiseMax(mat2)         mat1.cwiseMax(scalar)
+mat1.cwiseAbs2()
+mat1.cwiseAbs()
+mat1.cwiseSqrt()
+mat1.cwiseInverse()
+mat1.cwiseProduct(mat2)
+mat1.cwiseQuotient(mat2)
+mat1.cwiseEqual(mat2)       mat1.cwiseEqual(scalar)
+mat1.cwiseNotEqual(mat2)
+\endcode
+</td><td>\code
+mat1.array().min(mat2.array())    mat1.array().min(scalar)
+mat1.array().max(mat2.array())    mat1.array().max(scalar)
+mat1.array().abs2()
+mat1.array().abs()
+mat1.array().sqrt()
+mat1.array().inverse()
+mat1.array() * mat2.array()
+mat1.array() / mat2.array()
+mat1.array() == mat2.array()      mat1.array() == scalar
+mat1.array() != mat2.array()
+\endcode</td></tr>
+</table>
+The main difference between the two API is that the one based on cwise* methods returns an expression in the matrix world,
+while the second one (based on .array()) returns an array expression.
+Recall that .array() has no cost, it only changes the available API and interpretation of the data.
+
+It is also very simple to apply any user defined function \c foo using DenseBase::unaryExpr together with <a href="http://en.cppreference.com/w/cpp/utility/functional/ptr_fun">std::ptr_fun</a> (c++03), <a href="http://en.cppreference.com/w/cpp/utility/functional/ref">std::ref</a> (c++11), or <a href="http://en.cppreference.com/w/cpp/language/lambda">lambdas</a> (c++11):
+\code
+mat1.unaryExpr(std::ptr_fun(foo));
+mat1.unaryExpr(std::ref(foo));
+mat1.unaryExpr([](double x) { return foo(x); });
+\endcode
+
+
 <a href="#" class="top">top</a>
 \section QuickRef_Reductions Reductions
 
diff --git a/doc/SparseLinearSystems.dox b/doc/SparseLinearSystems.dox
index 13741280a..48c18f46f 100644
--- a/doc/SparseLinearSystems.dox
+++ b/doc/SparseLinearSystems.dox
@@ -21,7 +21,7 @@ They are summarized in the following table:
 <tr><td>ConjugateGradient</td><td>\link IterativeLinearSolvers_Module IterativeLinearSolvers \endlink</td><td>Classic iterative CG</td><td>SPD</td><td>Preconditionning</td>
     <td>built-in, MPL2</td>
     <td>Recommended for large symmetric problems (e.g., 3D Poisson eq.)</td></tr>
-<tr><td>LSCG</td><td>\link IterativeLinearSolvers_Module IterativeLinearSolvers \endlink</td><td>CG for rectangular least-square problem</td><td>Rectangular</td><td>Preconditionning</td>
+<tr><td>LeastSquaresConjugateGradient</td><td>\link IterativeLinearSolvers_Module IterativeLinearSolvers \endlink</td><td>CG for rectangular least-square problem</td><td>Rectangular</td><td>Preconditionning</td>
     <td>built-in, MPL2</td>
     <td>Solve for min |A'Ax-b|^2 without forming A'A</td></tr>
 <tr><td>BiCGSTAB</td><td>\link IterativeLinearSolvers_Module IterativeLinearSolvers \endlink</td><td>Iterative stabilized bi-conjugate gradient</td><td>Square</td><td>Preconditionning</td>
diff --git a/doc/TopicMultithreading.dox b/doc/TopicMultithreading.dox
index e22e1c613..66028d7a8 100644
--- a/doc/TopicMultithreading.dox
+++ b/doc/TopicMultithreading.dox
@@ -22,8 +22,12 @@ n = Eigen::nbThreads( );
 You can disable Eigen's multi threading at compile time by defining the EIGEN_DONT_PARALLELIZE preprocessor token.
 
 Currently, the following algorithms can make use of multi-threading:
- * general matrix - matrix products
- * PartialPivLU
+ - general dense matrix - matrix products
+ - PartialPivLU
+ - row-major-sparse * dense vector/matrix products
+ - ConjugateGradient with \c Lower|Upper as the \c UpLo template parameter.
+ - BiCGSTAB with a row-major sparse matrix format.
+ - LeastSquaresConjugateGradient
 
 \section TopicMultiThreading_UsingEigenWithMT Using Eigen in a multi-threaded application
 
diff --git a/test/array.cpp b/test/array.cpp
index b0210c682..8cd4fd888 100644
--- a/test/array.cpp
+++ b/test/array.cpp
@@ -124,8 +124,11 @@ template<typename ArrayType> void comparisons(const ArrayType& m)
         c = internal::random<Index>(0, cols-1);
 
   ArrayType m1 = ArrayType::Random(rows, cols),
-             m2 = ArrayType::Random(rows, cols),
-             m3(rows, cols);            
+            m2 = ArrayType::Random(rows, cols),
+            m3(rows, cols),
+            m4 = m1;
+  
+  m4 = (m4.abs()==Scalar(0)).select(1,m4);
 
   VERIFY(((m1 + Scalar(1)) > m1).all());
   VERIFY(((m1 - Scalar(1)) < m1).all());
@@ -197,7 +200,10 @@ template<typename ArrayType> void array_real(const ArrayType& m)
 
   ArrayType m1 = ArrayType::Random(rows, cols),
             m2 = ArrayType::Random(rows, cols),
-            m3(rows, cols);
+            m3(rows, cols),
+            m4 = m1;
+  
+  m4 = (m4.abs()==Scalar(0)).select(1,m4);
 
   Scalar  s1 = internal::random<Scalar>();
 
@@ -215,9 +221,9 @@ template<typename ArrayType> void array_real(const ArrayType& m)
   VERIFY_IS_APPROX(m1.round(), round(m1));
   VERIFY_IS_APPROX(m1.floor(), floor(m1));
   VERIFY_IS_APPROX(m1.ceil(), ceil(m1));
-  VERIFY((m1.isNaN() == isnan(m1)).all());
-  VERIFY((m1.isInf() == isinf(m1)).all());
-  VERIFY((m1.isFinite() == isfinite(m1)).all());
+  VERIFY((m1.isNaN() == Eigen::isnan(m1)).all());
+  VERIFY((m1.isInf() == Eigen::isinf(m1)).all());
+  VERIFY((m1.isFinite() == Eigen::isfinite(m1)).all());
   VERIFY_IS_APPROX(m1.inverse(), inverse(m1));
   VERIFY_IS_APPROX(m1.abs(), abs(m1));
   VERIFY_IS_APPROX(m1.abs2(), abs2(m1));
@@ -243,9 +249,9 @@ template<typename ArrayType> void array_real(const ArrayType& m)
   VERIFY_IS_APPROX(tanh(m1), (0.5*(exp(m1)-exp(-m1)))/(0.5*(exp(m1)+exp(-m1))));
   VERIFY_IS_APPROX(arg(m1), ((ArrayType)(m1<0))*std::acos(-1.0));
   VERIFY((round(m1) <= ceil(m1) && round(m1) >= floor(m1)).all());
-  VERIFY(isnan(m1*0.0/0.0).all());
-  VERIFY(isinf(m1/0.0).all());
-  VERIFY((isfinite(m1) && !isfinite(m1*0.0/0.0) && !isfinite(m1/0.0)).all());
+  VERIFY(Eigen::isnan((m1*0.0)/0.0).all());
+  VERIFY(Eigen::isinf(m4/0.0).all());
+  VERIFY((Eigen::isfinite(m1) && (!Eigen::isfinite(m1*0.0/0.0)) && (!Eigen::isfinite(m4/0.0))).all());
   VERIFY_IS_APPROX(inverse(inverse(m1)),m1);
   VERIFY((abs(m1) == m1 || abs(m1) == -m1).all());
   VERIFY_IS_APPROX(m3, sqrt(abs2(m1)));
@@ -299,7 +305,11 @@ template<typename ArrayType> void array_complex(const ArrayType& m)
   Index cols = m.cols();
 
   ArrayType m1 = ArrayType::Random(rows, cols),
-            m2(rows, cols);
+            m2(rows, cols),
+            m4 = m1;
+  
+  m4.real() = (m4.real().abs()==RealScalar(0)).select(RealScalar(1),m4.real());
+  m4.imag() = (m4.imag().abs()==RealScalar(0)).select(RealScalar(1),m4.imag());
 
   Array<RealScalar, -1, -1> m3(rows, cols);
 
@@ -317,9 +327,9 @@ template<typename ArrayType> void array_complex(const ArrayType& m)
   VERIFY_IS_APPROX(m1.cosh(), cosh(m1));
   VERIFY_IS_APPROX(m1.tanh(), tanh(m1));
   VERIFY_IS_APPROX(m1.arg(), arg(m1));
-  VERIFY((m1.isNaN() == isnan(m1)).all());
-  VERIFY((m1.isInf() == isinf(m1)).all());
-  VERIFY((m1.isFinite() == isfinite(m1)).all());
+  VERIFY((m1.isNaN() == Eigen::isnan(m1)).all());
+  VERIFY((m1.isInf() == Eigen::isinf(m1)).all());
+  VERIFY((m1.isFinite() == Eigen::isfinite(m1)).all());
   VERIFY_IS_APPROX(m1.inverse(), inverse(m1));
   VERIFY_IS_APPROX(m1.log(), log(m1));
   VERIFY_IS_APPROX(m1.log10(), log10(m1));
@@ -345,20 +355,20 @@ template<typename ArrayType> void array_complex(const ArrayType& m)
   VERIFY_IS_APPROX(arg(m1), m3);
 
   std::complex<RealScalar> zero(0.0,0.0);
-  VERIFY(isnan(m1*zero/zero).all());
+  VERIFY(Eigen::isnan(m1*zero/zero).all());
 #if EIGEN_COMP_CLANG
   // clang's complex division is notoriously broken
-  if(numext::isinf(m1(0,0)/Scalar(0))) {
+  if(numext::isinf(m4(0,0)/RealScalar(0))) {
 #endif
-  VERIFY(isinf(m1/zero).all());
+  VERIFY(Eigen::isinf(m4/zero).all());
 #if EIGEN_COMP_CLANG
   }
   else
   {
-    VERIFY(isinf(m1.real()/zero.real()).all());
+    VERIFY(Eigen::isinf(m4.real()/zero.real()).all());
   }
 #endif
-  VERIFY((isfinite(m1) && !isfinite(m1*zero/zero) && !isfinite(m1/zero)).all());
+  VERIFY((Eigen::isfinite(m1) && (!Eigen::isfinite(m1*zero/zero)) && (!Eigen::isfinite(m1/zero))).all());
 
   VERIFY_IS_APPROX(inverse(inverse(m1)),m1);
   VERIFY_IS_APPROX(conj(m1.conjugate()), m1);
diff --git a/test/basicstuff.cpp b/test/basicstuff.cpp
index 3d7d74d4e..c6bbf19ee 100644
--- a/test/basicstuff.cpp
+++ b/test/basicstuff.cpp
@@ -223,7 +223,7 @@ void fixedSizeMatrixConstruction()
     for(int k=0; k<2; ++k) VERIFY(m2(k) == DenseIndex(raw[k]));
     for(int k=0; k<2; ++k) VERIFY(a2(k) == DenseIndex(raw[k]));
     for(int k=0; k<2; ++k) VERIFY(m3(k) == int(raw[k]));
-    for(int k=0; k<2; ++k) VERIFY(m4(k) == float(raw[k]));
+    for(int k=0; k<2; ++k) VERIFY((m4(k)) == Scalar(float(raw[k])));
   }
   {
     Matrix<Scalar,1,1> m(raw), m1(raw[0]), m2( (DenseIndex(raw[0])) ), m3( (int(raw[0])) );
diff --git a/test/bdcsvd.cpp b/test/bdcsvd.cpp
index 52a02b697..f9f687aac 100644
--- a/test/bdcsvd.cpp
+++ b/test/bdcsvd.cpp
@@ -69,8 +69,8 @@ void test_bdcsvd()
   CALL_SUBTEST_7(( svd_verify_assert<BDCSVD<MatrixXf>  >(MatrixXf(10,12)) ));
   CALL_SUBTEST_8(( svd_verify_assert<BDCSVD<MatrixXcd> >(MatrixXcd(7,5)) ));
   
-  CALL_SUBTEST_1(( svd_all_trivial_2x2(bdcsvd<Matrix2cd>) ));
-  CALL_SUBTEST_1(( svd_all_trivial_2x2(bdcsvd<Matrix2d>) ));
+  CALL_SUBTEST_101(( svd_all_trivial_2x2(bdcsvd<Matrix2cd>) ));
+  CALL_SUBTEST_102(( svd_all_trivial_2x2(bdcsvd<Matrix2d>) ));
 
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_3(( bdcsvd<Matrix3f>() ));
@@ -104,8 +104,8 @@ void test_bdcsvd()
   CALL_SUBTEST_7( BDCSVD<MatrixXf>(10,10) );
 
   // Check that preallocation avoids subsequent mallocs
-  CALL_SUBTEST_9( svd_preallocate() );
+  CALL_SUBTEST_9( svd_preallocate<void>() );
 
-  CALL_SUBTEST_2( svd_underoverflow() );
+  CALL_SUBTEST_2( svd_underoverflow<void>() );
 }
 
diff --git a/test/jacobisvd.cpp b/test/jacobisvd.cpp
index f9de6b708..3d8d0203d 100644
--- a/test/jacobisvd.cpp
+++ b/test/jacobisvd.cpp
@@ -114,7 +114,7 @@ void test_jacobisvd()
   CALL_SUBTEST_7( JacobiSVD<MatrixXf>(10,10) );
 
   // Check that preallocation avoids subsequent mallocs
-  CALL_SUBTEST_9( svd_preallocate() );
+  CALL_SUBTEST_9( svd_preallocate<void>() );
 
-  CALL_SUBTEST_2( svd_underoverflow() );
+  CALL_SUBTEST_2( svd_underoverflow<void>() );
 }
diff --git a/test/linearstructure.cpp b/test/linearstructure.cpp
index 8e3cc9a86..3c7cdbe41 100644
--- a/test/linearstructure.cpp
+++ b/test/linearstructure.cpp
@@ -88,6 +88,10 @@ template<typename MatrixType> void real_complex(DenseIndex rows = MatrixType::Ro
   g_called = false;
   VERIFY_IS_APPROX(m1*s, m1*Scalar(s));
   VERIFY(g_called && "matrix<complex> * real not properly optimized");
+  
+  g_called = false;
+  VERIFY_IS_APPROX(m1/s, m1/Scalar(s));
+  VERIFY(g_called && "matrix<complex> / real not properly optimized");
 }
 
 void test_linearstructure()
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index 2f1accd1b..74b4d4279 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -315,9 +315,29 @@ template<typename Scalar> void packetmath_real()
   CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasExp, std::exp, internal::pexp);
   {
     data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
+    data1[1] = std::numeric_limits<Scalar>::epsilon();
     packet_helper<internal::packet_traits<Scalar>::HasExp,Packet> h;
-    h.store(data2, internal::pexp(h.load(data1))); 
+    h.store(data2, internal::pexp(h.load(data1)));
     VERIFY(numext::isnan(data2[0]));
+    VERIFY_IS_EQUAL(std::exp(std::numeric_limits<Scalar>::epsilon()), data2[1]);
+
+    data1[0] = -std::numeric_limits<Scalar>::epsilon();
+    data1[1] = 0;
+    h.store(data2, internal::pexp(h.load(data1)));
+    VERIFY_IS_EQUAL(std::exp(-std::numeric_limits<Scalar>::epsilon()), data2[0]);
+    VERIFY_IS_EQUAL(std::exp(0), data2[1]);
+
+    data1[0] = (std::numeric_limits<Scalar>::min)();
+    data1[1] = -(std::numeric_limits<Scalar>::min)();
+    h.store(data2, internal::pexp(h.load(data1)));
+    VERIFY_IS_EQUAL(std::exp((std::numeric_limits<Scalar>::min)()), data2[0]);
+    VERIFY_IS_EQUAL(std::exp(-(std::numeric_limits<Scalar>::min)()), data2[1]);
+
+    data1[0] = std::numeric_limits<Scalar>::denorm_min();
+    data1[1] = -std::numeric_limits<Scalar>::denorm_min();
+    h.store(data2, internal::pexp(h.load(data1)));
+    VERIFY_IS_EQUAL(std::exp(std::numeric_limits<Scalar>::denorm_min()), data2[0]);
+    VERIFY_IS_EQUAL(std::exp(-std::numeric_limits<Scalar>::denorm_min()), data2[1]);
   }
 
   for (int i=0; i<size; ++i)
@@ -331,12 +351,33 @@ template<typename Scalar> void packetmath_real()
   CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasLog, std::log, internal::plog);
   {
     data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
+    data1[1] = std::numeric_limits<Scalar>::epsilon();
     packet_helper<internal::packet_traits<Scalar>::HasLog,Packet> h;
     h.store(data2, internal::plog(h.load(data1)));
-    VERIFY(numext::isnan(data2[0]));
+    VERIFY(std::isnan(data2[0]));
+    //    VERIFY_IS_EQUAL(std::log(std::numeric_limits<Scalar>::epsilon()), data2[1]);
+
+    data1[0] = -std::numeric_limits<Scalar>::epsilon();
+    data1[1] = 0;
+    h.store(data2, internal::plog(h.load(data1)));
+    VERIFY(std::isnan(data2[0]));
+    //    VERIFY_IS_EQUAL(std::log(0), data2[1]);
+
+    data1[0] = (std::numeric_limits<Scalar>::min)();
+    data1[1] = -(std::numeric_limits<Scalar>::min)();
+    h.store(data2, internal::plog(h.load(data1)));
+    VERIFY_IS_EQUAL(std::log((std::numeric_limits<Scalar>::min)()), data2[0]);
+    //    VERIFY(std::isnan(data2[1]));
+
+    data1[0] = std::numeric_limits<Scalar>::denorm_min();
+    data1[1] = -std::numeric_limits<Scalar>::denorm_min();
+    h.store(data2, internal::plog(h.load(data1)));
+    //    VERIFY_IS_EQUAL(std::log(std::numeric_limits<Scalar>::denorm_min()), data2[0]);
+    //    VERIFY(std::isnan(data2[1]));
+
     data1[0] = -1.0f;
     h.store(data2, internal::plog(h.load(data1)));
-    VERIFY(numext::isnan(data2[0]));
+    VERIFY(std::isnan(data2[0]));
 #if !EIGEN_FAST_MATH
     h.store(data2, internal::psqrt(h.load(data1)));
     VERIFY(numext::isnan(data2[0]));
diff --git a/test/qr_colpivoting.cpp b/test/qr_colpivoting.cpp
index eb3feac01..e7abd3725 100644
--- a/test/qr_colpivoting.cpp
+++ b/test/qr_colpivoting.cpp
@@ -23,8 +23,8 @@ template<typename MatrixType> void qr()
   MatrixType m1;
   createRandomPIMatrixOfRank(rank,rows,cols,m1);
   ColPivHouseholderQR<MatrixType> qr(m1);
-  VERIFY(rank == qr.rank());
-  VERIFY(cols - qr.rank() == qr.dimensionOfKernel());
+  VERIFY_IS_EQUAL(rank, qr.rank());
+  VERIFY_IS_EQUAL(cols - qr.rank(), qr.dimensionOfKernel());
   VERIFY(!qr.isInjective());
   VERIFY(!qr.isInvertible());
   VERIFY(!qr.isSurjective());
@@ -51,11 +51,11 @@ template<typename MatrixType, int Cols2> void qr_fixedsize()
   Matrix<Scalar,Rows,Cols> m1;
   createRandomPIMatrixOfRank(rank,Rows,Cols,m1);
   ColPivHouseholderQR<Matrix<Scalar,Rows,Cols> > qr(m1);
-  VERIFY(rank == qr.rank());
-  VERIFY(Cols - qr.rank() == qr.dimensionOfKernel());
-  VERIFY(qr.isInjective() == (rank == Rows));
-  VERIFY(qr.isSurjective() == (rank == Cols));
-  VERIFY(qr.isInvertible() == (qr.isInjective() && qr.isSurjective()));
+  VERIFY_IS_EQUAL(rank, qr.rank());
+  VERIFY_IS_EQUAL(Cols - qr.rank(), qr.dimensionOfKernel());
+  VERIFY_IS_EQUAL(qr.isInjective(), (rank == Rows));
+  VERIFY_IS_EQUAL(qr.isSurjective(), (rank == Cols));
+  VERIFY_IS_EQUAL(qr.isInvertible(), (qr.isInjective() && qr.isSurjective()));
 
   Matrix<Scalar,Rows,Cols> r = qr.matrixQR().template triangularView<Upper>();
   Matrix<Scalar,Rows,Cols> c = qr.householderQ() * r * qr.colsPermutation().inverse();
diff --git a/test/qr_fullpivoting.cpp b/test/qr_fullpivoting.cpp
index 601773404..d82e123d0 100644
--- a/test/qr_fullpivoting.cpp
+++ b/test/qr_fullpivoting.cpp
@@ -23,8 +23,8 @@ template<typename MatrixType> void qr()
   MatrixType m1;
   createRandomPIMatrixOfRank(rank,rows,cols,m1);
   FullPivHouseholderQR<MatrixType> qr(m1);
-  VERIFY(rank == qr.rank());
-  VERIFY(cols - qr.rank() == qr.dimensionOfKernel());
+  VERIFY_IS_EQUAL(rank, qr.rank());
+  VERIFY_IS_EQUAL(cols - qr.rank(), qr.dimensionOfKernel());
   VERIFY(!qr.isInjective());
   VERIFY(!qr.isInvertible());
   VERIFY(!qr.isSurjective());
diff --git a/test/ref.cpp b/test/ref.cpp
index 18a89f4e9..1341dfef7 100644
--- a/test/ref.cpp
+++ b/test/ref.cpp
@@ -221,6 +221,12 @@ int test_ref_overload_fun1(Ref<MatrixXf> )       { return 3; }
 int test_ref_overload_fun2(Ref<const MatrixXd> ) { return 4; }
 int test_ref_overload_fun2(Ref<const MatrixXf> ) { return 5; }
 
+void test_ref_ambiguous(const Ref<const ArrayXd> &A, Ref<ArrayXd> B)
+{
+  B = A;
+  B = A - A;
+}
+
 // See also bug 969
 void test_ref_overloads()
 {
@@ -233,6 +239,9 @@ void test_ref_overloads()
   VERIFY( test_ref_overload_fun2(Ad)==4 );
   VERIFY( test_ref_overload_fun2(Ad+Bd)==4 );
   VERIFY( test_ref_overload_fun2(Af+Bf)==5 );
+  
+  ArrayXd A, B;
+  test_ref_ambiguous(A, B);
 }
 
 void test_ref()
diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp
index 75f29a2b4..492b3a4f2 100644
--- a/test/sparse_basic.cpp
+++ b/test/sparse_basic.cpp
@@ -365,6 +365,20 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
     VERIFY_IS_APPROX(m2, refMat2);
   }
   
+  // test diagonal to sparse
+  {
+    DenseVector d = DenseVector::Random(rows);
+    DenseMatrix refMat2 = d.asDiagonal();
+    SparseMatrixType m2(rows, rows);
+    m2 = d.asDiagonal();
+    VERIFY_IS_APPROX(m2, refMat2);
+    SparseMatrixType m3(d.asDiagonal());
+    VERIFY_IS_APPROX(m3, refMat2);
+    refMat2 += d.asDiagonal();
+    m2 += d.asDiagonal();
+    VERIFY_IS_APPROX(m2, refMat2);
+  }
+  
   // test conservative resize
   {
       std::vector< std::pair<StorageIndex,StorageIndex> > inc;
diff --git a/test/sparse_solver.h b/test/sparse_solver.h
index 1d3d25b53..64f6ac4fe 100644
--- a/test/sparse_solver.h
+++ b/test/sparse_solver.h
@@ -272,6 +272,7 @@ template<typename Solver> void check_sparse_spd_solving(Solver& solver, int maxS
           DenseVector b = it.rhs();
           DenseVector refX = it.refX();
           PermutationMatrix<Dynamic, Dynamic, StorageIndex> pnull;
+          halfA.resize(A.rows(), A.cols());
           if(Solver::UpLo == (Lower|Upper))
             halfA = A;
           else
diff --git a/test/sparseqr.cpp b/test/sparseqr.cpp
index 8e6887dd3..50d1fcdf2 100644
--- a/test/sparseqr.cpp
+++ b/test/sparseqr.cpp
@@ -89,6 +89,11 @@ template<typename Scalar> void test_sparseqr_scalar()
   QtQ = Q * Q.adjoint();
   idM.resize(Q.rows(), Q.rows()); idM.setIdentity();
   VERIFY(idM.isApprox(QtQ));
+  
+  // Q to dense
+  DenseMat dQ;
+  dQ = solver.matrixQ();
+  VERIFY_IS_APPROX(Q, dQ);
 }
 void test_sparseqr()
 {
diff --git a/test/svd_common.h b/test/svd_common.h
index b06a8a0f2..d8611b541 100644
--- a/test/svd_common.h
+++ b/test/svd_common.h
@@ -33,6 +33,7 @@ void svd_check_full(const MatrixType& m, const SvdType& svd)
   };
 
   typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
   typedef Matrix<Scalar, RowsAtCompileTime, RowsAtCompileTime> MatrixUType;
   typedef Matrix<Scalar, ColsAtCompileTime, ColsAtCompileTime> MatrixVType;
 
@@ -40,7 +41,10 @@ void svd_check_full(const MatrixType& m, const SvdType& svd)
   sigma.diagonal() = svd.singularValues().template cast<Scalar>();
   MatrixUType u = svd.matrixU();
   MatrixVType v = svd.matrixV();
-  VERIFY_IS_APPROX(m, u * sigma * v.adjoint());
+  RealScalar scaling = m.cwiseAbs().maxCoeff();
+  if(scaling<=(std::numeric_limits<RealScalar>::min)())
+    scaling = RealScalar(1);
+  VERIFY_IS_APPROX(m/scaling, u * (sigma/scaling) * v.adjoint());
   VERIFY_IS_UNITARY(u);
   VERIFY_IS_UNITARY(v);
 }
@@ -307,6 +311,7 @@ void svd_inf_nan()
 
 // Regression test for bug 286: JacobiSVD loops indefinitely with some
 // matrices containing denormal numbers.
+template<typename>
 void svd_underoverflow()
 {
 #if defined __INTEL_COMPILER
@@ -384,6 +389,7 @@ void svd_all_trivial_2x2( void (*cb)(const MatrixType&,bool) )
   } while((id<int(value_set.size())).all());
 }
 
+template<typename>
 void svd_preallocate()
 {
   Vector3f v(3.f, 2.f, 1.f);
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index 54d3cc18b..3f9aa1026 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -59,8 +59,10 @@
 #include "Eigen/Core"
 
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h"
@@ -80,6 +82,7 @@
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h"
@@ -88,6 +91,7 @@
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h"
diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h
index 77b92ee9b..2244e40c2 100644
--- a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h
+++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h
@@ -23,7 +23,7 @@ template <typename T, size_t n> class array {
   EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { return values[index]; }
 
-  static const std::size_t size = n;
+  static const std::size_t size() { return n; }
 
   T values[n];
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
index 4dbdbfb3e..32936d4be 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
@@ -375,6 +375,28 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
       resize(dims);
     }
 
+#ifndef EIGEN_EMULATE_CXX11_META_H
+    template <typename std::ptrdiff_t... Indices>
+    EIGEN_DEVICE_FUNC
+    void resize(const Sizes<Indices...>& dimensions) {
+      array<Index, NumIndices> dims;
+      for (std::size_t i = 0; i < NumIndices; ++i) {
+        dims[i] = static_cast<Index>(dimensions[i]);
+      }
+      resize(dims);
+    }
+#else
+    template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5>
+    EIGEN_DEVICE_FUNC
+    void resize(const Sizes<V1, V2, V3, V4, V5>& dimensions) {
+      array<Index, NumIndices> dims;
+      for (std::size_t i = 0; i < NumIndices; ++i) {
+        dims[i] = static_cast<Index>(dimensions[i]);
+      }
+      resize(dims);
+    }
+#endif
+
   protected:
 
     bool checkIndexRange(const array<Index, NumIndices>& indices) const
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index c7cfbfce0..1c31b9d28 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -108,6 +108,12 @@ class TensorBase<Derived, ReadOnlyAccessors>
       return unaryExpr(internal::scalar_inverse_op<Scalar>());
     }
 
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_tanh_op<Scalar>, const Derived>
+    tanh() const {
+      return unaryExpr(internal::scalar_tanh_op<Scalar>());
+    }
+
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_exp_op<Scalar>, const Derived>
     exp() const {
@@ -295,11 +301,10 @@ class TensorBase<Derived, ReadOnlyAccessors>
       return TensorReductionOp<internal::SumReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::SumReducer<CoeffReturnType>());
     }
 
-    const TensorReductionOp<internal::SumReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>
+    const TensorReductionOp<internal::SumReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>
     sum() const {
-      array<Index, NumDimensions> in_dims;
-      for (int i = 0; i < NumDimensions; ++i) in_dims[i] = i;
-      return TensorReductionOp<internal::SumReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::SumReducer<CoeffReturnType>());
+      DimensionList<Index, NumDimensions> in_dims;
+      return TensorReductionOp<internal::SumReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::SumReducer<CoeffReturnType>());
     }
 
     template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -308,11 +313,10 @@ class TensorBase<Derived, ReadOnlyAccessors>
       return TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::MeanReducer<CoeffReturnType>());
     }
 
-    const TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>
+    const TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>
     mean() const {
-      array<Index, NumDimensions> in_dims;
-      for (int i = 0; i < NumDimensions; ++i) in_dims[i] = i;
-      return TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MeanReducer<CoeffReturnType>());
+      DimensionList<Index, NumDimensions> in_dims;
+      return TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MeanReducer<CoeffReturnType>());
     }
 
     template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -321,11 +325,10 @@ class TensorBase<Derived, ReadOnlyAccessors>
       return TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::ProdReducer<CoeffReturnType>());
     }
 
-    const TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>
+    const TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>
     prod() const {
-      array<Index, NumDimensions> in_dims;
-      for (int i = 0; i < NumDimensions; ++i) in_dims[i] = i;
-      return TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::ProdReducer<CoeffReturnType>());
+      DimensionList<Index, NumDimensions> in_dims;
+      return TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::ProdReducer<CoeffReturnType>());
     }
 
     template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -334,11 +337,10 @@ class TensorBase<Derived, ReadOnlyAccessors>
       return TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::MaxReducer<CoeffReturnType>());
     }
 
-    const TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>
+    const TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>
     maximum() const {
-      array<Index, NumDimensions> in_dims;
-      for (int i = 0; i < NumDimensions; ++i) in_dims[i] = i;
-      return TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MaxReducer<CoeffReturnType>());
+      DimensionList<Index, NumDimensions> in_dims;
+      return TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MaxReducer<CoeffReturnType>());
     }
 
     template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -347,11 +349,10 @@ class TensorBase<Derived, ReadOnlyAccessors>
       return TensorReductionOp<internal::MinReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::MinReducer<CoeffReturnType>());
     }
 
-    const TensorReductionOp<internal::MinReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>
+    const TensorReductionOp<internal::MinReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>
     minimum() const {
-      array<Index, NumDimensions> in_dims;
-      for (int i = 0; i < NumDimensions; ++i) in_dims[i] = i;
-      return TensorReductionOp<internal::MinReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MinReducer<CoeffReturnType>());
+      DimensionList<Index, NumDimensions> in_dims;
+      return TensorReductionOp<internal::MinReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MinReducer<CoeffReturnType>());
     }
 
     template <typename Reducer, typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -413,6 +414,26 @@ class TensorBase<Derived, ReadOnlyAccessors>
                                                                  padding_type);
     }
 
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>
+    extract_volume_patches(const Index patch_planes, const Index patch_rows, const Index patch_cols,
+                           const Index plane_stride = 1, const Index row_stride = 1, const Index col_stride = 1,
+                           const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = 0) const {
+      return TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, 1, 1, 1, padding_type, padding_value);
+    }
+
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>
+    extract_volume_patches(const Index patch_planes, const Index patch_rows, const Index patch_cols,
+                           const Index plane_stride, const Index row_stride, const Index col_stride,
+                           const Index plane_inflate_stride, const Index row_inflate_stride, const Index col_inflate_stride,
+                           const Index padding_top_z, const Index padding_bottom_z,
+                           const Index padding_top, const Index padding_bottom,
+                           const Index padding_left, const Index padding_right, const Scalar padding_value = 0) const {
+      return TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, plane_inflate_stride, row_inflate_stride, col_inflate_stride, padding_top_z, padding_bottom_z, padding_top, padding_bottom, padding_left, padding_right, padding_value);
+    }
+
     // Morphing operators.
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorLayoutSwapOp<const Derived>
@@ -460,6 +481,18 @@ class TensorBase<Derived, ReadOnlyAccessors>
       return TensorStridingOp<const Strides, const Derived>(derived(), strides);
     }
 
+    // Added support for custom unary and binary operations
+    template <typename CustomUnaryFunc>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCustomUnaryOp<const CustomUnaryFunc, const Derived> customOp(const CustomUnaryFunc& op) const {
+      return TensorCustomUnaryOp<const CustomUnaryFunc, const Derived>(derived(), op);
+    }
+    template <typename OtherDerived, typename CustomBinaryFunc>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCustomBinaryOp<const CustomBinaryFunc, const Derived, const OtherDerived> customOp(const OtherDerived& other, const CustomBinaryFunc& op) const {
+      return TensorCustomBinaryOp<const CustomBinaryFunc, const Derived, const OtherDerived>(derived(), other, op);
+    }
+
     // Force the evaluation of the expression.
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorForcedEvalOp<const Derived> eval() const {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index 3b99ef069..2ef5ff205 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -106,8 +106,7 @@ class TensorChippingOp : public TensorBase<TensorChippingOp<DimId, XprType> >
   {
     typedef TensorAssignOp<TensorChippingOp, const TensorChippingOp> Assign;
     Assign assign(*this, other);
-    static const bool Vectorize = TensorEvaluator<const Assign, DefaultDevice>::PacketAccess;
-    internal::TensorExecutor<const Assign, DefaultDevice, Vectorize>::run(assign, DefaultDevice());
+    internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
     return *this;
   }
 
@@ -117,8 +116,7 @@ class TensorChippingOp : public TensorBase<TensorChippingOp<DimId, XprType> >
   {
     typedef TensorAssignOp<TensorChippingOp, const OtherDerived> Assign;
     Assign assign(*this, other);
-    static const bool Vectorize = TensorEvaluator<const Assign, DefaultDevice>::PacketAccess;
-    internal::TensorExecutor<const Assign, DefaultDevice, Vectorize>::run(assign, DefaultDevice());
+    internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
     return *this;
   }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
index 6979fb4ec..759e8208f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@@ -88,7 +88,7 @@ class TensorConcatenationOp : public TensorBase<TensorConcatenationOp<Axis, LhsX
     {
       typedef TensorAssignOp<TensorConcatenationOp, const TensorConcatenationOp> Assign;
       Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
       return *this;
     }
 
@@ -98,7 +98,7 @@ class TensorConcatenationOp : public TensorBase<TensorConcatenationOp<Axis, LhsX
     {
       typedef TensorAssignOp<TensorConcatenationOp, const OtherDerived> Assign;
       Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
       return *this;
     }
 
@@ -334,7 +334,7 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
     eigen_assert(index + packetSize - 1 < this->dimensions().TotalSize());
 
     EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize];
-    PacketReturnType rslt = internal::pstore<PacketReturnType>(values, x);
+    internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
     for (int i = 0; i < packetSize; ++i) {
       coeffRef(index+i) = values[i];
     }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 270383020..fd2f3abc4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -364,14 +364,6 @@ class TensorContractionInputMapper<Scalar, Index, side, Tensor, nocontract_t, co
 };
 
 
-template <size_t n> struct max_n_1 {
-  static const size_t size = n;
-};
-template <> struct max_n_1<0> {
-  static const size_t size = 1;
-};
-
-
 template<typename Dimensions, typename LhsXprType, typename RhsXprType>
 struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
 {
@@ -459,19 +451,6 @@ class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXp
 };
 
 
-template<bool cond> struct Cond {};
-
-template<typename T1, typename T2> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-const T1& choose(Cond<true>, const T1& first, const T2&) {
-  return first;
-}
-
-template<typename T1, typename T2> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-const T2& choose(Cond<false>, const T1&, const T2& second) {
-  return second;
-}
-
-
 template<typename Derived>
 struct TensorContractionEvaluatorBase
 {
@@ -508,13 +487,13 @@ struct TensorContractionEvaluatorBase
   static const int RDims =
       internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
   static const unsigned int ContractDims = internal::array_size<Indices>::value;
-  static const int NumDims = internal::max_n_1<LDims + RDims - 2 * ContractDims>::size;
+  static const int NumDims = max_n_1<LDims + RDims - 2 * ContractDims>::size;
 
   typedef array<Index, LDims> left_dim_mapper_t;
   typedef array<Index, RDims> right_dim_mapper_t;
   typedef array<Index, ContractDims> contract_t;
-  typedef array<Index, internal::max_n_1<LDims - ContractDims>::size> left_nocontract_t;
-  typedef array<Index, internal::max_n_1<RDims - ContractDims>::size> right_nocontract_t;
+  typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t;
+  typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t;
 
   typedef DSizes<Index, NumDims> Dimensions;
 
@@ -869,10 +848,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   typedef array<Index, RDims> right_dim_mapper_t;
 
   typedef array<Index, ContractDims> contract_t;
-  typedef array<Index, internal::max_n_1<LDims - ContractDims>::size> left_nocontract_t;
-  typedef array<Index, internal::max_n_1<RDims - ContractDims>::size> right_nocontract_t;
+  typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t;
+  typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t;
 
-  static const int NumDims = internal::max_n_1<LDims + RDims - 2 * ContractDims>::size;
+  static const int NumDims = max_n_1<LDims + RDims - 2 * ContractDims>::size;
 
   // Could we use NumDimensions here?
   typedef DSizes<Index, NumDims> Dimensions;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
index 588770bb4..f6bd949bd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
@@ -1241,10 +1241,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   typedef array<Index, RDims> right_dim_mapper_t;
 
   typedef array<Index, ContractDims> contract_t;
-  typedef array<Index, internal::max_n_1<LDims - ContractDims>::size> left_nocontract_t;
-  typedef array<Index, internal::max_n_1<RDims - ContractDims>::size> right_nocontract_t;
+  typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t;
+  typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t;
 
-  static const int NumDims = internal::max_n_1<LDims + RDims - 2 * ContractDims>::size;
+  static const int NumDims = max_n_1<LDims + RDims - 2 * ContractDims>::size;
 
   typedef DSizes<Index, NumDims> Dimensions;
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index ed87d3100..57030229d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -93,10 +93,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   typedef array<Index, RDims> right_dim_mapper_t;
 
   typedef array<Index, ContractDims> contract_t;
-  typedef array<Index, internal::max_n_1<LDims - ContractDims>::size> left_nocontract_t;
-  typedef array<Index, internal::max_n_1<RDims - ContractDims>::size> right_nocontract_t;
+  typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t;
+  typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t;
 
-  static const int NumDims = internal::max_n_1<LDims + RDims - 2 * ContractDims>::size;
+  static const int NumDims = max_n_1<LDims + RDims - 2 * ContractDims>::size;
 
   typedef DSizes<Index, NumDims> Dimensions;
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index 6b8f71b96..88db9d410 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -510,7 +510,8 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
       Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
       typedef TensorEvalToOp<const KernelArgType> EvalTo;
       EvalTo evalToTmp(local, m_kernelArg);
-      internal::TensorExecutor<const EvalTo, Device, TensorEvaluator<KernelArgType, Device>::PacketAccess>::run(evalToTmp, m_device);
+      const bool PacketAccess = internal::IsVectorizable<Device, KernelArgType>::value;
+      internal::TensorExecutor<const EvalTo, Device, PacketAccess>::run(evalToTmp, m_device);
 
       m_kernel = local;
       m_local_kernel = true;
@@ -815,7 +816,8 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
       Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
       typedef TensorEvalToOp<const KernelArgType> EvalTo;
       EvalTo evalToTmp(local, m_kernelArg);
-      internal::TensorExecutor<const EvalTo, GpuDevice, TensorEvaluator<KernelArgType, GpuDevice>::PacketAccess>::run(evalToTmp, m_device);
+      const bool PacketAccess = internal::IsVectorizable<GpuDevice, KernelArgType>::value;
+      internal::TensorExecutor<const EvalTo, GpuDevice, PacketAccess>::run(evalToTmp, m_device);
 
       m_kernel = local;
       m_local_kernel = true;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
new file mode 100644
index 000000000..0157f6fab
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
@@ -0,0 +1,310 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H
+
+namespace Eigen {
+
+/** \class TensorCustomUnaryOp
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor custom class.
+  *
+  *
+  */
+namespace internal {
+template<typename CustomUnaryFunc, typename XprType>
+struct traits<TensorCustomUnaryOp<CustomUnaryFunc, XprType> >
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename XprType::StorageKind StorageKind;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = traits<XprType>::NumDimensions;
+  static const int Layout = traits<XprType>::Layout;
+};
+
+template<typename CustomUnaryFunc, typename XprType>
+struct eval<TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Eigen::Dense>
+{
+  typedef const TensorCustomUnaryOp<CustomUnaryFunc, XprType>& type;
+};
+
+template<typename CustomUnaryFunc, typename XprType>
+struct nested<TensorCustomUnaryOp<CustomUnaryFunc, XprType> >
+{
+  typedef TensorCustomUnaryOp<CustomUnaryFunc, XprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename CustomUnaryFunc, typename XprType>
+class TensorCustomUnaryOp : public TensorBase<TensorCustomUnaryOp<CustomUnaryFunc, XprType>, ReadOnlyAccessors>
+{
+  public:
+  typedef typename internal::traits<TensorCustomUnaryOp>::Scalar Scalar;
+  typedef typename internal::traits<TensorCustomUnaryOp>::Packet Packet;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+  typedef typename internal::nested<TensorCustomUnaryOp>::type Nested;
+  typedef typename internal::traits<TensorCustomUnaryOp>::StorageKind StorageKind;
+  typedef typename internal::traits<TensorCustomUnaryOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCustomUnaryOp(const XprType& expr, const CustomUnaryFunc& func)
+      : m_expr(expr), m_func(func) {}
+
+  EIGEN_DEVICE_FUNC
+  const CustomUnaryFunc& func() const { return m_func; }
+
+  EIGEN_DEVICE_FUNC
+  const typename internal::remove_all<typename XprType::Nested>::type&
+  expression() const { return m_expr; }
+
+  protected:
+    typename XprType::Nested m_expr;
+    const CustomUnaryFunc m_func;
+};
+
+
+// Eval as rvalue
+template<typename CustomUnaryFunc, typename XprType, typename Device>
+struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Device>
+{
+  typedef TensorCustomUnaryOp<CustomUnaryFunc, XprType> ArgType;
+  typedef typename internal::traits<ArgType>::Index Index;
+  static const int NumDims = internal::traits<ArgType>::NumDimensions;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef
+      typename internal::remove_const<typename ArgType::Scalar>::type Scalar;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+    BlockAccess = false,
+    Layout = TensorEvaluator<XprType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device)
+      : m_op(op), m_device(device), m_result(NULL)
+  {
+    m_dimensions = op.func().dimensions(op.expression());
+  }
+
+  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+    if (data) {
+      evalTo(data);
+      return false;
+    } else {
+      m_result = static_cast<CoeffReturnType*>(
+          m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
+      evalTo(m_result);
+      return true;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    if (m_result != NULL) {
+      m_device.deallocate(m_result);
+      m_result = NULL;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    return m_result[index];
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
+    return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; }
+
+ protected:
+  EIGEN_DEVICE_FUNC void evalTo(Scalar* data) {
+    TensorMap<Tensor<CoeffReturnType, NumDims, Layout, Index> > result(
+        data, m_dimensions);
+    m_op.func().eval(m_op.expression(), result, m_device);
+  }
+
+  Dimensions m_dimensions;
+  const ArgType m_op;
+  const Device& m_device;
+  CoeffReturnType* m_result;
+};
+
+
+
+/** \class TensorCustomBinaryOp
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor custom class.
+  *
+  *
+  */
+namespace internal {
+template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
+struct traits<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> >
+{
+  typedef typename internal::promote_storage_type<typename LhsXprType::Scalar,
+                                                  typename RhsXprType::Scalar>::ret Scalar;
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType,
+                                                  typename RhsXprType::CoeffReturnType>::ret CoeffReturnType;
+  typedef typename internal::promote_storage_type<typename LhsXprType::PacketReturnType,
+                                                  typename RhsXprType::PacketReturnType>::ret PacketReturnType;
+  typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
+                                        typename traits<RhsXprType>::StorageKind>::ret StorageKind;
+  typedef typename promote_index_type<typename traits<LhsXprType>::Index,
+                                      typename traits<RhsXprType>::Index>::type Index;
+  typedef typename LhsXprType::Nested LhsNested;
+  typedef typename RhsXprType::Nested RhsNested;
+  typedef typename remove_reference<LhsNested>::type _LhsNested;
+  typedef typename remove_reference<RhsNested>::type _RhsNested;
+  static const int NumDimensions = traits<LhsXprType>::NumDimensions;
+  static const int Layout = traits<LhsXprType>::Layout;
+};
+
+template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
+struct eval<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Eigen::Dense>
+{
+  typedef const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>& type;
+};
+
+template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
+struct nested<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> >
+{
+  typedef TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
+class TensorCustomBinaryOp : public TensorBase<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, ReadOnlyAccessors>
+{
+  public:
+  typedef typename internal::traits<TensorCustomBinaryOp>::Scalar Scalar;
+  typedef typename internal::traits<TensorCustomBinaryOp>::Packet Packet;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename internal::traits<TensorCustomBinaryOp>::CoeffReturnType CoeffReturnType;
+  typedef typename internal::traits<TensorCustomBinaryOp>::PacketReturnType PacketReturnType;
+  typedef typename internal::nested<TensorCustomBinaryOp>::type Nested;
+  typedef typename internal::traits<TensorCustomBinaryOp>::StorageKind StorageKind;
+  typedef typename internal::traits<TensorCustomBinaryOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCustomBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, const CustomBinaryFunc& func)
+
+      : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_func(func) {}
+
+  EIGEN_DEVICE_FUNC
+  const CustomBinaryFunc& func() const { return m_func; }
+
+  EIGEN_DEVICE_FUNC
+  const typename internal::remove_all<typename LhsXprType::Nested>::type&
+  lhsExpression() const { return m_lhs_xpr; }
+
+  EIGEN_DEVICE_FUNC
+  const typename internal::remove_all<typename RhsXprType::Nested>::type&
+  rhsExpression() const { return m_rhs_xpr; }
+
+  protected:
+    typename LhsXprType::Nested m_lhs_xpr;
+    typename RhsXprType::Nested m_rhs_xpr;
+    const CustomBinaryFunc m_func;
+};
+
+
+// Eval as rvalue
+template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType, typename Device>
+struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Device>
+{
+  typedef TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> XprType;
+  typedef typename internal::traits<XprType>::Index Index;
+  static const int NumDims = internal::traits<XprType>::NumDimensions;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+    BlockAccess = false,
+    Layout = TensorEvaluator<LhsXprType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_op(op), m_device(device), m_result(NULL)
+  {
+    m_dimensions = op.func().dimensions(op.lhsExpression(), op.rhsExpression());
+  }
+
+  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+    if (data) {
+      evalTo(data);
+      return false;
+    } else {
+      m_result = static_cast<Scalar *>(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
+      evalTo(m_result);
+      return true;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    if (m_result != NULL) {
+      m_device.deallocate(m_result);
+      m_result = NULL;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    return m_result[index];
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
+    return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; }
+
+ protected:
+  EIGEN_DEVICE_FUNC void evalTo(Scalar* data) {
+    TensorMap<Tensor<Scalar, NumDims, Layout> > result(data, m_dimensions);
+    m_op.func().eval(m_op.lhsExpression(), m_op.rhsExpression(), result, m_device);
+  }
+
+  Dimensions m_dimensions;
+  const XprType m_op;
+  const Device& m_device;
+  CoeffReturnType* m_result;
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
index 1018395a1..a4419c665 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
@@ -28,8 +28,25 @@ struct DefaultDevice {
     ::memset(buffer, c, n);
   }
 
-  EIGEN_STRONG_INLINE size_t numThreads() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
+#ifndef __CUDA_ARCH__
+    // Running on the host CPU
     return 1;
+#else
+    // Running on a CUDA device
+    return 32;
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+#ifndef __CUDA_ARCH__
+    // Running single threaded on the host CPU
+    // Should return an enum that encodes the ISA supported by the CPU
+    return 1;
+#else
+    // Running on a CUDA device
+    return __CUDA_ARCH__ / 100;
+#endif
   }
 };
 
@@ -38,10 +55,19 @@ struct DefaultDevice {
 // We should really use a thread pool here but first we need to find a portable thread pool library.
 #ifdef EIGEN_USE_THREADS
 
+// This defines an interface that ThreadPoolDevice can take to use
+// custom thread pools underneath.
+class ThreadPoolInterface {
+ public:
+  virtual void Schedule(std::function<void()> fn) = 0;
+
+  virtual ~ThreadPoolInterface() {}
+};
+
 // The implementation of the ThreadPool type ensures that the Schedule method
 // runs the functions it is provided in FIFO order when the scheduling is done
 // by a single thread.
-class ThreadPool {
+class ThreadPool : public ThreadPoolInterface {
  public:
   // Construct a pool that contains "num_threads" threads.
   explicit ThreadPool(int num_threads) {
@@ -182,7 +208,7 @@ static EIGEN_STRONG_INLINE void wait_until_ready(Notification* n) {
 
 // Build a thread pool device on top the an existing pool of threads.
 struct ThreadPoolDevice {
-  ThreadPoolDevice(ThreadPool* pool, size_t num_cores) : pool_(pool), num_threads_(num_cores) { }
+  ThreadPoolDevice(ThreadPoolInterface* pool, size_t num_cores) : pool_(pool), num_threads_(num_cores) { }
 
   EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
     return internal::aligned_malloc(num_bytes);
@@ -204,6 +230,11 @@ struct ThreadPoolDevice {
     return num_threads_;
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+    // Should return an enum that encodes the ISA supported by the CPU
+    return 1;
+  }
+
   template <class Function, class... Args>
   EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const {
     Notification* n = new Notification();
@@ -219,7 +250,7 @@ struct ThreadPoolDevice {
   }
 
  private:
-  ThreadPool* pool_;
+  ThreadPoolInterface* pool_;
   size_t num_threads_;
 };
 
@@ -260,9 +291,12 @@ static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
   assert(status == cudaSuccess);
 }
 
+// Cuda stream to use when no stream is specified explicitely.
+static const cudaStream_t default_stream = cudaStreamDefault;
+
 struct GpuDevice {
   // The cudastream is not owned: the caller is responsible for its initialization and eventual destruction.
-  GpuDevice(const cudaStream_t* stream) : stream_(stream) { eigen_assert(stream); }
+  GpuDevice(const cudaStream_t* stream = &default_stream) : stream_(stream) { eigen_assert(stream); }
 
   EIGEN_STRONG_INLINE const cudaStream_t& stream() const { return *stream_; }
 
@@ -308,6 +342,8 @@ struct GpuDevice {
     return 32;
   }
 
+ inline int majorDeviceVersion() const { return m_deviceProperties.major; }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
     cudaStreamSynchronize(*stream_);
   }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h
new file mode 100644
index 000000000..9773afccf
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h
@@ -0,0 +1,235 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H
+#define EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H
+
+namespace Eigen {
+
+/** \internal
+  *
+  * \class TensorDimensionList
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Special case of tensor index list used to list all the dimensions of a tensor of rank n.
+  *
+  * \sa Tensor
+  */
+
+template <typename Index, std::size_t Rank> struct DimensionList {
+  const Index operator[] (const Index i) const { return i; }
+};
+
+namespace internal {
+
+template<typename Index, std::size_t Rank> struct array_size<DimensionList<Index, Rank> > {
+  static const size_t value = Rank;
+};
+template<typename Index, std::size_t Rank> struct array_size<const DimensionList<Index, Rank> > {
+  static const size_t value = Rank;
+};
+
+template<DenseIndex n, typename Index, std::size_t Rank> const Index array_get(DimensionList<Index, Rank>&) {
+  return n;
+}
+template<DenseIndex n, typename Index, std::size_t Rank> const Index array_get(const DimensionList<Index, Rank>&) {
+  return n;
+}
+
+
+#if defined(EIGEN_HAS_CONSTEXPR)
+template <typename Index, std::size_t Rank>
+struct index_known_statically<DimensionList<Index, Rank> > {
+  constexpr bool operator() (const DenseIndex) const {
+    return true;
+  }
+};
+template <typename Index, std::size_t Rank>
+struct index_known_statically<const DimensionList<Index, Rank> > {
+  constexpr bool operator() (const DenseIndex) const {
+    return true;
+  }
+};
+
+template <typename Index, std::size_t Rank>
+struct all_indices_known_statically<DimensionList<Index, Rank> > {
+  constexpr bool operator() () const {
+    return true;
+  }
+};
+template <typename Index, std::size_t Rank>
+struct all_indices_known_statically<const DimensionList<Index, Rank> > {
+  constexpr bool operator() () const {
+    return true;
+  }
+};
+
+template <typename Index, std::size_t Rank>
+struct indices_statically_known_to_increase<DimensionList<Index, Rank> > {
+  constexpr bool operator() () const {
+    return true;
+  }
+};
+template <typename Index, std::size_t Rank>
+struct indices_statically_known_to_increase<const DimensionList<Index, Rank> > {
+  constexpr bool operator() () const {
+    return true;
+  }
+};
+
+template <typename Index, std::size_t Rank>
+struct index_statically_eq<DimensionList<Index, Rank> > {
+  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+    return i == value;
+  }
+};
+template <typename Index, std::size_t Rank>
+struct index_statically_eq<const DimensionList<Index, Rank> > {
+  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+    return i == value;
+  }
+};
+
+template <typename Index, std::size_t Rank>
+struct index_statically_ne<DimensionList<Index, Rank> > {
+  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+    return i != value;
+  }
+};
+template <typename Index, std::size_t Rank>
+struct index_statically_ne<const DimensionList<Index, Rank> > {
+  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+    return i != value;
+  }
+};
+
+template <typename Index, std::size_t Rank>
+struct index_statically_gt<DimensionList<Index, Rank> > {
+  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+    return i > value;
+  }
+};
+template <typename Index, std::size_t Rank>
+struct index_statically_gt<const DimensionList<Index, Rank> > {
+  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+    return i > value;
+  }
+};
+
+template <typename Index, std::size_t Rank>
+struct index_statically_lt<DimensionList<Index, Rank> > {
+  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+    return i < value;
+  }
+};
+template <typename Index, std::size_t Rank>
+struct index_statically_lt<const DimensionList<Index, Rank> > {
+  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+    return i < value;
+  }
+};
+
+#else
+template <typename Index, std::size_t Rank>
+struct index_known_statically<DimensionList<Index, Rank> > {
+  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex) const {
+    return true;
+  }
+};
+template <typename Index, std::size_t Rank>
+struct index_known_statically<const DimensionList<Index, Rank> > {
+  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex) const {
+    return true;
+  }
+};
+
+template <typename Index, std::size_t Rank>
+struct all_indices_known_statically<DimensionList<Index, Rank> > {
+  EIGEN_ALWAYS_INLINE bool operator() () const {
+    return true;
+  }
+};
+template <typename Index, std::size_t Rank>
+struct all_indices_known_statically<const DimensionList<Index, Rank> > {
+  EIGEN_ALWAYS_INLINE bool operator() () const {
+    return true;
+  }
+};
+
+template <typename Index, std::size_t Rank>
+struct indices_statically_known_to_increase<DimensionList<Index, Rank> > {
+  EIGEN_ALWAYS_INLINE bool operator() () const {
+    return true;
+  }
+};
+template <typename Index, std::size_t Rank>
+struct indices_statically_known_to_increase<const DimensionList<Index, Rank> > {
+  EIGEN_ALWAYS_INLINE bool operator() () const {
+    return true;
+  }
+};
+
+template <typename Index, std::size_t Rank>
+struct index_statically_eq<DimensionList<Index, Rank> > {
+  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex, const DenseIndex) const {
+    return false;
+  }
+};
+template <typename Index, std::size_t Rank>
+struct index_statically_eq<const DimensionList<Index, Rank> > {
+  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex, const DenseIndex) const {
+    return false;
+  }
+};
+
+template <typename Index, std::size_t Rank>
+struct index_statically_ne<DimensionList<Index, Rank> > {
+  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex, const DenseIndex) const {
+    return false;
+  }
+};
+template <typename Index, std::size_t Rank>
+struct index_statically_ne<const DimensionList<Index, Rank> > {
+  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex, const DenseIndex) const {
+    return false;
+  }
+};
+
+template <typename Index, std::size_t Rank>
+struct index_statically_gt<DimensionList<Index, Rank> > {
+  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex, const DenseIndex) const {
+    return false;
+  }
+};
+template <typename Index, std::size_t Rank>
+struct index_statically_gt<const DimensionList<Index, Rank> > {
+  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex, const DenseIndex) const {
+    return false;
+  }
+};
+
+template <typename Index, std::size_t Rank>
+struct index_statically_lt<DimensionList<Index, Rank> > {
+  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex, const DenseIndex) const {
+    return false;
+  }
+};
+template <typename Index, std::size_t Rank>
+struct index_statically_lt<const DimensionList<Index, Rank> > {
+  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex, const DenseIndex) const {
+    return false;
+  }
+};
+#endif
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
index 836daea65..5928f0b0c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@@ -69,6 +69,31 @@ struct fixed_size_tensor_index_linearization_helper<Index, NumIndices, 0, RowMaj
   }
 };
 
+template<typename Index, std::size_t n>
+struct fixed_size_tensor_index_extraction_helper
+{
+  template <typename Dimensions> EIGEN_DEVICE_FUNC
+  static inline Index run(const Index index,
+                          const Dimensions& dimensions)
+  {
+    const Index mult = (index == n) ? 1 : 0;
+    return array_get<n>(dimensions) * mult +
+        fixed_size_tensor_index_extraction_helper<Index, n - 1>::run(index, dimensions);
+  }
+};
+
+template<typename Index>
+struct fixed_size_tensor_index_extraction_helper<Index, 0>
+{
+  template <typename Dimensions> EIGEN_DEVICE_FUNC
+  static inline Index run(const Index index,
+                          const Dimensions& dimensions)
+  {
+    const Index mult = (index == 0) ? 1 : 0;
+    return array_get<0>(dimensions) * mult;
+  }
+};
+
 }  // end namespace internal
 
 
@@ -99,6 +124,10 @@ struct Sizes : internal::numeric_list<std::ptrdiff_t, Indices...> {
   }
 #endif
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t operator[] (const int index) const {
+    return internal::fixed_size_tensor_index_extraction_helper<std::ptrdiff_t, Base::count - 1>::run(index, *this);
+  }
+
   template <typename T> Sizes& operator = (const T& /*other*/) {
     // add assertion failure if the size of other is different
     return *this;
@@ -114,10 +143,12 @@ struct Sizes : internal::numeric_list<std::ptrdiff_t, Indices...> {
   }
 };
 
+namespace internal {
 template <typename std::ptrdiff_t... Indices>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes<Indices...>&) {
   return Sizes<Indices...>::total_size;
 }
+}
 
 #else
 
@@ -166,6 +197,24 @@ template <std::size_t V1=0, std::size_t V2=0, std::size_t V3=0, std::size_t V4=0
   }
 #endif
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex operator[] (const int index) const {
+    switch (index) {
+      case 0:
+        return internal::get<0, Base>::value;
+      case 1:
+        return internal::get<1, Base>::value;
+      case 2:
+        return internal::get<2, Base>::value;
+      case 3:
+        return internal::get<3, Base>::value;
+      case 4:
+        return internal::get<4, Base>::value;
+      default:
+        eigen_assert(false && "index overflow");
+        return static_cast<std::size_t>(-1);
+    }
+  }
+
   template <typename T> Sizes& operator = (const T&) {
     // to do: check the size of other
     return *this;
@@ -181,10 +230,12 @@ template <std::size_t V1=0, std::size_t V2=0, std::size_t V3=0, std::size_t V4=0
   }
 };
 
+namespace internal {
 template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_prod(const Sizes<V1, V2, V3, V4, V5>&) {
   return Sizes<V1, V2, V3, V4, V5>::total_size;
 }
+}
 
 #endif
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
index 883e6cab1..ff4373f59 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -113,9 +113,9 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
-    m_impl.evalSubExprsIfNeeded(NULL);
-    return true;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* scalar) {
+    eigen_assert(scalar == NULL);
+    return m_impl.evalSubExprsIfNeeded(m_buffer);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 6ea588e4b..24606b0c8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -22,13 +22,8 @@ namespace Eigen {
   */
 namespace internal {
 
-template <typename Device, typename Expression>
-struct IsVectorizable {
-  static const bool value = TensorEvaluator<Expression, Device>::PacketAccess;
-};
-
 // Default strategy: the expression is evaluated with a single cpu thread.
-template<typename Expression, typename Device = DefaultDevice, bool Vectorizable = IsVectorizable<Device, Expression>::value>
+template<typename Expression, typename Device, bool Vectorizable>
 class TensorExecutor
 {
  public:
@@ -198,10 +193,6 @@ EigenMetaKernel_Vectorizable(Evaluator memcopied_eval, Index size) {
   }
 }
 
-template <typename Expression>
-struct IsVectorizable<GpuDevice, Expression> {
-  static const bool value = TensorEvaluator<Expression, GpuDevice>::PacketAccess && TensorEvaluator<Expression, GpuDevice>::IsAligned;
-};
 
 template<typename Expression>
 class TensorExecutor<Expression, GpuDevice, false>
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index 41a36cb75..d253b70f3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -116,7 +116,8 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
     }
     typedef TensorEvalToOp<const ArgType> EvalTo;
     EvalTo evalToTmp(m_buffer, m_op);
-    internal::TensorExecutor<const EvalTo, Device, TensorEvaluator<ArgType, Device>::PacketAccess>::run(evalToTmp, m_device);
+    const bool PacketAccess = internal::IsVectorizable<Device, ArgType>::value;
+    internal::TensorExecutor<const EvalTo, Device, PacketAccess>::run(evalToTmp, m_device);
     m_impl.cleanup();
     return true;
   }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
index a4224c372..b3bc16bc4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -29,6 +29,7 @@ template<typename TargetType, typename XprType> class TensorConversionOp;
 template<typename Dimensions, typename InputXprType, typename KernelXprType> class TensorConvolutionOp;
 template<typename PatchDim, typename XprType> class TensorPatchOp;
 template<DenseIndex Rows, DenseIndex Cols, typename XprType> class TensorImagePatchOp;
+template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType> class TensorVolumePatchOp;
 template<typename Broadcast, typename XprType> class TensorBroadcastingOp;
 template<DenseIndex DimId, typename XprType> class TensorChippingOp;
 template<typename NewDimensions, typename XprType> class TensorReshapingOp;
@@ -41,14 +42,36 @@ template<typename Strides, typename XprType> class TensorStridingOp;
 template<typename Generator, typename XprType> class TensorGeneratorOp;
 template<typename LeftXprType, typename RightXprType> class TensorAssignOp;
 
+template<typename CustomUnaryFunc, typename XprType> class TensorCustomUnaryOp;
+template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType> class TensorCustomBinaryOp;
+
 template<typename XprType> class TensorEvalToOp;
 template<typename XprType> class TensorForcedEvalOp;
 
 template<typename ExpressionType, typename DeviceType> class TensorDevice;
 template<typename Derived, typename Device> struct TensorEvaluator;
 
+class DefaultDevice;
+class ThreadPoolDevice;
+class GpuDevice;
+
 namespace internal {
-template<typename Expression, typename Device, bool Vectorizable> class TensorExecutor;
+
+template <typename Device, typename Expression>
+struct IsVectorizable {
+  static const bool value = TensorEvaluator<Expression, Device>::PacketAccess;
+};
+
+template <typename Expression>
+struct IsVectorizable<GpuDevice, Expression> {
+  static const bool value = TensorEvaluator<Expression, GpuDevice>::PacketAccess &&
+                            TensorEvaluator<Expression, GpuDevice>::IsAligned;
+};
+
+template <typename Expression, typename Device,
+          bool Vectorizable = IsVectorizable<Device, Expression>::value>
+class TensorExecutor;
+
 }  // end namespace internal
 
 }  // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
index 1b031b7a1..33e8c01c2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -17,6 +17,7 @@ namespace internal {
 template <typename T> struct SumReducer
 {
   static const bool PacketAccess = true;
+  static const bool IsStateful = false;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
     (*accum) += t;
@@ -49,6 +50,8 @@ template <typename T> struct SumReducer
 template <typename T> struct MeanReducer
 {
   static const bool PacketAccess = true;
+  static const bool IsStateful = true;
+
   MeanReducer() : scalarCount_(0), packetCount_(0) { }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) {
@@ -88,6 +91,7 @@ template <typename T> struct MeanReducer
 template <typename T> struct MaxReducer
 {
   static const bool PacketAccess = true;
+  static const bool IsStateful = false;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
     if (t > *accum) { *accum = t; }
@@ -120,6 +124,7 @@ template <typename T> struct MaxReducer
 template <typename T> struct MinReducer
 {
   static const bool PacketAccess = true;
+  static const bool IsStateful = false;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
     if (t < *accum) { *accum = t; }
@@ -153,6 +158,7 @@ template <typename T> struct MinReducer
 template <typename T> struct ProdReducer
 {
   static const bool PacketAccess = true;
+  static const bool IsStateful = false;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
     (*accum) *= t;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
index 054ecf7b5..ee66ae192 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
@@ -90,7 +90,7 @@ class TensorLayoutSwapOp : public TensorBase<TensorLayoutSwapOp<XprType>, WriteA
     {
       typedef TensorAssignOp<TensorLayoutSwapOp, const TensorLayoutSwapOp> Assign;
       Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
       return *this;
     }
 
@@ -100,7 +100,7 @@ class TensorLayoutSwapOp : public TensorBase<TensorLayoutSwapOp<XprType>, WriteA
     {
       typedef TensorAssignOp<TensorLayoutSwapOp, const OtherDerived> Assign;
       Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
       return *this;
     }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
new file mode 100644
index 000000000..78feb85cd
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -0,0 +1,36 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_META_H
+#define EIGEN_CXX11_TENSOR_TENSOR_META_H
+
+namespace Eigen {
+
+template<bool cond> struct Cond {};
+
+template<typename T1, typename T2> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+const T1& choose(Cond<true>, const T1& first, const T2&) {
+  return first;
+}
+
+template<typename T1, typename T2> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+const T2& choose(Cond<false>, const T1&, const T2& second) {
+  return second;
+}
+
+template <size_t n> struct max_n_1 {
+  static const size_t size = n;
+};
+template <> struct max_n_1<0> {
+  static const size_t size = 1;
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_META_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index fa1e6931c..34be9b908 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -78,7 +78,7 @@ class TensorReshapingOp : public TensorBase<TensorReshapingOp<NewDimensions, Xpr
     {
       typedef TensorAssignOp<TensorReshapingOp, const TensorReshapingOp> Assign;
       Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
       return *this;
     }
 
@@ -88,7 +88,7 @@ class TensorReshapingOp : public TensorBase<TensorReshapingOp<NewDimensions, Xpr
     {
       typedef TensorAssignOp<TensorReshapingOp, const OtherDerived> Assign;
       Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
       return *this;
     }
 
@@ -262,7 +262,7 @@ class TensorSlicingOp : public TensorBase<TensorSlicingOp<StartIndices, Sizes, X
     {
       typedef TensorAssignOp<TensorSlicingOp, const OtherDerived> Assign;
       Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
       return *this;
     }
 
@@ -271,7 +271,7 @@ class TensorSlicingOp : public TensorBase<TensorSlicingOp<StartIndices, Sizes, X
     {
       typedef TensorAssignOp<TensorSlicingOp, const TensorSlicingOp> Assign;
       Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
       return *this;
     }
 
@@ -411,7 +411,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
   {
     const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
     EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+    eigen_assert(index+packetSize-1 < internal::array_prod(dimensions()));
 
     Index inputIndices[] = {0, 0};
     Index indices[] = {index, index + packetSize - 1};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 95116aaee..3b3955094 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -44,6 +44,38 @@ struct nested<TensorReductionOp<Op, Dims, XprType>, 1, typename eval<TensorReduc
 };
 
 
+template <typename OutputDims> struct DimInitializer {
+  template <typename InputDims, typename ReducedDims> EIGEN_DEVICE_FUNC
+  static void run(const InputDims& input_dims,
+                  const array<bool, internal::array_size<InputDims>::value>& reduced,
+                  OutputDims* output_dims, ReducedDims* reduced_dims) {
+    const int NumInputDims = internal::array_size<InputDims>::value;
+    int outputIndex = 0;
+    int reduceIndex = 0;
+    for (int i = 0; i < NumInputDims; ++i) {
+      if (reduced[i]) {
+        (*reduced_dims)[reduceIndex] = input_dims[i];
+        ++reduceIndex;
+      } else {
+        (*output_dims)[outputIndex] = input_dims[i];
+        ++outputIndex;
+      }
+    }
+  }
+};
+
+template <> struct DimInitializer<Sizes<1> > {
+  template <typename InputDims, typename Index, size_t Rank> EIGEN_DEVICE_FUNC
+  static void run(const InputDims& input_dims, const array<bool, Rank>&,
+                  Sizes<1>*,  array<Index, Rank>* reduced_dims) {
+    const int NumInputDims = internal::array_size<InputDims>::value;
+    for (int i = 0; i < NumInputDims; ++i) {
+      (*reduced_dims)[i] = input_dims[i];
+    }
+  }
+};
+
+
 template <typename ReducedDims, int NumTensorDims, int Layout>
 struct are_inner_most_dims {
   static const bool value = false;
@@ -144,7 +176,7 @@ template <int DimIndex, typename Self, typename Op>
 struct InnerMostDimPreserver<DimIndex, Self, Op, true> {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
     EIGEN_STATIC_ASSERT(DimIndex > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
-    for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
+    for (typename Self::Index j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
       const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
       InnerMostDimPreserver<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
     }
@@ -154,13 +186,325 @@ struct InnerMostDimPreserver<DimIndex, Self, Op, true> {
 template <typename Self, typename Op>
 struct InnerMostDimPreserver<0, Self, Op, true> {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
-    for (int j = 0; j < self.m_reducedDims[0]; ++j) {
+    for (typename Self::Index j = 0; j < self.m_reducedDims[0]; ++j) {
       const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
       reducer.reducePacket(self.m_impl.template packet<Unaligned>(input), accum);
     }
   }
 };
 
+// Default full reducer
+template <typename Self, typename Op, typename Device, bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
+struct FullReducer {
+  static const bool HasOptimizedImplementation = false;
+
+  static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::CoeffReturnType* output) {
+    const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions());
+    *output = InnerMostDimReducer<Self, Op>::reduce(self, 0, num_coeffs, reducer);
+  }
+};
+
+
+#ifdef EIGEN_USE_THREADS
+// Multithreaded full reducers
+template <typename Eval, typename Op, bool Vectorizable = (Eval::InputPacketAccess & Op::PacketAccess)>
+struct FullReducerShard {
+  static void run(const Eval& eval, typename Eval::Index firstIndex, typename Eval::Index numValuesToReduce, Op& reducer, FullReducerShard* shard) {
+
+    shard->saccum = reducer.initialize();
+    for (typename Eval::Index j = 0; j < numValuesToReduce; ++j) {
+      reducer.reduce(eval.m_impl.coeff(firstIndex + j), &shard->saccum);
+    }
+  }
+
+  typename Eval::CoeffReturnType saccum;
+};
+
+template <typename Eval, typename Op>
+struct FullReducerShard<Eval, Op, true> {
+  static void run(const Eval& eval, typename Eval::Index firstIndex, typename Eval::Index numValuesToReduce, Op& reducer, FullReducerShard* shard) {
+
+    const int packetSize = internal::unpacket_traits<typename Eval::PacketReturnType>::size;
+    const typename Eval::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize;
+
+    shard->paccum = reducer.template initializePacket<typename Eval::PacketReturnType>();
+    for (typename Eval::Index j = 0; j < VectorizedSize; j += packetSize) {
+      reducer.reducePacket(eval.m_impl.template packet<Unaligned>(firstIndex + j), &shard->paccum);
+    }
+    shard->saccum = reducer.initialize();
+    for (typename Eval::Index j = VectorizedSize; j < numValuesToReduce; ++j) {
+      reducer.reduce(eval.m_impl.coeff(firstIndex + j), &shard->saccum);
+    }
+  }
+
+  typename Eval::PacketReturnType paccum;
+  typename Eval::CoeffReturnType saccum;
+};
+
+
+template <typename Self, typename Op>
+struct FullReducer<Self, Op, ThreadPoolDevice, false> {
+  static const bool HasOptimizedImplementation = !Op::IsStateful;
+
+  // launch one reducer per thread and accumulate the result.
+  static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device, typename Self::CoeffReturnType* output) {
+    typedef typename Self::Index Index;
+    const Index num_coeffs = array_prod(self.m_impl.dimensions());
+    const Index blocksize = std::floor<Index>(static_cast<float>(num_coeffs)/device.numThreads());
+    const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
+    eigen_assert(num_coeffs >= numblocks * blocksize);
+
+    std::vector<Notification*> results;
+    results.reserve(numblocks);
+    std::vector<FullReducerShard<Self, Op, false> > shards;
+    shards.resize(numblocks);
+    for (Index i = 0; i < numblocks; ++i) {
+      results.push_back(device.enqueue(&FullReducerShard<Self, Op, false>::run, self, i*blocksize, blocksize, reducer, &shards[i]));
+    }
+
+    FullReducerShard<Self, Op, false> finalShard;
+    if (numblocks * blocksize < num_coeffs) {
+      FullReducerShard<Self, Op, false>::run(self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer, &finalShard);
+    } else {
+      finalShard.saccum = reducer.initialize();
+    }
+
+    for (Index i = 0; i < numblocks; ++i) {
+      wait_until_ready(results[i]);
+      delete results[i];
+    }
+
+    for (Index i = 0; i < numblocks; ++i) {
+      reducer.reduce(shards[i].saccum, &finalShard.saccum);
+    }
+    *output = reducer.finalize(finalShard.saccum);
+  }
+};
+
+template <typename Self, typename Op>
+struct FullReducer<Self, Op, ThreadPoolDevice, true> {
+  static const bool HasOptimizedImplementation = !Op::IsStateful;
+
+  // launch one reducer per thread and accumulate the result.
+  static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device, typename Self::CoeffReturnType* output) {
+    typedef typename Self::Index Index;
+    const Index num_coeffs = array_prod(self.m_impl.dimensions());
+    const Index blocksize = std::floor<Index>(static_cast<float>(num_coeffs)/device.numThreads());
+    const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
+    eigen_assert(num_coeffs >= numblocks * blocksize);
+
+    std::vector<Notification*> results;
+    results.reserve(numblocks);
+    std::vector<FullReducerShard<Self, Op, true> > shards;
+    shards.resize(numblocks);
+    for (Index i = 0; i < numblocks; ++i) {
+      results.push_back(device.enqueue(&FullReducerShard<Self, Op, true>::run, self, i*blocksize, blocksize, reducer, &shards[i]));
+    }
+
+    FullReducerShard<Self, Op, true> finalShard;
+    if (numblocks * blocksize < num_coeffs) {
+      FullReducerShard<Self, Op, true>::run(self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer, &finalShard);
+    } else {
+      finalShard.paccum = reducer.template initializePacket<typename Self::PacketReturnType>();
+      finalShard.saccum = reducer.initialize();
+    }
+
+    for (Index i = 0; i < numblocks; ++i) {
+      wait_until_ready(results[i]);
+      delete results[i];
+    }
+
+    for (Index i = 0; i < numblocks; ++i) {
+      reducer.reducePacket(shards[i].paccum, &finalShard.paccum);
+      reducer.reduce(shards[i].saccum, &finalShard.saccum);
+    }
+
+    *output = reducer.finalizeBoth(finalShard.saccum, finalShard.paccum);
+  }
+};
+#endif
+
+
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+// Full reducers for GPU, don't vectorize for now
+
+// Reducer function that enables multiple cuda thread to safely accumulate at the same
+// output address. It basically reads the current value of the output variable, and
+// attempts to update it with the new value. If in the meantime another cuda thread
+// updated the content of the output address it will try again.
+template <typename T, typename R>
+__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
+#if __CUDA_ARCH__ >= 300
+  if (sizeof(T) == 4)
+  {
+    unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
+    unsigned int newval = oldval;
+    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+    if (newval == oldval) {
+      return;
+    }
+    unsigned int readback;
+    while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
+      oldval = readback;
+      newval = oldval;
+      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+      if (newval == oldval) {
+        return;
+      }
+    }
+  }
+  else if (sizeof(T) == 8) {
+    unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
+    unsigned long long newval = oldval;
+    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+    if (newval == oldval) {
+      return;
+    }
+    unsigned long long readback;
+    while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
+      oldval = readback;
+      newval = oldval;
+      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+      if (newval == oldval) {
+        return;
+      }
+    }
+  }
+  else {
+    assert(0 && "Wordsize not supported");
+  }
+#else
+  assert(0 && "Shouldn't be called on unsupported device");
+#endif
+}
+
+template <typename T>
+__device__ inline void atomicReduce(T* output, T accum, SumReducer<T>&) {
+#if __CUDA_ARCH__ >= 300
+  atomicAdd(output, accum);
+#else
+  assert(0 && "Shouldn't be called on unsupported device");
+#endif
+}
+
+template <int BlockSize, int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
+                                    typename Self::CoeffReturnType* output) {
+  const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
+
+  if (first_index == 0) {
+    *output = reducer.initialize();
+  }
+
+  typename Self::CoeffReturnType accum = reducer.initialize();
+  for (Index i = 0; i < NumPerThread; ++i) {
+    const Index index = first_index + i * BlockSize;
+    if (index >= num_coeffs) {
+      break;
+    }
+    typename Self::CoeffReturnType val = input.m_impl.coeff(index);
+    reducer.reduce(val, &accum);
+  }
+
+  for (int offset = warpSize/2; offset > 0; offset /= 2) {
+    reducer.reduce(__shfl_down(accum, offset), &accum);
+  }
+
+  if ((threadIdx.x & (warpSize - 1)) == 0) {
+    atomicReduce(output, accum, reducer);
+  }
+}
+
+
+template <typename Self, typename Op, bool Vectorizable>
+struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
+  // Unfortunately nvidia doesn't support well exotic types such as complex,
+  // so reduce the scope of the optimized version of the code to the simple case
+  // of floats.
+  static const bool HasOptimizedImplementation = !Op::IsStateful &&
+                                                 internal::is_same<typename Self::CoeffReturnType, float>::value;
+
+  template <typename OutputType>
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
+    assert(false && "Should only be called on floats");
+  }
+
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) {
+    typedef typename Self::Index Index;
+
+    const Index num_coeffs = array_prod(self.m_impl.dimensions());
+    const int block_size = 256;
+    const int num_per_thread = 128;
+    const int num_blocks = std::ceil(static_cast<float>(num_coeffs) / (block_size * num_per_thread));
+    LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output);
+  }
+};
+
+#endif
+
+
+template <typename Self, typename Op,
+          bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
+class BlockReducer {
+ public:
+  typedef typename Self::Index Index;
+  typedef typename Self::Scalar Scalar;
+  typedef typename Self::CoeffReturnType CoeffReturnType;
+  explicit BlockReducer(const Op& reducer) : op_(reducer) {
+    accum_ = op_.initialize();
+  }
+  void Reduce(Index index, Index num_values_to_reduce, Scalar* data) {
+    for (Index i = 0; i < num_values_to_reduce; ++i) {
+      op_.reduce(data[index + i], &accum_);
+    }
+  }
+  CoeffReturnType Finalize() {
+    return op_.finalize(accum_);
+  }
+
+ private:
+  CoeffReturnType accum_;
+  Op op_;
+};
+
+
+template <typename Self, typename Op>
+class BlockReducer<Self, Op, true> {
+ public:
+  typedef typename Self::Index Index;
+  typedef typename Self::Scalar Scalar;
+  typedef typename Self::CoeffReturnType CoeffReturnType;
+  typedef typename Self::PacketReturnType PacketReturnType;
+  explicit BlockReducer(const Op& reducer) : op_(reducer) {
+    vaccum_ = op_.template initializePacket<PacketReturnType>();
+    accum_ = op_.initialize();
+  }
+  void Reduce(Index index, Index num_values_to_reduce, Scalar* data) {
+    const int packet_size = internal::unpacket_traits<PacketReturnType>::size;
+    const typename Self::Index vectorized_size = (num_values_to_reduce /
+                                                  packet_size) * packet_size;
+    for (typename Self::Index i = 0; i < vectorized_size; i += packet_size) {
+      op_.reducePacket(internal::ploadt<PacketReturnType, Unaligned>(
+          &data[index + i]), &vaccum_);
+    }
+
+    for (typename Self::Index i = vectorized_size;
+         i < num_values_to_reduce; ++i) {
+      op_.reduce(data[index + i], &accum_);
+    }
+  }
+  typename Self::CoeffReturnType Finalize() {
+    return op_.finalizeBoth(accum_, vaccum_);
+  }
+
+ private:
+  typename Self::PacketReturnType vaccum_;
+  typename Self::CoeffReturnType accum_;
+  Op op_;
+};
+
 }  // end namespace internal
 
 
@@ -179,6 +523,7 @@ class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType>
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     TensorReductionOp(const XprType& expr, const Dims& dims) : m_expr(expr), m_dims(dims)
     { }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     TensorReductionOp(const XprType& expr, const Dims& dims, const Op& reducer) : m_expr(expr), m_dims(dims), m_reducer(reducer)
     { }
 
@@ -186,6 +531,7 @@ class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType>
     const XprType& expression() const { return m_expr; }
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const Dims& dims() const { return m_dims; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const Op& reducer() const { return m_reducer; }
 
   protected:
@@ -201,10 +547,11 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
 {
   typedef TensorReductionOp<Op, Dims, ArgType> XprType;
   typedef typename XprType::Index Index;
-  static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
+  static const int NumInputDims = internal::array_size<InputDimensions>::value;
   static const int NumReducedDims = internal::array_size<Dims>::value;
   static const int NumOutputDims = (NumInputDims==NumReducedDims) ? 1 : NumInputDims - NumReducedDims;
-  typedef DSizes<Index, NumOutputDims> Dimensions;
+  typedef typename internal::conditional<NumInputDims==NumReducedDims, Sizes<1>, DSizes<Index, NumOutputDims> >::type Dimensions;
   typedef typename XprType::Scalar Scalar;
   typedef TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> Self;
   static const bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess;
@@ -218,9 +565,10 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
 
   static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
   static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
+  static const bool RunningFullReduction = (NumInputDims==NumReducedDims);
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device), m_reducer(op.reducer())
+      : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device)
   {
     EIGEN_STATIC_ASSERT(NumInputDims >= NumReducedDims, YOU_MADE_A_PROGRAMMING_MISTAKE);
     EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
@@ -238,17 +586,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
     }
 
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
-    int outputIndex = 0;
-    int reduceIndex = 0;
-    for (int i = 0; i < NumInputDims; ++i) {
-      if (reduced[i]) {
-        m_reducedDims[reduceIndex] = input_dims[i];
-        ++reduceIndex;
-      } else {
-        m_dimensions[outputIndex] = input_dims[i];
-        ++outputIndex;
-      }
-    }
+    internal::DimInitializer<Dimensions>::run(input_dims, reduced, &m_dimensions, &m_reducedDims);
 
     // Precompute output strides.
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@@ -277,8 +615,8 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
       }
     }
 
-    outputIndex = 0;
-    reduceIndex = 0;
+    int outputIndex = 0;
+    int reduceIndex = 0;
     for (int i = 0; i < NumInputDims; ++i) {
       if (reduced[i]) {
         m_reducedStrides[reduceIndex] = input_strides[i];
@@ -291,27 +629,50 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
 
     // Special case for full reductions
     if (NumInputDims == NumReducedDims) {
-      m_dimensions[0] = 1;
+      eigen_assert(m_dimensions[0] == 1);
       m_preservedStrides[0] = internal::array_prod(input_dims);
     }
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+  typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType;
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
     m_impl.evalSubExprsIfNeeded(NULL);
+
+    // Use the FullReducer if possible.
+    if (RunningFullReduction && internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
+        ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) ||
+         (internal::array_prod(m_impl.dimensions()) > 1024 * 1024))) {
+
+      bool need_assign = false;
+      if (!data) {
+        m_result = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType)));
+        data = m_result;
+        need_assign = true;
+      }
+
+      Op reducer(m_reducer);
+      internal::FullReducer<Self, Op, Device>::run(*this, reducer, m_device, data);
+      return need_assign;
+    }
     return true;
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
+    if (m_result) {
+      m_device.deallocate(m_result);
+    }
   }
 
-  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
-  typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
   {
+    if (RunningFullReduction && m_result) {
+      return *m_result;
+    }
     Op reducer(m_reducer);
     if (ReducingInnerMostDims) {
       const Index num_values_to_reduce =
@@ -372,6 +733,13 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
   template <int, typename, typename> friend struct internal::GenericDimReducer;
   template <typename, typename, bool> friend struct internal::InnerMostDimReducer;
   template <int, typename, typename, bool> friend struct internal::InnerMostDimPreserver;
+  template <typename S, typename O, typename D, bool V> friend struct internal::FullReducer;
+#ifdef EIGEN_USE_THREADS
+  template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
+#endif
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+  template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*);
+#endif
 
   // Returns the Index in the input tensor of the first value that needs to be
   // used to compute the reduction at output index "index".
@@ -392,7 +760,12 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
         startInput += idx * m_preservedStrides[i];
         index -= idx * m_outputStrides[i];
       }
-      startInput += index * m_preservedStrides[0];
+      if (PreservingInnerMostDims) {
+        eigen_assert(m_preservedStrides[0] == 1);
+        startInput += index;
+      } else {
+        startInput += index * m_preservedStrides[0];
+      }
     } else {
       for (int i = 0; i < NumOutputDims - 1; ++i) {
         // This is index_i in the output tensor.
@@ -400,7 +773,12 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
         startInput += idx * m_preservedStrides[i];
         index -= idx * m_outputStrides[i];
       }
-      startInput += index * m_preservedStrides[NumOutputDims - 1];
+      if (PreservingInnerMostDims) {
+        eigen_assert(m_preservedStrides[NumOutputDims - 1] == 1);
+        startInput += index;
+      } else {
+        startInput += index * m_preservedStrides[NumOutputDims - 1];
+      }
     }
     return startInput;
   }
@@ -425,6 +803,16 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
 
   // Operation to apply for computing the reduction.
   Op m_reducer;
+
+  // For full reductions
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+  static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
+#else
+  static const bool RunningOnGPU = false;
+#endif
+  CoeffReturnType* m_result;
+
+  const Device& m_device;
 };
 
 } // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
index 16bef2ad3..52f95b2a2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -80,7 +80,7 @@ class TensorReverseOp : public TensorBase<TensorReverseOp<ReverseDimensions,
     {
       typedef TensorAssignOp<TensorReverseOp, const TensorReverseOp> Assign;
       Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
       return *this;
     }
 
@@ -90,7 +90,7 @@ class TensorReverseOp : public TensorBase<TensorReverseOp<ReverseDimensions,
     {
       typedef TensorAssignOp<TensorReverseOp, const OtherDerived> Assign;
       Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
       return *this;
     }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index 1012ecd69..02f73dd37 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -78,7 +78,7 @@ class TensorShufflingOp : public TensorBase<TensorShufflingOp<Shuffle, XprType>
     {
       typedef TensorAssignOp<TensorShufflingOp, const TensorShufflingOp> Assign;
       Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
       return *this;
     }
 
@@ -88,7 +88,7 @@ class TensorShufflingOp : public TensorBase<TensorShufflingOp<Shuffle, XprType>
     {
       typedef TensorAssignOp<TensorShufflingOp, const OtherDerived> Assign;
       Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
       return *this;
     }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
index 00cb8e373..dd913fbae 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@@ -78,7 +78,7 @@ class TensorStridingOp : public TensorBase<TensorStridingOp<Strides, XprType> >
     {
       typedef TensorAssignOp<TensorStridingOp, const TensorStridingOp> Assign;
       Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
       return *this;
     }
 
@@ -88,7 +88,7 @@ class TensorStridingOp : public TensorBase<TensorStridingOp<Strides, XprType> >
     {
       typedef TensorAssignOp<TensorStridingOp, const OtherDerived> Assign;
       Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
       return *this;
     }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
index 757d6cb38..2e6b93bfb 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
@@ -156,9 +156,9 @@ struct eval<const TensorRef<PlainObjectType>, Eigen::Dense>
 };
 
 // TODO nested<> does not exist anymore in Eigen/Core, and it thus has to be removed in favor of ref_selector.
-template<typename T, int n=1, typename PlainObject = void> struct nested    
-{   
-  typedef typename ref_selector<T>::type type;    
+template<typename T, int n=1, typename PlainObject = void> struct nested
+{
+  typedef typename ref_selector<T>::type type;
 };
 
 template <typename Scalar_, std::size_t NumIndices_, int Options_, typename IndexType_>
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
new file mode 100644
index 000000000..00eabdf5a
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
@@ -0,0 +1,677 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H
+#define EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H
+
+namespace Eigen {
+
+/** \class TensorVolumePatch
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Patch extraction specialized for processing of volumetric data.
+  * This assumes that the input has a least 4 dimensions ordered as follows:
+  *  - channels
+  *  - planes
+  *  - rows
+  *  - columns
+  *  - (optional) additional dimensions such as time or batch size.
+  * Calling the volume patch code with patch_planes, patch_rows, and patch_cols
+  * is equivalent to calling the regular patch extraction code with parameters
+  * d, patch_planes, patch_rows, patch_cols, and 1 for all the additional
+  * dimensions.
+  */
+namespace internal {
+template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
+struct traits<TensorVolumePatchOp<Planes, Rows, Cols, XprType> > : public traits<XprType>
+{
+  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions + 1;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
+struct eval<TensorVolumePatchOp<Planes, Rows, Cols, XprType>, Eigen::Dense>
+{
+  typedef const TensorVolumePatchOp<Planes, Rows, Cols, XprType>& type;
+};
+
+template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
+struct nested<TensorVolumePatchOp<Planes, Rows, Cols, XprType>, 1, typename eval<TensorVolumePatchOp<Planes, Rows, Cols, XprType> >::type>
+{
+  typedef TensorVolumePatchOp<Planes, Rows, Cols, XprType> type;
+};
+
+}  // end namespace internal
+
+template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
+class TensorVolumePatchOp : public TensorBase<TensorVolumePatchOp<Planes, Rows, Cols, XprType>, ReadOnlyAccessors>
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorVolumePatchOp>::Scalar Scalar;
+  typedef typename Eigen::internal::traits<TensorVolumePatchOp>::Packet Packet;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+  typedef typename Eigen::internal::nested<TensorVolumePatchOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorVolumePatchOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorVolumePatchOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorVolumePatchOp(const XprType& expr, DenseIndex patch_planes, DenseIndex patch_rows, DenseIndex patch_cols,
+                                                            DenseIndex plane_strides, DenseIndex row_strides, DenseIndex col_strides,
+                                                            DenseIndex in_plane_strides, DenseIndex in_row_strides, DenseIndex in_col_strides,
+                                                            DenseIndex plane_inflate_strides, DenseIndex row_inflate_strides, DenseIndex col_inflate_strides,
+                                                            PaddingType padding_type, Scalar padding_value)
+      : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
+        m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides),
+        m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
+        m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
+        m_padding_explicit(false), m_padding_top_z(0), m_padding_bottom_z(0), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0),
+        m_padding_type(padding_type), m_padding_value(padding_value) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorVolumePatchOp(const XprType& expr, DenseIndex patch_planes, DenseIndex patch_rows, DenseIndex patch_cols,
+                                                           DenseIndex plane_strides, DenseIndex row_strides, DenseIndex col_strides,
+                                                           DenseIndex in_plane_strides, DenseIndex in_row_strides, DenseIndex in_col_strides,
+                                                           DenseIndex plane_inflate_strides, DenseIndex row_inflate_strides, DenseIndex col_inflate_strides,
+                                                           DenseIndex padding_top_z, DenseIndex padding_bottom_z,
+                                                           DenseIndex padding_top, DenseIndex padding_bottom,
+                                                           DenseIndex padding_left, DenseIndex padding_right,
+                                                           Scalar padding_value)
+      : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
+        m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides),
+        m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
+        m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
+        m_padding_explicit(true), m_padding_top_z(padding_top_z), m_padding_bottom_z(padding_bottom_z), m_padding_top(padding_top), m_padding_bottom(padding_bottom),
+        m_padding_left(padding_left), m_padding_right(padding_right),
+        m_padding_type(PADDING_VALID), m_padding_value(padding_value) {}
+
+    EIGEN_DEVICE_FUNC
+    DenseIndex patch_planes() const { return m_patch_planes; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex patch_rows() const { return m_patch_rows; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex patch_cols() const { return m_patch_cols; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex plane_strides() const { return m_plane_strides; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex row_strides() const { return m_row_strides; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex col_strides() const { return m_col_strides; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex in_plane_strides() const { return m_in_plane_strides; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex in_row_strides() const { return m_in_row_strides; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex in_col_strides() const { return m_in_col_strides; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex plane_inflate_strides() const { return m_plane_inflate_strides; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex row_inflate_strides() const { return m_row_inflate_strides; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex col_inflate_strides() const { return m_col_inflate_strides; }
+    EIGEN_DEVICE_FUNC
+    bool padding_explicit() const { return m_padding_explicit; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex padding_top_z() const { return m_padding_top_z; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex padding_bottom_z() const { return m_padding_bottom_z; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex padding_top() const { return m_padding_top; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex padding_bottom() const { return m_padding_bottom; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex padding_left() const { return m_padding_left; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex padding_right() const { return m_padding_right; }
+    EIGEN_DEVICE_FUNC
+    PaddingType padding_type() const { return m_padding_type; }
+    EIGEN_DEVICE_FUNC
+    Scalar padding_value() const { return m_padding_value; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const DenseIndex m_patch_planes;
+    const DenseIndex m_patch_rows;
+    const DenseIndex m_patch_cols;
+    const DenseIndex m_plane_strides;
+    const DenseIndex m_row_strides;
+    const DenseIndex m_col_strides;
+    const DenseIndex m_in_plane_strides;
+    const DenseIndex m_in_row_strides;
+    const DenseIndex m_in_col_strides;
+    const DenseIndex m_plane_inflate_strides;
+    const DenseIndex m_row_inflate_strides;
+    const DenseIndex m_col_inflate_strides;
+    const bool m_padding_explicit;
+    const DenseIndex m_padding_top_z;
+    const DenseIndex m_padding_bottom_z;
+    const DenseIndex m_padding_top;
+    const DenseIndex m_padding_bottom;
+    const DenseIndex m_padding_left;
+    const DenseIndex m_padding_right;
+    const PaddingType m_padding_type;
+    const Scalar m_padding_value;
+};
+
+
+// Eval as rvalue
+template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, Device>
+{
+  typedef TensorVolumePatchOp<Planes, Rows, Cols, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  static const int NumDims = NumInputDims + 1;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = NumDims == 6,
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device)
+  {
+    EIGEN_STATIC_ASSERT(NumDims >= 5, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    m_paddingValue = op.padding_value();
+
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+
+    // Cache a few variables.
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_inputDepth = input_dims[0];
+      m_inputPlanes = input_dims[1];
+      m_inputRows = input_dims[2];
+      m_inputCols = input_dims[3];
+    } else {
+      m_inputDepth = input_dims[NumInputDims-1];
+      m_inputPlanes = input_dims[NumInputDims-2];
+      m_inputRows = input_dims[NumInputDims-3];
+      m_inputCols = input_dims[NumInputDims-4];
+    }
+
+    m_plane_strides = op.plane_strides();
+    m_row_strides = op.row_strides();
+    m_col_strides = op.col_strides();
+
+    // Input strides and effective input/patch size
+    m_in_plane_strides = op.in_plane_strides();
+    m_in_row_strides = op.in_row_strides();
+    m_in_col_strides = op.in_col_strides();
+    m_plane_inflate_strides = op.plane_inflate_strides();
+    m_row_inflate_strides = op.row_inflate_strides();
+    m_col_inflate_strides = op.col_inflate_strides();
+
+    // The "effective" spatial size after inflating data with zeros.
+    m_input_planes_eff = (m_inputPlanes - 1) * m_plane_inflate_strides + 1;
+    m_input_rows_eff = (m_inputRows - 1) * m_row_inflate_strides + 1;
+    m_input_cols_eff = (m_inputCols - 1) * m_col_inflate_strides + 1;
+    m_patch_planes_eff = op.patch_planes() + (op.patch_planes() - 1) * (m_in_plane_strides - 1);
+    m_patch_rows_eff = op.patch_rows() + (op.patch_rows() - 1) * (m_in_row_strides - 1);
+    m_patch_cols_eff = op.patch_cols() + (op.patch_cols() - 1) * (m_in_col_strides - 1);
+
+    if (op.padding_explicit()) {
+      m_outputPlanes = numext::ceil((m_input_planes_eff + op.padding_top_z() + op.padding_bottom_z() - m_patch_planes_eff + 1.f) / static_cast<float>(m_plane_strides));
+      m_outputRows = numext::ceil((m_input_rows_eff + op.padding_top() + op.padding_bottom() - m_patch_rows_eff + 1.f) / static_cast<float>(m_row_strides));
+      m_outputCols = numext::ceil((m_input_cols_eff + op.padding_left() + op.padding_right() - m_patch_cols_eff + 1.f) / static_cast<float>(m_col_strides));
+      m_planePaddingTop = op.padding_top_z();
+      m_rowPaddingTop = op.padding_top();
+      m_colPaddingLeft = op.padding_left();
+    } else {
+      // Computing padding from the type
+      switch (op.padding_type()) {
+        case PADDING_VALID:
+          m_outputPlanes = numext::ceil((m_input_planes_eff - m_patch_planes_eff + 1.f) / static_cast<float>(m_plane_strides));
+          m_outputRows = numext::ceil((m_input_rows_eff - m_patch_rows_eff + 1.f) / static_cast<float>(m_row_strides));
+          m_outputCols = numext::ceil((m_input_cols_eff - m_patch_cols_eff + 1.f) / static_cast<float>(m_col_strides));
+          m_planePaddingTop = 0;
+          m_rowPaddingTop = 0;
+          m_colPaddingLeft = 0;
+          break;
+        case PADDING_SAME: {
+          m_outputPlanes = numext::ceil(m_input_planes_eff / static_cast<float>(m_plane_strides));
+          m_outputRows = numext::ceil(m_input_rows_eff / static_cast<float>(m_row_strides));
+          m_outputCols = numext::ceil(m_input_cols_eff / static_cast<float>(m_col_strides));
+          const Index dz = m_outputPlanes * m_plane_strides + m_patch_planes_eff - 1 - m_input_planes_eff;
+          const Index dy = m_outputRows * m_row_strides + m_patch_rows_eff - 1 - m_input_rows_eff;
+          const Index dx = m_outputCols * m_col_strides + m_patch_cols_eff - 1 - m_input_cols_eff;
+          m_planePaddingTop = dz - dz / 2;
+          m_rowPaddingTop = dy - dy / 2;
+          m_colPaddingLeft = dx - dx / 2;
+          break;
+        }
+        default:
+          eigen_assert(false && "unexpected padding");
+      }
+    }
+    eigen_assert(m_outputRows > 0);
+    eigen_assert(m_outputCols > 0);
+    eigen_assert(m_outputPlanes > 0);
+
+    // Dimensions for result of extraction.
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      // ColMajor
+      // 0: depth
+      // 1: patch_planes
+      // 2: patch_rows
+      // 3: patch_cols
+      // 4: number of patches
+      // 5 and beyond: anything else (such as batch).
+      m_dimensions[0] = input_dims[0];
+      m_dimensions[1] = op.patch_planes();
+      m_dimensions[2] = op.patch_rows();
+      m_dimensions[3] = op.patch_cols();
+      m_dimensions[4] = m_outputPlanes * m_outputRows * m_outputCols;
+      for (int i = 5; i < NumDims; ++i) {
+        m_dimensions[i] = input_dims[i-1];
+      }
+    } else {
+      // RowMajor
+      // NumDims-1: depth
+      // NumDims-2: patch_planes
+      // NumDims-3: patch_rows
+      // NumDims-4: patch_cols
+      // NumDims-5: number of patches
+      // NumDims-6 and beyond: anything else (such as batch).
+      m_dimensions[NumDims-1] = input_dims[NumInputDims-1];
+      m_dimensions[NumDims-2] = op.patch_planes();
+      m_dimensions[NumDims-3] = op.patch_rows();
+      m_dimensions[NumDims-4] = op.patch_cols();
+      m_dimensions[NumDims-5] = m_outputPlanes * m_outputRows * m_outputCols;
+      for (int i = NumDims-6; i >= 0; --i) {
+        m_dimensions[i] = input_dims[i];
+      }
+    }
+
+    // Strides for the output tensor.
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_rowStride = m_dimensions[1];
+      m_colStride = m_dimensions[2] * m_rowStride;
+      m_patchStride = m_colStride * m_dimensions[3] * m_dimensions[0];
+      m_otherStride = m_patchStride * m_dimensions[4];
+    } else {
+      m_rowStride = m_dimensions[NumDims-2];
+      m_colStride = m_dimensions[NumDims-3] * m_rowStride;
+      m_patchStride = m_colStride * m_dimensions[NumDims-4] * m_dimensions[NumDims-1];
+      m_otherStride = m_patchStride * m_dimensions[NumDims-5];
+    }
+
+    // Strides for navigating through the input tensor.
+    m_planeInputStride = m_inputDepth;
+    m_rowInputStride = m_inputDepth * m_inputPlanes;
+    m_colInputStride = m_inputDepth * m_inputRows * m_inputPlanes;
+    m_otherInputStride = m_inputDepth * m_inputRows * m_inputCols * m_inputPlanes;
+
+    m_outputPlanesRows = m_outputPlanes * m_outputRows;
+
+    // Fast representations of different variables.
+    m_fastOtherStride = internal::TensorIntDivisor<Index>(m_otherStride);
+    m_fastPatchStride = internal::TensorIntDivisor<Index>(m_patchStride);
+    m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride);
+    m_fastRowStride = internal::TensorIntDivisor<Index>(m_rowStride);
+    m_fastInputRowStride = internal::TensorIntDivisor<Index>(m_row_inflate_strides);
+    m_fastInputColStride = internal::TensorIntDivisor<Index>(m_col_inflate_strides);
+    m_fastInputPlaneStride = internal::TensorIntDivisor<Index>(m_plane_inflate_strides);
+    m_fastInputColsEff = internal::TensorIntDivisor<Index>(m_input_cols_eff);
+    m_fastOutputPlanes = internal::TensorIntDivisor<Index>(m_outputPlanes);
+    m_fastOutputPlanesRows = internal::TensorIntDivisor<Index>(m_outputPlanesRows);
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[0]);
+    } else {
+      m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[NumDims-1]);
+    }
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    // Patch index corresponding to the passed in index.
+    const Index patchIndex = index / m_fastPatchStride;
+
+    // Spatial offset within the patch. This has to be translated into 3D
+    // coordinates within the patch.
+    const Index patchOffset = (index - patchIndex * m_patchStride) / m_fastOutputDepth;
+
+    // Batch, etc.
+    const Index otherIndex = (NumDims == 5) ? 0 : index / m_fastOtherStride;
+    const Index patch3DIndex = (NumDims == 5) ? patchIndex : (index - otherIndex * m_otherStride) / m_fastPatchStride;
+
+    // Calculate column index in the input original tensor.
+    const Index colIndex = patch3DIndex / m_fastOutputPlanesRows;
+    const Index colOffset = patchOffset / m_fastColStride;
+    const Index inputCol = colIndex * m_col_strides + colOffset * m_in_col_strides - m_colPaddingLeft;
+    const Index origInputCol = (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0);
+    if (inputCol < 0 || inputCol >= m_input_cols_eff ||
+        ((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides))) {
+      return Scalar(m_paddingValue);
+    }
+
+    // Calculate row index in the original input tensor.
+    const Index rowIndex = (patch3DIndex - colIndex * m_outputPlanesRows) / m_fastOutputPlanes;
+    const Index rowOffset = (patchOffset - colOffset * m_colStride) / m_fastRowStride;
+    const Index inputRow = rowIndex * m_row_strides + rowOffset * m_in_row_strides - m_rowPaddingTop;
+    const Index origInputRow = (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0);
+    if (inputRow < 0 || inputRow >= m_input_rows_eff ||
+        ((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) {
+      return Scalar(m_paddingValue);
+    }
+
+    // Calculate plane index in the original input tensor.
+    const Index planeIndex = (patch3DIndex - m_outputPlanes * (colIndex * m_outputRows + rowIndex));
+    const Index planeOffset = patchOffset - colOffset * m_colStride - rowOffset * m_rowStride;
+    const Index inputPlane = planeIndex * m_plane_strides + planeOffset * m_in_plane_strides - m_planePaddingTop;
+    const Index origInputPlane = (m_plane_inflate_strides == 1) ? inputPlane : ((inputPlane >= 0) ? (inputPlane / m_fastInputPlaneStride) : 0);
+    if (inputPlane < 0 || inputPlane >= m_input_planes_eff ||
+        ((m_plane_inflate_strides != 1) && (inputPlane != origInputPlane * m_plane_inflate_strides))) {
+      return Scalar(m_paddingValue);
+    }
+
+    const int depth_index = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
+    const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index];
+
+    const Index inputIndex = depth +
+        origInputRow * m_rowInputStride +
+        origInputCol * m_colInputStride +
+        origInputPlane * m_planeInputStride +
+        otherIndex * m_otherInputStride;
+
+    return m_impl.coeff(inputIndex);
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    const Index packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+    if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1 ||
+        m_in_plane_strides != 1 || m_plane_inflate_strides != 1) {
+      return packetWithPossibleZero(index);
+    }
+
+    const Index indices[2] = {index, index + packetSize - 1};
+    const Index patchIndex = indices[0] / m_fastPatchStride;
+    if (patchIndex != indices[1] / m_fastPatchStride) {
+      return packetWithPossibleZero(index);
+    }
+    const Index otherIndex = (NumDims == 5) ? 0 : indices[0] / m_fastOtherStride;
+    eigen_assert(otherIndex == indices[1] / m_fastOtherStride);
+
+    // Find the offset of the element wrt the location of the first element.
+    const Index patchOffsets[2] = {(indices[0] - patchIndex * m_patchStride) / m_fastOutputDepth,
+                                   (indices[1] - patchIndex * m_patchStride) / m_fastOutputDepth};
+
+    const Index patch3DIndex = (NumDims == 5) ? patchIndex : (indices[0] - otherIndex * m_otherStride) / m_fastPatchStride;
+    eigen_assert(patch3DIndex == (indices[1] - otherIndex * m_otherStride) / m_fastPatchStride);
+
+    const Index colIndex = patch3DIndex / m_fastOutputPlanesRows;
+    const Index colOffsets[2] = {
+      patchOffsets[0] / m_fastColStride,
+      patchOffsets[1] / m_fastColStride};
+
+    // Calculate col indices in the original input tensor.
+    const Index inputCols[2] = {
+      colIndex * m_col_strides + colOffsets[0] - m_colPaddingLeft,
+      colIndex * m_col_strides + colOffsets[1] - m_colPaddingLeft};
+    if (inputCols[1] < 0 || inputCols[0] >= m_inputCols) {
+      return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
+    }
+
+    if (inputCols[0] != inputCols[1]) {
+      return packetWithPossibleZero(index);
+    }
+
+    const Index rowIndex = (patch3DIndex - colIndex * m_outputPlanesRows) / m_fastOutputPlanes;
+    const Index rowOffsets[2] = {
+      (patchOffsets[0] - colOffsets[0] * m_colStride) / m_fastRowStride,
+      (patchOffsets[1] - colOffsets[1] * m_colStride) / m_fastRowStride};
+    eigen_assert(rowOffsets[0] <= rowOffsets[1]);
+    // Calculate col indices in the original input tensor.
+    const Index inputRows[2] = {
+      rowIndex * m_row_strides + rowOffsets[0] - m_rowPaddingTop,
+      rowIndex * m_row_strides + rowOffsets[1] - m_rowPaddingTop};
+
+    if (inputRows[1] < 0 || inputRows[0] >= m_inputRows) {
+      return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
+    }
+
+    if (inputRows[0] != inputRows[1]) {
+      return packetWithPossibleZero(index);
+    }
+
+    const Index planeIndex = (patch3DIndex - m_outputPlanes * (colIndex * m_outputRows + rowIndex));
+    const Index planeOffsets[2] = {
+      patchOffsets[0] - colOffsets[0] * m_colStride - rowOffsets[0] * m_rowStride,
+      patchOffsets[1] - colOffsets[1] * m_colStride - rowOffsets[1] * m_rowStride};
+    eigen_assert(planeOffsets[0] <= planeOffsets[1]);
+    const Index inputPlanes[2] = {
+      planeIndex * m_plane_strides + planeOffsets[0] - m_planePaddingTop,
+      planeIndex * m_plane_strides + planeOffsets[1] - m_planePaddingTop};
+
+    if (inputPlanes[1] < 0 || inputPlanes[0] >= m_inputPlanes) {
+      return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
+    }
+
+    if (inputPlanes[0] >= 0 && inputPlanes[1] < m_inputPlanes) {
+      // no padding
+      const int depth_index = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
+      const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index];
+      const Index inputIndex = depth +
+          inputRows[0] * m_rowInputStride +
+          inputCols[0] * m_colInputStride +
+          m_planeInputStride * inputPlanes[0] +
+          otherIndex * m_otherInputStride;
+      return m_impl.template packet<Unaligned>(inputIndex);
+    }
+
+    return packetWithPossibleZero(index);
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+
+  Index planePaddingTop() const { return m_planePaddingTop; }
+  Index rowPaddingTop() const { return m_rowPaddingTop; }
+  Index colPaddingLeft() const { return m_colPaddingLeft; }
+  Index outputPlanes() const { return m_outputPlanes; }
+  Index outputRows() const { return m_outputRows; }
+  Index outputCols() const { return m_outputCols; }
+  Index userPlaneStride() const { return m_plane_strides; }
+  Index userRowStride() const { return m_row_strides; }
+  Index userColStride() const { return m_col_strides; }
+  Index userInPlaneStride() const { return m_in_plane_strides; }
+  Index userInRowStride() const { return m_in_row_strides; }
+  Index userInColStride() const { return m_in_col_strides; }
+  Index planeInflateStride() const { return m_plane_inflate_strides; }
+  Index rowInflateStride() const { return m_row_inflate_strides; }
+  Index colInflateStride() const { return m_col_inflate_strides; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords) const
+  {
+    // ColMajor
+    //   0: depth, 1: patch_planes, 2: patch_rows, 3: patch_cols, 4: number of patches, 5: batches
+    // RowMajor
+    //   0: batches, 1: number of patches, 2: patch_cols , 3: patch_rows, 4: patch_planes, 5: depth
+    const Index patch3DIndex = coords[static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 4 : 1];
+    const Index colOffset = coords[static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 3 : 2];
+    const Index rowOffset= coords[static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 3];
+    const Index planeOffset = coords[static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 4];
+
+    array<Index, NumDims-1> inputCoords;
+
+    const Index colIndex = patch3DIndex / m_fastOutputPlanesRows;
+    const Index inputCol = colIndex * m_col_strides + colOffset * m_in_col_strides - m_colPaddingLeft;
+    const Index origInputCol = (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0);
+    if (inputCol < 0 || inputCol >= m_input_cols_eff ||
+        ((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides))) {
+      return Scalar(m_paddingValue);
+    }
+
+    const Index rowIndex = (patch3DIndex - colIndex * m_outputPlanesRows) / m_fastOutputPlanes;
+    const Index inputRow = rowIndex * m_row_strides + rowOffset * m_in_row_strides - m_rowPaddingTop;
+    const Index origInputRow = (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0);
+    if (inputRow < 0 || inputRow >= m_input_rows_eff ||
+        ((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) {
+      return Scalar(m_paddingValue);
+    }
+
+    const Index planeIndex = patch3DIndex - colIndex * m_outputPlanesRows - rowIndex * m_outputRows;
+    const Index inputPlane = planeIndex * m_plane_strides + planeOffset * m_in_plane_strides - m_planePaddingTop;
+    const Index origInputPlane = (m_plane_inflate_strides == 1) ? inputPlane : ((inputPlane >= 0) ? (inputPlane / m_fastInputPlaneStride) : 0);
+    if (inputPlane < 0 || inputPlane >= m_input_planes_eff ||
+        ((m_plane_inflate_strides != 1) && (inputPlane != origInputPlane * m_plane_inflate_strides))) {
+      return Scalar(m_paddingValue);
+    }
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      inputCoords[0] = coords[0];  // depth
+      inputCoords[1] = origInputPlane;
+      inputCoords[2] = origInputRow;
+      inputCoords[3] = origInputCol;
+      inputCoords[4] = coords[5];  // batch
+    } else {
+      inputCoords[4] = coords[5];  // depth
+      inputCoords[3] = origInputPlane;
+      inputCoords[2] = origInputRow;
+      inputCoords[1] = origInputCol;
+      inputCoords[0] = coords[0];  // batch
+    }
+    if (TensorEvaluator<ArgType, Device>::CoordAccess) {
+      return m_impl.coeff(inputCoords);
+    } else {
+      Index inputIndex;
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        inputIndex =
+          inputCoords[4] * m_otherInputStride +
+          inputCoords[3] * m_colInputStride +
+          inputCoords[2] * m_rowInputStride +
+          inputCoords[1] * m_planeInputStride +
+          inputCoords[0];
+      } else {
+        inputIndex =
+          inputCoords[0] * m_otherInputStride +
+          inputCoords[1] * m_colInputStride +
+          inputCoords[2] * m_rowInputStride +
+          inputCoords[3] * m_planeInputStride +
+          inputCoords[4];
+      }
+      return m_impl.coeff(inputIndex);
+    }
+  }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
+  {
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+    for (int i = 0; i < packetSize; ++i) {
+      values[i] = coeff(index+i);
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  Dimensions m_dimensions;
+
+  // Parameters passed to the costructor.
+  Index m_plane_strides;
+  Index m_row_strides;
+  Index m_col_strides;
+
+  Index m_outputPlanes;
+  Index m_outputRows;
+  Index m_outputCols;
+
+  Index m_planePaddingTop;
+  Index m_rowPaddingTop;
+  Index m_colPaddingLeft;
+
+  Index m_in_plane_strides;
+  Index m_in_row_strides;
+  Index m_in_col_strides;
+
+  Index m_plane_inflate_strides;
+  Index m_row_inflate_strides;
+  Index m_col_inflate_strides;
+
+  // Cached input size.
+  Index m_inputDepth;
+  Index m_inputPlanes;
+  Index m_inputRows;
+  Index m_inputCols;
+
+  // Other cached variables.
+  Index m_outputPlanesRows;
+
+  // Effective input/patch post-inflation size.
+  Index m_input_planes_eff;
+  Index m_input_rows_eff;
+  Index m_input_cols_eff;
+  Index m_patch_planes_eff;
+  Index m_patch_rows_eff;
+  Index m_patch_cols_eff;
+
+  // Strides for the output tensor.
+  Index m_otherStride;
+  Index m_patchStride;
+  Index m_rowStride;
+  Index m_colStride;
+
+  // Strides for the input tensor.
+  Index m_planeInputStride;
+  Index m_rowInputStride;
+  Index m_colInputStride;
+  Index m_otherInputStride;
+
+  internal::TensorIntDivisor<Index> m_fastOtherStride;
+  internal::TensorIntDivisor<Index> m_fastPatchStride;
+  internal::TensorIntDivisor<Index> m_fastColStride;
+  internal::TensorIntDivisor<Index> m_fastRowStride;
+  internal::TensorIntDivisor<Index> m_fastInputPlaneStride;
+  internal::TensorIntDivisor<Index> m_fastInputRowStride;
+  internal::TensorIntDivisor<Index> m_fastInputColStride;
+  internal::TensorIntDivisor<Index> m_fastInputColsEff;
+  internal::TensorIntDivisor<Index> m_fastOutputPlanesRows;
+  internal::TensorIntDivisor<Index> m_fastOutputPlanes;
+  internal::TensorIntDivisor<Index> m_fastOutputDepth;
+
+  Scalar m_paddingValue;
+
+  TensorEvaluator<ArgType, Device> m_impl;
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index f438d4107..155bfcd76 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -125,6 +125,7 @@ if(EIGEN_TEST_CXX11)
   ei_add_test(cxx11_tensor_padding "-std=c++0x")
   ei_add_test(cxx11_tensor_patch "-std=c++0x")
   ei_add_test(cxx11_tensor_image_patch "-std=c++0x")
+  ei_add_test(cxx11_tensor_volume_patch "-std=c++0x")
   ei_add_test(cxx11_tensor_reduction "-std=c++0x")
   ei_add_test(cxx11_tensor_shuffling "-std=c++0x")
   ei_add_test(cxx11_tensor_striding "-std=c++0x")
@@ -136,10 +137,12 @@ if(EIGEN_TEST_CXX11)
   ei_add_test(cxx11_tensor_layout_swap "-std=c++0x")
   ei_add_test(cxx11_tensor_io "-std=c++0x")
   ei_add_test(cxx11_tensor_generator "-std=c++0x")
+  ei_add_test(cxx11_tensor_custom_op "-std=c++0x")
 
   # These tests needs nvcc
 #  ei_add_test(cxx11_tensor_device "-std=c++0x")
 #  ei_add_test(cxx11_tensor_cuda "-std=c++0x")
 #  ei_add_test(cxx11_tensor_contract_cuda "-std=c++0x")
+#  ei_add_test(cxx11_tensor_reduction_cuda "-std=c++0x")
 
 endif()
diff --git a/unsupported/test/cxx11_meta.cpp b/unsupported/test/cxx11_meta.cpp
index af5cadbf9..4f45e1dd3 100644
--- a/unsupported/test/cxx11_meta.cpp
+++ b/unsupported/test/cxx11_meta.cpp
@@ -354,7 +354,3 @@ void test_cxx11_meta()
   CALL_SUBTEST(test_array_zip_and_apply());
   CALL_SUBTEST(test_array_misc());
 }
-
-/*
- * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
- */
diff --git a/unsupported/test/cxx11_tensor_custom_op.cpp b/unsupported/test/cxx11_tensor_custom_op.cpp
new file mode 100644
index 000000000..7e33c9580
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_custom_op.cpp
@@ -0,0 +1,107 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+
+struct InsertZeros {
+  DSizes<DenseIndex, 2> dimensions(const Tensor<float, 2>& input) const {
+    DSizes<DenseIndex, 2> result;
+    result[0] = input.dimension(0) * 2;
+    result[1] = input.dimension(1) * 2;
+    return result;
+  }
+
+  template <typename Output, typename Device>
+  void eval(const Tensor<float, 2>& input, Output& output, const Device& device) const
+  {
+    array<DenseIndex, 2> strides{{2, 2}};
+    output.stride(strides).device(device) = input;
+
+    Eigen::DSizes<DenseIndex, 2> offsets(1,1);
+    Eigen::DSizes<DenseIndex, 2> extents(output.dimension(0)-1, output.dimension(1)-1);
+    output.slice(offsets, extents).stride(strides).device(device) = input.constant(0.0f);
+  }
+};
+
+static void test_custom_unary_op()
+{
+  Tensor<float, 2> tensor(3,5);
+  tensor.setRandom();
+
+  Tensor<float, 2> result = tensor.customOp(InsertZeros());
+  VERIFY_IS_EQUAL(result.dimension(0), 6);
+  VERIFY_IS_EQUAL(result.dimension(1), 10);
+
+  for (int i = 0; i < 6; i+=2) {
+    for (int j = 0; j < 10; j+=2) {
+      VERIFY_IS_EQUAL(result(i, j), tensor(i/2, j/2));
+    }
+  }
+  for (int i = 1; i < 6; i+=2) {
+    for (int j = 1; j < 10; j+=2) {
+      VERIFY_IS_EQUAL(result(i, j), 0);
+    }
+  }
+}
+
+
+struct BatchMatMul {
+  DSizes<DenseIndex, 3> dimensions(const Tensor<float, 3>& input1, const Tensor<float, 3>& input2) const {
+    DSizes<DenseIndex, 3> result;
+    result[0] = input1.dimension(0);
+    result[1] = input2.dimension(1);
+    result[2] = input2.dimension(2);
+    return result;
+  }
+
+  template <typename Output, typename Device>
+  void eval(const Tensor<float, 3>& input1, const Tensor<float, 3>& input2,
+            Output& output, const Device& device) const
+  {
+    typedef Tensor<float, 3>::DimensionPair DimPair;
+    array<DimPair, 1> dims({{DimPair(1, 0)}});
+    for (int i = 0; i < output.dimension(2); ++i) {
+      output.template chip<2>(i).device(device) = input1.chip<2>(i).contract(input2.chip<2>(i), dims);
+    }
+  }
+};
+
+
+static void test_custom_binary_op()
+{
+  Tensor<float, 3> tensor1(2,3,5);
+  tensor1.setRandom();
+  Tensor<float, 3> tensor2(3,7,5);
+  tensor2.setRandom();
+
+  Tensor<float, 3> result = tensor1.customOp(tensor2, BatchMatMul());
+  for (int i = 0; i < 5; ++i) {
+    typedef Tensor<float, 3>::DimensionPair DimPair;
+    array<DimPair, 1> dims({{DimPair(1, 0)}});
+    Tensor<float, 2> reference = tensor1.chip<2>(i).contract(tensor2.chip<2>(i), dims);
+    TensorRef<Tensor<float, 2>> val = result.chip<2>(i);
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(val(j, k), reference(j, k));
+      }
+    }
+  }
+}
+
+
+void test_cxx11_tensor_custom_op()
+{
+  CALL_SUBTEST(test_custom_unary_op());
+  CALL_SUBTEST(test_custom_binary_op());
+}
diff --git a/unsupported/test/cxx11_tensor_reduction_cuda.cpp b/unsupported/test/cxx11_tensor_reduction_cuda.cpp
new file mode 100644
index 000000000..a7eb7ac75
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_reduction_cuda.cpp
@@ -0,0 +1,55 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_reduction_cuda
+#define EIGEN_USE_GPU
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+
+template<int DataLayout>
+static void test_full_reductions() {
+
+  Eigen::GpuDevice gpu_device;
+
+  const int num_rows = internal::random<int>(1024, 5*1024);
+  const int num_cols = internal::random<int>(1024, 5*1024);
+
+  Tensor<float, 2, DataLayout> in(num_rows, num_cols);
+  in.setRandom();
+
+  Tensor<float, 1, DataLayout> full_redux(1);
+  full_redux = in.sum();
+
+  std::size_t in_bytes = in.size() * sizeof(float);
+  std::size_t out_bytes = full_redux.size() * sizeof(float);
+  float* gpu_in_ptr = static_cast<float*>(gpu_device.allocate(in_bytes));
+  float* gpu_out_ptr = static_cast<float*>(gpu_device.allocate(out_bytes));
+  gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
+
+  TensorMap<Tensor<float, 2, DataLayout> > in_gpu(gpu_in_ptr, num_rows, num_cols);
+  TensorMap<Tensor<float, 1, DataLayout> > out_gpu(gpu_out_ptr, 1);
+
+  out_gpu.device(gpu_device) = in_gpu.sum();
+
+  Tensor<float, 1, DataLayout> full_redux_gpu(1);
+  gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
+  gpu_device.synchronize();
+
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux(0), full_redux_gpu(0));
+}
+
+void test_cxx11_tensor_reduction_cuda() {
+  CALL_SUBTEST(test_full_reductions<ColMajor>());
+  CALL_SUBTEST(test_full_reductions<RowMajor>());
+}
diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp
index 0a20a01a4..5ec7c8bf4 100644
--- a/unsupported/test/cxx11_tensor_thread_pool.cpp
+++ b/unsupported/test/cxx11_tensor_thread_pool.cpp
@@ -228,6 +228,29 @@ static void test_multithread_contraction_agrees_with_singlethread() {
 }
 
 
+template<int DataLayout>
+static void test_multithreaded_reductions() {
+  const int num_threads = internal::random<int>(3, 11);
+  ThreadPool thread_pool(num_threads);
+  Eigen::ThreadPoolDevice thread_pool_device(&thread_pool, num_threads);
+
+  const int num_rows = internal::random<int>(13, 732);
+  const int num_cols = internal::random<int>(13, 732);
+  Tensor<float, 2, DataLayout> t1(num_rows, num_cols);
+  t1.setRandom();
+
+  Tensor<float, 1, DataLayout> full_redux(1);
+  full_redux = t1.sum();
+
+  Tensor<float, 1, DataLayout> full_redux_tp(1);
+  full_redux_tp.device(thread_pool_device) = t1.sum();
+
+  // Check that the single threaded and the multi threaded reductions return
+  // the same result.
+  VERIFY_IS_APPROX(full_redux(0), full_redux_tp(0));
+}
+
+
 static void test_memcpy() {
 
   for (int i = 0; i < 5; ++i) {
@@ -271,6 +294,9 @@ void test_cxx11_tensor_thread_pool()
   CALL_SUBTEST(test_contraction_corner_cases<ColMajor>());
   CALL_SUBTEST(test_contraction_corner_cases<RowMajor>());
 
+  CALL_SUBTEST(test_multithreaded_reductions<ColMajor>());
+  CALL_SUBTEST(test_multithreaded_reductions<RowMajor>());
+
   CALL_SUBTEST(test_memcpy());
 
   CALL_SUBTEST(test_multithread_random());
diff --git a/unsupported/test/cxx11_tensor_volume_patch.cpp b/unsupported/test/cxx11_tensor_volume_patch.cpp
new file mode 100644
index 000000000..ca6840f3b
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_volume_patch.cpp
@@ -0,0 +1,112 @@
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+static void test_single_voxel_patch()
+{
+  Tensor<float, 5> tensor(4,2,3,5,7);
+  tensor.setRandom();
+  Tensor<float, 5, RowMajor> tensor_row_major = tensor.swap_layout();
+
+  Tensor<float, 6> single_voxel_patch;
+  single_voxel_patch = tensor.extract_volume_patches(1, 1, 1);
+  VERIFY_IS_EQUAL(single_voxel_patch.dimension(0), 4);
+  VERIFY_IS_EQUAL(single_voxel_patch.dimension(1), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch.dimension(3), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch.dimension(4), 2 * 3 * 5);
+  VERIFY_IS_EQUAL(single_voxel_patch.dimension(5), 7);
+
+  Tensor<float, 6, RowMajor> single_voxel_patch_row_major;
+  single_voxel_patch_row_major = tensor_row_major.extract_volume_patches(1, 1, 1);
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(0), 7);
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(1), 2 * 3 * 5);
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(3), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(4), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(5), 4);
+
+  for (int i = 0; i < tensor.size(); ++i) {
+    VERIFY_IS_EQUAL(tensor.data()[i], single_voxel_patch.data()[i]);
+    VERIFY_IS_EQUAL(tensor_row_major.data()[i], single_voxel_patch_row_major.data()[i]);
+    VERIFY_IS_EQUAL(tensor.data()[i], tensor_row_major.data()[i]);
+  }
+}
+
+
+static void test_entire_volume_patch()
+{
+  const int depth = 4;
+  const int patch_z = 2;
+  const int patch_y = 3;
+  const int patch_x = 5;
+  const int batch = 7;
+
+  Tensor<float, 5> tensor(depth, patch_z, patch_y, patch_x, batch);
+  tensor.setRandom();
+  Tensor<float, 5, RowMajor> tensor_row_major = tensor.swap_layout();
+
+  Tensor<float, 6> entire_volume_patch;
+  entire_volume_patch = tensor.extract_volume_patches(patch_z, patch_y, patch_x);
+  VERIFY_IS_EQUAL(entire_volume_patch.dimension(0), depth);
+  VERIFY_IS_EQUAL(entire_volume_patch.dimension(1), patch_z);
+  VERIFY_IS_EQUAL(entire_volume_patch.dimension(2), patch_y);
+  VERIFY_IS_EQUAL(entire_volume_patch.dimension(3), patch_x);
+  VERIFY_IS_EQUAL(entire_volume_patch.dimension(4), patch_z * patch_y * patch_x);
+  VERIFY_IS_EQUAL(entire_volume_patch.dimension(5), batch);
+
+  Tensor<float, 6, RowMajor> entire_volume_patch_row_major;
+  entire_volume_patch_row_major = tensor_row_major.extract_volume_patches(patch_z, patch_y, patch_x);
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(0), batch);
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(1), patch_z * patch_y * patch_x);
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(2), patch_x);
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(3), patch_y);
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(4), patch_z);
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(5), depth);
+
+  const int dz = patch_z - 1;
+  const int dy = patch_y - 1;
+  const int dx = patch_x - 1;
+
+  const int forward_pad_z = dz - dz / 2;
+  const int forward_pad_y = dy - dy / 2;
+  const int forward_pad_x = dx - dx / 2;
+
+  for (int pz = 0; pz < patch_z; pz++) {
+    for (int py = 0; py < patch_y; py++) {
+      for (int px = 0; px < patch_x; px++) {
+        const int patchId = pz + patch_z * (py + px * patch_y);
+        for (int z = 0; z < patch_z; z++) {
+          for (int y = 0; y < patch_y; y++) {
+            for (int x = 0; x < patch_x; x++) {
+              for (int b = 0; b < batch; b++) {
+                for (int d = 0; d < depth; d++) {
+                  float expected = 0.0f;
+                  float expected_row_major = 0.0f;
+                  const int eff_z = z - forward_pad_z + pz;
+                  const int eff_y = y - forward_pad_y + py;
+                  const int eff_x = x - forward_pad_x + px;
+                  if (eff_z >= 0 && eff_y >= 0 && eff_x >= 0 &&
+                      eff_z < patch_z && eff_y < patch_y && eff_x < patch_x) {
+                    expected = tensor(d, eff_z, eff_y, eff_x, b);
+                    expected_row_major = tensor_row_major(b, eff_x, eff_y, eff_z, d);
+                  }
+                  VERIFY_IS_EQUAL(entire_volume_patch(d, z, y, x, patchId, b), expected);
+                  VERIFY_IS_EQUAL(entire_volume_patch_row_major(b, patchId, x, y, z, d), expected_row_major);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void test_cxx11_tensor_volume_patch()
+{
+  CALL_SUBTEST(test_single_voxel_patch());
+  CALL_SUBTEST(test_entire_volume_patch());
+}

Module	Header file	Contents
\link Core_Module Core \endlink	\code#include \endcode	Matrix and Array classes, basic linear algebra (including triangular and selfadjoint products), array manipulation
\link Core_Module Core \endlink	\code#include \endcode	Matrix and Array classes, basic linear algebra (including triangular and selfadjoint products), array manipulation
\link Geometry_Module Geometry \endlink	\code#include \endcode	Transform, Translation, Scaling, Rotation2D and 3D rotations (Quaternion, AngleAxis)
\link LU_Module LU \endlink	\code#include \endcode	Inverse, determinant, LU decompositions with solver (FullPivLU, PartialPivLU)
\link Cholesky_Module Cholesky \endlink	\code#include \endcode	LLT and LDLT Cholesky factorization with solver
\link Householder_Module Householder \endlink	\code#include \endcode	Householder transformations; this module is used by several linear algebra modules
\link SVD_Module SVD \endlink	\code#include \endcode	SVD decomposition with least-squares solver (JacobiSVD)
\link QR_Module QR \endlink	\code#include \endcode	QR decomposition with solver (HouseholderQR, ColPivHouseholderQR, FullPivHouseholderQR)
\link Eigenvalues_Module Eigenvalues \endlink	\code#include \endcode	Eigenvalue, eigenvector decompositions (EigenSolver, SelfAdjointEigenSolver, ComplexEigenSolver)
\link Sparse_modules Sparse \endlink	\code#include \endcode	%Sparse matrix storage and related basic linear algebra (SparseMatrix, DynamicSparseMatrix, SparseVector)
	\code#include \endcode	Includes Core, Geometry, LU, Cholesky, SVD, QR, and Eigenvalues header files
	\code#include \endcode	Includes %Dense and %Sparse header files (the whole Eigen library)
\link LU_Module LU \endlink	\code#include \endcode	Inverse, determinant, LU decompositions with solver (FullPivLU, PartialPivLU)
\link Cholesky_Module Cholesky \endlink	\code#include \endcode	LLT and LDLT Cholesky factorization with solver
\link Householder_Module Householder \endlink	\code#include \endcode	Householder transformations; this module is used by several linear algebra modules
\link SVD_Module SVD \endlink	\code#include \endcode	SVD decompositions with least-squares solver (JacobiSVD, BDCSVD)
\link QR_Module QR \endlink	\code#include \endcode	QR decomposition with solver (HouseholderQR, ColPivHouseholderQR, FullPivHouseholderQR)
\link Eigenvalues_Module Eigenvalues \endlink	\code#include \endcode	Eigenvalue, eigenvector decompositions (EigenSolver, SelfAdjointEigenSolver, ComplexEigenSolver)
\link Sparse_modules Sparse \endlink	\code#include \endcode	%Sparse matrix storage and related basic linear algebra (SparseMatrix, SparseVector) \n (see \ref SparseQuickRefPage for details on sparse modules)
	\code#include \endcode	Includes Core, Geometry, LU, Cholesky, SVD, QR, and Eigenvalues header files
	\code#include \endcode	Includes %Dense and %Sparse header files (the whole Eigen library)
Arithmetic operators	\code @@ -400,28 +378,107 @@ array1 + scalar array1 - scalar array1 += scalar array1 -= scalar array1 < array2 array1 > array2 array1 < scalar array1 > scalar array1 <= array2 array1 >= array2 array1 <= scalar array1 >= scalar array1 == array2 array1 != array2 array1 == scalar array1 != scalar +array1.min(array2) array1.max(array2) array1.min(scalar) array1.max(scalar) \endcode
Trigo, power, and \n misc functions \n and the STL variants	\code -array1.min(array2) -array1.max(array2) +
Trigo, power, and \n misc functions \n and the STL-like variants	\code array1.abs2() array1.abs() abs(array1) array1.sqrt() sqrt(array1) array1.log() log(array1) +array1.log10() log10(array1) array1.exp() exp(array1) -array1.pow(exponent) pow(array1,exponent) +array1.pow(array2) pow(array1,array2) +array1.pow(scalar) pow(array1,scalar) array1.square() array1.cube() array1.inverse() + array1.sin() sin(array1) array1.cos() cos(array1) array1.tan() tan(array1) array1.asin() asin(array1) array1.acos() acos(array1) +array1.atan() atan(array1) +array1.sinh() sinh(array1) +array1.cosh() cosh(array1) +array1.tanh() tanh(array1) +array1.arg() arg(array1) + +array1.floor() floor(array1) +array1.ceil() ceil(array1) +array1.round() round(aray1) + +array1.isFinite() isfinite(array1) +array1.isInf() isinf(array1) +array1.isNaN() isnan(array1) \endcode