Merge with upstream eigen/default

2025-09-11 17:03:15 +08:00 · 2018-08-27 14:34:07 -07:00 · 2018-08-27 14:34:07 -07:00 · c144bb355b
commit c144bb355b
parent 35d90e8960 5747288676
49 changed files with 668 additions and 556 deletions
--- a/Eigen/src/CholmodSupport/CholmodSupport.h
+++ b/Eigen/src/CholmodSupport/CholmodSupport.h
@ -10,7 +10,7 @@
 #ifndef EIGEN_CHOLMODSUPPORT_H
 #define EIGEN_CHOLMODSUPPORT_H

-namespace Eigen { 
+namespace Eigen {

 namespace internal {

@ -79,12 +79,12 @@ cholmod_sparse viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_StorageIndex> >

  res.dtype   = 0;
  res.stype   = -1;
-  
+
  if (internal::is_same<_StorageIndex,int>::value)
  {
    res.itype = CHOLMOD_INT;
  }
-  else if (internal::is_same<_StorageIndex,long>::value)
+  else if (internal::is_same<_StorageIndex,SuiteSparse_long>::value)
  {
    res.itype = CHOLMOD_LONG;
  }
@ -95,9 +95,9 @@ cholmod_sparse viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_StorageIndex> >

  // setup res.xtype
  internal::cholmod_configure_matrix<_Scalar>::run(res);
-  
+
  res.stype = 0;
-  
+
  return res;
 }

@ -121,7 +121,7 @@ template<typename _Scalar, int _Options, typename _Index, unsigned int UpLo>
 cholmod_sparse viewAsCholmod(const SparseSelfAdjointView<const SparseMatrix<_Scalar,_Options,_Index>, UpLo>& mat)
 {
  cholmod_sparse res = viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_Index> >(mat.matrix().const_cast_derived()));
-  
+
  if(UpLo==Upper) res.stype =  1;
  if(UpLo==Lower) res.stype = -1;
  // swap stype for rowmajor matrices (only works for real matrices)
@ -168,11 +168,11 @@ namespace internal {

 #define EIGEN_CHOLMOD_SPECIALIZE0(ret, name) \
    template<typename _StorageIndex> inline ret cm_ ## name       (cholmod_common &Common) { return cholmod_ ## name   (&Common); } \
-    template<>                       inline ret cm_ ## name<long> (cholmod_common &Common) { return cholmod_l_ ## name (&Common); }
+    template<>                       inline ret cm_ ## name<SuiteSparse_long> (cholmod_common &Common) { return cholmod_l_ ## name (&Common); }

 #define EIGEN_CHOLMOD_SPECIALIZE1(ret, name, t1, a1) \
    template<typename _StorageIndex> inline ret cm_ ## name       (t1& a1, cholmod_common &Common) { return cholmod_ ## name   (&a1, &Common); } \
-    template<>                       inline ret cm_ ## name<long> (t1& a1, cholmod_common &Common) { return cholmod_l_ ## name (&a1, &Common); }
+    template<>                       inline ret cm_ ## name<SuiteSparse_long> (t1& a1, cholmod_common &Common) { return cholmod_l_ ## name (&a1, &Common); }

 EIGEN_CHOLMOD_SPECIALIZE0(int, start)
 EIGEN_CHOLMOD_SPECIALIZE0(int, finish)
@ -184,15 +184,15 @@ EIGEN_CHOLMOD_SPECIALIZE1(int, free_sparse, cholmod_sparse*, A)
 EIGEN_CHOLMOD_SPECIALIZE1(cholmod_factor*, analyze, cholmod_sparse, A)

 template<typename _StorageIndex> inline cholmod_dense*  cm_solve         (int sys, cholmod_factor& L, cholmod_dense&  B, cholmod_common &Common) { return cholmod_solve     (sys, &L, &B, &Common); }
-template<>                       inline cholmod_dense*  cm_solve<long>   (int sys, cholmod_factor& L, cholmod_dense&  B, cholmod_common &Common) { return cholmod_l_solve   (sys, &L, &B, &Common); }
+template<>                       inline cholmod_dense*  cm_solve<SuiteSparse_long>   (int sys, cholmod_factor& L, cholmod_dense&  B, cholmod_common &Common) { return cholmod_l_solve   (sys, &L, &B, &Common); }

 template<typename _StorageIndex> inline cholmod_sparse* cm_spsolve       (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_spsolve   (sys, &L, &B, &Common); }
-template<>                       inline cholmod_sparse* cm_spsolve<long> (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_l_spsolve (sys, &L, &B, &Common); }
+template<>                       inline cholmod_sparse* cm_spsolve<SuiteSparse_long> (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_l_spsolve (sys, &L, &B, &Common); }

 template<typename _StorageIndex>
 inline int  cm_factorize_p       (cholmod_sparse*  A, double beta[2], _StorageIndex* fset, std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_factorize_p   (A, beta, fset, fsize, L, &Common); }
 template<>
-inline int  cm_factorize_p<long> (cholmod_sparse*  A, double beta[2], long* fset,          std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_l_factorize_p (A, beta, fset, fsize, L, &Common); }
+inline int  cm_factorize_p<SuiteSparse_long> (cholmod_sparse*  A, double beta[2], SuiteSparse_long* fset,          std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_l_factorize_p (A, beta, fset, fsize, L, &Common); }

 #undef EIGEN_CHOLMOD_SPECIALIZE0
 #undef EIGEN_CHOLMOD_SPECIALIZE1
@ -254,10 +254,10 @@ class CholmodBase : public SparseSolverBase<Derived>
        internal::cm_free_factor<StorageIndex>(m_cholmodFactor, m_cholmod);
      internal::cm_finish<StorageIndex>(m_cholmod);
    }
-    
+
    inline StorageIndex cols() const { return internal::convert_index<StorageIndex, Index>(m_cholmodFactor->n); }
    inline StorageIndex rows() const { return internal::convert_index<StorageIndex, Index>(m_cholmodFactor->n); }
-    
+
    /** \brief Reports whether previous computation was successful.
      *
      * \returns \c Success if computation was successful,
@ -276,11 +276,11 @@ class CholmodBase : public SparseSolverBase<Derived>
      factorize(matrix);
      return derived();
    }
-    
+
    /** Performs a symbolic decomposition on the sparsity pattern of \a matrix.
      *
      * This function is particularly useful when solving for several problems having the same structure.
-      * 
+      *
      * \sa factorize()
      */
    void analyzePattern(const MatrixType& matrix)
@ -292,13 +292,13 @@ class CholmodBase : public SparseSolverBase<Derived>
      }
      cholmod_sparse A = viewAsCholmod(matrix.template selfadjointView<UpLo>());
      m_cholmodFactor = internal::cm_analyze<StorageIndex>(A, m_cholmod);
-      
+
      this->m_isInitialized = true;
      this->m_info = Success;
      m_analysisIsOk = true;
      m_factorizationIsOk = false;
    }
-    
+
    /** Performs a numeric decomposition of \a matrix
      *
      * The given matrix must have the same sparsity pattern as the matrix on which the symbolic decomposition has been performed.
@ -315,11 +315,11 @@ class CholmodBase : public SparseSolverBase<Derived>
      this->m_info = (m_cholmodFactor->minor == m_cholmodFactor->n ? Success : NumericalIssue);
      m_factorizationIsOk = true;
    }
-    
+
    /** Returns a reference to the Cholmod's configuration structure to get a full control over the performed operations.
     *  See the Cholmod user guide for details. */
    cholmod_common& cholmod() { return m_cholmod; }
-    
+
    #ifndef EIGEN_PARSED_BY_DOXYGEN
    /** \internal */
    template<typename Rhs,typename Dest>
@ -329,7 +329,7 @@ class CholmodBase : public SparseSolverBase<Derived>
      const Index size = m_cholmodFactor->n;
      EIGEN_UNUSED_VARIABLE(size);
      eigen_assert(size==b.rows());
-      
+
      // Cholmod needs column-major storage without inner-stride, which corresponds to the default behavior of Ref.
      Ref<const Matrix<typename Rhs::Scalar,Dynamic,Dynamic,ColMajor> > b_ref(b.derived());

@ -345,7 +345,7 @@ class CholmodBase : public SparseSolverBase<Derived>
      dest = Matrix<Scalar,Dest::RowsAtCompileTime,Dest::ColsAtCompileTime>::Map(reinterpret_cast<Scalar*>(x_cd->x),b.rows(),b.cols());
      internal::cm_free_dense<StorageIndex>(x_cd, m_cholmod);
    }
-    
+
    /** \internal */
    template<typename RhsDerived, typename DestDerived>
    void _solve_impl(const SparseMatrixBase<RhsDerived> &b, SparseMatrixBase<DestDerived> &dest) const
@ -370,8 +370,8 @@ class CholmodBase : public SparseSolverBase<Derived>
      internal::cm_free_sparse<StorageIndex>(x_cs, m_cholmod);
    }
    #endif // EIGEN_PARSED_BY_DOXYGEN
-    
-    
+
+
    /** Sets the shift parameter that will be used to adjust the diagonal coefficients during the numerical factorization.
      *
      * During the numerical factorization, an offset term is added to the diagonal coefficients:\n
@ -386,7 +386,7 @@ class CholmodBase : public SparseSolverBase<Derived>
      m_shiftOffset[0] = double(offset);
      return derived();
    }
-    
+
    /** \returns the determinant of the underlying matrix from the current factorization */
    Scalar determinant() const
    {
@ -441,7 +441,7 @@ class CholmodBase : public SparseSolverBase<Derived>
    template<typename Stream>
    void dumpMemory(Stream& /*s*/)
    {}
-    
+
  protected:
    mutable cholmod_common m_cholmod;
    cholmod_factor* m_cholmodFactor;
@ -478,11 +478,11 @@ class CholmodSimplicialLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimpl
 {
    typedef CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLLT> Base;
    using Base::m_cholmod;
-    
+
  public:
-    
+
    typedef _MatrixType MatrixType;
-    
+
    CholmodSimplicialLLT() : Base() { init(); }

    CholmodSimplicialLLT(const MatrixType& matrix) : Base()
@ -529,11 +529,11 @@ class CholmodSimplicialLDLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimp
 {
    typedef CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLDLT> Base;
    using Base::m_cholmod;
-    
+
  public:
-    
+
    typedef _MatrixType MatrixType;
-    
+
    CholmodSimplicialLDLT() : Base() { init(); }

    CholmodSimplicialLDLT(const MatrixType& matrix) : Base()
@ -578,11 +578,11 @@ class CholmodSupernodalLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSuper
 {
    typedef CholmodBase<_MatrixType, _UpLo, CholmodSupernodalLLT> Base;
    using Base::m_cholmod;
-    
+
  public:
-    
+
    typedef _MatrixType MatrixType;
-    
+
    CholmodSupernodalLLT() : Base() { init(); }

    CholmodSupernodalLLT(const MatrixType& matrix) : Base()
@ -629,11 +629,11 @@ class CholmodDecomposition : public CholmodBase<_MatrixType, _UpLo, CholmodDecom
 {
    typedef CholmodBase<_MatrixType, _UpLo, CholmodDecomposition> Base;
    using Base::m_cholmod;
-    
+
  public:
-    
+
    typedef _MatrixType MatrixType;
-    
+
    CholmodDecomposition() : Base() { init(); }

    CholmodDecomposition(const MatrixType& matrix) : Base()
@ -643,7 +643,7 @@ class CholmodDecomposition : public CholmodBase<_MatrixType, _UpLo, CholmodDecom
    }

    ~CholmodDecomposition() {}
-    
+
    void setMode(CholmodMode mode)
    {
      switch(mode)
--- a/Eigen/src/Core/GlobalFunctions.h
+++ b/Eigen/src/Core/GlobalFunctions.h
@ -66,6 +66,7 @@ namespace Eigen
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sinh,scalar_sinh_op,hyperbolic sine,\sa ArrayBase::sinh)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cosh,scalar_cosh_op,hyperbolic cosine,\sa ArrayBase::cosh)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tanh,scalar_tanh_op,hyperbolic tangent,\sa ArrayBase::tanh)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(logistic,scalar_logistic_op,logistic function,\sa ArrayBase::logistic)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(lgamma,scalar_lgamma_op,natural logarithm of the gamma function,\sa ArrayBase::lgamma)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(digamma,scalar_digamma_op,derivative of lgamma,\sa ArrayBase::digamma)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erf,scalar_erf_op,error function,\sa ArrayBase::erf)
@ -89,7 +90,7 @@ namespace Eigen
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isinf,scalar_isinf_op,infinite value test,\sa Eigen::isnan DOXCOMMA Eigen::isfinite DOXCOMMA ArrayBase::isinf)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isfinite,scalar_isfinite_op,finite value test,\sa Eigen::isinf DOXCOMMA Eigen::isnan DOXCOMMA ArrayBase::isfinite)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sign,scalar_sign_op,sign (or 0),\sa ArrayBase::sign)
-  
+
  /** \returns an expression of the coefficient-wise power of \a x to the given constant \a exponent.
    *
    * \tparam ScalarExponent is the scalar type of \a exponent. It must be compatible with the scalar type of the given expression (\c Derived::Scalar).
@ -124,21 +125,21 @@ namespace Eigen
    *
    * Example: \include Cwise_array_power_array.cpp
    * Output: \verbinclude Cwise_array_power_array.out
-    * 
+    *
    * \sa ArrayBase::pow()
    *
    * \relates ArrayBase
    */
  template<typename Derived,typename ExponentDerived>
  inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_pow_op<typename Derived::Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>
-  pow(const Eigen::ArrayBase<Derived>& x, const Eigen::ArrayBase<ExponentDerived>& exponents) 
+  pow(const Eigen::ArrayBase<Derived>& x, const Eigen::ArrayBase<ExponentDerived>& exponents)
  {
    return Eigen::CwiseBinaryOp<Eigen::internal::scalar_pow_op<typename Derived::Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>(
      x.derived(),
      exponents.derived()
    );
  }
-  
+
  /** \returns an expression of the coefficient-wise power of the scalar \a x to the given array of \a exponents.
    *
    * This function computes the coefficient-wise power between a scalar and an array of exponents.
@ -147,7 +148,7 @@ namespace Eigen
    *
    * Example: \include Cwise_scalar_power_array.cpp
    * Output: \verbinclude Cwise_scalar_power_array.out
-    * 
+    *
    * \sa ArrayBase::pow()
    *
    * \relates ArrayBase
--- a/Eigen/src/Core/MapBase.h
+++ b/Eigen/src/Core/MapBase.h
@ -43,6 +43,7 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
    enum {
      RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
      ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
+      InnerStrideAtCompileTime = internal::traits<Derived>::InnerStrideAtCompileTime,
      SizeAtCompileTime = Base::SizeAtCompileTime
    };

@ -187,8 +188,11 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
    void checkSanity(typename internal::enable_if<(internal::traits<T>::Alignment>0),void*>::type = 0) const
    {
 #if EIGEN_MAX_ALIGN_BYTES>0
+      // innerStride() is not set yet when this function is called, so we optimistically assume the lowest plausible value:
+      const Index minInnerStride = InnerStrideAtCompileTime == Dynamic ? 1 : Index(InnerStrideAtCompileTime);
+      EIGEN_ONLY_USED_FOR_DEBUG(minInnerStride);
      eigen_assert((   ((internal::UIntPtr(m_data) % internal::traits<Derived>::Alignment) == 0)
-                    || (cols() * rows() * innerStride() * sizeof(Scalar)) < internal::traits<Derived>::Alignment ) && "data is not aligned");
+                    || (cols() * rows() * minInnerStride * sizeof(Scalar)) < internal::traits<Derived>::Alignment ) && "data is not aligned");
 #endif
    }

--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@ -634,13 +634,13 @@ template<> EIGEN_STRONG_INLINE Packet8d preverse(const Packet8d& a)
 template<> EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a)
 {
  // _mm512_abs_ps intrinsic not found, so hack around it
-  return (__m512)_mm512_and_si512((__m512i)a, _mm512_set1_epi32(0x7fffffff));
+  return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(a), _mm512_set1_epi32(0x7fffffff)));
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) {
  // _mm512_abs_ps intrinsic not found, so hack around it
-  return (__m512d)_mm512_and_si512((__m512i)a,
-                                   _mm512_set1_epi64(0x7fffffffffffffff));
+  return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(a),
+                                   _mm512_set1_epi64(0x7fffffffffffffff)));
 }

 #ifdef EIGEN_VECTORIZE_AVX512DQ
--- a/Eigen/src/Core/functors/UnaryFunctors.h
+++ b/Eigen/src/Core/functors/UnaryFunctors.h
@ -701,7 +701,7 @@ template<typename Scalar> struct scalar_isnan_op {
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const {
 #if defined(__SYCL_DEVICE_ONLY__)
    return numext::isnan(a);
-#else  
+#else
    return (numext::isnan)(a);
 #endif
  }
@ -815,7 +815,7 @@ struct scalar_sign_op<Scalar,true> {
 template<typename Scalar>
 struct functor_traits<scalar_sign_op<Scalar> >
 { enum {
-    Cost = 
+    Cost =
        NumTraits<Scalar>::IsComplex
        ? ( 8*NumTraits<Scalar>::MulCost  ) // roughly
        : ( 3*NumTraits<Scalar>::AddCost),
@ -823,6 +823,34 @@ struct functor_traits<scalar_sign_op<Scalar> >
  };
 };

+/** \internal
+  * \brief Template functor to compute the logistic function of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::logistic()
+  */
+template <typename T>
+struct scalar_logistic_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_logistic_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const {
+    const T one = T(1);
+    return one / (one + numext::exp(-x));
+  }
+
+  template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Packet packetOp(const Packet& x) const {
+    const Packet one = pset1<Packet>(T(1));
+    return pdiv(one, padd(one, pexp(pnegate(x))));
+  }
+};
+template <typename T>
+struct functor_traits<scalar_logistic_op<T> > {
+  enum {
+    Cost = NumTraits<T>::AddCost * 2 + NumTraits<T>::MulCost * 6,
+    PacketAccess = packet_traits<T>::HasAdd && packet_traits<T>::HasDiv &&
+                   packet_traits<T>::HasNegate && packet_traits<T>::HasExp
+  };
+};
+
+
 } // end namespace internal

 } // end namespace Eigen
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@ -405,7 +405,7 @@ template<typename T> struct plain_matrix_type_row_major
  typedef Matrix<typename traits<T>::Scalar,
                Rows,
                Cols,
-                (MaxCols==1&&MaxRows!=1) ? RowMajor : ColMajor,
+                (MaxCols==1&&MaxRows!=1) ? ColMajor : RowMajor,
                MaxRows,
                MaxCols
          > type;
--- a/Eigen/src/SuperLUSupport/SuperLUSupport.h
+++ b/Eigen/src/SuperLUSupport/SuperLUSupport.h
@ -297,8 +297,8 @@ SluMatrix asSluMatrix(MatrixType& mat)
 template<typename Scalar, int Flags, typename Index>
 MappedSparseMatrix<Scalar,Flags,Index> map_superlu(SluMatrix& sluMat)
 {
-  eigen_assert((Flags&RowMajor)==RowMajor && sluMat.Stype == SLU_NR
-         || (Flags&ColMajor)==ColMajor && sluMat.Stype == SLU_NC);
+  eigen_assert(((Flags&RowMajor)==RowMajor && sluMat.Stype == SLU_NR)
+         || ((Flags&ColMajor)==ColMajor && sluMat.Stype == SLU_NC));

  Index outerSize = (Flags&RowMajor)==RowMajor ? sluMat.ncol : sluMat.nrow;

--- a/Eigen/src/plugins/ArrayCwiseUnaryOps.h
+++ b/Eigen/src/plugins/ArrayCwiseUnaryOps.h
@ -21,6 +21,7 @@ typedef CwiseUnaryOp<internal::scalar_acos_op<Scalar>, const Derived> AcosReturn
 typedef CwiseUnaryOp<internal::scalar_asin_op<Scalar>, const Derived> AsinReturnType;
 typedef CwiseUnaryOp<internal::scalar_atan_op<Scalar>, const Derived> AtanReturnType;
 typedef CwiseUnaryOp<internal::scalar_tanh_op<Scalar>, const Derived> TanhReturnType;
+typedef CwiseUnaryOp<internal::scalar_logistic_op<Scalar>, const Derived> LogisticReturnType;
 typedef CwiseUnaryOp<internal::scalar_sinh_op<Scalar>, const Derived> SinhReturnType;
 typedef CwiseUnaryOp<internal::scalar_cosh_op<Scalar>, const Derived> CoshReturnType;
 typedef CwiseUnaryOp<internal::scalar_square_op<Scalar>, const Derived> SquareReturnType;
@ -335,6 +336,15 @@ cosh() const
  return CoshReturnType(derived());
 }

+/** \returns an expression of the coefficient-wise logistic of *this.
+  */
+EIGEN_DEVICE_FUNC
+inline const LogisticReturnType
+logistic() const
+{
+  return LogisticReturnType(derived());
+}
+
 /** \returns an expression of the coefficient-wise inverse of *this.
  *
  * Example: \include Cwise_inverse.cpp
--- a/doc/snippets/DirectionWise_hnormalized.cpp
+++ b/doc/snippets/DirectionWise_hnormalized.cpp
@ -1,7 +1,6 @@
-typedef Matrix<double,4,Dynamic> Matrix4Xd;
 Matrix4Xd M = Matrix4Xd::Random(4,5);
 Projective3d P(Matrix4d::Random());
 cout << "The matrix M is:" << endl << M << endl << endl;
 cout << "M.colwise().hnormalized():" << endl << M.colwise().hnormalized() << endl << endl;
 cout << "P*M:" << endl << P*M << endl << endl;
-cout << "(P*M).colwise().hnormalized():" << endl << (P*M).colwise().hnormalized() << endl << endl;
+cout << "(P*M).colwise().hnormalized():" << endl << (P*M).colwise().hnormalized() << endl << endl;
--- a/doc/snippets/VectorwiseOp_homogeneous.cpp
+++ b/doc/snippets/VectorwiseOp_homogeneous.cpp
@ -1,7 +1,6 @@
-typedef Matrix<double,3,Dynamic> Matrix3Xd;
 Matrix3Xd M = Matrix3Xd::Random(3,5);
 Projective3d P(Matrix4d::Random());
 cout << "The matrix M is:" << endl << M << endl << endl;
 cout << "M.colwise().homogeneous():" << endl << M.colwise().homogeneous() << endl << endl;
 cout << "P * M.colwise().homogeneous():" << endl << P * M.colwise().homogeneous() << endl << endl;
-cout << "P * M.colwise().homogeneous().hnormalized(): " << endl << (P * M.colwise().homogeneous()).colwise().hnormalized() << endl << endl;
+cout << "P * M.colwise().homogeneous().hnormalized(): " << endl << (P * M.colwise().homogeneous()).colwise().hnormalized() << endl << endl;
--- a/test/array.cpp
+++ b/test/array.cpp
@ -231,6 +231,7 @@ template<typename ArrayType> void array_real(const ArrayType& m)
  VERIFY_IS_APPROX(m1.sinh(), sinh(m1));
  VERIFY_IS_APPROX(m1.cosh(), cosh(m1));
  VERIFY_IS_APPROX(m1.tanh(), tanh(m1));
+  VERIFY_IS_APPROX(m1.logistic(), logistic(m1));

  VERIFY_IS_APPROX(m1.arg(), arg(m1));
  VERIFY_IS_APPROX(m1.round(), round(m1));
@ -266,6 +267,7 @@ template<typename ArrayType> void array_real(const ArrayType& m)
  VERIFY_IS_APPROX(sinh(m1), 0.5*(exp(m1)-exp(-m1)));
  VERIFY_IS_APPROX(cosh(m1), 0.5*(exp(m1)+exp(-m1)));
  VERIFY_IS_APPROX(tanh(m1), (0.5*(exp(m1)-exp(-m1)))/(0.5*(exp(m1)+exp(-m1))));
+  VERIFY_IS_APPROX(logistic(m1), (1.0/(1.0+exp(-m1))));
  VERIFY_IS_APPROX(arg(m1), ((m1<0).template cast<Scalar>())*std::acos(-1.0));
  VERIFY((round(m1) <= ceil(m1) && round(m1) >= floor(m1)).all());
  VERIFY((Eigen::isnan)((m1*0.0)/0.0).all());
@ -345,6 +347,7 @@ template<typename ArrayType> void array_complex(const ArrayType& m)
  VERIFY_IS_APPROX(m1.sinh(), sinh(m1));
  VERIFY_IS_APPROX(m1.cosh(), cosh(m1));
  VERIFY_IS_APPROX(m1.tanh(), tanh(m1));
+  VERIFY_IS_APPROX(m1.logistic(), logistic(m1));
  VERIFY_IS_APPROX(m1.arg(), arg(m1));
  VERIFY((m1.isNaN() == (Eigen::isnan)(m1)).all());
  VERIFY((m1.isInf() == (Eigen::isinf)(m1)).all());
@ -368,6 +371,7 @@ template<typename ArrayType> void array_complex(const ArrayType& m)
  VERIFY_IS_APPROX(sinh(m1), 0.5*(exp(m1)-exp(-m1)));
  VERIFY_IS_APPROX(cosh(m1), 0.5*(exp(m1)+exp(-m1)));
  VERIFY_IS_APPROX(tanh(m1), (0.5*(exp(m1)-exp(-m1)))/(0.5*(exp(m1)+exp(-m1))));
+  VERIFY_IS_APPROX(logistic(m1), (1.0/(1.0 + exp(-m1))));

  for (Index i = 0; i < m.rows(); ++i)
    for (Index j = 0; j < m.cols(); ++j)
--- a/test/geo_quaternion.cpp
+++ b/test/geo_quaternion.cpp
@ -290,6 +290,8 @@ template<typename PlainObjectType> void check_const_correctness(const PlainObjec

 // Regression for bug 1573
 struct MovableClass {
+  // The following line is a workaround for gcc 4.7 and 4.8 (see bug 1573 comments).
+  static_assert(std::is_nothrow_move_constructible<Quaternionf>::value,"");
  MovableClass() = default;
  MovableClass(const MovableClass&) = default;
  MovableClass(MovableClass&&) noexcept = default;
--- a/test/main.h
+++ b/test/main.h
@ -125,7 +125,7 @@ inline void on_temporary_creation(long int size) {
    if(nb_temporaries!=(N)) { std::cerr << "nb_temporaries == " << nb_temporaries << "\n"; }\
    VERIFY( (#XPR) && nb_temporaries==(N) ); \
  }
-  
+
 #endif

 #include "split_test_helper.h"
@ -328,7 +328,7 @@ namespace Eigen
  #define VERIFY_RAISES_STATIC_ASSERT(a) \
    std::cout << "Can't VERIFY_RAISES_STATIC_ASSERT( " #a " ) with exceptions disabled\n";
 #endif
-    
+
  #if !defined(__CUDACC__) && !defined(__HIPCC__) && !defined(__SYCL_DEVICE_ONLY__)
  #define EIGEN_USE_CUSTOM_ASSERT
  #endif
@ -845,4 +845,4 @@ int main(int argc, char *argv[])
 #ifdef _MSC_VER
  // 4503 - decorated name length exceeded, name was truncated
  #pragma warning( disable : 4503)
-#endif
+#endif
--- a/test/meta.cpp
+++ b/test/meta.cpp
@ -102,7 +102,13 @@ EIGEN_DECLARE_TEST(meta)
  }

  STATIC_CHECK(( !internal::is_convertible<MyInterface, MyImpl>::value ));
+  #if (!EIGEN_COMP_GNUC_STRICT) || (EIGEN_GNUC_AT_LEAST(4,8))
+  // GCC prior to 4.8 fails to compile this test:
+  // error: cannot allocate an object of abstract type 'MyInterface'
+  // In other word, it does not obey SFINAE.
+  // Nevertheless, we don't really care about supporting abstract type as scalar type!
  STATIC_CHECK(( !internal::is_convertible<MyImpl, MyInterface>::value ));
+  #endif
  STATIC_CHECK((  internal::is_convertible<MyImpl, const MyInterface&>::value ));
  {
    int i;
--- a/unsupported/Eigen/CXX11/ThreadPool
+++ b/unsupported/Eigen/CXX11/ThreadPool
@ -44,17 +44,27 @@
 #include <thread>
 #include <functional>
 #include <memory>
-
 #include "src/util/CXX11Meta.h"
 #include "src/util/MaxSizeVector.h"

 #include "src/ThreadPool/ThreadLocal.h"
+#ifndef EIGEN_THREAD_LOCAL
+// There are non-parenthesized calls to "max" in the  <unordered_map> header,
+// which trigger a check in test/main.h causing compilation to fail.
+// We work around the check here by removing the check for max in
+// the case where we have to emulate thread_local.
+#ifdef max
+#undef max
+#endif
+#include <unordered_map>
+#endif
 #include "src/ThreadPool/ThreadYield.h"
 #include "src/ThreadPool/ThreadCancel.h"
 #include "src/ThreadPool/EventCount.h"
 #include "src/ThreadPool/RunQueue.h"
 #include "src/ThreadPool/ThreadPoolInterface.h"
 #include "src/ThreadPool/ThreadEnvironment.h"
+#include "src/ThreadPool/Barrier.h"
 #include "src/ThreadPool/NonBlockingThreadPool.h"

 #endif
@ -62,4 +72,3 @@
 #include <Eigen/src/Core/util/ReenableStupidWarnings.h>

 #endif // EIGEN_CXX11_THREADPOOL_MODULE
-
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@ -189,7 +189,7 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(TensorBlock* block) {
    if (TensorEvaluator<LeftArgType, Device>::RawAccess &&
-        m_leftImpl.data() != nullptr) {
+        m_leftImpl.data() != NULL) {
      TensorBlock left_block(block->first_coeff_index(), block->block_sizes(),
                             block->tensor_strides(), block->tensor_strides(),
                             m_leftImpl.data() + block->first_coeff_index());
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@ -200,9 +200,9 @@ class TensorBase<Derived, ReadOnlyAccessors>
    }

    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sigmoid_op<Scalar>, const Derived>
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_logistic_op<Scalar>, const Derived>
    sigmoid() const {
-      return unaryExpr(internal::scalar_sigmoid_op<Scalar>());
+      return unaryExpr(internal::scalar_logistic_op<Scalar>());
    }

    EIGEN_DEVICE_FUNC
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
@ -62,7 +62,7 @@ struct cond<RowMajor> {
 */
 enum TensorBlockShapeType {
  kUniformAllDims,
-  kSkewedInnerDims,
+  kSkewedInnerDims
 };

 struct TensorOpResourceRequirements {
@ -73,7 +73,7 @@ struct TensorOpResourceRequirements {
  // expression tree (like reductions) to communicate resources
  // requirements based on local state (like the total number of reductions
  // to be computed).
-  TensorOpResourceRequirements(internal::TensorBlockShapeType shape,
+  TensorOpResourceRequirements(TensorBlockShapeType shape,
                               const Index size)
      : block_shape(shape), block_total_size(size) {}
 };
@ -90,9 +90,9 @@ EIGEN_STRONG_INLINE void MergeResourceRequirements(
  *block_shape = resources[0].block_shape;
  *block_total_size = resources[0].block_total_size;
  for (std::vector<TensorOpResourceRequirements>::size_type i = 1; i < resources.size(); ++i) {
-    if (resources[i].block_shape == TensorBlockShapeType::kSkewedInnerDims &&
-        *block_shape != TensorBlockShapeType::kSkewedInnerDims) {
-      *block_shape = TensorBlockShapeType::kSkewedInnerDims;
+    if (resources[i].block_shape == kSkewedInnerDims &&
+        *block_shape != kSkewedInnerDims) {
+      *block_shape = kSkewedInnerDims;
    }
    *block_total_size =
        numext::maxi(*block_total_size, resources[i].block_total_size);
@ -152,11 +152,11 @@ struct TensorBlockCopyOp {
    const Scalar* src_base = &src_data[src_index];
    Scalar* dst_base = &dst_data[dst_index];

-    typedef const Eigen::Array<Scalar, Dynamic, 1> Src;
-    typedef Eigen::Array<Scalar, Dynamic, 1> Dst;
+    typedef const Array<Scalar, Dynamic, 1> Src;
+    typedef Array<Scalar, Dynamic, 1> Dst;

-    typedef Eigen::Map<Src, 0, InnerStride<>> SrcMap;
-    typedef Eigen::Map<Dst, 0, InnerStride<>> DstMap;
+    typedef Map<Src, 0, InnerStride<> > SrcMap;
+    typedef Map<Dst, 0, InnerStride<> > DstMap;

    const SrcMap src(src_base, num_coeff_to_copy, InnerStride<>(src_stride));
    DstMap dst(dst_base, num_coeff_to_copy, InnerStride<>(dst_stride));
@ -178,10 +178,8 @@ template <typename Scalar, typename StorageIndex, int NumDims, int Layout,
          bool BlockRead>
 class TensorBlockIO {
 public:
-  typedef typename internal::TensorBlock<Scalar, StorageIndex, NumDims, Layout>
-      TensorBlock;
-  typedef typename internal::TensorBlockCopyOp<Scalar, StorageIndex>
-      TensorBlockCopyOp;
+  typedef TensorBlock<Scalar, StorageIndex, NumDims, Layout> Block;
+  typedef TensorBlockCopyOp<Scalar, StorageIndex> BlockCopyOp;

 protected:
  struct BlockIteratorState {
@ -194,7 +192,7 @@ class TensorBlockIO {
  };

  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Copy(
-      const TensorBlock& block, StorageIndex first_coeff_index,
+      const Block& block, StorageIndex first_coeff_index,
      const array<StorageIndex, NumDims>& tensor_to_block_dim_map,
      const array<StorageIndex, NumDims>& tensor_strides, const Scalar* src_data,
      Scalar* dst_data) {
@ -214,11 +212,11 @@ class TensorBlockIO {
        num_size_one_inner_dims, NumDims - num_size_one_inner_dims - 1);
    const StorageIndex block_dim_for_tensor_stride1_dim =
        NumDims == 0 ? 1 : tensor_to_block_dim_map[tensor_stride1_dim];
-    Index block_inner_dim_size =
+    StorageIndex block_inner_dim_size =
        NumDims == 0 ? 1
                     : block.block_sizes()[block_dim_for_tensor_stride1_dim];
-    for (int i = num_size_one_inner_dims + 1; i < NumDims; ++i) {
-      const int dim = cond<Layout>()(i, NumDims - i - 1);
+    for (Index i = num_size_one_inner_dims + 1; i < NumDims; ++i) {
+      const Index dim = cond<Layout>()(i, NumDims - i - 1);
      const StorageIndex block_stride =
          block.block_strides()[tensor_to_block_dim_map[dim]];
      if (block_inner_dim_size == block_stride &&
@ -260,8 +258,8 @@ class TensorBlockIO {

    // Initialize block iterator state. Squeeze away any dimension of size 1.
    int num_squeezed_dims = 0;
-    for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
-      const int dim = cond<Layout>()(i + 1, NumDims - i - 2);
+    for (Index i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
+      const Index dim = cond<Layout>()(i + 1, NumDims - i - 2);
      const StorageIndex size = block.block_sizes()[tensor_to_block_dim_map[dim]];
      if (size == 1) {
        continue;
@ -290,8 +288,8 @@ class TensorBlockIO {
    const StorageIndex block_total_size =
        NumDims == 0 ? 1 : block.block_sizes().TotalSize();
    for (StorageIndex i = 0; i < block_total_size; i += block_inner_dim_size) {
-      TensorBlockCopyOp::Run(block_inner_dim_size, outputIndex, output_stride,
-                             dst_data, inputIndex, input_stride, src_data);
+      BlockCopyOp::Run(block_inner_dim_size, outputIndex, output_stride,
+                       dst_data, inputIndex, input_stride, src_data);
      // Update index.
      for (int j = 0; j < num_squeezed_dims; ++j) {
        if (++block_iter_state[j].count < block_iter_state[j].size) {
@ -320,13 +318,11 @@ template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
 class TensorBlockReader : public TensorBlockIO<Scalar, StorageIndex, NumDims,
                                               Layout, /*BlockRead=*/true> {
 public:
-  typedef typename internal::TensorBlock<Scalar, StorageIndex, NumDims, Layout>
-      TensorBlock;
-  typedef TensorBlockIO<Scalar, StorageIndex, NumDims, Layout, /*BlockRead=*/true>
-      Base;
+  typedef TensorBlock<Scalar, StorageIndex, NumDims, Layout> Block;
+  typedef TensorBlockIO<Scalar, StorageIndex, NumDims, Layout, /*BlockRead=*/true> Base;

  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      TensorBlock* block, const Scalar* src_data) {
+      Block* block, const Scalar* src_data) {
    array<StorageIndex, NumDims> tensor_to_block_dim_map;
    for (int i = 0; i < NumDims; ++i) {
      tensor_to_block_dim_map[i] = i;
@ -336,7 +332,7 @@ class TensorBlockReader : public TensorBlockIO<Scalar, StorageIndex, NumDims,
  }

  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      TensorBlock* block, StorageIndex first_coeff_index,
+      Block* block, StorageIndex first_coeff_index,
      const array<StorageIndex, NumDims>& tensor_to_block_dim_map,
      const array<StorageIndex, NumDims>& tensor_strides, const Scalar* src_data) {
    Base::Copy(*block, first_coeff_index, tensor_to_block_dim_map,
@ -357,13 +353,11 @@ template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
 class TensorBlockWriter : public TensorBlockIO<Scalar, StorageIndex, NumDims,
                                               Layout, /*BlockRead=*/false> {
 public:
-  typedef typename internal::TensorBlock<Scalar, StorageIndex, NumDims, Layout>
-      TensorBlock;
-  typedef TensorBlockIO<Scalar, StorageIndex, NumDims, Layout, /*BlockRead=*/false>
-      Base;
+  typedef TensorBlock<Scalar, StorageIndex, NumDims, Layout> Block;
+  typedef TensorBlockIO<Scalar, StorageIndex, NumDims, Layout, /*BlockRead=*/false> Base;

  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      const TensorBlock& block, Scalar* dst_data) {
+      const Block& block, Scalar* dst_data) {
    array<StorageIndex, NumDims> tensor_to_block_dim_map;
    for (int i = 0; i < NumDims; ++i) {
      tensor_to_block_dim_map[i] = i;
@ -373,7 +367,7 @@ class TensorBlockWriter : public TensorBlockIO<Scalar, StorageIndex, NumDims,
  }

  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      const TensorBlock& block, StorageIndex first_coeff_index,
+      const Block& block, StorageIndex first_coeff_index,
      const array<StorageIndex, NumDims>& tensor_to_block_dim_map,
      const array<StorageIndex, NumDims>& tensor_strides, Scalar* dst_data) {
    Base::Copy(block, first_coeff_index, tensor_to_block_dim_map,
@ -542,13 +536,13 @@ struct TensorBlockCwiseBinaryOp {
      const StorageIndex left_stride, const LeftScalar* left_data,
      const StorageIndex right_index, const StorageIndex right_stride,
      const RightScalar* right_data) {
-    typedef const Eigen::Array<LeftScalar, Dynamic, 1> Lhs;
-    typedef const Eigen::Array<RightScalar, Dynamic, 1> Rhs;
-    typedef Eigen::Array<OutputScalar, Dynamic, 1> Out;
+    typedef const Array<LeftScalar, Dynamic, 1> Lhs;
+    typedef const Array<RightScalar, Dynamic, 1> Rhs;
+    typedef Array<OutputScalar, Dynamic, 1> Out;

-    typedef Eigen::Map<Lhs, 0, InnerStride<>> LhsMap;
-    typedef Eigen::Map<Rhs, 0, InnerStride<>> RhsMap;
-    typedef Eigen::Map<Out, 0, InnerStride<>> OutMap;
+    typedef Map<Lhs, 0, InnerStride<> > LhsMap;
+    typedef Map<Rhs, 0, InnerStride<> > RhsMap;
+    typedef Map<Out, 0, InnerStride<> > OutMap;

    const LeftScalar* lhs_base = &left_data[left_index];
    const RightScalar* rhs_base = &right_data[right_index];
@ -558,8 +552,7 @@ struct TensorBlockCwiseBinaryOp {
    const RhsMap rhs(rhs_base, num_coeff, InnerStride<>(right_stride));
    OutMap out(out_base, num_coeff, InnerStride<>(output_stride));

-    out =
-        Eigen::CwiseBinaryOp<BinaryFunctor, LhsMap, RhsMap>(lhs, rhs, functor);
+    out = CwiseBinaryOp<BinaryFunctor, LhsMap, RhsMap>(lhs, rhs, functor);
  }
 };

@ -575,8 +568,7 @@ struct TensorBlockCwiseBinaryOp {
 template <typename BinaryFunctor, typename StorageIndex, typename OutputScalar,
          int NumDims, int Layout>
 struct TensorBlockCwiseBinaryIO {
-  typedef typename internal::TensorBlock<OutputScalar, StorageIndex, NumDims,
-                                         Layout>::Dimensions Dimensions;
+  typedef typename TensorBlock<OutputScalar, StorageIndex, NumDims, Layout>::Dimensions Dimensions;

  struct BlockIteratorState {
    StorageIndex output_stride, output_span;
@ -642,7 +634,7 @@ struct TensorBlockCwiseBinaryIO {
      if (size == 1) {
        continue;
      }
-      auto& state = block_iter_state[num_squeezed_dims];
+      BlockIteratorState& state = block_iter_state[num_squeezed_dims];
      state.output_stride = block_strides[dim];
      state.left_stride = left_strides[dim];
      state.right_stride = right_strides[dim];
@ -664,7 +656,7 @@ struct TensorBlockCwiseBinaryIO {
                                    right_stride, right_data);
      // Update index.
      for (int j = 0; j < num_squeezed_dims; ++j) {
-        auto& state = block_iter_state[j];
+        BlockIteratorState& state = block_iter_state[j];
        if (++state.count < state.size) {
          output_index += state.output_stride;
          left_index += state.left_stride;
@ -768,15 +760,14 @@ struct TensorBlockView {
 template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
 class TensorBlockMapper {
 public:
-  typedef typename internal::TensorBlock<Scalar, StorageIndex, NumDims, Layout>
-      TensorBlock;
+  typedef TensorBlock<Scalar, StorageIndex, NumDims, Layout> Block;
  typedef DSizes<StorageIndex, NumDims> Dimensions;

  TensorBlockMapper(const Dimensions& dims,
                    const TensorBlockShapeType block_shape,
                    Index min_target_size)
      : m_dimensions(dims),
-        m_block_dim_sizes(BlockDimensions(dims, block_shape, min_target_size)) {
+        m_block_dim_sizes(BlockDimensions(dims, block_shape, internal::convert_index<StorageIndex>(min_target_size))) {
    // Calculate block counts by dimension and total block count.
    DSizes<StorageIndex, NumDims> block_count;
    for (Index i = 0; i < block_count.rank(); ++i) {
@ -804,7 +795,7 @@ class TensorBlockMapper {
    }
  }

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Block
  GetBlockForIndex(StorageIndex block_index, Scalar* data) const {
    StorageIndex first_coeff_index = 0;
    DSizes<StorageIndex, NumDims> coords;
@ -852,8 +843,7 @@ class TensorBlockMapper {
      }
    }

-    return TensorBlock(first_coeff_index, sizes, strides, m_tensor_strides,
-                       data);
+    return Block(first_coeff_index, sizes, strides, m_tensor_strides, data);
  }

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageIndex total_block_count() const {
@ -868,8 +858,8 @@ class TensorBlockMapper {
 private:
  static Dimensions BlockDimensions(const Dimensions& tensor_dims,
                                    const TensorBlockShapeType block_shape,
-                                    Index min_target_size) {
-    min_target_size = numext::maxi<Index>(1, min_target_size);
+                                    StorageIndex min_target_size) {
+    min_target_size = numext::maxi<StorageIndex>(1, min_target_size);

    // If tensor fully fits into the target size, we'll treat it a single block.
    Dimensions block_dim_sizes = tensor_dims;
@ -883,12 +873,12 @@ class TensorBlockMapper {
        block_dim_sizes[i] = 1;
      }
    } else if (block_dim_sizes.TotalSize() > min_target_size) {
-      if (block_shape == TensorBlockShapeType::kUniformAllDims) {
+      if (block_shape == kUniformAllDims) {
        // Tensor will not fit within 'min_target_size' budget: calculate tensor
        // block dimension sizes based on "square" dimension size target.
-        const Index dim_size_target = static_cast<Index>(
-            std::pow(static_cast<float>(min_target_size),
-                     1.0 / static_cast<float>(block_dim_sizes.rank())));
+        const StorageIndex dim_size_target = internal::convert_index<StorageIndex>(
+          std::pow(static_cast<float>(min_target_size),
+                   1.0f / static_cast<float>(block_dim_sizes.rank())));
        for (Index i = 0; i < block_dim_sizes.rank(); ++i) {
          // TODO(andydavis) Adjust the inner most 'block_dim_size' to make it
          // a multiple of the packet size. Note that reducing
@ -913,7 +903,7 @@ class TensorBlockMapper {
            total_size = total_size_other_dims * block_dim_sizes[dim];
          }
        }
-      } else if (block_shape == TensorBlockShapeType::kSkewedInnerDims) {
+      } else if (block_shape == kSkewedInnerDims) {
        StorageIndex coeff_to_allocate = min_target_size;
        for (int i = 0; i < NumDims; ++i) {
          const int dim = cond<Layout>()(i, NumDims - i - 1);
@ -929,8 +919,9 @@ class TensorBlockMapper {
      }
    }

-    eigen_assert(block_dim_sizes.TotalSize() >=
-                 numext::mini<Index>(min_target_size, tensor_dims.TotalSize()));
+    eigen_assert(
+        block_dim_sizes.TotalSize() >=
+        numext::mini<Index>(min_target_size, tensor_dims.TotalSize()));

    return block_dim_sizes;
  }
@ -957,8 +948,7 @@ class TensorBlockMapper {
 template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
 class TensorSliceBlockMapper {
 public:
-  typedef typename internal::TensorBlock<Scalar, StorageIndex, NumDims, Layout>
-      TensorBlock;
+  typedef TensorBlock<Scalar, StorageIndex, NumDims, Layout> Block;
  typedef DSizes<StorageIndex, NumDims> Dimensions;

  TensorSliceBlockMapper(const Dimensions& tensor_dims,
@ -974,7 +964,7 @@ class TensorSliceBlockMapper {
        m_total_block_count(1) {
    // Calculate block counts by dimension and total block count.
    DSizes<StorageIndex, NumDims> block_count;
-    for (size_t i = 0; i < block_count.rank(); ++i) {
+    for (Index i = 0; i < block_count.rank(); ++i) {
      block_count[i] = divup(m_tensor_slice_extents[i], m_block_dim_sizes[i]);
    }
    m_total_block_count = array_prod(block_count);
@ -999,7 +989,7 @@ class TensorSliceBlockMapper {
    }
  }

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Block
  GetBlockForIndex(StorageIndex block_index, Scalar* data) const {
    StorageIndex first_coeff_index = 0;
    DSizes<StorageIndex, NumDims> coords;
@ -1056,8 +1046,7 @@ class TensorSliceBlockMapper {
      }
    }

-    return TensorBlock(first_coeff_index, sizes, strides, m_tensor_strides,
-                       data);
+    return Block(first_coeff_index, sizes, strides, m_tensor_strides, data);
  }

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageIndex total_block_count() const {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@ -105,7 +105,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-  bool isCopy= false, nByOne = false, oneByN = false;
+  bool isCopy, nByOne, oneByN;

  enum {
    IsAligned         = true,
@ -134,9 +134,10 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
                                                        const Device& device)
-      : m_device(device),
-        m_broadcast(op.broadcast()),
-        m_impl(op.expression(), device) {
+      : isCopy(false), nByOne(false), oneByN(false),
+        m_device(device), m_broadcast(op.broadcast()), m_impl(op.expression(), device)
+  {
+
    // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar
    // and store the result in a scalar. Instead one should reshape the scalar into a a N-D
    // tensor with N >= 1 of 1 element first and then broadcast.
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@ -152,13 +152,7 @@ struct TensorContractionParams {
 //   1. Elementwise Relu transformation following Conv2D.
 //   2. AddBias to the Conv2D output channels dimension.
 //
-// See expected implementation in NoOpOutputKernel.
-struct OutputKernel {
-  template <typename Index, typename Scalar>
-  using OutputMapper = internal::blas_data_mapper<Scalar, Index, ColMajor>;
-};
-
-// Output kernel that does absolutely nothing.
+// The NoOpOutputKernel implements an output kernel that does absolutely nothing.
 struct NoOpOutputKernel {
  /**
   * Tensor contraction evaluator calls this kernel after finishing each block
@ -177,7 +171,7 @@ struct NoOpOutputKernel {
   */
  template <typename Index, typename Scalar>
  EIGEN_ALWAYS_INLINE void operator()(
-      const OutputKernel::OutputMapper<Index, Scalar>& /*output_mapper*/,
+      const internal::blas_data_mapper<Scalar, Index, ColMajor>& /*output_mapper*/,
      const TensorContractionParams& /*params*/, Index /*i*/,
      Index /*j*/, Index /*num_rows*/, Index /*num_cols*/) const {}
 };
@ -354,7 +348,7 @@ struct TensorContractionEvaluatorBase
    // dimensions and right non-contracting dimensions.
    m_lhs_inner_dim_contiguous = true;
    int dim_idx = 0;
-    unsigned int nocontract_idx = 0;
+    Index nocontract_idx = 0;

    for (int i = 0; i < LDims; i++) {
      // find if we are contracting on index i of left tensor
@ -667,7 +661,7 @@ struct TensorContractionEvaluatorBase

          // call gebp (matrix kernel)
          // The parameters here are copied from Eigen's GEMM implementation
-          const auto output_mapper = output.getSubMapper(i2, j2);
+          const OutputMapper output_mapper = output.getSubMapper(i2, j2);
          gebp(output_mapper, blockA, blockB, actual_mc, actual_kc, actual_nc,
               Scalar(1), -1, -1, 0, 0);

--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
@ -88,6 +88,7 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef typename PointerType<CoeffReturnType, Device>::Type PointerT;

  enum {
    IsAligned = false,
@ -107,12 +108,12 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(PointerT data) {
    if (data) {
      evalTo(data);
      return false;
    } else {
-      m_result = static_cast<CoeffReturnType*>(
+      m_result = static_cast<PointerT>(
          m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar)));
      evalTo(m_result);
      return true;
@ -140,23 +141,22 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
  }

-  EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return m_result; }
+  EIGEN_DEVICE_FUNC PointerT data() const { return m_result; }

 #ifdef EIGEN_USE_SYCL
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const { return m_device; }
 #endif

 protected:
-  EIGEN_DEVICE_FUNC void evalTo(Scalar* data) {
-    TensorMap<Tensor<CoeffReturnType, NumDims, Layout, Index> > result(
-        data, m_dimensions);
+  EIGEN_DEVICE_FUNC void evalTo(PointerT data) {
+    TensorMap<Tensor<CoeffReturnType, NumDims, Layout, Index> > result(data, m_dimensions);
    m_op.func().eval(m_op.expression(), result, m_device);
  }

  Dimensions m_dimensions;
  const ArgType m_op;
  const Device& m_device;
-  CoeffReturnType* m_result;
+  PointerT m_result;
 };


@ -251,6 +251,7 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef typename PointerType<CoeffReturnType, Device>::Type PointerT;

  enum {
    IsAligned = false,
@ -270,12 +271,12 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(PointerT data) {
    if (data) {
      evalTo(data);
      return false;
    } else {
-      m_result = static_cast<Scalar *>(m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar)));
+      m_result = static_cast<PointerT>(m_device.allocate_temp(dimensions().TotalSize() * sizeof(CoeffReturnType)));
      evalTo(m_result);
      return true;
    }
@ -302,22 +303,22 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
  }

-  EIGEN_DEVICE_FUNC typename internal::traits<XprType>::PointerType data() const { return m_result; }
+  EIGEN_DEVICE_FUNC PointerT data() const { return m_result; }

 #ifdef EIGEN_USE_SYCL
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const { return m_device; }
 #endif

 protected:
-  EIGEN_DEVICE_FUNC void evalTo(Scalar* data) {
-    TensorMap<Tensor<Scalar, NumDims, Layout> > result(data, m_dimensions);
+  EIGEN_DEVICE_FUNC void evalTo(PointerT data) {
+    TensorMap<Tensor<CoeffReturnType, NumDims, Layout> > result(data, m_dimensions);
    m_op.func().eval(m_op.lhsExpression(), m_op.rhsExpression(), result, m_device);
  }

  Dimensions m_dimensions;
  const XprType m_op;
  const Device& m_device;
-  CoeffReturnType* m_result;
+  PointerT m_result;
 };


--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@ -12,56 +12,6 @@

 namespace Eigen {

-// Barrier is an object that allows one or more threads to wait until
-// Notify has been called a specified number of times.
-class Barrier {
- public:
-  Barrier(unsigned int count) : state_(count << 1), notified_(false) {
-    eigen_assert(((count << 1) >> 1) == count);
-  }
-  ~Barrier() {
-    eigen_assert((state_>>1) == 0);
-  }
-
-  void Notify() {
-    unsigned int v = state_.fetch_sub(2, std::memory_order_acq_rel) - 2;
-    if (v != 1) {
-      eigen_assert(((v + 2) & ~1) != 0);
-      return;  // either count has not dropped to 0, or waiter is not waiting
-    }
-    std::unique_lock<std::mutex> l(mu_);
-    eigen_assert(!notified_);
-    notified_ = true;
-    cv_.notify_all();
-  }
-
-  void Wait() {
-    unsigned int v = state_.fetch_or(1, std::memory_order_acq_rel);
-    if ((v >> 1) == 0) return;
-    std::unique_lock<std::mutex> l(mu_);
-    while (!notified_) {
-      cv_.wait(l);
-    }
-  }
-
- private:
-  std::mutex mu_;
-  std::condition_variable cv_;
-  std::atomic<unsigned int> state_;  // low bit is waiter flag
-  bool notified_;
-};
-
-
-// Notification is an object that allows a user to to wait for another
-// thread to signal a notification that an event has occurred.
-//
-// Multiple threads can wait on the same Notification object,
-// but only one caller must call Notify() on the object.
-struct Notification : Barrier {
-  Notification() : Barrier(1) {};
-};
-
-
 // Runs an arbitrary function and then calls Notify() on the passed in
 // Notification.
 template <typename Function, typename... Args> struct FunctionWrapperWithNotification
@ -102,7 +52,7 @@ class Allocator {
 // Build a thread pool device on top the an existing pool of threads.
 struct ThreadPoolDevice {
  // The ownership of the thread pool remains with the caller.
-  ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores, Allocator* allocator = nullptr)
+  ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores, Allocator* allocator = NULL)
      : pool_(pool), num_threads_(num_cores), allocator_(allocator) { }

  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
@ -282,7 +232,7 @@ struct ThreadPoolDevice {
  // Convenience wrapper for parallelFor that does not align blocks.
  void parallelFor(Index n, const TensorOpCost& cost,
                   std::function<void(Index, Index)> f) const {
-    parallelFor(n, cost, nullptr, std::move(f));
+    parallelFor(n, cost, NULL, std::move(f));
  }

  // Thread pool accessor.
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@ -32,12 +32,12 @@ namespace Eigen {
 // Boilerplate code
 namespace internal {

-template<std::size_t n, typename Dimension> struct dget {
+template<std::ptrdiff_t n, typename Dimension> struct dget {
  static const std::ptrdiff_t value = get<n, Dimension>::value;
 };


-template<typename Index, std::size_t NumIndices, std::size_t n, bool RowMajor>
+template<typename Index, std::ptrdiff_t NumIndices, std::ptrdiff_t n, bool RowMajor>
 struct fixed_size_tensor_index_linearization_helper
 {
  template <typename Dimensions> EIGEN_DEVICE_FUNC
@ -50,7 +50,7 @@ struct fixed_size_tensor_index_linearization_helper
  }
 };

-template<typename Index, std::size_t NumIndices, bool RowMajor>
+template<typename Index, std::ptrdiff_t NumIndices, bool RowMajor>
 struct fixed_size_tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor>
 {
  template <typename Dimensions> EIGEN_DEVICE_FUNC
@ -60,7 +60,7 @@ struct fixed_size_tensor_index_linearization_helper<Index, NumIndices, 0, RowMaj
  }
 };

-template<typename Index, std::size_t n>
+template<typename Index, std::ptrdiff_t n>
 struct fixed_size_tensor_index_extraction_helper
 {
  template <typename Dimensions> EIGEN_DEVICE_FUNC
@ -94,7 +94,7 @@ struct Sizes {
  typedef internal::numeric_list<std::ptrdiff_t, Indices...> Base;
  const Base t = Base();
  static const std::ptrdiff_t total_size = internal::arg_prod(Indices...);
-  static const size_t count = Base::count;
+  static const ptrdiff_t count = Base::count;

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t rank() const {
    return Base::count;
@ -121,16 +121,16 @@ struct Sizes {
    return *this;
  }

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t operator[] (const std::size_t index) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t operator[] (const std::ptrdiff_t index) const {
    return internal::fixed_size_tensor_index_extraction_helper<std::ptrdiff_t, Base::count>::run(index, t);
  }

  template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  size_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
+  ptrdiff_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, false>::run(indices, t);
  }
  template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  size_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
+  ptrdiff_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, true>::run(indices, t);
  }
 };
@ -144,25 +144,25 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes<Indi

 #else

-template <std::size_t n>
+template <std::ptrdiff_t n>
 struct non_zero_size {
-  typedef internal::type2val<std::size_t, n> type;
+  typedef internal::type2val<std::ptrdiff_t, n> type;
 };
 template <>
 struct non_zero_size<0> {
  typedef internal::null_type type;
 };

-template <std::size_t V1=0, std::size_t V2=0, std::size_t V3=0, std::size_t V4=0, std::size_t V5=0> struct Sizes {
+template <std::ptrdiff_t V1=0, std::ptrdiff_t V2=0, std::ptrdiff_t V3=0, std::ptrdiff_t V4=0, std::ptrdiff_t V5=0> struct Sizes {
  typedef typename internal::make_type_list<typename non_zero_size<V1>::type, typename non_zero_size<V2>::type, typename non_zero_size<V3>::type, typename non_zero_size<V4>::type, typename non_zero_size<V5>::type >::type Base;
-  static const size_t count = Base::count;
-  static const std::size_t total_size = internal::arg_prod<Base>::value;
+  static const std::ptrdiff_t count = Base::count;
+  static const std::ptrdiff_t total_size = internal::arg_prod<Base>::value;

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptrdiff_t rank() const {
    return count;
  }

-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptrdiff_t TotalSize() {
    return internal::arg_prod<Base>::value;
  }

@ -178,7 +178,7 @@ template <std::size_t V1=0, std::size_t V2=0, std::size_t V3=0, std::size_t V4=0

 #if EIGEN_HAS_VARIADIC_TEMPLATES
  template <typename... DenseIndex> Sizes(DenseIndex... /*indices*/) { }
-  explicit Sizes(std::initializer_list<std::size_t>) {
+  explicit Sizes(std::initializer_list<std::ptrdiff_t>) {
    // todo: add assertion
  }
 #else
@ -213,18 +213,18 @@ template <std::size_t V1=0, std::size_t V2=0, std::size_t V3=0, std::size_t V4=0
  }

  template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  size_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
+  ptrdiff_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, false>::run(indices, *reinterpret_cast<const Base*>(this));
  }
  template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  size_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
+  ptrdiff_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, true>::run(indices, *reinterpret_cast<const Base*>(this));
  }
 };

 namespace internal {
-template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_prod(const Sizes<V1, V2, V3, V4, V5>&) {
+template <std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes<V1, V2, V3, V4, V5>&) {
  return Sizes<V1, V2, V3, V4, V5>::total_size;
 }
 }
@ -233,7 +233,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_prod(const Sizes<V1, V2,

 // Boilerplate
 namespace internal {
-template<typename Index, std::size_t NumIndices, std::size_t n, bool RowMajor>
+template<typename Index, std::ptrdiff_t NumIndices, std::ptrdiff_t n, bool RowMajor>
 struct tensor_index_linearization_helper
 {
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@ -245,7 +245,7 @@ struct tensor_index_linearization_helper
  }
 };

-template<typename Index, std::size_t NumIndices, bool RowMajor>
+template<typename Index, std::ptrdiff_t NumIndices, bool RowMajor>
 struct tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor>
 {
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@ -264,7 +264,7 @@ struct DSizes : array<DenseIndex, NumDims> {
  typedef array<DenseIndex, NumDims> Base;
  static const int count = NumDims;

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const {
    return NumDims;
  }

@ -298,7 +298,7 @@ struct DSizes : array<DenseIndex, NumDims> {
    }
  }
 #else
-  template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5>
+  template <std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5>
  EIGEN_DEVICE_FUNC DSizes(const Sizes<V1, V2, V3, V4, V5>& a) {
    for (int i = 0 ; i < NumDims; ++i) {
      (*this)[i] = a[i];
@ -359,7 +359,7 @@ struct DSizes : array<DenseIndex, NumDims> {

 // Boilerplate
 namespace internal {
-template<typename Index, std::size_t NumIndices, std::size_t n, bool RowMajor>
+template<typename Index, std::ptrdiff_t NumIndices, std::ptrdiff_t n, bool RowMajor>
 struct tensor_vsize_index_linearization_helper
 {
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@ -371,7 +371,7 @@ struct tensor_vsize_index_linearization_helper
  }
 };

-template<typename Index, std::size_t NumIndices, bool RowMajor>
+template<typename Index, std::ptrdiff_t NumIndices, bool RowMajor>
 struct tensor_vsize_index_linearization_helper<Index, NumIndices, 0, RowMajor>
 {
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@ -386,10 +386,10 @@ struct tensor_vsize_index_linearization_helper<Index, NumIndices, 0, RowMajor>
 namespace internal {

 template <typename DenseIndex, int NumDims> struct array_size<const DSizes<DenseIndex, NumDims> > {
-  static const size_t value = NumDims;
+  static const ptrdiff_t value = NumDims;
 };
 template <typename DenseIndex, int NumDims> struct array_size<DSizes<DenseIndex, NumDims> > {
-  static const size_t value = NumDims;
+  static const ptrdiff_t value = NumDims;
 };
 #ifndef EIGEN_EMULATE_CXX11_META_H
 template <typename std::ptrdiff_t... Indices> struct array_size<const Sizes<Indices...> > {
@ -399,33 +399,33 @@ template <typename std::ptrdiff_t... Indices> struct array_size<Sizes<Indices...
 static const std::ptrdiff_t value = Sizes<Indices...>::count;
 };
 template <std::ptrdiff_t n, typename std::ptrdiff_t... Indices> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<Indices...>&) {
-  return get<n, internal::numeric_list<std::size_t, Indices...> >::value;
+  return get<n, internal::numeric_list<std::ptrdiff_t, Indices...> >::value;
 }
 template <std::ptrdiff_t n> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<>&) {
  eigen_assert(false && "should never be called");
  return -1;
 }
 #else
-template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> struct array_size<const Sizes<V1,V2,V3,V4,V5> > {
-  static const size_t value = Sizes<V1,V2,V3,V4,V5>::count;
+template <std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5> struct array_size<const Sizes<V1,V2,V3,V4,V5> > {
+  static const ptrdiff_t value = Sizes<V1,V2,V3,V4,V5>::count;
 };
-template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> struct array_size<Sizes<V1,V2,V3,V4,V5> > {
-  static const size_t value = Sizes<V1,V2,V3,V4,V5>::count;
+template <std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5> struct array_size<Sizes<V1,V2,V3,V4,V5> > {
+  static const ptrdiff_t value = Sizes<V1,V2,V3,V4,V5>::count;
 };
-template <std::size_t n, std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes<V1,V2,V3,V4,V5>&) {
+template <std::ptrdiff_t n, std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<V1,V2,V3,V4,V5>&) {
  return get<n, typename Sizes<V1,V2,V3,V4,V5>::Base>::value;
 }

 #endif


-template <typename Dims1, typename Dims2, size_t n, size_t m>
+template <typename Dims1, typename Dims2, ptrdiff_t n, ptrdiff_t m>
 struct sizes_match_below_dim {
  static EIGEN_DEVICE_FUNC  EIGEN_STRONG_INLINE bool run(Dims1&, Dims2&) {
    return false;
  }
 };
-template <typename Dims1, typename Dims2, size_t n>
+template <typename Dims1, typename Dims2, ptrdiff_t n>
 struct sizes_match_below_dim<Dims1, Dims2, n, n> {
  static EIGEN_DEVICE_FUNC  EIGEN_STRONG_INLINE bool run(Dims1& dims1, Dims2& dims2) {
    return (array_get<n-1>(dims1) == array_get<n-1>(dims2)) &
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@ -133,7 +133,7 @@ class TensorExecutor<Expression, DefaultDevice, Vectorizable,
    if (needs_assign) {
      // Size tensor blocks to fit in cache (or requested target block size).
      Index block_total_size = numext::mini(cache_size, total_size);
-      TensorBlockShapeType block_shape = TensorBlockShapeType::kSkewedInnerDims;
+      TensorBlockShapeType block_shape = kSkewedInnerDims;
      // Query expression tree for desired block size/shape.
      std::vector<TensorOpResourceRequirements> resources;
      evaluator.getResourceRequirements(&resources);
@ -229,12 +229,8 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> {
    typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;

    Evaluator evaluator(expr, device);
-    const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
    if (needs_assign) {
-      const StorageIndex PacketSize =
-          Vectorizable
-              ? unpacket_traits<typename Evaluator::PacketReturnType>::size
-              : 1;
      const StorageIndex size = array_prod(evaluator.dimensions());
      device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
                         EvalRange::alignBlockSize,
@ -259,12 +255,11 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ tr

  static EIGEN_STRONG_INLINE void run(const Expression& expr,
                         const ThreadPoolDevice& device) {
-    typedef TensorBlock<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout> TensorBlock;
    typedef TensorBlockMapper<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout> TensorBlockMapper;

    Evaluator evaluator(expr, device);
-    StorageIndex total_size = array_prod(evaluator.dimensions());
-    StorageIndex cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
+    Index total_size = array_prod(evaluator.dimensions());
+    Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
    if (total_size < cache_size) {
      // TODO(andydavis) Reduce block management overhead for small tensors.
      internal::TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
@ -273,9 +268,9 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ tr
      return;
    }

-    const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
    if (needs_assign) {
-      TensorBlockShapeType block_shape = TensorBlockShapeType::kSkewedInnerDims;
+      TensorBlockShapeType block_shape = kSkewedInnerDims;
      Index block_total_size = 0;
      // Query expression tree for desired block size/shape.
      std::vector<internal::TensorOpResourceRequirements> resources;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@ -24,6 +24,14 @@ template<typename T> struct MakePointer {
  typedef T ScalarType;
 };

+// The PointerType class is a container of the device specefic pointer
+// used for refering to a Pointer on TensorEvaluator class. While the TensorExpression
+// is a device-agnostic type and need MakePointer class for type conversion,
+// the TensorEvaluator calss can be specialized for a device, hence it is possible
+// to construct different types of temproray storage memory in TensorEvaluator
+// for different devices by specializing the following PointerType class.
+template<typename T, typename Device> struct PointerType : MakePointer<T>{};
+
 namespace internal{
 template<typename A, typename B> struct Pointer_type_promotion {
  static const bool val=false;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@ -54,36 +54,6 @@ struct functor_traits<scalar_fmod_op<Scalar> > {
         PacketAccess = false };
 };

-
-/** \internal
-  * \brief Template functor to compute the sigmoid of a scalar
-  * \sa class CwiseUnaryOp, ArrayBase::sigmoid()
-  */
-template <typename T>
-struct scalar_sigmoid_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_sigmoid_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const {
-    const T one = T(1);
-    return one / (one + numext::exp(-x));
-  }
-
-  template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  Packet packetOp(const Packet& x) const {
-    const Packet one = pset1<Packet>(T(1));
-    return pdiv(one, padd(one, pexp(pnegate(x))));
-  }
-};
-
-template <typename T>
-struct functor_traits<scalar_sigmoid_op<T> > {
-  enum {
-    Cost = NumTraits<T>::AddCost * 2 + NumTraits<T>::MulCost * 6,
-    PacketAccess = packet_traits<T>::HasAdd && packet_traits<T>::HasDiv &&
-                   packet_traits<T>::HasNegate && packet_traits<T>::HasExp
-  };
-};
-
-
 template<typename Reducer, typename Device>
 struct reducer_traits {
  enum {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
@ -84,7 +84,7 @@ template<DenseIndex n> struct NumTraits<type2index<n> >
 namespace internal {
 template <typename T>
 EIGEN_DEVICE_FUNC void update_value(T& val, DenseIndex new_val) {
-  val = new_val;
+  val = internal::convert_index<T>(new_val);
 }
 template <DenseIndex n>
 EIGEN_DEVICE_FUNC void update_value(type2index<n>& val, DenseIndex new_val) {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@ -527,7 +527,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
      : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices())
  {
-    for (std::size_t i = 0; i < internal::array_size<Dimensions>::value; ++i) {
+    for (Index i = 0; i < internal::array_size<Dimensions>::value; ++i) {
      eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]);
    }

@ -985,7 +985,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
    // Handle degenerate intervals by gracefully clamping and allowing m_dimensions to be zero
    DSizes<Index,NumDims> startIndicesClamped, stopIndicesClamped;
    m_is_identity = true;
-    for (size_t i = 0; i < internal::array_size<Dimensions>::value; ++i) {
+    for (Index i = 0; i < internal::array_size<Dimensions>::value; ++i) {
      if (m_strides[i] != 1 || op.startIndices()[i] != 0 ||
          op.stopIndices()[i] != (m_impl.dimensions()[i] - 1)) {
        m_is_identity = false;
--- a/unsupported/Eigen/CXX11/src/ThreadPool/Barrier.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/Barrier.h
@ -0,0 +1,64 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Rasmus Munk Larsen <rmlarsen@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// Barrier is an object that allows one or more threads to wait until
+// Notify has been called a specified number of times.
+
+#ifndef EIGEN_CXX11_THREADPOOL_BARRIER_H
+#define EIGEN_CXX11_THREADPOOL_BARRIER_H
+
+namespace Eigen {
+
+class Barrier {
+ public:
+  Barrier(unsigned int count) : state_(count << 1), notified_(false) {
+    eigen_assert(((count << 1) >> 1) == count);
+  }
+  ~Barrier() { eigen_plain_assert((state_ >> 1) == 0); }
+
+  void Notify() {
+    unsigned int v = state_.fetch_sub(2, std::memory_order_acq_rel) - 2;
+    if (v != 1) {
+      eigen_assert(((v + 2) & ~1) != 0);
+      return;  // either count has not dropped to 0, or waiter is not waiting
+    }
+    std::unique_lock<std::mutex> l(mu_);
+    eigen_assert(!notified_);
+    notified_ = true;
+    cv_.notify_all();
+  }
+
+  void Wait() {
+    unsigned int v = state_.fetch_or(1, std::memory_order_acq_rel);
+    if ((v >> 1) == 0) return;
+    std::unique_lock<std::mutex> l(mu_);
+    while (!notified_) {
+      cv_.wait(l);
+    }
+  }
+
+ private:
+  std::mutex mu_;
+  std::condition_variable cv_;
+  std::atomic<unsigned int> state_;  // low bit is waiter flag
+  bool notified_;
+};
+
+// Notification is an object that allows a user to to wait for another
+// thread to signal a notification that an event has occurred.
+//
+// Multiple threads can wait on the same Notification object,
+// but only one caller must call Notify() on the object.
+struct Notification : Barrier {
+  Notification() : Barrier(1){};
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_THREADPOOL_BARRIER_H
--- a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
@ -58,7 +58,7 @@ class EventCount {

  ~EventCount() {
    // Ensure there are no waiters.
-    eigen_assert((state_.load() & (kStackMask | kWaiterMask)) == kStackMask);
+    eigen_plain_assert((state_.load() & (kStackMask | kWaiterMask)) == kStackMask);
  }

  // Prewait prepares for waiting.
@ -169,7 +169,8 @@ class EventCount {

  class Waiter {
    friend class EventCount;
-    // Align to 128 byte boundary to prevent false sharing with other Waiter objects in the same vector.
+    // Align to 128 byte boundary to prevent false sharing with other Waiter
+    // objects in the same vector.
    EIGEN_ALIGN_TO_BOUNDARY(128) std::atomic<Waiter*> next;
    std::mutex mu;
    std::condition_variable cv;
--- a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
@ -10,7 +10,6 @@
 #ifndef EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H
 #define EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H

-
 namespace Eigen {

 template <typename Environment>
@ -23,7 +22,7 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
      : ThreadPoolTempl(num_threads, true, env) {}

  ThreadPoolTempl(int num_threads, bool allow_spinning,
-                             Environment env = Environment())
+                  Environment env = Environment())
      : env_(env),
        num_threads_(num_threads),
        allow_spinning_(allow_spinning),
@ -58,12 +57,18 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
        coprimes_.push_back(i);
      }
    }
+    queues_.resize(num_threads_);
+#ifndef EIGEN_THREAD_LOCAL
+    init_barrier_.reset(new Barrier(num_threads_));
+#endif
    for (int i = 0; i < num_threads_; i++) {
-      queues_.push_back(new Queue());
-    }
-    for (int i = 0; i < num_threads_; i++) {
-      threads_.push_back(env_.CreateThread([this, i]() { WorkerLoop(i); }));
+      threads_.emplace_back(env_.CreateThread([this, i]() { WorkerLoop(i); }));
    }
+#ifndef EIGEN_THREAD_LOCAL
+    // Wait for workers to initialize per_thread_map_. Otherwise we might race
+    // with them in Schedule or CurrentThreadId.
+    init_barrier_->Wait();
+#endif
  }

  ~ThreadPoolTempl() {
@ -78,13 +83,13 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
      // Since we were cancelled, there might be entries in the queues.
      // Empty them to prevent their destructor from asserting.
      for (size_t i = 0; i < queues_.size(); i++) {
-        queues_[i]->Flush();
+        queues_[i].Flush();
      }
    }

    // Join threads explicitly to avoid destruction order issues.
-    for (size_t i = 0; i < num_threads_; i++) delete threads_[i];
-    for (size_t i = 0; i < num_threads_; i++) delete queues_[i];
+    threads_.resize(0);
+    queues_.resize(0);
  }

  void Schedule(std::function<void()> fn) {
@ -92,13 +97,13 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
    PerThread* pt = GetPerThread();
    if (pt->pool == this) {
      // Worker thread of this pool, push onto the thread's queue.
-      Queue* q = queues_[pt->thread_id];
-      t = q->PushFront(std::move(t));
+      Queue& q = queues_[pt->thread_id];
+      t = q.PushFront(std::move(t));
    } else {
      // A free-standing thread (or worker of another pool), push onto a random
      // queue.
-      Queue* q = queues_[Rand(&pt->rand) % queues_.size()];
-      t = q->PushBack(std::move(t));
+      Queue& q = queues_[Rand(&pt->rand) % queues_.size()];
+      t = q.PushBack(std::move(t));
    }
    // Note: below we touch this after making w available to worker threads.
    // Strictly speaking, this can lead to a racy-use-after-free. Consider that
@ -109,8 +114,7 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
    // this is kept alive while any threads can potentially be in Schedule.
    if (!t.f) {
      ec_.Notify(false);
-    }
-    else {
+    } else {
      env_.ExecuteTask(t);  // Push failed, execute directly.
    }
  }
@ -130,13 +134,10 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
    ec_.Notify(true);
  }

-  int NumThreads() const final {
-    return num_threads_;
-  }
+  int NumThreads() const final { return num_threads_; }

  int CurrentThreadId() const final {
-    const PerThread* pt =
-        const_cast<ThreadPoolTempl*>(this)->GetPerThread();
+    const PerThread* pt = const_cast<ThreadPoolTempl*>(this)->GetPerThread();
    if (pt->pool == this) {
      return pt->thread_id;
    } else {
@ -148,17 +149,21 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
  typedef typename Environment::EnvThread Thread;

  struct PerThread {
-    constexpr PerThread() : pool(NULL), rand(0), thread_id(-1) { }
+    constexpr PerThread() : pool(NULL), rand(0), thread_id(-1) {}
    ThreadPoolTempl* pool;  // Parent pool, or null for normal threads.
-    uint64_t rand;  // Random generator state.
-    int thread_id;  // Worker thread index in pool.
+    uint64_t rand;          // Random generator state.
+    int thread_id;          // Worker thread index in pool.
+#ifndef EIGEN_THREAD_LOCAL
+    // Prevent false sharing.
+    char pad_[128];
+#endif
  };

  Environment env_;
  const int num_threads_;
  const bool allow_spinning_;
-  MaxSizeVector<Thread*> threads_;
-  MaxSizeVector<Queue*> queues_;
+  MaxSizeVector<std::unique_ptr<Thread> > threads_;
+  MaxSizeVector<Queue> queues_;
  MaxSizeVector<unsigned> coprimes_;
  MaxSizeVector<EventCount::Waiter> waiters_;
  std::atomic<unsigned> blocked_;
@ -166,14 +171,27 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
  std::atomic<bool> done_;
  std::atomic<bool> cancelled_;
  EventCount ec_;
+#ifndef EIGEN_THREAD_LOCAL
+  std::unique_ptr<Barrier> init_barrier_;
+  std::mutex per_thread_map_mutex_;  // Protects per_thread_map_.
+  std::unordered_map<uint64_t, std::unique_ptr<PerThread>> per_thread_map_;
+#endif

  // Main worker thread loop.
  void WorkerLoop(int thread_id) {
+#ifndef EIGEN_THREAD_LOCAL
+    std::unique_ptr<PerThread> new_pt(new PerThread());
+    per_thread_map_mutex_.lock();
+    eigen_assert(per_thread_map_.emplace(GlobalThreadIdHash(), std::move(new_pt)).second);
+    per_thread_map_mutex_.unlock();
+    init_barrier_->Notify();
+    init_barrier_->Wait();
+#endif
    PerThread* pt = GetPerThread();
    pt->pool = this;
-    pt->rand = std::hash<std::thread::id>()(std::this_thread::get_id());
+    pt->rand = GlobalThreadIdHash();
    pt->thread_id = thread_id;
-    Queue* q = queues_[thread_id];
+    Queue& q = queues_[thread_id];
    EventCount::Waiter* waiter = &waiters_[thread_id];
    // TODO(dvyukov,rmlarsen): The time spent in Steal() is proportional
    // to num_threads_ and we assume that new work is scheduled at a
@ -189,10 +207,10 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
      // counter-productive for the types of I/O workloads the single thread
      // pools tend to be used for.
      while (!cancelled_) {
-        Task t = q->PopFront();
+        Task t = q.PopFront();
        for (int i = 0; i < spin_count && !t.f; i++) {
          if (!cancelled_.load(std::memory_order_relaxed)) {
-            t = q->PopFront();
+            t = q.PopFront();
          }
        }
        if (!t.f) {
@ -206,7 +224,7 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
      }
    } else {
      while (!cancelled_) {
-        Task t = q->PopFront();
+        Task t = q.PopFront();
        if (!t.f) {
          t = Steal();
          if (!t.f) {
@ -243,7 +261,7 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
    unsigned inc = coprimes_[r % coprimes_.size()];
    unsigned victim = r % size;
    for (unsigned i = 0; i < size; i++) {
-      Task t = queues_[victim]->PopBack();
+      Task t = queues_[victim].PopBack();
      if (t.f) {
        return t;
      }
@ -270,7 +288,7 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
      if (cancelled_) {
        return false;
      } else {
-        *t = queues_[victim]->PopBack();
+        *t = queues_[victim].PopBack();
        return true;
      }
    }
@ -278,7 +296,8 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
    // If we are shutting down and all worker threads blocked without work,
    // that's we are done.
    blocked_++;
-    if (done_ && blocked_ == num_threads_) {
+    // TODO is blocked_ required to be unsigned?
+    if (done_ && blocked_ == static_cast<unsigned>(num_threads_)) {
      ec_.CancelWait(waiter);
      // Almost done, but need to re-check queues.
      // Consider that all queues are empty and all worker threads are preempted
@ -311,7 +330,7 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
    unsigned inc = coprimes_[r % coprimes_.size()];
    unsigned victim = r % size;
    for (unsigned i = 0; i < size; i++) {
-      if (!queues_[victim]->Empty()) {
+      if (!queues_[victim].Empty()) {
        return victim;
      }
      victim += inc;
@ -322,10 +341,24 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
    return -1;
  }

-  static EIGEN_STRONG_INLINE PerThread* GetPerThread() {
+  static EIGEN_STRONG_INLINE uint64_t GlobalThreadIdHash() {
+    return std::hash<std::thread::id>()(std::this_thread::get_id());
+  }
+
+  EIGEN_STRONG_INLINE PerThread* GetPerThread() {
+#ifndef EIGEN_THREAD_LOCAL
+    static PerThread dummy;
+    auto it = per_thread_map_.find(GlobalThreadIdHash());
+    if (it == per_thread_map_.end()) {
+      return &dummy;
+    } else {
+      return it->second.get();
+    }
+#else
    EIGEN_THREAD_LOCAL PerThread per_thread_;
    PerThread* pt = &per_thread_;
    return pt;
+#endif
  }

  static EIGEN_STRONG_INLINE unsigned Rand(uint64_t* state) {
@ -333,7 +366,8 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
    // Update the internal state
    *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
    // Generate the random output (using the PCG-XSH-RS scheme)
-    return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
+    return static_cast<unsigned>((current ^ (current >> 22)) >>
+                                 (22 + (current >> 61)));
  }
 };

--- a/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
@ -10,7 +10,6 @@
 #ifndef EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_
 #define EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_

-
 namespace Eigen {

 // RunQueue is a fixed-size, partially non-blocking deque or Work items.
@ -47,7 +46,7 @@ class RunQueue {
      array_[i].state.store(kEmpty, std::memory_order_relaxed);
  }

-  ~RunQueue() { eigen_assert(Size() == 0); }
+  ~RunQueue() { eigen_plain_assert(Size() == 0); }

  // PushFront inserts w at the beginning of the queue.
  // If queue is full returns w, otherwise returns default-constructed Work.
@ -131,9 +130,8 @@ class RunQueue {
      Elem* e = &array_[mid & kMask];
      uint8_t s = e->state.load(std::memory_order_relaxed);
      if (n == 0) {
-        if (s != kReady ||
-            !e->state.compare_exchange_strong(s, kBusy,
-                                              std::memory_order_acquire))
+        if (s != kReady || !e->state.compare_exchange_strong(
+                               s, kBusy, std::memory_order_acquire))
          continue;
        start = mid;
      } else {
--- a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h
@ -10,13 +10,45 @@
 #ifndef EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
 #define EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H

-// Try to come up with a portable implementation of thread local variables
-#if EIGEN_COMP_GNUC && EIGEN_GNUC_AT_MOST(4, 7)
-#define EIGEN_THREAD_LOCAL static __thread
-#elif EIGEN_COMP_CLANG
-#define EIGEN_THREAD_LOCAL static __thread
-#else
+#if EIGEN_MAX_CPP_VER >= 11 &&                         \
+    ((EIGEN_COMP_GNUC && EIGEN_GNUC_AT_LEAST(4, 8)) || \
+     __has_feature(cxx_thread_local))
 #define EIGEN_THREAD_LOCAL static thread_local
 #endif

+// Disable TLS for Apple and Android builds with older toolchains.
+#if defined(__APPLE__)
+// Included for TARGET_OS_IPHONE, __IPHONE_OS_VERSION_MIN_REQUIRED,
+// __IPHONE_8_0.
+#include <Availability.h>
+#include <TargetConditionals.h>
+#endif
+// Checks whether C++11's `thread_local` storage duration specifier is
+// supported.
+#if defined(__apple_build_version__) &&     \
+    ((__apple_build_version__ < 8000042) || \
+     (TARGET_OS_IPHONE && __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_9_0))
+// Notes: Xcode's clang did not support `thread_local` until version
+// 8, and even then not for all iOS < 9.0.
+#undef EIGEN_THREAD_LOCAL
+
+#elif defined(__ANDROID__) && EIGEN_COMP_CLANG
+// There are platforms for which TLS should not be used even though the compiler
+// makes it seem like it's supported (Android NDK < r12b for example).
+// This is primarily because of linker problems and toolchain misconfiguration:
+// TLS isn't supported until NDK r12b per
+// https://developer.android.com/ndk/downloads/revision_history.html
+// Since NDK r16, `__NDK_MAJOR__` and `__NDK_MINOR__` are defined in
+// <android/ndk-version.h>. For NDK < r16, users should define these macros,
+// e.g. `-D__NDK_MAJOR__=11 -D__NKD_MINOR__=0` for NDK r11.
+#if __has_include(<android/ndk-version.h>)
+#include <android/ndk-version.h>
+#endif  // __has_include(<android/ndk-version.h>)
+#if defined(__ANDROID__) && defined(__clang__) && defined(__NDK_MAJOR__) && \
+    defined(__NDK_MINOR__) &&                                               \
+    ((__NDK_MAJOR__ < 12) || ((__NDK_MAJOR__ == 12) && (__NDK_MINOR__ < 1)))
+#undef EIGEN_THREAD_LOCAL
+#endif
+#endif  // defined(__ANDROID__) && defined(__clang__)
+
 #endif  // EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
--- a/unsupported/Eigen/CXX11/src/util/EmulateArray.h
+++ b/unsupported/Eigen/CXX11/src/util/EmulateArray.h
@ -25,6 +25,11 @@ template <typename T, size_t n> class array {
  EIGEN_DEVICE_FUNC
  EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { return values[index]; }

+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE T& at(size_t index) { eigen_assert(index < size()); return values[index]; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const T& at(size_t index) const { eigen_assert(index < size()); return values[index]; }
+
  EIGEN_DEVICE_FUNC
  EIGEN_STRONG_INLINE T& front() { return values[0]; }
  EIGEN_DEVICE_FUNC
@ -202,16 +207,16 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const array<T,N>& a) {
 }

 template<class T, std::size_t N> struct array_size<array<T,N> > {
-  static const size_t value = N;
+  enum { value = N };
 };
 template<class T, std::size_t N> struct array_size<array<T,N>& > {
-  static const size_t value = N;
+  enum { value = N };
 };
 template<class T, std::size_t N> struct array_size<const array<T,N> > {
-  static const size_t value = N;
+  enum { value = N };
 };
 template<class T, std::size_t N> struct array_size<const array<T,N>& > {
-  static const size_t value = N;
+  enum { value = N };
 };

 }  // end namespace internal
--- a/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h
+++ b/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h
@ -35,7 +35,6 @@ class MaxSizeVector {
  explicit MaxSizeVector(size_t n)
      : reserve_(n), size_(0),
        data_(static_cast<T*>(internal::aligned_malloc(n * sizeof(T)))) {
-    for (size_t i = 0; i < n; ++i) { new (&data_[i]) T; }
  }

  // Construct a new MaxSizeVector, reserve and resize to n.
@ -44,35 +43,55 @@ class MaxSizeVector {
  MaxSizeVector(size_t n, const T& init)
      : reserve_(n), size_(n),
        data_(static_cast<T*>(internal::aligned_malloc(n * sizeof(T)))) {
-    for (size_t i = 0; i < n; ++i) { new (&data_[i]) T(init); }
+    size_t i = 0;
+    EIGEN_TRY
+    {
+      for(; i < size_; ++i) { new (&data_[i]) T(init); }
+    }
+    EIGEN_CATCH(...)
+    {
+      // Construction failed, destruct in reverse order:
+      for(; (i+1) > 0; --i) { data_[i-1].~T(); }
+      internal::aligned_free(data_);
+      EIGEN_THROW;
+    }
  }

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  ~MaxSizeVector() {
-    for (size_t i = 0; i < size_; ++i) {
-      data_[i].~T();
+    for (size_t i = size_; i > 0; --i) {
+      data_[i-1].~T();
    }
    internal::aligned_free(data_);
  }

  void resize(size_t n) {
    eigen_assert(n <= reserve_);
-    for (size_t i = size_; i < n; ++i) {
-      new (&data_[i]) T;
+    for (; size_ < n; ++size_) {
+      new (&data_[size_]) T;
    }
-    for (size_t i = n; i < size_; ++i) {
-      data_[i].~T();
+    for (; size_ > n; --size_) {
+      data_[size_-1].~T();
    }
-    size_ = n;
+    eigen_assert(size_ == n);
  }

  // Append new elements (up to reserved size).
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  void push_back(const T& t) {
    eigen_assert(size_ < reserve_);
-    data_[size_++] = t;
+    new (&data_[size_++]) T(t);
  }

+  // For C++03 compatibility this only takes one argument
+  template<class X>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void emplace_back(const X& x) {
+    eigen_assert(size_ < reserve_);
+    new (&data_[size_++]) T(x);
+  }
+
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  const T& operator[] (size_t i) const {
    eigen_assert(i < size_);
@ -99,11 +118,8 @@ class MaxSizeVector {

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  void pop_back() {
-    // NOTE: This does not destroy the value at the end the way
-    // std::vector's version of pop_back() does.  That happens when
-    // the Vector is destroyed.
    eigen_assert(size_ > 0);
-    size_--;
+    data_[--size_].~T();
  }

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
--- a/unsupported/Eigen/FFT
+++ b/unsupported/Eigen/FFT
@ -289,6 +289,7 @@ class FFT
    void inv( MatrixBase<OutputDerived> & dst, const MatrixBase<ComplexDerived> & src, Index nfft=-1)
    {
      typedef typename ComplexDerived::Scalar src_type;
+      typedef typename ComplexDerived::RealScalar real_type;
      typedef typename OutputDerived::Scalar dst_type;
      const bool realfft= (NumTraits<dst_type>::IsComplex == 0);
      EIGEN_STATIC_ASSERT_VECTOR_ONLY(OutputDerived)
@ -329,9 +330,9 @@ class FFT
            tmp.head(nhead) = src.head(nhead);
            tmp.tail(ntail) = src.tail(ntail);
            if (resize_input<0) { //shrinking -- create the Nyquist bin as the average of the two bins that fold into it
-              tmp(nhead) = ( src(nfft/2) + src( src.size() - nfft/2 ) )*src_type(.5);
+              tmp(nhead) = ( src(nfft/2) + src( src.size() - nfft/2 ) )*real_type(.5);
            }else{ // expanding -- split the old Nyquist bin into two halves
-              tmp(nhead) = src(nhead) * src_type(.5);
+              tmp(nhead) = src(nhead) * real_type(.5);
              tmp(tmp.size()-nhead) = tmp(nhead);
            }
          }
--- a/unsupported/Eigen/OpenGLSupport
+++ b/unsupported/Eigen/OpenGLSupport
@ -184,7 +184,7 @@ inline void glRotate(const Rotation2D<float>& rot)
 }
 inline void glRotate(const Rotation2D<double>& rot)
 {
-  glRotated(rot.angle()*180.0/EIGEN_PI, 0.0, 0.0, 1.0);
+  glRotated(rot.angle()*180.0/double(EIGEN_PI), 0.0, 0.0, 1.0);
 }

 template<typename Derived> void glRotate(const RotationBase<Derived,3>& rot)
--- a/unsupported/Eigen/src/BVH/KdBVH.h
+++ b/unsupported/Eigen/src/BVH/KdBVH.h
@ -35,6 +35,7 @@ struct get_boxes_helper {
  {
    outBoxes.insert(outBoxes.end(), boxBegin, boxEnd);
    eigen_assert(outBoxes.size() == objects.size());
+    EIGEN_ONLY_USED_FOR_DEBUG(objects);
  }
 };

--- a/unsupported/Eigen/src/Splines/Spline.h
+++ b/unsupported/Eigen/src/Splines/Spline.h
@ -249,15 +249,13 @@ namespace Eigen
    DenseIndex degree,
    const typename Spline<_Scalar, _Dim, _Degree>::KnotVectorType& knots)
  {
-    typedef typename Spline<_Scalar, _Dim, _Degree>::BasisVectorType BasisVectorType;
-
    const DenseIndex p = degree;
    const DenseIndex i = Spline::Span(u, degree, knots);

    const KnotVectorType& U = knots;

    BasisVectorType left(p+1); left(0) = Scalar(0);
-    BasisVectorType right(p+1); right(0) = Scalar(0);        
+    BasisVectorType right(p+1); right(0) = Scalar(0);

    VectorBlock<BasisVectorType,Degree>(left,1,p) = u - VectorBlock<const KnotVectorType,Degree>(U,i+1-p,p).reverse();
    VectorBlock<BasisVectorType,Degree>(right,1,p) = VectorBlock<const KnotVectorType,Degree>(U,i+1,p) - u;
@ -380,9 +378,6 @@ namespace Eigen
    typedef Spline<_Scalar, _Dim, _Degree> SplineType;
    enum { Order = SplineTraits<SplineType>::OrderAtCompileTime };

-    typedef typename SplineTraits<SplineType>::Scalar Scalar;
-    typedef typename SplineTraits<SplineType>::BasisVectorType BasisVectorType;
-  
    const DenseIndex span = SplineType::Span(u, p, U);

    const DenseIndex n = (std::min)(p, order);
--- a/unsupported/test/EulerAngles.cpp
+++ b/unsupported/test/EulerAngles.cpp
@ -197,6 +197,7 @@ template<typename Scalar> void check_singular_cases(const Scalar& singularBeta)
 template<typename Scalar> void eulerangles_manual()
 {
  typedef Matrix<Scalar,3,1> Vector3;
+  typedef Matrix<Scalar,Dynamic,1> VectorX;
  const Vector3 Zero = Vector3::Zero();
  const Scalar PI = Scalar(EIGEN_PI);
  
@ -213,13 +214,13 @@ template<typename Scalar> void eulerangles_manual()
  check_singular_cases(-PI);
  
  // non-singular cases
-  VectorXd alpha = VectorXd::LinSpaced(Eigen::Sequential, 20, Scalar(-0.99) * PI, PI);
-  VectorXd beta = VectorXd::LinSpaced(Eigen::Sequential, 20, Scalar(-0.49) * PI, Scalar(0.49) * PI);
-  VectorXd gamma = VectorXd::LinSpaced(Eigen::Sequential, 20, Scalar(-0.99) * PI, PI);
+  VectorX alpha = VectorX::LinSpaced(Eigen::Sequential, 20, Scalar(-0.99) * PI, PI);
+  VectorX beta =  VectorX::LinSpaced(Eigen::Sequential, 20, Scalar(-0.49) * PI, Scalar(0.49) * PI);
+  VectorX gamma = VectorX::LinSpaced(Eigen::Sequential, 20, Scalar(-0.99) * PI, PI);
  for (int i = 0; i < alpha.size(); ++i) {
    for (int j = 0; j < beta.size(); ++j) {
      for (int k = 0; k < gamma.size(); ++k) {
-        check_all_var(Vector3d(alpha(i), beta(j), gamma(k)));
+        check_all_var(Vector3(alpha(i), beta(j), gamma(k)));
      }
    }
  }
--- a/unsupported/test/cxx11_tensor_block_access.cpp
+++ b/unsupported/test/cxx11_tensor_block_access.cpp
@ -10,6 +10,7 @@

 #include "main.h"

+#include <algorithm>
 #include <set>

 #include <Eigen/CXX11/Tensor>
@ -19,22 +20,21 @@ using Eigen::Index;
 using Eigen::RowMajor;
 using Eigen::ColMajor;

-using internal::TensorBlockShapeType;

 template<typename T>
 static const T& choose(int layout, const T& col, const T& row) {
  return layout == ColMajor ? col : row;
 }

-static const TensorBlockShapeType RandomShape() {
+static internal::TensorBlockShapeType RandomShape() {
  return internal::random<bool>()
-             ? internal::TensorBlockShapeType::kUniformAllDims
-             : internal::TensorBlockShapeType::kSkewedInnerDims;
+             ? internal::kUniformAllDims
+             : internal::kSkewedInnerDims;
 }

 template <int NumDims>
-static std::size_t RandomTargetSize(const DSizes<Index, NumDims>& dims) {
-  return internal::random<int>(1, dims.TotalSize());
+static Index RandomTargetSize(const DSizes<Index, NumDims>& dims) {
+  return internal::random<Index>(1, dims.TotalSize());
 }

 template <int NumDims>
@ -44,12 +44,12 @@ static DSizes<Index, NumDims> RandomDims() {
    dims[i] = internal::random<int>(1, 20);
  }
  return DSizes<Index, NumDims>(dims);
-};
+}

 /** Dummy data type to test TensorBlock copy ops. */
 struct Data {
-  Data() : Data(0) {}
-  explicit Data(int v) { value = v; }
+  Data() : value(0) {}
+  explicit Data(int v) : value(v) { }
  int value;
 };

@ -91,21 +91,19 @@ static void Debug(DSizes<Index, NumDims> dims) {
 template <int Layout>
 static void test_block_mapper_sanity()
 {
-  using T = int;
-  using TensorBlock = internal::TensorBlock<T, Index, 2, Layout>;
-  using TensorBlockMapper = internal::TensorBlockMapper<T, Index, 2, Layout>;
+  typedef internal::TensorBlockMapper<int, Index, 2, Layout> TensorBlockMapper;

  DSizes<Index, 2> tensor_dims(100, 100);

  // Test uniform blocks.
  TensorBlockMapper uniform_block_mapper(
-      tensor_dims, internal::TensorBlockShapeType::kUniformAllDims, 100);
+      tensor_dims, internal::kUniformAllDims, 100);

  VERIFY_IS_EQUAL(uniform_block_mapper.total_block_count(), 100);
  VERIFY_IS_EQUAL(uniform_block_mapper.block_dims_total_size(), 100);

  // 10x10 blocks
-  auto uniform_b0 = uniform_block_mapper.GetBlockForIndex(0, nullptr);
+  typename TensorBlockMapper::Block uniform_b0 = uniform_block_mapper.GetBlockForIndex(0, NULL);
  VERIFY_IS_EQUAL(uniform_b0.block_sizes().at(0), 10);
  VERIFY_IS_EQUAL(uniform_b0.block_sizes().at(1), 10);
  // Depending on a layout we stride by cols rows.
@ -117,13 +115,13 @@ static void test_block_mapper_sanity()

  // Test skewed to inner dims blocks.
  TensorBlockMapper skewed_block_mapper(
-      tensor_dims, internal::TensorBlockShapeType::kSkewedInnerDims, 100);
+      tensor_dims, internal::kSkewedInnerDims, 100);

  VERIFY_IS_EQUAL(skewed_block_mapper.total_block_count(), 100);
  VERIFY_IS_EQUAL(skewed_block_mapper.block_dims_total_size(), 100);

  // 1x100 (100x1) rows/cols depending on a tensor layout.
-  auto skewed_b0 = skewed_block_mapper.GetBlockForIndex(0, nullptr);
+  typename TensorBlockMapper::Block skewed_b0 = skewed_block_mapper.GetBlockForIndex(0, NULL);
  VERIFY_IS_EQUAL(skewed_b0.block_sizes().at(0), choose(Layout, 100, 1));
  VERIFY_IS_EQUAL(skewed_b0.block_sizes().at(1), choose(Layout, 1, 100));
  // Depending on a layout we stride by cols rows.
@ -140,12 +138,13 @@ template <typename T, int Layout, int NumDims>
 static void UpdateCoeffSet(
    const internal::TensorBlock<T, Index, NumDims, Layout>& block,
    Index first_coeff_index, int dim_index, std::set<Index>* visited_coeffs) {
-  const DSizes<Index, NumDims> block_sizes = block.block_sizes();
-  const DSizes<Index, NumDims> tensor_strides = block.tensor_strides();
+  const DSizes<Index, NumDims>& block_sizes = block.block_sizes();
+  const DSizes<Index, NumDims>& tensor_strides = block.tensor_strides();

  for (int i = 0; i < block_sizes[dim_index]; ++i) {
    if (tensor_strides[dim_index] == 1) {
-      auto inserted = visited_coeffs->insert(first_coeff_index + i);
+      typedef std::pair<std::set<Index>::iterator, bool> ReturnType;
+      ReturnType inserted = visited_coeffs->insert(first_coeff_index + i);
      VERIFY_IS_EQUAL(inserted.second, true);
    } else {
      int next_dim_index = dim_index + choose(Layout, -1, 1);
@ -158,9 +157,8 @@ static void UpdateCoeffSet(

 template <typename T, int NumDims, int Layout>
 static void test_block_mapper_maps_every_element() {
-  using TensorBlock = internal::TensorBlock<T, Index, NumDims, Layout>;
-  using TensorBlockMapper =
-      internal::TensorBlockMapper<T, Index, NumDims, Layout>;
+  typedef internal::TensorBlock<T, Index, NumDims, Layout> TensorBlock;
+  typedef internal::TensorBlockMapper<T, Index, NumDims, Layout> TensorBlockMapper;

  DSizes<Index, NumDims> dims = RandomDims<NumDims>();

@ -171,7 +169,7 @@ static void test_block_mapper_maps_every_element() {
  TensorBlockMapper block_mapper(dims, RandomShape(), RandomTargetSize(dims));

  for (int i = 0; i < block_mapper.total_block_count(); ++i) {
-    TensorBlock block = block_mapper.GetBlockForIndex(i, nullptr);
+    TensorBlock block = block_mapper.GetBlockForIndex(i, NULL);
    UpdateCoeffSet<T, Layout, NumDims>(block, block.first_coeff_index(),
                                       choose(Layout, NumDims - 1, 0),
                                       &coeff_set);
@ -180,16 +178,15 @@ static void test_block_mapper_maps_every_element() {
  // Verify that every coefficient in the original Tensor is accessible through
  // TensorBlock only once.
  Index total_coeffs = dims.TotalSize();
-  VERIFY_IS_EQUAL(coeff_set.size(), total_coeffs);
+  VERIFY_IS_EQUAL(Index(coeff_set.size()), total_coeffs);
  VERIFY_IS_EQUAL(*coeff_set.begin(), 0);
  VERIFY_IS_EQUAL(*coeff_set.rbegin(), total_coeffs - 1);
 }

 template <typename T, int NumDims, int Layout>
 static void test_slice_block_mapper_maps_every_element() {
-  using TensorBlock = internal::TensorBlock<T, Index, NumDims, Layout>;
-  using TensorSliceBlockMapper =
-      internal::TensorSliceBlockMapper<T, Index, NumDims, Layout>;
+  typedef internal::TensorBlock<T, Index, NumDims, Layout> TensorBlock;
+  typedef internal::TensorSliceBlockMapper<T, Index, NumDims, Layout> TensorSliceBlockMapper;

  DSizes<Index, NumDims> tensor_dims = RandomDims<NumDims>();
  DSizes<Index, NumDims> tensor_slice_offsets = RandomDims<NumDims>();
@ -206,12 +203,12 @@ static void test_slice_block_mapper_maps_every_element() {
  // Keep track of elements indices available via block access.
  std::set<Index> coeff_set;

-  auto total_coeffs = static_cast<int>(tensor_slice_extents.TotalSize());
+  int total_coeffs = static_cast<int>(tensor_slice_extents.TotalSize());

  // Pick a random dimension sizes for the tensor blocks.
  DSizes<Index, NumDims> block_sizes;
  for (int i = 0; i < NumDims; ++i) {
-    block_sizes[i] = internal::random<int>(1, tensor_slice_extents[i]);
+    block_sizes[i] = internal::random<Index>(1, tensor_slice_extents[i]);
  }

  TensorSliceBlockMapper block_mapper(tensor_dims, tensor_slice_offsets,
@ -219,13 +216,13 @@ static void test_slice_block_mapper_maps_every_element() {
                                      DimensionList<Index, NumDims>());

  for (int i = 0; i < block_mapper.total_block_count(); ++i) {
-    TensorBlock block = block_mapper.GetBlockForIndex(i, nullptr);
+    TensorBlock block = block_mapper.GetBlockForIndex(i, NULL);
    UpdateCoeffSet<T, Layout, NumDims>(block, block.first_coeff_index(),
                                       choose(Layout, NumDims - 1, 0),
                                       &coeff_set);
  }

-  VERIFY_IS_EQUAL(coeff_set.size(), total_coeffs);
+  VERIFY_IS_EQUAL(Index(coeff_set.size()), total_coeffs);
 }

 template <typename T, int NumDims, int Layout>
@ -240,7 +237,7 @@ static void test_block_io_copy_data_from_source_to_target() {
      TensorBlockWriter;

  DSizes<Index, NumDims> input_tensor_dims = RandomDims<NumDims>();
-  const auto input_tensor_size = input_tensor_dims.TotalSize();
+  const Index input_tensor_size = input_tensor_dims.TotalSize();

  T* input_data = GenerateRandomData<T>(input_tensor_size);
  T* output_data = new T[input_tensor_size];
@ -265,14 +262,14 @@ static void test_block_io_copy_data_from_source_to_target() {
 }

 template <int Layout, int NumDims>
-static int GetInputIndex(Index output_index,
+static Index GetInputIndex(Index output_index,
                         const array<Index, NumDims>& output_to_input_dim_map,
                         const array<Index, NumDims>& input_strides,
                         const array<Index, NumDims>& output_strides) {
  int input_index = 0;
  if (Layout == ColMajor) {
    for (int i = NumDims - 1; i > 0; --i) {
-      const int idx = output_index / output_strides[i];
+      const Index idx = output_index / output_strides[i];
      input_index += idx * input_strides[output_to_input_dim_map[i]];
      output_index -= idx * output_strides[i];
    }
@ -280,7 +277,7 @@ static int GetInputIndex(Index output_index,
           output_index * input_strides[output_to_input_dim_map[0]];
  } else {
    for (int i = 0; i < NumDims - 1; ++i) {
-      const int idx = output_index / output_strides[i];
+      const Index idx = output_index / output_strides[i];
      input_index += idx * input_strides[output_to_input_dim_map[i]];
      output_index -= idx * output_strides[i];
    }
@ -319,7 +316,7 @@ static void test_block_io_copy_using_reordered_dimensions() {
      TensorBlockWriter;

  DSizes<Index, NumDims> input_tensor_dims = RandomDims<NumDims>();
-  const auto input_tensor_size = input_tensor_dims.TotalSize();
+  const Index input_tensor_size = input_tensor_dims.TotalSize();

  // Create a random input tensor.
  T* input_data = GenerateRandomData<T>(input_tensor_size);
@ -327,7 +324,7 @@ static void test_block_io_copy_using_reordered_dimensions() {
  // Create a random dimension re-ordering/shuffle.
  std::vector<Index> shuffle;
  for (int i = 0; i < NumDims; ++i) shuffle.push_back(i);
-  std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937());
+  std::random_shuffle(shuffle.begin(), shuffle.end());

  DSizes<Index, NumDims> output_tensor_dims;
  array<Index, NumDims> input_to_output_dim_map;
@ -342,8 +339,8 @@ static void test_block_io_copy_using_reordered_dimensions() {
  TensorBlockMapper block_mapper(output_tensor_dims, RandomShape(),
                                 RandomTargetSize(input_tensor_dims));

-  auto* block_data = new T[block_mapper.block_dims_total_size()];
-  auto* output_data = new T[input_tensor_size];
+  T* block_data = new T[block_mapper.block_dims_total_size()];
+  T* output_data = new T[input_tensor_size];

  array<Index, NumDims> input_tensor_strides =
      ComputeStrides<Layout, NumDims>(input_tensor_dims);
@ -370,6 +367,40 @@ static void test_block_io_copy_using_reordered_dimensions() {
  delete[] output_data;
 }

+template<typename Scalar, typename StorageIndex, int Dim>
+class EqualityChecker
+{
+    const Scalar* input_data;
+    const DSizes<StorageIndex, Dim> &input_dims, &input_strides, &output_dims, &output_strides;
+    void check_recursive(const Scalar* input, const Scalar* output, int depth=0) const
+    {
+        if(depth==Dim)
+        {
+            VERIFY_IS_EQUAL(*input, *output);
+            return;
+        }
+
+        for(int i=0; i<output_dims[depth]; ++i)
+        {
+            check_recursive(input + i % input_dims[depth] * input_strides[depth], output + i*output_strides[depth], depth+1);
+        }
+    }
+public:
+    EqualityChecker(const Scalar* input_data_,
+            const DSizes<StorageIndex, Dim> &input_dims_, const DSizes<StorageIndex, Dim> &input_strides_,
+            const DSizes<StorageIndex, Dim> &output_dims_, const DSizes<StorageIndex, Dim> &output_strides_)
+        : input_data(input_data_)
+        , input_dims(input_dims_), input_strides(input_strides_)
+        , output_dims(output_dims_), output_strides(output_strides_)
+        {}
+
+    void operator()(const Scalar* output_data) const
+    {
+        check_recursive(input_data, output_data);
+    }
+};
+
+
 template <int Layout>
 static void test_block_io_zero_stride()
 {
@ -385,8 +416,8 @@ static void test_block_io_zero_stride()
  input_tensor_dims[0] = 1;
  input_tensor_dims[2] = 1;
  input_tensor_dims[4] = 1;
-  const auto input_tensor_size = input_tensor_dims.TotalSize();
-  auto* input_data = GenerateRandomData<float>(input_tensor_size);
+  const Index input_tensor_size = input_tensor_dims.TotalSize();
+  float* input_data = GenerateRandomData<float>(input_tensor_size);

  DSizes<Index, 5> output_tensor_dims = rnd_dims;

@ -401,33 +432,10 @@ static void test_block_io_zero_stride()
  input_tensor_strides_with_zeros[4] = 0;

  // Verify that data was correctly read/written from/into the block.
-  const auto verify_is_equal = [&](const float* output_data) {
-    for (int i = 0; i < output_tensor_dims[0]; ++i) {
-      for (int j = 0; j < output_tensor_dims[1]; ++j) {
-        for (int k = 0; k < output_tensor_dims[2]; ++k) {
-          for (int l = 0; l < output_tensor_dims[3]; ++l) {
-            for (int m = 0; m < output_tensor_dims[4]; ++m) {
-              const Index output_offset =
-                  i * output_tensor_strides[0] + j * output_tensor_strides[1] +
-                  k * output_tensor_strides[2] + l * output_tensor_strides[3] +
-                  m * output_tensor_strides[4];
-              const Index input_offset =
-                  i % input_tensor_dims[0] * input_tensor_strides[0] +
-                  j % input_tensor_dims[1] * input_tensor_strides[1] +
-                  k % input_tensor_dims[2] * input_tensor_strides[2] +
-                  l % input_tensor_dims[3] * input_tensor_strides[3] +
-                  m % input_tensor_dims[4] * input_tensor_strides[4];
-              VERIFY_IS_EQUAL(output_data[output_offset],
-                              input_data[input_offset]);
-            }
-          }
-        }
-      }
-    }
-  };
+  const EqualityChecker<float, Index, 5> verify_is_equal(input_data, input_tensor_dims, input_tensor_strides, output_tensor_dims, output_tensor_strides);

  {
-    auto* output_data = new float[output_tensor_dims.TotalSize()];
+    float* output_data = new float[output_tensor_dims.TotalSize()];
    TensorBlock read_block(0, output_tensor_dims, output_tensor_strides,
                           input_tensor_strides_with_zeros, output_data);
    TensorBlockReader::Run(&read_block, input_data);
@ -436,7 +444,7 @@ static void test_block_io_zero_stride()
  }

  {
-    auto* output_data = new float[output_tensor_dims.TotalSize()];
+    float* output_data = new float[output_tensor_dims.TotalSize()];
    TensorBlock write_block(0, output_tensor_dims,
                            input_tensor_strides_with_zeros,
                            output_tensor_strides, input_data);
@ -459,14 +467,14 @@ static void test_block_io_squeeze_ones() {
  // Total size > 1.
  {
    DSizes<Index, 5> block_sizes(1, 2, 1, 2, 1);
-    const auto total_size = block_sizes.TotalSize();
+    const Index total_size = block_sizes.TotalSize();

    // Create a random input tensor.
-    auto* input_data = GenerateRandomData<float>(total_size);
+    float* input_data = GenerateRandomData<float>(total_size);
    DSizes<Index, 5> strides(ComputeStrides<Layout, 5>(block_sizes));

    {
-      auto* output_data = new float[block_sizes.TotalSize()];
+      float* output_data = new float[block_sizes.TotalSize()];
      TensorBlock read_block(0, block_sizes, strides, strides, output_data);
      TensorBlockReader::Run(&read_block, input_data);
      for (int i = 0; i < total_size; ++i) {
@ -476,7 +484,7 @@ static void test_block_io_squeeze_ones() {
    }

    {
-      auto* output_data = new float[block_sizes.TotalSize()];
+      float* output_data = new float[block_sizes.TotalSize()];
      TensorBlock write_block(0, block_sizes, strides, strides, input_data);
      TensorBlockWriter::Run(write_block, output_data);
      for (int i = 0; i < total_size; ++i) {
@ -489,14 +497,14 @@ static void test_block_io_squeeze_ones() {
  // Total size == 1.
  {
    DSizes<Index, 5> block_sizes(1, 1, 1, 1, 1);
-    const auto total_size = block_sizes.TotalSize();
+    const Index total_size = block_sizes.TotalSize();

    // Create a random input tensor.
-    auto* input_data = GenerateRandomData<float>(total_size);
+    float* input_data = GenerateRandomData<float>(total_size);
    DSizes<Index, 5> strides(ComputeStrides<Layout, 5>(block_sizes));

    {
-      auto* output_data = new float[block_sizes.TotalSize()];
+      float* output_data = new float[block_sizes.TotalSize()];
      TensorBlock read_block(0, block_sizes, strides, strides, output_data);
      TensorBlockReader::Run(&read_block, input_data);
      for (int i = 0; i < total_size; ++i) {
@ -506,7 +514,7 @@ static void test_block_io_squeeze_ones() {
    }

    {
-      auto* output_data = new float[block_sizes.TotalSize()];
+      float* output_data = new float[block_sizes.TotalSize()];
      TensorBlock write_block(0, block_sizes, strides, strides, input_data);
      TensorBlockWriter::Run(write_block, output_data);
      for (int i = 0; i < total_size; ++i) {
@ -635,7 +643,7 @@ static void test_block_cwise_binary_io_basic() {
  DSizes<Index, NumDims> block_sizes = RandomDims<NumDims>();
  DSizes<Index, NumDims> strides(ComputeStrides<Layout, NumDims>(block_sizes));

-  const auto total_size = block_sizes.TotalSize();
+  const Index total_size = block_sizes.TotalSize();

  // Create a random input tensors.
  T* left_data = GenerateRandomData<T>(total_size);
@ -664,13 +672,13 @@ static void test_block_cwise_binary_io_squeeze_ones() {
  DSizes<Index, 5> block_sizes(1, 2, 1, 3, 1);
  DSizes<Index, 5> strides(ComputeStrides<Layout, 5>(block_sizes));

-  const auto total_size = block_sizes.TotalSize();
+  const Index total_size = block_sizes.TotalSize();

  // Create a random input tensors.
-  auto* left_data = GenerateRandomData<float>(total_size);
-  auto* right_data = GenerateRandomData<float>(total_size);
+  float* left_data = GenerateRandomData<float>(total_size);
+  float* right_data = GenerateRandomData<float>(total_size);

-  auto* output_data = new float[total_size];
+  float* output_data = new float[total_size];
  BinaryFunctor functor;
  TensorBlockCwiseBinaryIO::Run(functor, block_sizes, strides, output_data,
                                strides, left_data, strides, right_data);
@ -711,14 +719,14 @@ static void test_block_cwise_binary_io_zero_strides() {
  right_strides[3] = 0;

  // Generate random data.
-  auto* left_data = GenerateRandomData<float>(left_sizes.TotalSize());
-  auto* right_data = GenerateRandomData<float>(right_sizes.TotalSize());
+  float* left_data = GenerateRandomData<float>(left_sizes.TotalSize());
+  float* right_data = GenerateRandomData<float>(right_sizes.TotalSize());

  DSizes<Index, 5> output_sizes = rnd_dims;
  DSizes<Index, 5> output_strides(ComputeStrides<Layout, 5>(output_sizes));

-  const auto output_total_size = output_sizes.TotalSize();
-  auto* output_data = new float[output_total_size];
+  const Index output_total_size = output_sizes.TotalSize();
+  float* output_data = new float[output_total_size];

  BinaryFunctor functor;
  TensorBlockCwiseBinaryIO::Run(functor, output_sizes, output_strides,
@ -755,17 +763,16 @@ static void test_block_cwise_binary_io_zero_strides() {
 template <int Layout>
 static void test_uniform_block_shape()
 {
-  using T = int;
-  typedef internal::TensorBlock<T, Index, 5, Layout> TensorBlock;
-  typedef internal::TensorBlockMapper<T, Index, 5, Layout> TensorBlockMapper;
+  typedef internal::TensorBlock<int, Index, 5, Layout> TensorBlock;
+  typedef internal::TensorBlockMapper<int, Index, 5, Layout> TensorBlockMapper;

  {
    // Test shape 'UniformAllDims' with uniform 'max_coeff count'.
    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
-    const size_t max_coeff_count = 5 * 5 * 5 * 5 * 5;
-    TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims,
+    const Index max_coeff_count = 5 * 5 * 5 * 5 * 5;
+    TensorBlockMapper block_mapper(dims, internal::kUniformAllDims,
                                   max_coeff_count);
-    TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
+    TensorBlock block = block_mapper.GetBlockForIndex(0, NULL);
    for (int i = 0; i < 5; ++i) {
      VERIFY_IS_EQUAL(5, block.block_sizes()[i]);
    }
@ -776,10 +783,10 @@ static void test_uniform_block_shape()
  // partially into first inner-most dimension.
  if (Layout == ColMajor) {
    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
-    const size_t max_coeff_count = 7 * 5 * 5 * 5 * 5;
-    TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims,
+    const Index max_coeff_count = 7 * 5 * 5 * 5 * 5;
+    TensorBlockMapper block_mapper(dims, internal::kUniformAllDims,
                                   max_coeff_count);
-    TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
+    TensorBlock block = block_mapper.GetBlockForIndex(0, NULL);
    VERIFY_IS_EQUAL(7, block.block_sizes()[0]);
    for (int i = 1; i < 5; ++i) {
      VERIFY_IS_EQUAL(5, block.block_sizes()[i]);
@ -787,10 +794,10 @@ static void test_uniform_block_shape()
    VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
  } else {
    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
-    const size_t max_coeff_count = 5 * 5 * 5 * 5 * 6;
-    TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims,
+    const Index max_coeff_count = 5 * 5 * 5 * 5 * 6;
+    TensorBlockMapper block_mapper(dims, internal::kUniformAllDims,
                                   max_coeff_count);
-    TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
+    TensorBlock block = block_mapper.GetBlockForIndex(0, NULL);
    VERIFY_IS_EQUAL(6, block.block_sizes()[4]);
    for (int i = 3; i >= 0; --i) {
      VERIFY_IS_EQUAL(5, block.block_sizes()[i]);
@ -802,10 +809,10 @@ static void test_uniform_block_shape()
  // fully into first inner-most dimension.
  if (Layout == ColMajor) {
    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
-    const size_t max_coeff_count = 11 * 5 * 5 * 5 * 5;
-    TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims,
+    const Index max_coeff_count = 11 * 5 * 5 * 5 * 5;
+    TensorBlockMapper block_mapper(dims, internal::kUniformAllDims,
                                   max_coeff_count);
-    TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
+    TensorBlock block = block_mapper.GetBlockForIndex(0, NULL);
    VERIFY_IS_EQUAL(11, block.block_sizes()[0]);
    for (int i = 1; i < 5; ++i) {
      VERIFY_IS_EQUAL(5, block.block_sizes()[i]);
@ -813,10 +820,10 @@ static void test_uniform_block_shape()
    VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
  } else {
    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
-    const size_t max_coeff_count = 5 * 5 * 5 * 5 * 7;
-    TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims,
+    const Index max_coeff_count = 5 * 5 * 5 * 5 * 7;
+    TensorBlockMapper block_mapper(dims, internal::kUniformAllDims,
                                   max_coeff_count);
-    TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
+    TensorBlock block = block_mapper.GetBlockForIndex(0, NULL);
    VERIFY_IS_EQUAL(7, block.block_sizes()[4]);
    for (int i = 3; i >= 0; --i) {
      VERIFY_IS_EQUAL(5, block.block_sizes()[i]);
@ -828,10 +835,10 @@ static void test_uniform_block_shape()
  // fully into first few inner-most dimensions.
  if (Layout == ColMajor) {
    DSizes<Index, 5> dims(7, 5, 6, 17, 7);
-    const size_t max_coeff_count = 7 * 5 * 6 * 7 * 5;
-    TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims,
+    const Index max_coeff_count = 7 * 5 * 6 * 7 * 5;
+    TensorBlockMapper block_mapper(dims, internal::kUniformAllDims,
                                   max_coeff_count);
-    TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
+    TensorBlock block = block_mapper.GetBlockForIndex(0, NULL);
    VERIFY_IS_EQUAL(7, block.block_sizes()[0]);
    VERIFY_IS_EQUAL(5, block.block_sizes()[1]);
    VERIFY_IS_EQUAL(6, block.block_sizes()[2]);
@ -840,10 +847,10 @@ static void test_uniform_block_shape()
    VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
  } else {
    DSizes<Index, 5> dims(7, 5, 6, 9, 7);
-    const size_t max_coeff_count = 5 * 5 * 5 * 6 * 7;
-    TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims,
+    const Index max_coeff_count = 5 * 5 * 5 * 6 * 7;
+    TensorBlockMapper block_mapper(dims, internal::kUniformAllDims,
                                   max_coeff_count);
-    TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
+    TensorBlock block = block_mapper.GetBlockForIndex(0, NULL);
    VERIFY_IS_EQUAL(7, block.block_sizes()[4]);
    VERIFY_IS_EQUAL(6, block.block_sizes()[3]);
    VERIFY_IS_EQUAL(5, block.block_sizes()[2]);
@ -855,10 +862,10 @@ static void test_uniform_block_shape()
  // Test shape 'UniformAllDims' with full allocation to all dims.
  if (Layout == ColMajor) {
    DSizes<Index, 5> dims(7, 5, 6, 17, 7);
-    const size_t max_coeff_count = 7 * 5 * 6 * 17 * 7;
-    TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims,
+    const Index max_coeff_count = 7 * 5 * 6 * 17 * 7;
+    TensorBlockMapper block_mapper(dims, internal::kUniformAllDims,
                                   max_coeff_count);
-    TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
+    TensorBlock block = block_mapper.GetBlockForIndex(0, NULL);
    VERIFY_IS_EQUAL(7, block.block_sizes()[0]);
    VERIFY_IS_EQUAL(5, block.block_sizes()[1]);
    VERIFY_IS_EQUAL(6, block.block_sizes()[2]);
@ -867,10 +874,10 @@ static void test_uniform_block_shape()
    VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
  } else {
    DSizes<Index, 5> dims(7, 5, 6, 9, 7);
-    const size_t max_coeff_count = 7 * 5 * 6 * 9 * 7;
-    TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims,
+    const Index max_coeff_count = 7 * 5 * 6 * 9 * 7;
+    TensorBlockMapper block_mapper(dims, internal::kUniformAllDims,
                                   max_coeff_count);
-    TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
+    TensorBlock block = block_mapper.GetBlockForIndex(0, NULL);
    VERIFY_IS_EQUAL(7, block.block_sizes()[4]);
    VERIFY_IS_EQUAL(9, block.block_sizes()[3]);
    VERIFY_IS_EQUAL(6, block.block_sizes()[2]);
@ -883,17 +890,16 @@ static void test_uniform_block_shape()
 template <int Layout>
 static void test_skewed_inner_dim_block_shape()
 {
-  using T = int;
-  typedef internal::TensorBlock<T, Index, 5, Layout> TensorBlock;
-  typedef internal::TensorBlockMapper<T, Index, 5, Layout> TensorBlockMapper;
+  typedef internal::TensorBlock<int, Index, 5, Layout> TensorBlock;
+  typedef internal::TensorBlockMapper<int, Index, 5, Layout> TensorBlockMapper;

  // Test shape 'SkewedInnerDims' with partial allocation to inner-most dim.
  if (Layout == ColMajor) {
    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
-    const size_t max_coeff_count = 10 * 1 * 1 * 1 * 1;
-    TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims,
+    const Index max_coeff_count = 10 * 1 * 1 * 1 * 1;
+    TensorBlockMapper block_mapper(dims, internal::kSkewedInnerDims,
                                   max_coeff_count);
-    TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
+    TensorBlock block = block_mapper.GetBlockForIndex(0, NULL);
    VERIFY_IS_EQUAL(10, block.block_sizes()[0]);
    for (int i = 1; i < 5; ++i) {
      VERIFY_IS_EQUAL(1, block.block_sizes()[i]);
@ -901,10 +907,10 @@ static void test_skewed_inner_dim_block_shape()
    VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
  } else {
    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
-    const size_t max_coeff_count = 1 * 1 * 1 * 1 * 6;
-    TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims,
+    const Index max_coeff_count = 1 * 1 * 1 * 1 * 6;
+    TensorBlockMapper block_mapper(dims, internal::kSkewedInnerDims,
                                   max_coeff_count);
-    TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
+    TensorBlock block = block_mapper.GetBlockForIndex(0, NULL);
    VERIFY_IS_EQUAL(6, block.block_sizes()[4]);
    for (int i = 3; i >= 0; --i) {
      VERIFY_IS_EQUAL(1, block.block_sizes()[i]);
@ -915,10 +921,10 @@ static void test_skewed_inner_dim_block_shape()
  // Test shape 'SkewedInnerDims' with full allocation to inner-most dim.
  if (Layout == ColMajor) {
    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
-    const size_t max_coeff_count = 11 * 1 * 1 * 1 * 1;
-    TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims,
+    const Index max_coeff_count = 11 * 1 * 1 * 1 * 1;
+    TensorBlockMapper block_mapper(dims, internal::kSkewedInnerDims,
                                   max_coeff_count);
-    TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
+    TensorBlock block = block_mapper.GetBlockForIndex(0, NULL);
    VERIFY_IS_EQUAL(11, block.block_sizes()[0]);
    for (int i = 1; i < 5; ++i) {
      VERIFY_IS_EQUAL(1, block.block_sizes()[i]);
@ -926,10 +932,10 @@ static void test_skewed_inner_dim_block_shape()
    VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
  } else {
    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
-    const size_t max_coeff_count = 1 * 1 * 1 * 1 * 7;
-    TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims,
+    const Index max_coeff_count = 1 * 1 * 1 * 1 * 7;
+    TensorBlockMapper block_mapper(dims, internal::kSkewedInnerDims,
                                   max_coeff_count);
-    TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
+    TensorBlock block = block_mapper.GetBlockForIndex(0, NULL);
    VERIFY_IS_EQUAL(7, block.block_sizes()[4]);
    for (int i = 3; i >= 0; --i) {
      VERIFY_IS_EQUAL(1, block.block_sizes()[i]);
@ -941,10 +947,10 @@ static void test_skewed_inner_dim_block_shape()
  // and partial allocation to second inner-dim.
  if (Layout == ColMajor) {
    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
-    const size_t max_coeff_count = 11 * 3 * 1 * 1 * 1;
-    TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims,
+    const Index max_coeff_count = 11 * 3 * 1 * 1 * 1;
+    TensorBlockMapper block_mapper(dims, internal::kSkewedInnerDims,
                                   max_coeff_count);
-    TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
+    TensorBlock block = block_mapper.GetBlockForIndex(0, NULL);
    VERIFY_IS_EQUAL(11, block.block_sizes()[0]);
    VERIFY_IS_EQUAL(3, block.block_sizes()[1]);
    for (int i = 2; i < 5; ++i) {
@ -953,10 +959,10 @@ static void test_skewed_inner_dim_block_shape()
    VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
  } else {
    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
-    const size_t max_coeff_count = 1 * 1 * 1 * 15 * 7;
-    TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims,
+    const Index max_coeff_count = 1 * 1 * 1 * 15 * 7;
+    TensorBlockMapper block_mapper(dims, internal::kSkewedInnerDims,
                                   max_coeff_count);
-    TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
+    TensorBlock block = block_mapper.GetBlockForIndex(0, NULL);
    VERIFY_IS_EQUAL(7, block.block_sizes()[4]);
    VERIFY_IS_EQUAL(15, block.block_sizes()[3]);
    for (int i = 2; i >= 0; --i) {
@ -969,10 +975,10 @@ static void test_skewed_inner_dim_block_shape()
  // and partial allocation to third inner-dim.
  if (Layout == ColMajor) {
    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
-    const size_t max_coeff_count = 11 * 5 * 5 * 1 * 1;
-    TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims,
+    const Index max_coeff_count = 11 * 5 * 5 * 1 * 1;
+    TensorBlockMapper block_mapper(dims, internal::kSkewedInnerDims,
                                   max_coeff_count);
-    TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
+    TensorBlock block = block_mapper.GetBlockForIndex(0, NULL);
    VERIFY_IS_EQUAL(11, block.block_sizes()[0]);
    VERIFY_IS_EQUAL(5, block.block_sizes()[1]);
    VERIFY_IS_EQUAL(5, block.block_sizes()[2]);
@ -982,10 +988,10 @@ static void test_skewed_inner_dim_block_shape()
    VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
  } else {
    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
-    const size_t max_coeff_count = 1 * 1 * 5 * 17 * 7;
-    TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims,
+    const Index max_coeff_count = 1 * 1 * 5 * 17 * 7;
+    TensorBlockMapper block_mapper(dims, internal::kSkewedInnerDims,
                                   max_coeff_count);
-    TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
+    TensorBlock block = block_mapper.GetBlockForIndex(0, NULL);
    VERIFY_IS_EQUAL(7, block.block_sizes()[4]);
    VERIFY_IS_EQUAL(17, block.block_sizes()[3]);
    VERIFY_IS_EQUAL(5, block.block_sizes()[2]);
@ -998,10 +1004,10 @@ static void test_skewed_inner_dim_block_shape()
  // Test shape 'SkewedInnerDims' with full allocation to all dims.
  if (Layout == ColMajor) {
    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
-    const size_t max_coeff_count = 11 * 5 * 6 * 17 * 7;
-    TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims,
+    const Index max_coeff_count = 11 * 5 * 6 * 17 * 7;
+    TensorBlockMapper block_mapper(dims, internal::kSkewedInnerDims,
                                   max_coeff_count);
-    TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
+    TensorBlock block = block_mapper.GetBlockForIndex(0, NULL);
    VERIFY_IS_EQUAL(11, block.block_sizes()[0]);
    VERIFY_IS_EQUAL(5, block.block_sizes()[1]);
    VERIFY_IS_EQUAL(6, block.block_sizes()[2]);
@ -1010,10 +1016,10 @@ static void test_skewed_inner_dim_block_shape()
    VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
  } else {
    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
-    const size_t max_coeff_count = 11 * 5 * 6 * 17 * 7;
-    TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims,
+    const Index max_coeff_count = 11 * 5 * 6 * 17 * 7;
+    TensorBlockMapper block_mapper(dims, internal::kSkewedInnerDims,
                                   max_coeff_count);
-    TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
+    TensorBlock block = block_mapper.GetBlockForIndex(0, NULL);
    VERIFY_IS_EQUAL(7, block.block_sizes()[4]);
    VERIFY_IS_EQUAL(17, block.block_sizes()[3]);
    VERIFY_IS_EQUAL(6, block.block_sizes()[2]);
@ -1026,15 +1032,13 @@ static void test_skewed_inner_dim_block_shape()
 template <int Layout>
 static void test_empty_dims(const internal::TensorBlockShapeType block_shape)
 {
-  using T = int;
-
  // Test blocking of tensors with zero dimensions:
  //  - we must not crash on asserts and divisions by zero
  //  - we must not return block with zero dimensions
  //    (recipe for overflows/underflows, divisions by zero and NaNs later)
  //  - total block count must be zero
  {
-    typedef internal::TensorBlockMapper<T, Index, 1, Layout> TensorBlockMapper;
+    typedef internal::TensorBlockMapper<int, Index, 1, Layout> TensorBlockMapper;
    DSizes<Index, 1> dims(0);
    for (int max_coeff_count = 0; max_coeff_count < 2; ++max_coeff_count) {
      TensorBlockMapper block_mapper(dims, block_shape, max_coeff_count);
@ -1044,7 +1048,7 @@ static void test_empty_dims(const internal::TensorBlockShapeType block_shape)
  }

  {
-    typedef internal::TensorBlockMapper<T, Index, 2, Layout> TensorBlockMapper;
+    typedef internal::TensorBlockMapper<int, Index, 2, Layout> TensorBlockMapper;
    for (int dim1 = 0; dim1 < 3; ++dim1) {
      for (int dim2 = 0; dim2 < 3; ++dim2) {
        DSizes<Index, 2> dims(dim1, dim2);
@ -1098,9 +1102,9 @@ EIGEN_DECLARE_TEST(cxx11_tensor_block_access) {
  TEST_LAYOUTS(test_block_cwise_binary_io_zero_strides);
  TEST_LAYOUTS(test_uniform_block_shape);
  TEST_LAYOUTS(test_skewed_inner_dim_block_shape);
-  TEST_LAYOUTS_WITH_ARG(test_empty_dims, TensorBlockShapeType::kUniformAllDims);
-  TEST_LAYOUTS_WITH_ARG(test_empty_dims, TensorBlockShapeType::kSkewedInnerDims);
+  TEST_LAYOUTS_WITH_ARG(test_empty_dims, internal::kUniformAllDims);
+  TEST_LAYOUTS_WITH_ARG(test_empty_dims, internal::kSkewedInnerDims);
 }

 #undef TEST_LAYOUTS
-#undef TEST_LAYOUTS_WITH_ARG
+#undef TEST_LAYOUTS_WITH_ARG
--- a/unsupported/test/cxx11_tensor_concatenation.cpp
+++ b/unsupported/test/cxx11_tensor_concatenation.cpp
@ -56,7 +56,7 @@ static void test_static_dimension_failure()
  // either the code should change to 
  //  Tensor<int, 2>::Dimensions{{2, 3}}
  // or Tensor<int, 2>::Dimensions{Tensor<int, 2>::Dimensions{{2, 3}}}
-      .concatenate(right.reshape(Tensor<int, 2>::Dimensions{{2, 3}}), 0);
+      .concatenate(right.reshape(Tensor<int, 2>::Dimensions(2, 3)), 0);
 }

 template<int DataLayout>
--- a/unsupported/test/cxx11_tensor_contraction.cpp
+++ b/unsupported/test/cxx11_tensor_contraction.cpp
@ -514,7 +514,7 @@ static void test_const_inputs()
 struct SqrtOutputKernel {
  template <typename Index, typename Scalar>
  EIGEN_ALWAYS_INLINE void operator()(
-      const OutputKernel::OutputMapper<Index, Scalar>& output_mapper,
+      const internal::blas_data_mapper<Scalar, Index, ColMajor>& output_mapper,
      const TensorContractionParams&, Index, Index, Index num_rows,
      Index num_cols) const {
    for (int i = 0; i < num_rows; ++i) {
@ -553,7 +553,7 @@ static void test_large_contraction_with_output_kernel() {

  m_result = m_left * m_right;

-  for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) {
+  for (std::ptrdiff_t i = 0; i < t_result.dimensions().TotalSize(); i++) {
    VERIFY(&t_result.data()[i] != &m_result.data()[i]);
    VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i]));
  }
--- a/unsupported/test/cxx11_tensor_index_list.cpp
+++ b/unsupported/test/cxx11_tensor_index_list.cpp
@ -170,7 +170,6 @@ static void test_type2indexpair_list()
  typedef Eigen::IndexPairList<Eigen::type2indexpair<0,10>, Eigen::IndexPair<DenseIndex>, Eigen::type2indexpair<2,12>> Dims2_b;
  typedef Eigen::IndexPairList<Eigen::IndexPair<DenseIndex>, Eigen::type2indexpair<1,11>, Eigen::IndexPair<DenseIndex>> Dims2_c;

-  Dims0 d0;
  Dims2_a d2_a;

  Dims2_b d2_b;
--- a/unsupported/test/cxx11_tensor_thread_pool.cpp
+++ b/unsupported/test/cxx11_tensor_thread_pool.cpp
@ -255,7 +255,7 @@ void test_multithread_contraction_agrees_with_singlethread() {
 struct SqrtOutputKernel {
  template <typename Index, typename Scalar>
  EIGEN_ALWAYS_INLINE void operator()(
-      const OutputKernel::OutputMapper<Index, Scalar>& output_mapper,
+      const internal::blas_data_mapper<Scalar, Index, ColMajor>& output_mapper,
      const TensorContractionParams&, Index, Index, Index num_rows,
      Index num_cols) const {
    for (int i = 0; i < num_rows; ++i) {
@ -300,7 +300,7 @@ static void test_multithread_contraction_with_output_kernel() {

  m_result = m_left * m_right;

-  for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) {
+  for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
    VERIFY(&t_result.data()[i] != &m_result.data()[i]);
    VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i]));
  }
@ -428,7 +428,7 @@ void test_threadpool_allocate(TestAllocator* allocator)
    void* ptr = device.allocate(512);
    device.deallocate(ptr);
  }
-  VERIFY(allocator != nullptr);
+  VERIFY(allocator != NULL);
  VERIFY_IS_EQUAL(allocator->alloc_count(), num_allocs);
  VERIFY_IS_EQUAL(allocator->dealloc_count(), num_allocs);
 }
@ -460,7 +460,7 @@ EIGEN_DECLARE_TEST(cxx11_tensor_thread_pool)
  CALL_SUBTEST_6(test_multithread_random());

  TestAllocator test_allocator;
-  CALL_SUBTEST_6(test_multithread_shuffle<ColMajor>(nullptr));
+  CALL_SUBTEST_6(test_multithread_shuffle<ColMajor>(NULL));
  CALL_SUBTEST_6(test_multithread_shuffle<RowMajor>(&test_allocator));
  CALL_SUBTEST_6(test_threadpool_allocate(&test_allocator));
 }
--- a/unsupported/test/kronecker_product.cpp
+++ b/unsupported/test/kronecker_product.cpp
@ -9,6 +9,7 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

+
 #ifdef EIGEN_TEST_PART_1

 #include "sparse.h"
@ -95,7 +96,7 @@ EIGEN_DECLARE_TEST(kronecker_product)
  SM_a.insert(1,0) = DM_a.coeffRef(1,0) = -0.9076572187376921;
  SM_a.insert(1,1) = DM_a.coeffRef(1,1) =  0.6469156566545853;
  SM_a.insert(1,2) = DM_a.coeffRef(1,2) = -0.3658010398782789;
- 
+
  MatrixXd             DM_b(3,2);
  SparseMatrix<double> SM_b(3,2);
  SM_b.insert(0,0) = DM_b.coeffRef(0,0) =  0.9004440976767099;
@ -165,7 +166,7 @@ EIGEN_DECLARE_TEST(kronecker_product)
  SM_a.insert(0,3) = -0.2;
  SM_a.insert(2,4) =  0.3;
  SM_a.finalize();
-  
+
  SM_b.insert(0,0) =  0.4;
  SM_b.insert(2,1) = -0.5;
  SM_b.finalize();
@ -183,7 +184,7 @@ EIGEN_DECLARE_TEST(kronecker_product)
  DM_b2.resize(4,8);
  DM_ab2 = kroneckerProduct(DM_a2,DM_b2);
  CALL_SUBTEST(check_dimension(DM_ab2,10*4,9*8));
-  
+
  for(int i = 0; i < g_repeat; i++)
  {
    double density = Eigen::internal::random<double>(0.01,0.5);
@ -196,35 +197,35 @@ EIGEN_DECLARE_TEST(kronecker_product)
    MatrixXf dA(ra,ca), dB(rb,cb), dC;
    initSparse(density, dA, sA);
    initSparse(density, dB, sB);
-    
+
    sC = kroneckerProduct(sA,sB);
    dC = kroneckerProduct(dA,dB);
    VERIFY_IS_APPROX(MatrixXf(sC),dC);
-    
+
    sC = kroneckerProduct(sA.transpose(),sB);
    dC = kroneckerProduct(dA.transpose(),dB);
    VERIFY_IS_APPROX(MatrixXf(sC),dC);
-    
+
    sC = kroneckerProduct(sA.transpose(),sB.transpose());
    dC = kroneckerProduct(dA.transpose(),dB.transpose());
    VERIFY_IS_APPROX(MatrixXf(sC),dC);
-    
+
    sC = kroneckerProduct(sA,sB.transpose());
    dC = kroneckerProduct(dA,dB.transpose());
    VERIFY_IS_APPROX(MatrixXf(sC),dC);
-    
+
    sC2 = kroneckerProduct(sA,sB);
    dC = kroneckerProduct(dA,dB);
    VERIFY_IS_APPROX(MatrixXf(sC2),dC);
-    
+
    sC2 = kroneckerProduct(dA,sB);
    dC = kroneckerProduct(dA,dB);
    VERIFY_IS_APPROX(MatrixXf(sC2),dC);
-    
+
    sC2 = kroneckerProduct(sA,dB);
    dC = kroneckerProduct(dA,dB);
    VERIFY_IS_APPROX(MatrixXf(sC2),dC);
-    
+
    sC2 = kroneckerProduct(2*sA,sB);
    dC = kroneckerProduct(2*dA,dB);
    VERIFY_IS_APPROX(MatrixXf(sC2),dC);
@ -236,7 +237,6 @@ EIGEN_DECLARE_TEST(kronecker_product)
 #ifdef EIGEN_TEST_PART_2

 // simply check that for a dense kronecker product, sparse module is not needed
-
 #include "main.h"
 #include <Eigen/KroneckerProduct>

--- a/unsupported/test/matrix_function.cpp
+++ b/unsupported/test/matrix_function.cpp
@ -23,9 +23,8 @@ inline bool test_isApprox_abs(const Type1& a, const Type2& b)

 // Returns a matrix with eigenvalues clustered around 0, 1 and 2.
 template<typename MatrixType>
-MatrixType randomMatrixWithRealEivals(const typename MatrixType::Index size)
+MatrixType randomMatrixWithRealEivals(const Index size)
 {
-  typedef typename MatrixType::Index Index;
  typedef typename MatrixType::Scalar Scalar;
  typedef typename MatrixType::RealScalar RealScalar;
  MatrixType diag = MatrixType::Zero(size, size);
@ -42,16 +41,15 @@ template <typename MatrixType, int IsComplex = NumTraits<typename internal::trai
 struct randomMatrixWithImagEivals
 {
  // Returns a matrix with eigenvalues clustered around 0 and +/- i.
-  static MatrixType run(const typename MatrixType::Index size);
+  static MatrixType run(const Index size);
 };

 // Partial specialization for real matrices
 template<typename MatrixType>
 struct randomMatrixWithImagEivals<MatrixType, 0>
 {
-  static MatrixType run(const typename MatrixType::Index size)
+  static MatrixType run(const Index size)
  {
-    typedef typename MatrixType::Index Index;
    typedef typename MatrixType::Scalar Scalar;
    MatrixType diag = MatrixType::Zero(size, size);
    Index i = 0;
@ -77,9 +75,8 @@ struct randomMatrixWithImagEivals<MatrixType, 0>
 template<typename MatrixType>
 struct randomMatrixWithImagEivals<MatrixType, 1>
 {
-  static MatrixType run(const typename MatrixType::Index size)
+  static MatrixType run(const Index size)
  {
-    typedef typename MatrixType::Index Index;
    typedef typename MatrixType::Scalar Scalar;
    typedef typename MatrixType::RealScalar RealScalar;
    const Scalar imagUnit(0, 1);
@ -171,7 +168,6 @@ void testMatrixType(const MatrixType& m)
 {
  // Matrices with clustered eigenvalue lead to different code paths
  // in MatrixFunction.h and are thus useful for testing.
-  typedef typename MatrixType::Index Index;

  const Index size = m.rows();
  for (int i = 0; i < g_repeat; i++) {
--- a/unsupported/test/openglsupport.cpp
+++ b/unsupported/test/openglsupport.cpp
@ -318,10 +318,6 @@ EIGEN_DECLARE_TEST(openglsupport)
        
      GLint prg_id = createShader(vtx,frg);
      
-      typedef Vector2d Vector2d;
-      typedef Vector3d Vector3d;
-      typedef Vector4d Vector4d;
-      
      VERIFY_UNIFORM(dv,v2d, Vector2d);
      VERIFY_UNIFORM(dv,v3d, Vector3d);
      VERIFY_UNIFORM(dv,v4d, Vector4d);
--- a/unsupported/test/polynomialsolver.cpp
+++ b/unsupported/test/polynomialsolver.cpp
@ -30,7 +30,6 @@ struct increment_if_fixed_size
 template<int Deg, typename POLYNOMIAL, typename SOLVER>
 bool aux_evalSolver( const POLYNOMIAL& pols, SOLVER& psolve )
 {
-  typedef typename POLYNOMIAL::Index Index;
  typedef typename POLYNOMIAL::Scalar Scalar;
  typedef typename POLYNOMIAL::RealScalar RealScalar;