Merged latest changes from upstream/eigen

2025-08-12 19:59:05 +08:00 · 2018-08-01 11:59:04 -07:00 · 2018-08-01 11:59:04 -07:00 · 385b3ff12f
commit 385b3ff12f
parent 83c0a16baf 17221115c9
61 changed files with 1319 additions and 357 deletions
--- a/CTestCustom.cmake.in
+++ b/CTestCustom.cmake.in
@ -1,3 +1,4 @@
 set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_WARNINGS "2000")
 set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_ERRORS   "2000")
 list(APPEND CTEST_CUSTOM_ERROR_EXCEPTION    @EIGEN_CTEST_ERROR_EXCEPTION@)
--- a/Eigen/Core
+++ b/Eigen/Core
@ -179,6 +179,7 @@ using std::ptrdiff_t;
  #include "src/Core/arch/NEON/PacketMath.h"
  #include "src/Core/arch/NEON/MathFunctions.h"
  #include "src/Core/arch/NEON/Complex.h"
  #include "src/Core/arch/NEON/TypeCasting.h"
 #elif defined EIGEN_VECTORIZE_ZVECTOR
  #include "src/Core/arch/ZVector/PacketMath.h"
  #include "src/Core/arch/ZVector/MathFunctions.h"
--- a/Eigen/src/Core/arch/AVX512/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h
@ -258,48 +258,39 @@ pexp<Packet8d>(const Packet8d& _x) {
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
 psqrt<Packet16f>(const Packet16f& _x) {
-  _EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f);
+  Packet16f neg_half = pmul(_x, pset1<Packet16f>(-.5f));
-  _EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f);
+  __mmask16 denormal_mask = _mm512_kand(
-  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000);
+      _mm512_cmp_ps_mask(_x, pset1<Packet16f>((std::numeric_limits<float>::min)()),
                        _CMP_LT_OQ),
      _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_GE_OQ));
-  Packet16f neg_half = pmul(_x, p16f_minus_half);
+  Packet16f x = _mm512_rsqrt14_ps(_x);
  // select only the inverse sqrt of positive normal inputs (denormals are
  // flushed to zero and cause infs as well).
  __mmask16 non_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_GE_OQ);
  Packet16f x = _mm512_mask_blend_ps(non_zero_mask, _mm512_setzero_ps(), _mm512_rsqrt14_ps(_x));
  // Do a single step of Newton's iteration.
-  x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));
+  x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet16f>(1.5f)));
-  // Multiply the original _x by it's reciprocal square root to extract the
+  // Flush results for denormals to zero.
-  // square root.
+  return _mm512_mask_blend_ps(denormal_mask, pmul(_x,x), _mm512_setzero_ps());
  return pmul(_x, x);
 }
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
 psqrt<Packet8d>(const Packet8d& _x) {
-  _EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5);
+  Packet8d neg_half = pmul(_x, pset1<Packet8d>(-.5f));
-  _EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5);
+  __mmask16 denormal_mask = _mm512_kand(
-  _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL);
+      _mm512_cmp_pd_mask(_x, pset1<Packet8d>((std::numeric_limits<double>::min)()),
                        _CMP_LT_OQ),
      _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_GE_OQ));
-  Packet8d neg_half = pmul(_x, p8d_minus_half);
+  Packet8d x = _mm512_rsqrt14_pd(_x);
-  // select only the inverse sqrt of positive normal inputs (denormals are
+  // Do a single step of Newton's iteration.
-  // flushed to zero and cause infs as well).
+  x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet8d>(1.5f)));
  __mmask8 non_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_GE_OQ);
  Packet8d x = _mm512_mask_blend_pd(non_zero_mask, _mm512_setzero_pd(), _mm512_rsqrt14_pd(_x));
  // Do a first step of Newton's iteration.
  x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
  // Do a second step of Newton's iteration.
-  x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
+  x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet8d>(1.5f)));
-  // Multiply the original _x by it's reciprocal square root to extract the
+  return _mm512_mask_blend_pd(denormal_mask, pmul(_x,x), _mm512_setzero_pd());
  // square root.
  return pmul(_x, x);
 }
 #else
 template <>
--- a/Eigen/src/Core/arch/NEON/TypeCasting.h
+++ b/Eigen/src/Core/arch/NEON/TypeCasting.h
@ -0,0 +1,48 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2018 Rasmus Munk Larsen <rmlarsen@google.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #ifndef EIGEN_TYPE_CASTING_NEON_H
 #define EIGEN_TYPE_CASTING_NEON_H
 namespace Eigen {
 namespace internal {
 template <>
 struct type_casting_traits<float, int> {
  enum {
    VectorizedCast = 1,
    SrcCoeffRatio = 1,
    TgtCoeffRatio = 1
  };
 };
 template <>
 struct type_casting_traits<int, float> {
  enum {
    VectorizedCast = 1,
    SrcCoeffRatio = 1,
    TgtCoeffRatio = 1
  };
 };
 template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
  return vcvtq_s32_f32(a);
 }
 template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
  return vcvtq_f32_s32(a);
 }
 } // end namespace internal
 } // end namespace Eigen
 #endif // EIGEN_TYPE_CASTING_NEON_H
--- a/Eigen/src/Core/arch/SYCL/InteropHeaders.h
+++ b/Eigen/src/Core/arch/SYCL/InteropHeaders.h
@ -0,0 +1,104 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Mehdi Goli    Codeplay Software Ltd.
 // Ralph Potter  Codeplay Software Ltd.
 // Luke Iwanski  Codeplay Software Ltd.
 // Contact: <eigen@codeplay.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 /*****************************************************************
 * InteropHeaders.h
 *
 * \brief:
 *  InteropHeaders
 *
 *****************************************************************/
 #ifndef EIGEN_INTEROP_HEADERS_SYCL_H
 #define EIGEN_INTEROP_HEADERS_SYCL_H
 #if defined EIGEN_USE_SYCL
 namespace Eigen {
 namespace internal {
 #define SYCL_PACKET_TRAITS(packet_type, val, unpacket_type, lengths)\
  template<> struct packet_traits<unpacket_type> : default_packet_traits\
  {\
    typedef packet_type type;\
    typedef packet_type half;\
    enum {\
      Vectorizable = 1,\
      AlignedOnScalar = 1,\
      size=lengths,\
      HasHalfPacket = 0,\
      HasDiv  = 1,\
      HasLog  = 1,\
      HasExp  = 1,\
      HasSqrt = 1,\
      HasRsqrt = 1,\
      HasSin    = 1,\
      HasCos    = 1,\
      HasTan    = 1,\
      HasASin   = 1,\
      HasACos   = 1,\
      HasATan   = 1,\
      HasSinh   = 1,\
      HasCosh   = 1,\
      HasTanh   = 1,\
      HasLGamma = 0,\
      HasDiGamma = 0,\
      HasZeta = 0,\
      HasPolygamma = 0,\
      HasErf = 0,\
      HasErfc = 0,\
      HasIGamma = 0,\
      HasIGammac = 0,\
      HasBetaInc = 0,\
      HasBlend = val,\
      HasMax=1,\
      HasMin=1,\
      HasMul=1,\
      HasAdd=1,\
      HasFloor=1,\
      HasRound=1,\
      HasLog1p=1,\
      HasExpm1=1,\
      HasCeil=1,\
    };\
  };
 SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, float, 4)
 SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, const float, 4)
 SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, double, 2)
 SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, const double, 2)
 #undef SYCL_PACKET_TRAITS
 // Make sure this is only available when targeting a GPU: we don't want to
 // introduce conflicts between these packet_traits definitions and the ones
 // we'll use on the host side (SSE, AVX, ...)
 #define SYCL_ARITHMETIC(packet_type) template<> struct is_arithmetic<packet_type>  { enum { value = true }; };
 SYCL_ARITHMETIC(cl::sycl::cl_float4)
 SYCL_ARITHMETIC(cl::sycl::cl_double2)
 #undef SYCL_ARITHMETIC
 #define SYCL_UNPACKET_TRAITS(packet_type, unpacket_type, lengths)\
 template<> struct unpacket_traits<packet_type>  {\
  typedef unpacket_type  type;\
  enum {size=lengths, alignment=Aligned16};\
  typedef packet_type half;\
 };
 SYCL_UNPACKET_TRAITS(cl::sycl::cl_float4, float, 4)
 SYCL_UNPACKET_TRAITS(cl::sycl::cl_double2, double, 2)
 #undef SYCL_UNPACKET_TRAITS
 } // end namespace internal
 } // end namespace Eigen
 #endif // EIGEN_USE_SYCL
 #endif // EIGEN_INTEROP_HEADERS_SYCL_H
--- a/Eigen/src/Core/arch/SYCL/MathFunctions.h
+++ b/Eigen/src/Core/arch/SYCL/MathFunctions.h
@ -0,0 +1,221 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Mehdi Goli    Codeplay Software Ltd.
 // Ralph Potter  Codeplay Software Ltd.
 // Luke Iwanski  Codeplay Software Ltd.
 // Contact: <eigen@codeplay.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 /*****************************************************************
 * MathFunctions.h
 *
 * \brief:
 *  MathFunctions
 *
 *****************************************************************/
 #ifndef EIGEN_MATH_FUNCTIONS_SYCL_H
 #define EIGEN_MATH_FUNCTIONS_SYCL_H
 namespace Eigen {
 namespace internal {
 // Make sure this is only available when targeting a GPU: we don't want to
 // introduce conflicts between these packet_traits definitions and the ones
 // we'll use on the host side (SSE, AVX, ...)
 //#if defined(__SYCL_DEVICE_ONLY__) && defined(EIGEN_USE_SYCL)
 #define SYCL_PLOG(packet_type) \
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
 packet_type plog<packet_type>(const packet_type& a) { return cl::sycl::log(a); }
 SYCL_PLOG(cl::sycl::cl_float4)
 SYCL_PLOG(cl::sycl::cl_double2)
 #undef SYCL_PLOG
 #define SYCL_PLOG1P(packet_type) \
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
 packet_type plog1p<packet_type>(const packet_type& a) { return cl::sycl::log1p(a); }
 SYCL_PLOG1P(cl::sycl::cl_float4)
 SYCL_PLOG1P(cl::sycl::cl_double2)
 #undef SYCL_PLOG1P
 #define SYCL_PLOG10(packet_type) \
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
 packet_type plog10<packet_type>(const packet_type& a) { return cl::sycl::log10(a); }
 SYCL_PLOG10(cl::sycl::cl_float4)
 SYCL_PLOG10(cl::sycl::cl_double2)
 #undef SYCL_PLOG10
 #define SYCL_PEXP(packet_type) \
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
 packet_type pexp<packet_type>(const packet_type& a) { return cl::sycl::exp(a); }
 SYCL_PEXP(cl::sycl::cl_float4)
 SYCL_PEXP(cl::sycl::cl_double2)
 #undef SYCL_PEXP
 #define SYCL_PEXPM1(packet_type) \
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
 packet_type pexpm1<packet_type>(const packet_type& a) { return cl::sycl::expm1(a); }
 SYCL_PEXPM1(cl::sycl::cl_float4)
 SYCL_PEXPM1(cl::sycl::cl_double2)
 #undef SYCL_PEXPM1
 #define SYCL_PSQRT(packet_type) \
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
 packet_type psqrt<packet_type>(const packet_type& a) { return cl::sycl::sqrt(a); }
 SYCL_PSQRT(cl::sycl::cl_float4)
 SYCL_PSQRT(cl::sycl::cl_double2)
 #undef SYCL_PSQRT
 #define SYCL_PRSQRT(packet_type) \
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
 packet_type prsqrt<packet_type>(const packet_type& a) { return cl::sycl::rsqrt(a); }
 SYCL_PRSQRT(cl::sycl::cl_float4)
 SYCL_PRSQRT(cl::sycl::cl_double2)
 #undef SYCL_PRSQRT
 /** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
 #define SYCL_PSIN(packet_type) \
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
 packet_type psin<packet_type>(const packet_type& a)  { return cl::sycl::sin(a); }
 SYCL_PSIN(cl::sycl::cl_float4)
 SYCL_PSIN(cl::sycl::cl_double2)
 #undef SYCL_PSIN
 /** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
 #define SYCL_PCOS(packet_type) \
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
 packet_type pcos<packet_type>(const packet_type& a)  { return cl::sycl::cos(a); }
 SYCL_PCOS(cl::sycl::cl_float4)
 SYCL_PCOS(cl::sycl::cl_double2)
 #undef SYCL_PCOS
 /** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
 #define SYCL_PTAN(packet_type) \
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
 packet_type ptan<packet_type>(const packet_type& a) {  return cl::sycl::tan(a); }
 SYCL_PTAN(cl::sycl::cl_float4)
 SYCL_PTAN(cl::sycl::cl_double2)
 #undef SYCL_PTAN
 /** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
 #define SYCL_PASIN(packet_type) \
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
 packet_type pasin<packet_type>(const packet_type& a)  { return cl::sycl::asin(a); }
 SYCL_PASIN(cl::sycl::cl_float4)
 SYCL_PASIN(cl::sycl::cl_double2)
 #undef SYCL_PASIN
 /** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
 #define SYCL_PACOS(packet_type) \
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
 packet_type pacos<packet_type>(const packet_type& a)  { return cl::sycl::acos(a); }
 SYCL_PACOS(cl::sycl::cl_float4)
 SYCL_PACOS(cl::sycl::cl_double2)
 #undef SYCL_PACOS
 /** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
 #define SYCL_PATAN(packet_type) \
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
 packet_type patan<packet_type>(const packet_type& a) {  return cl::sycl::atan(a); }
 SYCL_PATAN(cl::sycl::cl_float4)
 SYCL_PATAN(cl::sycl::cl_double2)
 #undef SYCL_PATAN
 /** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
 #define SYCL_PSINH(packet_type) \
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
 packet_type psinh<packet_type>(const packet_type& a)  { return cl::sycl::sinh(a); }
 SYCL_PSINH(cl::sycl::cl_float4)
 SYCL_PSINH(cl::sycl::cl_double2)
 #undef SYCL_PSINH
 /** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
 #define SYCL_PCOSH(packet_type) \
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
 packet_type pcosh<packet_type>(const packet_type& a)  { return cl::sycl::cosh(a); }
 SYCL_PCOSH(cl::sycl::cl_float4)
 SYCL_PCOSH(cl::sycl::cl_double2)
 #undef SYCL_PCOSH
 /** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
 #define SYCL_PTANH(packet_type) \
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
 packet_type ptanh<packet_type>(const packet_type& a) {  return cl::sycl::tanh(a); }
 SYCL_PTANH(cl::sycl::cl_float4)
 SYCL_PTANH(cl::sycl::cl_double2)
 #undef SYCL_PTANH
 #define SYCL_PCEIL(packet_type) \
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
 packet_type pceil<packet_type>(const packet_type& a) { return cl::sycl::ceil(a); }
 SYCL_PCEIL(cl::sycl::cl_float4)
 SYCL_PCEIL(cl::sycl::cl_double2)
 #undef SYCL_PCEIL
 #define SYCL_PROUND(packet_type) \
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
 packet_type pround<packet_type>(const packet_type& a) { return cl::sycl::round(a); }
 SYCL_PROUND(cl::sycl::cl_float4)
 SYCL_PROUND(cl::sycl::cl_double2)
 #undef SYCL_PROUND
 #define SYCL_FLOOR(packet_type) \
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
 packet_type pfloor<packet_type>(const packet_type& a) { return cl::sycl::floor(a); }
 SYCL_FLOOR(cl::sycl::cl_float4)
 SYCL_FLOOR(cl::sycl::cl_double2)
 #undef SYCL_FLOOR
 #define SYCL_PMIN(packet_type, expr) \
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
 packet_type pmin<packet_type>(const packet_type& a, const packet_type& b) { return expr; }
 SYCL_PMIN(cl::sycl::cl_float4, cl::sycl::fmin(a, b))
 SYCL_PMIN(cl::sycl::cl_double2, cl::sycl::fmin(a, b))
 #undef SYCL_PMIN
 #define SYCL_PMAX(packet_type, expr) \
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
 packet_type pmax<packet_type>(const packet_type& a, const packet_type& b) { return expr; }
 SYCL_PMAX(cl::sycl::cl_float4, cl::sycl::fmax(a, b))
 SYCL_PMAX(cl::sycl::cl_double2, cl::sycl::fmax(a, b))
 #undef SYCL_PMAX
 //#endif
 } // end namespace internal
 } // end namespace Eigen
 #endif // EIGEN_MATH_FUNCTIONS_CUDA_H
--- a/Eigen/src/Core/arch/SYCL/PacketMath.h
+++ b/Eigen/src/Core/arch/SYCL/PacketMath.h
@ -0,0 +1,458 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Mehdi Goli    Codeplay Software Ltd.
 // Ralph Potter  Codeplay Software Ltd.
 // Luke Iwanski  Codeplay Software Ltd.
 // Contact: <eigen@codeplay.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 /*****************************************************************
 * PacketMath.h
 *
 * \brief:
 *  PacketMath
 *
 *****************************************************************/
 #ifndef EIGEN_PACKET_MATH_SYCL_H
 #define EIGEN_PACKET_MATH_SYCL_H
 #include <type_traits>
 #if defined EIGEN_USE_SYCL
 namespace Eigen {
 namespace internal {
 #define SYCL_PLOADT_RO(address_space_target)\
 template<typename packet_type, int Alignment>\
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type\
 ploadt_ro(typename cl::sycl::multi_ptr<const typename unpacket_traits<packet_type>::type,\
   cl::sycl::access::address_space::address_space_target>::pointer_t from) {\
   typedef typename unpacket_traits<packet_type>::type scalar;\
   typedef cl::sycl::multi_ptr<scalar, cl::sycl::access::address_space::address_space_target> multi_ptr;\
   auto res=packet_type(static_cast<typename unpacket_traits<packet_type>::type>(0));\
   res.load(0, multi_ptr(const_cast<typename multi_ptr::pointer_t>(from)));\
   return res;\
 }
 SYCL_PLOADT_RO(global_space)
 SYCL_PLOADT_RO(local_space)
 #undef SYCL_PLOADT_RO
 #define SYCL_PLOAD(address_space_target, Alignment, AlignedType)\
 template<typename packet_type> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type\
 pload##AlignedType(typename cl::sycl::multi_ptr<const typename unpacket_traits<packet_type>::type,\
   cl::sycl::access::address_space::address_space_target>::pointer_t from) {\
   return ploadt_ro<packet_type, Alignment>(from);\
 }
 // global space
 SYCL_PLOAD(global_space, Unaligned, u)
 SYCL_PLOAD(global_space, Aligned, )
 // local space
 SYCL_PLOAD(local_space, Unaligned, u)
 SYCL_PLOAD(local_space, Aligned, )
 // private space
 //SYCL_PLOAD(private_space, Unaligned, u)
 //SYCL_PLOAD(private_space, Aligned, )
 #undef SYCL_PLOAD
 /** \internal \returns a packet version of \a *from.
  * The pointer \a from must be aligned on a \a Alignment bytes boundary. */
 #define SYCL_PLOADT(address_space_target)\
 template<typename packet_type, int Alignment>\
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type ploadt(\
  typename cl::sycl::multi_ptr<const typename unpacket_traits<packet_type>::type,\
  cl::sycl::access::address_space::address_space_target>::pointer_t from)\
 {\
  if(Alignment >= unpacket_traits<packet_type>::alignment)\
    return pload<packet_type>(from);\
  else\
    return ploadu<packet_type>(from);\
 }
 // global space
 SYCL_PLOADT(global_space)
 // local space
 SYCL_PLOADT(local_space)
 //private_space
 // There is no need to specialise it for private space as it can use the GenericPacketMath version
 #define SYCL_PLOADT_RO_SPECIAL(packet_type, Alignment)\
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type\
  ploadt_ro<packet_type, Alignment>(const typename unpacket_traits<packet_type>::type * from) { \
    typedef typename unpacket_traits<packet_type>::type scalar;\
   auto res=packet_type(static_cast<scalar>(0));\
   res. template load<cl::sycl::access::address_space::private_space>(0, const_cast<scalar*>(from));\
   return res;\
  }
 SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_float4, Aligned)
 SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_double2, Aligned)
 SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_float4, Unaligned)
 SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_double2, Unaligned)
 #define SYCL_PLOAD_SPECIAL(packet_type, alignment_type)\
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type\
  pload##alignment_type(const typename unpacket_traits<packet_type>::type * from) { \
    typedef typename unpacket_traits<packet_type>::type scalar;\
   auto res=packet_type(static_cast<scalar>(0));\
   res. template load<cl::sycl::access::address_space::private_space>(0, const_cast<scalar*>(from));\
   return res;\
  }
 SYCL_PLOAD_SPECIAL(cl::sycl::cl_float4,)
 SYCL_PLOAD_SPECIAL(cl::sycl::cl_double2,)
 SYCL_PLOAD_SPECIAL(cl::sycl::cl_float4, u)
 SYCL_PLOAD_SPECIAL(cl::sycl::cl_double2, u)
 #undef SYCL_PLOAD_SPECIAL
 #define SYCL_PSTORE(scalar, packet_type, address_space_target, alignment)\
 template<>\
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \
   typename cl::sycl::multi_ptr<scalar, cl::sycl::access::address_space::address_space_target>::pointer_t to, \
   const packet_type& from) {\
     typedef cl::sycl::multi_ptr<scalar, cl::sycl::access::address_space::address_space_target> multi_ptr;\
     from.store(0, multi_ptr(to));\
 }
 // global space
 SYCL_PSTORE(float, cl::sycl::cl_float4, global_space, )
 SYCL_PSTORE(float, cl::sycl::cl_float4, global_space, u)
 SYCL_PSTORE(double, cl::sycl::cl_double2, global_space, )
 SYCL_PSTORE(double, cl::sycl::cl_double2, global_space, u)
 SYCL_PSTORE(float, cl::sycl::cl_float4, local_space, )
 SYCL_PSTORE(float, cl::sycl::cl_float4, local_space, u)
 SYCL_PSTORE(double, cl::sycl::cl_double2, local_space, )
 SYCL_PSTORE(double, cl::sycl::cl_double2, local_space, u)
 SYCL_PSTORE(float, cl::sycl::cl_float4, private_space, )
 SYCL_PSTORE(float, cl::sycl::cl_float4, private_space, u)
 SYCL_PSTORE(double, cl::sycl::cl_double2, private_space, )
 SYCL_PSTORE(double, cl::sycl::cl_double2, private_space, u)
 #define SYCL_PSTORE_T(scalar, packet_type, Alignment)\
 template<>\
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret<scalar, packet_type, Alignment>(\
  scalar* to,\
  const packet_type& from) {\
  if(Alignment)\
  pstore(to, from);\
  else\
  pstoreu(to,from);\
 }
 SYCL_PSTORE_T(float, cl::sycl::cl_float4, Aligned)
 SYCL_PSTORE_T(float, cl::sycl::cl_float4, Unaligned)
 SYCL_PSTORE_T(double, cl::sycl::cl_double2, Aligned)
 SYCL_PSTORE_T(double, cl::sycl::cl_double2, Unaligned)
 #undef SYCL_PSTORE_T
 #define SYCL_PSET1(packet_type)\
 template<>  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pset1<packet_type>(\
  const typename unpacket_traits<packet_type>::type&  from) {\
  return packet_type(from);\
 }
 // global space
 SYCL_PSET1(cl::sycl::cl_float4)
 SYCL_PSET1(cl::sycl::cl_double2)
 #undef SYCL_PSET1
 template <typename packet_type> struct get_base_packet {
 template <typename sycl_multi_pointer>
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type get_ploaddup(sycl_multi_pointer ) {}
  template <typename sycl_multi_pointer>
    static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type get_pgather(sycl_multi_pointer , Index ) {}
 };
 template <> struct get_base_packet <cl::sycl::cl_float4>  {
  template <typename sycl_multi_pointer>
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 get_ploaddup(sycl_multi_pointer from) {
    return cl::sycl::cl_float4(from[0], from[0], from[1], from[1]);
  }
  template <typename sycl_multi_pointer>
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 get_pgather(sycl_multi_pointer from, Index stride) {
    return cl::sycl::cl_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
  }
  template <typename sycl_multi_pointer>
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter(sycl_multi_pointer to , const cl::sycl::cl_float4& from, Index stride) {
    auto tmp = stride;
    to[0] = from.x();
    to[tmp] = from.y();
    to[tmp += stride] = from.z();
    to[tmp += stride] = from.w();
 }
 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 set_plset(const float& a) {
   return  cl::sycl::cl_float4(static_cast<float>(a), static_cast<float>(a+1), static_cast<float>(a+2), static_cast<float>(a+3));
 }
 };
 template <> struct get_base_packet <cl::sycl::cl_double2>  {
  template <typename sycl_multi_pointer>
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 get_ploaddup(const sycl_multi_pointer from) {
    return cl::sycl::cl_double2(from[0], from[0]);
  }
  template <typename sycl_multi_pointer, typename Index>
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 get_pgather(const sycl_multi_pointer from, Index stride) {
    return cl::sycl::cl_double2(from[0*stride], from[1*stride]);
  }
  template <typename sycl_multi_pointer>
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter(sycl_multi_pointer to , const cl::sycl::cl_double2& from, Index stride) {
    to[0] = from.x();
    to[stride] = from.y();
  }
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 set_plset(const double& a) {
    return  cl::sycl::cl_double2(static_cast<double>(a), static_cast<double>(a + 1));
  }
 };
 #define SYCL_PLOAD_DUP(address_space_target)\
 template<typename packet_type> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type \
 ploaddup(typename cl::sycl::multi_ptr<const typename unpacket_traits<packet_type>::type,\
  cl::sycl::access::address_space::address_space_target>::pointer_t from)\
 {\
  return get_base_packet<packet_type>::get_ploaddup(from); \
 }
 // global space
 SYCL_PLOAD_DUP(global_space)
 // local_space
 SYCL_PLOAD_DUP(local_space)
 // private_space
 //SYCL_PLOAD_DUP(private_space)
 #undef SYCL_PLOAD_DUP
 #define SYCL_PLOAD_DUP_SPECILIZE(packet_type)\
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type \
 ploaddup<packet_type>(const typename unpacket_traits<packet_type>::type * from)\
 { \
  return get_base_packet<packet_type>::get_ploaddup(from); \
 }
 SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_float4)
 SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_double2)
 #undef SYCL_PLOAD_DUP_SPECILIZE
 #define SYCL_PLSET(packet_type)\
 template<>  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type plset<packet_type>(const typename unpacket_traits<packet_type>::type& a) {\
  return get_base_packet<packet_type>::set_plset(a);\
 }
 SYCL_PLSET(cl::sycl::cl_float4)
 SYCL_PLSET(cl::sycl::cl_double2)
 #undef SYCL_PLSET
 #define SYCL_PGATHER(address_space_target)\
 template<typename Scalar, typename packet_type> EIGEN_DEVICE_FUNC inline packet_type pgather(\
  typename cl::sycl::multi_ptr<const typename unpacket_traits<packet_type>::type,\
  cl::sycl::access::address_space::address_space_target>::pointer_t from, Index stride) {\
    return get_base_packet<packet_type>::get_pgather(from, stride); \
 }
 // global space
 SYCL_PGATHER(global_space)
 // local space
 SYCL_PGATHER(local_space)
 // private space
 //SYCL_PGATHER(private_space)
 #undef SYCL_PGATHER
 #define SYCL_PGATHER_SPECILIZE(scalar, packet_type)\
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type \
 pgather<scalar, packet_type>(const typename unpacket_traits<packet_type>::type * from, Index stride)\
 { \
  return get_base_packet<packet_type>::get_pgather(from, stride); \
 }
 SYCL_PGATHER_SPECILIZE(float, cl::sycl::cl_float4)
 SYCL_PGATHER_SPECILIZE(double, cl::sycl::cl_double2)
 #undef SYCL_PGATHER_SPECILIZE
 #define SYCL_PSCATTER(address_space_target)\
 template<typename Scalar, typename packet_type> EIGEN_DEVICE_FUNC inline void pscatter(\
    typename cl::sycl::multi_ptr<typename unpacket_traits<packet_type>::type,\
    cl::sycl::access::address_space::address_space_target>::pointer_t to,\
      const packet_type& from, Index stride) {\
      get_base_packet<packet_type>::set_pscatter(to, from, stride);\
 }
 // global space
 SYCL_PSCATTER(global_space)
 // local space
 SYCL_PSCATTER(local_space)
 // private space
 //SYCL_PSCATTER(private_space)
 #undef SYCL_PSCATTER
 #define SYCL_PSCATTER_SPECILIZE(scalar, packet_type)\
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void \
 pscatter<scalar, packet_type>(typename unpacket_traits<packet_type>::type * to, const packet_type& from, Index stride)\
 { \
   get_base_packet<packet_type>::set_pscatter(to, from, stride);\
 }
 SYCL_PSCATTER_SPECILIZE(float, cl::sycl::cl_float4)
 SYCL_PSCATTER_SPECILIZE(double, cl::sycl::cl_double2)
 #undef SYCL_PSCATTER_SPECILIZE
 #define SYCL_PMAD(packet_type)\
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pmadd( const packet_type& a,\
  const packet_type& b, const packet_type& c){\
  return cl::sycl::mad(a,b,c);\
 }
 SYCL_PMAD(cl::sycl::cl_float4)
 SYCL_PMAD(cl::sycl::cl_double2)
 #undef SYCL_PMAD
 template<>  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float  pfirst<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
  return a.x();
 }
 template<>  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double pfirst<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
  return a.x();
 }
 template<>  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float  predux<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
  return a.x() + a.y() + a.z() + a.w();
 }
 template<>  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
  return a.x() + a.y();
 }
 template<>  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float  predux_max<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
  return cl::sycl::fmax(cl::sycl::fmax(a.x(), a.y()), cl::sycl::fmax(a.z(), a.w()));
 }
 template<>  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_max<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
  return cl::sycl::fmax(a.x(), a.y());
 }
 template<>  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float  predux_min<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
  return cl::sycl::fmin(cl::sycl::fmin(a.x(), a.y()), cl::sycl::fmin(a.z(), a.w()));
 }
 template<>  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_min<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
  return cl::sycl::fmin(a.x(), a.y());
 }
 template<>  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float  predux_mul<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
  return a.x() * a.y() * a.z() * a.w();
 }
 template<>  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_mul<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
  return a.x() * a.y();
 }
 template<>  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4  pabs<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
  return cl::sycl::cl_float4(cl::sycl::fabs(a.x()), cl::sycl::fabs(a.y()), cl::sycl::fabs(a.z()), cl::sycl::fabs(a.w()));
 }
 template<>  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_double2 pabs<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
  return cl::sycl::cl_double2(cl::sycl::fabs(a.x()), cl::sycl::fabs(a.y()));
 }
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void
 ptranspose(PacketBlock<cl::sycl::cl_float4,4>& kernel) {
  float tmp = kernel.packet[0].y();
  kernel.packet[0].y() = kernel.packet[1].x();
  kernel.packet[1].x() = tmp;
 //  std::swap(kernel.packet[0].y(), kernel.packet[1].x());
  tmp = kernel.packet[0].z();
  kernel.packet[0].z() = kernel.packet[2].x();
  kernel.packet[2].x() = tmp;
  //std::swap(kernel.packet[0].z(), kernel.packet[2].x());
  tmp = kernel.packet[0].w();
  kernel.packet[0].w() = kernel.packet[3].x();
  kernel.packet[3].x() = tmp;
  //std::swap(kernel.packet[0].w(), kernel.packet[3].x());
  tmp = kernel.packet[1].z();
  kernel.packet[1].z() = kernel.packet[2].y();
  kernel.packet[2].y() = tmp;
 //  std::swap(kernel.packet[1].z(), kernel.packet[2].y());
  tmp = kernel.packet[1].w();
  kernel.packet[1].w() = kernel.packet[3].y();
  kernel.packet[3].y() = tmp;
 //  std::swap(kernel.packet[1].w(), kernel.packet[3].y());
  tmp = kernel.packet[2].w();
  kernel.packet[2].w() = kernel.packet[3].z();
  kernel.packet[3].z() = tmp;
 //  std::swap(kernel.packet[2].w(), kernel.packet[3].z());
 }
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void
 ptranspose(PacketBlock<cl::sycl::cl_double2,2>& kernel) {
  double tmp = kernel.packet[0].y();
  kernel.packet[0].y() = kernel.packet[1].x();
  kernel.packet[1].x() = tmp;
 //std::swap(kernel.packet[0].y(), kernel.packet[1].x());
 }
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4
 pblend(const Selector<unpacket_traits<cl::sycl::cl_float4>::size>& ifPacket,
  const cl::sycl::cl_float4& thenPacket, const cl::sycl::cl_float4& elsePacket) {
  cl::sycl::cl_int4 condition(ifPacket.select[0] ? 0 : -1,
                              ifPacket.select[1] ? 0 : -1,
                              ifPacket.select[2] ? 0 : -1,
                              ifPacket.select[3] ? 0 : -1);
  return cl::sycl::select(thenPacket, elsePacket, condition);
 }
 template<> inline cl::sycl::cl_double2
 pblend(const Selector<unpacket_traits<cl::sycl::cl_double2>::size>& ifPacket,
  const cl::sycl::cl_double2& thenPacket, const cl::sycl::cl_double2& elsePacket) {
  cl::sycl::cl_long2 condition(ifPacket.select[0] ? 0 : -1,
                               ifPacket.select[1] ? 0 : -1);
  return cl::sycl::select(thenPacket, elsePacket, condition);
 }
 } // end namespace internal
 } // end namespace Eigen
 #endif // EIGEN_USE_SYCL
 #endif // EIGEN_PACKET_MATH_SYCL_H
--- a/Eigen/src/Core/arch/SYCL/TypeCasting.h
+++ b/Eigen/src/Core/arch/SYCL/TypeCasting.h
@ -0,0 +1,89 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Mehdi Goli    Codeplay Software Ltd.
 // Ralph Potter  Codeplay Software Ltd.
 // Luke Iwanski  Codeplay Software Ltd.
 // Contact: <eigen@codeplay.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 /*****************************************************************
 * TypeCasting.h
 *
 * \brief:
 *  TypeCasting
 *
 *****************************************************************/
 #ifndef EIGEN_TYPE_CASTING_SYCL_H
 #define EIGEN_TYPE_CASTING_SYCL_H
 namespace Eigen {
 namespace internal {
 #ifdef __SYCL_DEVICE_ONLY__
 template <>
 struct type_casting_traits<float, int> {
  enum {
    VectorizedCast = 1,
    SrcCoeffRatio = 1,
    TgtCoeffRatio = 1
  };
 };
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_int4 pcast<cl::sycl::cl_float4, cl::sycl::cl_int4>(const cl::sycl::cl_float4& a) {
  return a. template convert<cl::sycl::cl_int, cl::sycl::rounding_mode::automatic>();
 }
 template <>
 struct type_casting_traits<int, float> {
  enum {
    VectorizedCast = 1,
    SrcCoeffRatio = 1,
    TgtCoeffRatio = 1
  };
 };
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 pcast<cl::sycl::cl_int4, cl::sycl::cl_float4>(const cl::sycl::cl_int4& a) {
  return a. template convert<cl::sycl::cl_float, cl::sycl::rounding_mode::automatic>();
 }
 template <>
 struct type_casting_traits<double, float> {
  enum {
    VectorizedCast = 1,
    SrcCoeffRatio = 2,
    TgtCoeffRatio = 1
  };
 };
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 pcast<cl::sycl::cl_double2, cl::sycl::cl_float4>(const cl::sycl::cl_double2& a, const cl::sycl::cl_double2& b) {
  auto a1=a. template convert<cl::sycl::cl_float, cl::sycl::rounding_mode::automatic>();
  auto b1=b. template convert<cl::sycl::cl_float, cl::sycl::rounding_mode::automatic>();
  return cl::sycl::float4(a1.x(), a1.y(), b1.x(), b1.y());
 }
 template <>
 struct type_casting_traits<float, double> {
  enum {
    VectorizedCast = 1,
    SrcCoeffRatio = 1,
    TgtCoeffRatio = 2
  };
 };
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_double2 pcast<cl::sycl::cl_float4, cl::sycl::cl_double2>(const cl::sycl::cl_float4& a) {
  // Simply discard the second half of the input
  return cl::sycl::cl_double2(a.x(), a.y());
 }
 #endif
 } // end namespace internal
 } // end namespace Eigen
 #endif // EIGEN_TYPE_CASTING_SYCL_H
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@ -972,7 +972,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
              internal::prefetch(blA+(3*K+16)*LhsProgress); \
-              if (EIGEN_ARCH_ARM) { internal::prefetch(blB+(4*K+16)*RhsProgress); } /* Bug 953 */ \
+              if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) { internal::prefetch(blB+(4*K+16)*RhsProgress); } /* Bug 953 */ \
              traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0);  \
              traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1);  \
              traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2);  \
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@ -518,6 +518,8 @@
 #endif
 // Does the compiler support C99?
 // Need to include <cmath> to make sure _GLIBCXX_USE_C99 gets defined
 #include <cmath>
 #ifndef EIGEN_HAS_C99_MATH
 #if EIGEN_MAX_CPP_VER>=11 && \
    ((defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901))       \
@ -1074,4 +1076,17 @@ namespace Eigen {
 #   endif
 #endif
 #ifdef EIGEN_HAS_VARIADIC_TEMPLATES
 // The all function is used to enable a variadic version of eigen_assert which can take a parameter pack as its input.
 namespace Eigen {
 namespace internal {
 bool all(){ return true; }
 template<typename T, typename ...Ts>
 bool all(T t, Ts ... ts){ return t && all(ts...); }
 }
 }
 #endif
 #endif // EIGEN_MACROS_H
--- a/cmake/EigenConfigureTesting.cmake
+++ b/cmake/EigenConfigureTesting.cmake
@ -19,6 +19,7 @@ include(CTest)
 set(EIGEN_TEST_BUILD_FLAGS "" CACHE STRING "Options passed to the build command of unit tests")
 set(EIGEN_DASHBOARD_BUILD_TARGET "buildtests" CACHE STRING "Target to be built in dashboard mode, default is buildtests")
 set(EIGEN_CTEST_ERROR_EXCEPTION "" CACHE STRING "Regular expression for build error messages to be filtered out")
 # Overwrite default DartConfiguration.tcl such that ctest can build our unit tests.
 # Recall that our unit tests are not in the "all" target, so we have to explicitly ask ctest to build our custom 'buildtests' target.
--- a/doc/FunctionsTakingEigenTypes.dox
+++ b/doc/FunctionsTakingEigenTypes.dox
@ -79,7 +79,7 @@ These examples are just intended to give the reader a first impression of how fu
 \section TopicUsingRefClass How to write generic, but non-templated function?
-In all the previous examples, the functions had to be template functions. This approach allows to write very generic code, but it is often desirable to write non templated function and still keep some level of genericity to avoid stupid copies of the arguments. The typical example is to write functions accepting both a MatrixXf or a block of a MatrixXf. This exactly the purpose of the Ref class. Here is a simple example:
+In all the previous examples, the functions had to be template functions. This approach allows to write very generic code, but it is often desirable to write non templated functions and still keep some level of genericity to avoid stupid copies of the arguments. The typical example is to write functions accepting both a MatrixXf or a block of a MatrixXf. This is exactly the purpose of the Ref class. Here is a simple example:
 <table class="example">
 <tr><th>Example:</th><th>Output:</th></tr>
--- a/doc/snippets/MatrixBase_cwiseEqual.cpp
+++ b/doc/snippets/MatrixBase_cwiseEqual.cpp
@ -3,5 +3,5 @@ m << 1, 0,
     1, 1;
 cout << "Comparing m with identity matrix:" << endl;
 cout << m.cwiseEqual(MatrixXi::Identity(2,2)) << endl;
-int count = m.cwiseEqual(MatrixXi::Identity(2,2)).count();
+Index count = m.cwiseEqual(MatrixXi::Identity(2,2)).count();
 cout << "Number of coefficients that are equal: " << count << endl;
--- a/doc/snippets/MatrixBase_cwiseNotEqual.cpp
+++ b/doc/snippets/MatrixBase_cwiseNotEqual.cpp
@ -3,5 +3,5 @@ m << 1, 0,
     1, 1;
 cout << "Comparing m with identity matrix:" << endl;
 cout << m.cwiseNotEqual(MatrixXi::Identity(2,2)) << endl;
-int count = m.cwiseNotEqual(MatrixXi::Identity(2,2)).count();
+Index count = m.cwiseNotEqual(MatrixXi::Identity(2,2)).count();
 cout << "Number of coefficients that are not equal: " << count << endl;
--- a/test/AnnoyingScalar.h
+++ b/test/AnnoyingScalar.h
@ -33,6 +33,9 @@ class AnnoyingScalar
    AnnoyingScalar(float _v)        { init(); *v = _v; }
    AnnoyingScalar(int _v)          { init(); *v = _v; }
    AnnoyingScalar(long _v)         { init(); *v = _v; }
    #if EIGEN_HAS_CXX11
    AnnoyingScalar(long long _v)    { init(); *v = _v; }
    #endif
    AnnoyingScalar(const AnnoyingScalar& other) { init(); *v = *(other.v); }
    ~AnnoyingScalar() {
      if(v!=&data)
--- a/test/mixingtypes.cpp
+++ b/test/mixingtypes.cpp
@ -8,13 +8,27 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-// work around "uninitialized" warnings and give that option some testing
+#if defined(EIGEN_TEST_PART_7)
 #define EIGEN_INITIALIZE_MATRICES_BY_ZERO
 #ifndef EIGEN_NO_STATIC_ASSERT
 #define EIGEN_NO_STATIC_ASSERT // turn static asserts into runtime asserts in order to check them
 #endif
 // ignore double-promotion diagnostic for clang and gcc, if we check for static assertion anyway:
 // TODO do the same for MSVC?
 #if defined(__clang__)
 #  if (__clang_major__ * 100 + __clang_minor__) >= 308
 #    pragma clang diagnostic ignored "-Wdouble-promotion"
 #  endif
 #elif defined(__GNUC__)
  // TODO is there a minimal GCC version for this? At least g++-4.7 seems to be fine with this.
 #  pragma GCC diagnostic ignored "-Wdouble-promotion"
 #endif
 #endif
 #if defined(EIGEN_TEST_PART_1) || defined(EIGEN_TEST_PART_2) || defined(EIGEN_TEST_PART_3)
 #ifndef EIGEN_DONT_VECTORIZE
@ -35,6 +49,28 @@ using namespace std;
  VERIFY_IS_APPROX(XPR,REF); \
  VERIFY( g_called && #XPR" not properly optimized");
 template<int SizeAtCompileType>
 void raise_assertion(Index size = SizeAtCompileType)
 {
  // VERIFY_RAISES_ASSERT(mf+md); // does not even compile
  Matrix<float, SizeAtCompileType, 1> vf; vf.setRandom(size);
  Matrix<double, SizeAtCompileType, 1> vd; vd.setRandom(size);
  VERIFY_RAISES_ASSERT(vf=vd);
  VERIFY_RAISES_ASSERT(vf+=vd);
  VERIFY_RAISES_ASSERT(vf-=vd);
  VERIFY_RAISES_ASSERT(vd=vf);
  VERIFY_RAISES_ASSERT(vd+=vf);
  VERIFY_RAISES_ASSERT(vd-=vf);
  //   vd.asDiagonal() * mf;    // does not even compile
  //   vcd.asDiagonal() * mf;   // does not even compile
 #if 0 // we get other compilation errors here than just static asserts
  VERIFY_RAISES_ASSERT(vd.dot(vf));
 #endif
 }
 template<int SizeAtCompileType> void mixingtypes(int size = SizeAtCompileType)
 {
  typedef std::complex<float>   CF;
@ -73,13 +109,6 @@ template<int SizeAtCompileType> void mixingtypes(int size = SizeAtCompileType)
  while(std::abs(scf)<epsf) scf = internal::random<CF>();
  while(std::abs(scd)<epsd) scd = internal::random<CD>();
 //   VERIFY_RAISES_ASSERT(mf+md); // does not even compile
 #ifdef EIGEN_DONT_VECTORIZE
  VERIFY_RAISES_ASSERT(vf=vd);
  VERIFY_RAISES_ASSERT(vf+=vd);
 #endif
  // check scalar products
  VERIFY_MIX_SCALAR(vcf * sf , vcf * complex<float>(sf));
  VERIFY_MIX_SCALAR(sd * vcd , complex<double>(sd) * vcd);
@ -119,9 +148,6 @@ template<int SizeAtCompileType> void mixingtypes(int size = SizeAtCompileType)
  // check dot product
  vf.dot(vf);
 #if 0 // we get other compilation errors here than just static asserts
  VERIFY_RAISES_ASSERT(vd.dot(vf));
 #endif
  VERIFY_IS_APPROX(vcf.dot(vf), vcf.dot(vf.template cast<complex<float> >()));
  // check diagonal product
@ -130,9 +156,6 @@ template<int SizeAtCompileType> void mixingtypes(int size = SizeAtCompileType)
  VERIFY_IS_APPROX(mcf * vf.asDiagonal(), mcf * vf.template cast<complex<float> >().asDiagonal());
  VERIFY_IS_APPROX(md * vcd.asDiagonal(), md.template cast<complex<double> >() * vcd.asDiagonal());
 //   vd.asDiagonal() * mf;    // does not even compile
 //   vcd.asDiagonal() * mf;   // does not even compile
  // check inner product
  VERIFY_IS_APPROX((vf.transpose() * vcf).value(), (vf.template cast<complex<float> >().transpose() * vcf).value());
@ -296,5 +319,10 @@ EIGEN_DECLARE_TEST(mixingtypes)
    CALL_SUBTEST_4(mixingtypes<3>());
    CALL_SUBTEST_5(mixingtypes<4>());
    CALL_SUBTEST_6(mixingtypes<Dynamic>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE)));
    CALL_SUBTEST_7(raise_assertion<Dynamic>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE)));
  }
  CALL_SUBTEST_7(raise_assertion<0>());
  CALL_SUBTEST_7(raise_assertion<3>());
  CALL_SUBTEST_7(raise_assertion<4>());
  CALL_SUBTEST_7(raise_assertion<Dynamic>(0));
 }
--- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
@ -112,7 +112,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
 #if EIGEN_HAS_VARIADIC_TEMPLATES
    template<typename... IndexTypes>
-    EIGEN_DEVICE_FUNC inline const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
    {
      // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@ -98,7 +98,7 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
  typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  static const int NumDims = XprType::NumDims;
  enum {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@ -104,7 +104,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
  typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  bool isCopy= false, nByOne = false, oneByN = false;
  enum {
@ -306,7 +306,13 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
      if (isCopy) {
        #ifdef EIGEN_GPU_COMPILE_PHASE
        // See PR 437: on NVIDIA P100 and K20m we observed a x3-4 speed up by enforcing
        // unaligned loads here. The reason is unclear though.
        return m_impl.template packet<Unaligned>(index);
        #else
        return m_impl.template packet<LoadMode>(index);
        #endif
      } else if (oneByN && !nByOne) {
        return packetNByOne<LoadMode>(index);
      } else if (!oneByN && nByOne) {
@ -318,7 +324,12 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
      }
    } else {
      if (isCopy) {
        #ifdef EIGEN_GPU_COMPILE_PHASE
        // See above.
        return m_impl.template packet<Unaligned>(index);
        #else
        return m_impl.template packet<LoadMode>(index);
        #endif
      } else if (oneByN && !nByOne) {
        return packetOneByN<LoadMode>(index);
      } else if (!oneByN && nByOne) {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@ -138,7 +138,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
  typedef typename XprType::Scalar Scalar;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  enum {
@ -417,7 +417,7 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
  typedef typename XprType::Scalar Scalar;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  enum {
    IsAligned    = false,
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@ -251,7 +251,7 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
  template<int LoadMode>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
  {
-    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    const int packetSize = PacketType<CoeffReturnType, Device>::size;
    EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
    eigen_assert(index + packetSize - 1 < dimensions().TotalSize());
@ -354,7 +354,7 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  void writePacket(Index index, const PacketReturnType& x)
  {
-    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    const int packetSize = PacketType<CoeffReturnType, Device>::size;
    EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
    eigen_assert(index + packetSize - 1 < this->dimensions().TotalSize());
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@ -177,9 +177,9 @@ struct NoOpOutputKernel {
   */
  template <typename Index, typename Scalar>
  EIGEN_ALWAYS_INLINE void operator()(
-      const OutputKernel::OutputMapper<Index, Scalar>& output_mapper,
+      const OutputKernel::OutputMapper<Index, Scalar>& /*output_mapper*/,
-      const TensorContractionParams& params, Index i, Index j, Index num_rows,
+      const TensorContractionParams& /*params*/, Index /*i*/,
-      Index num_cols) const {}
+      Index /*j*/, Index /*num_rows*/, Index /*num_cols*/) const {}
 };
 template<typename Indices, typename LhsXprType, typename RhsXprType, typename OutputKernelType = const NoOpOutputKernel>
@ -239,7 +239,7 @@ struct TensorContractionEvaluatorBase
  enum {
    IsAligned = true,
-    PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
    BlockAccess = false,
    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
    CoordAccess = false,  // to be implemented
@ -468,42 +468,58 @@ struct TensorContractionEvaluatorBase
    }
  }
-  EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const {
+#define TENSOR_CONTRACTION_DISPATCH(METHOD, ALIGNMENT, ARGS)   \
-    if (this->m_lhs_inner_dim_contiguous) {
+    if (this->m_lhs_inner_dim_contiguous) { \
-      if (this->m_rhs_inner_dim_contiguous) {
+      if (this->m_rhs_inner_dim_contiguous) { \
-        if (this->m_rhs_inner_dim_reordered) {
+        if (this->m_rhs_inner_dim_reordered) { \
-          static_cast<const Derived*>(this)->template evalProduct<true, true, true, Unaligned>(buffer);
+          METHOD<true, true, true, ALIGNMENT>ARGS;    \
-        }
+        } \
-        else {
+        else { \
-          static_cast<const Derived*>(this)->template evalProduct<true, true, false, Unaligned>(buffer);
+          METHOD<true, true, false, ALIGNMENT>ARGS; \
-        }
+        } \
-      }
+      } \
-      else {
+      else { \
-       if (this->m_rhs_inner_dim_reordered) {
+       if (this->m_rhs_inner_dim_reordered) { \
-          static_cast<const Derived*>(this)->template evalProduct<true, false, true, Unaligned>(buffer);
+          METHOD<true, false, true, ALIGNMENT>ARGS; \
-        }
+        } \
-        else {
+        else { \
-          static_cast<const Derived*>(this)->template evalProduct<true, false, false, Unaligned>(buffer);
+          METHOD<true, false, false, ALIGNMENT>ARGS; \
-        }
+        } \
-      }
+      } \
    } \
    else { \
      if (this->m_rhs_inner_dim_contiguous) { \
        if (this->m_rhs_inner_dim_reordered) { \
          METHOD<false, true, true, ALIGNMENT>ARGS; \
        } \
        else { \
          METHOD<false, true, false, ALIGNMENT>ARGS; \
        } \
      } \
      else { \
       if (this->m_rhs_inner_dim_reordered) { \
          METHOD<false, false, true, ALIGNMENT>ARGS; \
        } \
        else { \
          METHOD<false, false, false, ALIGNMENT>ARGS; \
        } \
      } \
    }
-    else {
+
-      if (this->m_rhs_inner_dim_contiguous) {
+  EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const {
-        if (this->m_rhs_inner_dim_reordered) {
+   static_cast<const Derived*>(this)->template evalProduct<Unaligned>(buffer);
-          static_cast<const Derived*>(this)->template evalProduct<false, true, true, Unaligned>(buffer);
+  }
-        }
+
-        else {
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
-          static_cast<const Derived*>(this)->template evalProduct<false, true, false, Unaligned>(buffer);
+            bool rhs_inner_dim_reordered, int Alignment>
-        }
+  void evalProductSequential(Scalar* buffer) const {
-      }
+    if (this->m_j_size == 1) {
-      else {
+      this->template evalGemv<lhs_inner_dim_contiguous,
-       if (this->m_rhs_inner_dim_reordered) {
+                              rhs_inner_dim_contiguous, rhs_inner_dim_reordered,
-          static_cast<const Derived*>(this)->template evalProduct<false, false, true, Unaligned>(buffer);
+                              Alignment>(buffer);
-        }
+    } else {
-        else {
+      this->template evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous,
-          static_cast<const Derived*>(this)->template evalProduct<false, false, false, Unaligned>(buffer);
+                              rhs_inner_dim_reordered, Alignment>(buffer);
        }
      }
    }
  }
@ -624,7 +640,7 @@ struct TensorContractionEvaluatorBase
    OutputMapper output(buffer, m);
    // Sizes of the blocks to load in cache. See the Goto paper for details.
-    internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, internal::ShardByCol> blocking(k, m, n, 1);
+    internal::TensorContractionBlocking<LhsScalar, RhsScalar, Index, internal::ShardByCol> blocking(k, m, n, 1);
    const Index kc = blocking.kc();
    const Index mc = numext::mini(m, blocking.mc());
    const Index nc = numext::mini(n, blocking.nc());
@ -977,14 +993,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
      Base(op, device) { }
-  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  template <int Alignment>
-  EIGEN_DEVICE_FUNC void evalProduct(Scalar* buffer) const {
+  void evalProduct(Scalar* buffer) const {
-    if (this->m_j_size == 1) {
+    TENSOR_CONTRACTION_DISPATCH(this->template evalProductSequential, Alignment, (buffer));
      this->template evalGemv<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
      return;
    }
    this->template evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
  }
 };
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
@ -21,13 +21,10 @@ enum {
 // Default Blocking Strategy
-template <typename LhsMapper, typename RhsMapper, typename Index, int ShardingType=ShardByCol>
+template <typename LhsScalar, typename RhsScalar, typename Index, int ShardingType=ShardByCol>
 class TensorContractionBlocking {
 public:
  typedef typename LhsMapper::Scalar LhsScalar;
  typedef typename RhsMapper::Scalar RhsScalar;
 /*
   adding EIGEN_DEVICE_FUNC unconditionally to 'TensorContractionBlocking' constructor in `TensorContractionBlocking.h`
     requires adding EIGEN_DEVICE_FUNC to `computeProductBlockingSizes` in `GeneralBlockPanelKernel.h`
@ -41,7 +38,7 @@ class TensorContractionBlocking {
   ../Eigen/src/Core/products/GeneralBlockPanelKernel.h(57): error #2901:
      dynamic initialization is not supported for function-scope static variables within a __device__/__global__ function
 */
- 
+
  #if !defined(EIGEN_HIPCC)
  EIGEN_DEVICE_FUNC
  #endif
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@ -71,8 +71,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
  TensorEvaluator(const XprType& op, const Device& device) :
      Base(op, device) {}
-  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
+  template <int Alignment>
            bool rhs_inner_dim_reordered, int Alignment>
  void evalProduct(Scalar* buffer) const {
    const Index m = this->m_i_size;
    const Index n = this->m_j_size;
@ -96,39 +95,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
    }
 #endif
    typedef
        typename internal::remove_const<typename EvalLeftArgType::Scalar>::type
            LhsScalar;
    typedef
        typename internal::remove_const<typename EvalRightArgType::Scalar>::type
            RhsScalar;
    typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
    typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
    typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
    typedef internal::TensorContractionInputMapper<
        LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t,
        contract_t, internal::packet_traits<LhsScalar>::size,
        lhs_inner_dim_contiguous, false, Unaligned>
        LhsMapper;
    typedef internal::TensorContractionInputMapper<
        RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t,
        contract_t, internal::packet_traits<RhsScalar>::size,
        rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned>
        RhsMapper;
    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
    typedef internal::gemm_pack_lhs<LhsScalar, Index,
                                    typename LhsMapper::SubMapper, Traits::mr,
                                    Traits::LhsProgress, ColMajor>
        LhsPacker;
    typedef internal::gemm_pack_rhs<
        RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor>
        RhsPacker;
    typedef internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper,
                                  Traits::mr, Traits::nr, false, false>
        GebpKernel;
    // Compute a set of algorithm parameters:
    // - kernel block sizes (bm, bn, bk)
    // - task grain sizes (number of kernels executed per task: gm, gn)
@ -158,14 +124,14 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
    // Again, we don't know number of threads yet, so we use 2.
    Index bm, bn, bk;
    if (shard_by_col) {
-      internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
+      internal::TensorContractionBlocking<LhsScalar, RhsScalar, Index,
                                          internal::ShardByCol>
          blocking(k, m, n, 2);
      bm = blocking.mc();
      bn = blocking.nc();
      bk = blocking.kc();
    } else {
-      internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
+      internal::TensorContractionBlocking<LhsScalar, RhsScalar, Index,
                                          internal::ShardByRow>
          blocking(k, m, n, 2);
      bm = blocking.mc();
@ -187,29 +153,22 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
    if (n == 1) num_threads = 1;
    if (num_threads == 1) {
-      // The single-threaded algorithm should be faster in this case.
+      TENSOR_CONTRACTION_DISPATCH(this->template evalProductSequential,
-      if (n == 1)
+                                  Unaligned, (buffer));
        this->template evalGemv<lhs_inner_dim_contiguous,
                                rhs_inner_dim_contiguous,
                                rhs_inner_dim_reordered, Alignment>(buffer);
      else
        this->template evalGemm<lhs_inner_dim_contiguous,
                                rhs_inner_dim_contiguous,
                                rhs_inner_dim_reordered, Alignment>(buffer);
      return;
    }
    // Now that we know number of threads, recalculate sharding and blocking.
    shard_by_col = shardByCol(m, n, num_threads);
    if (shard_by_col) {
-      internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
+      internal::TensorContractionBlocking<LhsScalar, RhsScalar, Index,
                                          internal::ShardByCol>
          blocking(k, m, n, num_threads);
      bm = blocking.mc();
      bn = blocking.nc();
      bk = blocking.kc();
    } else {
-      internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
+      internal::TensorContractionBlocking<LhsScalar, RhsScalar, Index,
                                          internal::ShardByRow>
          blocking(k, m, n, num_threads);
      bm = blocking.mc();
@ -257,34 +216,55 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
    // more important in this case.
    if ((shard_by_col ? nm : nn) == 1) parallel_pack = false;
-    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides,
+    #define CONTEXT_ARGS                                                        \
-                  this->m_i_strides, this->m_left_contracting_strides,
+  (this, num_threads, buffer, m, n, k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, \
-                  this->m_k_strides);
+   nn0, shard_by_col, parallel_pack)                                        \
      .run()
-    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides,
+    TENSOR_CONTRACTION_DISPATCH(Context, Alignment, CONTEXT_ARGS);
-                  this->m_j_strides, this->m_right_contracting_strides,
+
-                  this->m_k_strides);
+#undef CONTEXT_ARGS
    Context<LhsPacker, RhsPacker, GebpKernel, LhsMapper, RhsMapper,
            OutputMapper>(this, num_threads, lhs, rhs, buffer, m, n,
                          k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, nn0,
                          shard_by_col, parallel_pack)
        .run();
  }
  // Context coordinates a single parallel gemm operation.
-  template <typename LhsPacker, typename RhsPacker, typename GebpKernel,
+ template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
-            typename LhsMapper, typename RhsMapper, typename OutputMapper>
+            bool rhs_inner_dim_reordered, int Alignment>
  class Context {
   public:
-    Context(const Self* self, int num_threads, LhsMapper& lhs,
+    typedef internal::TensorContractionInputMapper<
-            RhsMapper& rhs, Scalar* buffer, Index tm, Index tn, Index tk, Index bm,
+        LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t,
-            Index bn, Index bk, Index nm, Index nn, Index nk, Index gm,
+        contract_t, internal::packet_traits<LhsScalar>::size,
-            Index gn, Index nm0, Index nn0, bool shard_by_col,
+        lhs_inner_dim_contiguous, false, Unaligned>
        LhsMapper;
    typedef internal::TensorContractionInputMapper<
        RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t,
        contract_t, internal::packet_traits<RhsScalar>::size,
        rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned>
        RhsMapper;
    typedef internal::gemm_pack_lhs<LhsScalar, Index,
                                    typename LhsMapper::SubMapper, Traits::mr,
                                    Traits::LhsProgress, ColMajor>
        LhsPacker;
    typedef internal::gemm_pack_rhs<
        RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor>
        RhsPacker;
    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
    typedef internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper,
                                  Traits::mr, Traits::nr, false, false>
        GebpKernel;
    Context(const Self* self, int num_threads, Scalar* buffer, Index tm, Index tn,
            Index tk, Index bm, Index bn, Index bk, Index nm, Index nn, Index nk,
            Index gm, Index gn, Index nm0, Index nn0, bool shard_by_col,
            bool parallel_pack)
        : device_(self->m_device),
-          lhs_(lhs),
+          lhs_(self->m_leftImpl, self->m_left_nocontract_strides,
-          rhs_(rhs),
+               self->m_i_strides, self->m_left_contracting_strides,
               self->m_k_strides),
          rhs_(self->m_rightImpl, self->m_right_nocontract_strides,
               self->m_j_strides, self->m_right_contracting_strides,
               self->m_k_strides),
          buffer_(buffer),
          output_(buffer, tm),
          output_kernel_(self->m_output_kernel),
@ -337,7 +317,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
          divup<size_t>(bm_ * bk_ * sizeof(LhsScalar), align) * align;
      size_t rhs_size =
          divup<size_t>(bn_ * bk_ * sizeof(RhsScalar), align) * align;
-      packed_mem_ = static_cast<char*>(internal::aligned_malloc(
+      packed_mem_ = static_cast<char*>(device_.allocate(
          (nm0_ * lhs_size + nn0_ * rhs_size) * std::min<size_t>(nk_, P - 1)));
      char* mem = static_cast<char*>(packed_mem_);
      for (Index x = 0; x < numext::mini<Index>(nk_, P - 1); x++) {
@ -359,7 +339,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
        for (Index m = 0; m < nm_; m++) delete[] state_kernel_[x][m];
        delete[] state_kernel_[x];
      }
-      internal::aligned_free(packed_mem_);
+      device_.deallocate(packed_mem_);
    }
    void run() {
@ -376,8 +356,8 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   private:
    Notification done_;
    const Device& device_;
-    LhsMapper& lhs_;
+    LhsMapper lhs_;
-    RhsMapper& rhs_;
+    RhsMapper rhs_;
    Scalar* const buffer_;
    OutputMapper output_;
    OutputKernelType output_kernel_;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@ -190,7 +190,7 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
  typedef typename internal::remove_all<typename internal::traits<ArgType>::Scalar>::type SrcType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
  typedef typename PacketType<SrcType, Device>::type PacketSourceType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  enum {
    IsAligned = false,
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@ -302,7 +302,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
  typedef typename XprType::Scalar Scalar;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  enum {
    IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
@ -87,11 +87,11 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
  typedef typename internal::remove_const<typename ArgType::Scalar>::type Scalar;
  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  enum {
    IsAligned = false,
-    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
    BlockAccess = false,
    Layout = TensorEvaluator<XprType, Device>::Layout,
    CoordAccess = false,  // to be implemented
@ -112,7 +112,7 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
      return false;
    } else {
      m_result = static_cast<CoeffReturnType*>(
-          m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
+          m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar)));
      evalTo(m_result);
      return true;
    }
@ -120,7 +120,7 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
    if (m_result != NULL) {
-      m_device.deallocate(m_result);
+      m_device.deallocate_temp(m_result);
      m_result = NULL;
    }
  }
@ -249,11 +249,11 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
  typedef typename XprType::Scalar Scalar;
  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  enum {
    IsAligned = false,
-    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
    BlockAccess = false,
    Layout = TensorEvaluator<LhsXprType, Device>::Layout,
    CoordAccess = false,  // to be implemented
@ -273,7 +273,7 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
      evalTo(data);
      return false;
    } else {
-      m_result = static_cast<Scalar *>(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
+      m_result = static_cast<Scalar *>(m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar)));
      evalTo(m_result);
      return true;
    }
@ -281,7 +281,7 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
    if (m_result != NULL) {
-      m_device.deallocate(m_result);
+      m_device.deallocate_temp(m_result);
      m_result = NULL;
    }
  }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
@ -20,6 +20,12 @@ struct DefaultDevice {
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
    internal::aligned_free(buffer);
  }
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const {
    return allocate(num_bytes);
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const {
    deallocate(buffer);
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
    ::memcpy(dst, src, n);
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
@ -207,6 +207,15 @@ struct GpuDevice {
    stream_->deallocate(buffer);
  }
  EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const {
    return stream_->allocate(num_bytes);
  }
  EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const {
    stream_->deallocate(buffer);
  }
  EIGEN_STRONG_INLINE void* scratchpad() const {
    return stream_->scratchpad();
  }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@ -105,6 +105,14 @@ struct ThreadPoolDevice {
    internal::aligned_free(buffer);
  }
    EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const {
    return allocate(num_bytes);
  }
  EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const {
    deallocate(buffer);
  }
  EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
    ::memcpy(dst, src, n);
  }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@ -41,7 +41,7 @@ template<typename Index, std::size_t NumIndices, std::size_t n, bool RowMajor>
 struct fixed_size_tensor_index_linearization_helper
 {
  template <typename Dimensions> EIGEN_DEVICE_FUNC
-  static inline Index run(array<Index, NumIndices> const& indices,
+  static EIGEN_STRONG_INLINE Index run(array<Index, NumIndices> const& indices,
                          const Dimensions& dimensions)
  {
    return array_get<RowMajor ? n - 1 : (NumIndices - n)>(indices) +
@ -54,7 +54,7 @@ template<typename Index, std::size_t NumIndices, bool RowMajor>
 struct fixed_size_tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor>
 {
  template <typename Dimensions> EIGEN_DEVICE_FUNC
-  static inline Index run(array<Index, NumIndices> const&, const Dimensions&)
+  static EIGEN_STRONG_INLINE Index run(array<Index, NumIndices> const&, const Dimensions&)
  {
    return 0;
  }
@ -64,7 +64,7 @@ template<typename Index, std::size_t n>
 struct fixed_size_tensor_index_extraction_helper
 {
  template <typename Dimensions> EIGEN_DEVICE_FUNC
-  static inline Index run(const Index index,
+  static EIGEN_STRONG_INLINE Index run(const Index index,
                          const Dimensions& dimensions)
  {
    const Index mult = (index == n-1) ? 1 : 0;
@ -77,7 +77,7 @@ template<typename Index>
 struct fixed_size_tensor_index_extraction_helper<Index, 0>
 {
  template <typename Dimensions> EIGEN_DEVICE_FUNC
-  static inline Index run(const Index,
+  static EIGEN_STRONG_INLINE Index run(const Index,
                          const Dimensions&)
  {
    return 0;
@ -421,20 +421,20 @@ template <std::size_t n, std::size_t V1, std::size_t V2, std::size_t V3, std::si
 template <typename Dims1, typename Dims2, size_t n, size_t m>
 struct sizes_match_below_dim {
-  static EIGEN_DEVICE_FUNC  inline bool run(Dims1&, Dims2&) {
+  static EIGEN_DEVICE_FUNC  EIGEN_STRONG_INLINE bool run(Dims1&, Dims2&) {
    return false;
  }
 };
 template <typename Dims1, typename Dims2, size_t n>
 struct sizes_match_below_dim<Dims1, Dims2, n, n> {
-  static EIGEN_DEVICE_FUNC  inline bool run(Dims1& dims1, Dims2& dims2) {
+  static EIGEN_DEVICE_FUNC  EIGEN_STRONG_INLINE bool run(Dims1& dims1, Dims2& dims2) {
    return (array_get<n-1>(dims1) == array_get<n-1>(dims2)) &
        sizes_match_below_dim<Dims1, Dims2, n-1, n-1>::run(dims1, dims2);
  }
 };
 template <typename Dims1, typename Dims2>
 struct sizes_match_below_dim<Dims1, Dims2, 0, 0> {
-  static EIGEN_DEVICE_FUNC  inline bool run(Dims1&, Dims2&) {
+  static EIGEN_DEVICE_FUNC  EIGEN_STRONG_INLINE bool run(Dims1&, Dims2&) {
    return true;
  }
 };
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@ -102,7 +102,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
  typedef typename XprType::Index Index;
  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  enum {
    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@ -33,6 +33,7 @@ struct TensorEvaluator
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
  typedef typename Derived::Dimensions Dimensions;
  typedef Derived XprType;
  static const int PacketSize =  PacketType<CoeffReturnType, Device>::size;
  // NumDimensions is -1 for variable dim tensors
  static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ?
@ -40,7 +41,7 @@ struct TensorEvaluator
  enum {
    IsAligned = Derived::IsAligned,
-    PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
    BlockAccess = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
    Layout = Derived::Layout,
    CoordAccess = NumCoords > 0,
@ -121,7 +122,7 @@ struct TensorEvaluator
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
-                        internal::unpacket_traits<PacketReturnType>::size);
+                        PacketType<CoeffReturnType, Device>::size);
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
@ -188,10 +189,11 @@ struct TensorEvaluator<const Derived, Device>
  // NumDimensions is -1 for variable dim tensors
  static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ?
                               internal::traits<Derived>::NumDimensions : 0;
  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  enum {
    IsAligned = Derived::IsAligned,
-    PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
    BlockAccess = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
    Layout = Derived::Layout,
    CoordAccess = NumCoords > 0,
@ -249,7 +251,7 @@ struct TensorEvaluator<const Derived, Device>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
-                        internal::unpacket_traits<PacketReturnType>::size);
+                        PacketType<CoeffReturnType, Device>::size);
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
@ -300,7 +302,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
  typedef typename XprType::Scalar Scalar;
  typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
@ -322,7 +324,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
  costPerCoeff(bool vectorized) const {
    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
-                        internal::unpacket_traits<PacketReturnType>::size);
+                        PacketType<CoeffReturnType, Device>::size);
  }
  EIGEN_DEVICE_FUNC  typename Eigen::internal::traits<XprType>::PointerType  data() const { return NULL; }
@ -367,7 +369,7 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
  typedef typename XprType::Scalar Scalar;
  typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
@ -445,7 +447,7 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
  typedef typename XprType::Scalar Scalar;
  typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  typedef typename TensorEvaluator<LeftArgType, Device>::Dimensions Dimensions;
  static const int NumDims = internal::array_size<
@ -574,7 +576,7 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
  typedef typename XprType::Scalar Scalar;
  typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  typedef typename TensorEvaluator<Arg1Type, Device>::Dimensions Dimensions;
  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
@ -644,7 +646,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
  enum {
    IsAligned = TensorEvaluator<ThenArgType, Device>::IsAligned & TensorEvaluator<ElseArgType, Device>::IsAligned,
    PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & TensorEvaluator<ElseArgType, Device>::PacketAccess &
-                   internal::packet_traits<Scalar>::HasBlend,
+                    PacketType<Scalar, Device>::HasBlend,
    BlockAccess = false,
    Layout = TensorEvaluator<IfArgType, Device>::Layout,
    CoordAccess = false,  // to be implemented
@ -665,7 +667,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
  typedef typename XprType::Index Index;
  typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  typedef typename TensorEvaluator<IfArgType, Device>::Dimensions Dimensions;
  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@ -39,7 +39,7 @@ class TensorExecutor {
  using StorageIndex = typename Expression::Index;
  EIGEN_DEVICE_FUNC
-  static inline void run(const Expression& expr,
+  static EIGEN_STRONG_INLINE void run(const Expression& expr,
                         const Device& device = Device()) {
    TensorEvaluator<Expression, Device> evaluator(expr, device);
    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
@ -63,7 +63,7 @@ class TensorExecutor<Expression, DefaultDevice, /*Vectorizable*/ true,
  using StorageIndex = typename Expression::Index;
  EIGEN_DEVICE_FUNC
-  static inline void run(const Expression& expr,
+  static EIGEN_STRONG_INLINE void run(const Expression& expr,
                         const DefaultDevice& device = DefaultDevice()) {
    TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
@ -111,7 +111,7 @@ class TensorExecutor<Expression, DefaultDevice, Vectorizable,
  static const int NumDims = traits<Expression>::NumDimensions;
  EIGEN_DEVICE_FUNC
-  static inline void run(const Expression& expr,
+  static EIGEN_STRONG_INLINE void run(const Expression& expr,
                         const DefaultDevice& device = DefaultDevice()) {
    using TensorBlock =
        TensorBlock<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout>;
@ -223,7 +223,7 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> {
 public:
  using StorageIndex = typename Expression::Index;
-  static inline void run(const Expression& expr,
+  static EIGEN_STRONG_INLINE void run(const Expression& expr,
                         const ThreadPoolDevice& device) {
    typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
    typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
@ -257,7 +257,7 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ tr
  static const int NumDims = traits<Expression>::NumDimensions;
-  static inline void run(const Expression& expr,
+  static EIGEN_STRONG_INLINE void run(const Expression& expr,
                         const ThreadPoolDevice& device) {
    using TensorBlock =
        TensorBlock<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout>;
@ -376,7 +376,7 @@ EigenMetaKernel(Evaluator eval, StorageIndex size) {
 /*static*/
 template <typename Expression, bool Vectorizable, bool Tileable>
-inline void TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable>::run(
+EIGEN_STRONG_INLINE void TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable>::run(
    const Expression& expr, const GpuDevice& device) {
  TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
@ -405,7 +405,7 @@ inline void TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable>::run(
 template <typename Expression, bool Vectorizable>
 class TensorExecutor<Expression, SyclDevice, Vectorizable> {
 public:
-  static inline void run(const Expression &expr, const SyclDevice &device) {
+  static EIGEN_STRONG_INLINE void run(const Expression &expr, const SyclDevice &device) {
    // call TensorSYCL module
    TensorSycl::run(expr, device);
  }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@ -93,11 +93,11 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
  typedef typename XprType::Index Index;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  enum {
    IsAligned = true,
-    PacketAccess = (PacketSize > 1),
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
    BlockAccess = false,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
    RawAccess = true
@ -115,7 +115,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
  #endif
  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
    const Index numValues =  internal::array_prod(m_impl.dimensions());
-    m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType));
+    m_buffer = (CoeffReturnType*)m_device.allocate_temp(numValues * sizeof(CoeffReturnType));
    // Should initialize the memory in case we're dealing with non POD types.
    if (NumTraits<CoeffReturnType>::RequireInitialization) {
      for (Index i = 0; i < numValues; ++i) {
@ -129,7 +129,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
    return true;
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
-    m_device.deallocate(m_buffer);
+    m_device.deallocate_temp(m_buffer);
    m_buffer = NULL;
  }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@ -20,7 +20,7 @@ namespace internal {
 template <typename Scalar>
 struct scalar_mod_op {
  EIGEN_DEVICE_FUNC scalar_mod_op(const Scalar& divisor) : m_divisor(divisor) {}
-  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a % m_divisor; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a % m_divisor; }
  const Scalar m_divisor;
 };
 template <typename Scalar>
@ -34,7 +34,7 @@ struct functor_traits<scalar_mod_op<Scalar> >
 template <typename Scalar>
 struct scalar_mod2_op {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op)
-  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; }
 };
 template <typename Scalar>
 struct functor_traits<scalar_mod2_op<Scalar> >
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
@ -90,7 +90,7 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
  enum {
    IsAligned = false,
-    PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
    BlockAccess = false,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
    CoordAccess = false,  // to be implemented
@ -137,7 +137,7 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
  template<int LoadMode>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
  {
-    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    const int packetSize = PacketType<CoeffReturnType, Device>::size;
    EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@ -241,7 +241,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
  typedef TensorEvaluator<ArgType, Device> Impl;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  enum {
    IsAligned    = false,
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
@ -75,10 +75,10 @@ template<DenseIndex n> struct NumTraits<type2index<n> >
    MulCost = 1
  };
-  EIGEN_DEVICE_FUNC static inline Real epsilon() { return 0; }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Real epsilon() { return 0; }
-  EIGEN_DEVICE_FUNC static inline Real dummy_precision() { return 0; }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Real dummy_precision() { return 0; }
-  EIGEN_DEVICE_FUNC static inline Real highest() { return n; }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Real highest() { return n; }
-  EIGEN_DEVICE_FUNC static inline Real lowest() { return n; }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Real lowest() { return n; }
 };
 namespace internal {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
@ -85,7 +85,7 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
  typedef typename XprType::Scalar Scalar;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  enum {
    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
@ -150,6 +150,7 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
    EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
    {
      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
      eigen_assert(internal::all((Eigen::NumTraits<Index>::highest() >= otherIndices)...));
      if (PlainObjectType::Options&RowMajor) {
        const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
        return m_data[index];
@ -237,6 +238,7 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
    EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
    {
      static_assert(sizeof...(otherIndices) + 2 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
       eigen_assert(internal::all((Eigen::NumTraits<Index>::highest() >= otherIndices)...));
      const std::size_t NumDims = sizeof...(otherIndices) + 2;
      if (PlainObjectType::Options&RowMajor) {
        const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumDims>{{firstIndex, secondIndex, otherIndices...}});
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@ -617,7 +617,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
  template<int LoadMode>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
  {
-    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    const int packetSize = PacketType<CoeffReturnType, Device>::size;
    EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
    eigen_assert(index+packetSize-1 < internal::array_prod(dimensions()));
@ -814,7 +814,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
      return;
    }
-    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    const int packetSize = PacketType<CoeffReturnType, Device>::size;
    Index inputIndices[] = {0, 0};
    Index indices[] = {index, index + packetSize - 1};
    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@ -91,7 +91,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
  typedef typename XprType::Scalar Scalar;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  enum {
    IsAligned = true,
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
@ -88,7 +88,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
  typedef typename XprType::Scalar Scalar;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  enum {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@ -472,7 +472,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
  static const bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess;
  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  enum {
    IsAligned    = false,
@ -596,7 +596,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
         !RunningOnGPU))) {
      bool need_assign = false;
      if (!data) {
-        m_result = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType)));
+        m_result = static_cast<CoeffReturnType*>(m_device.allocate_temp(sizeof(CoeffReturnType)));
        data = m_result;
        need_assign = true;
      }
@ -608,7 +608,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
      const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
      const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
      if (!data) {
-        data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
+        data = static_cast<CoeffReturnType*>(m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
        m_result = data;
      }
      Op reducer(m_reducer);
@ -632,7 +632,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
        const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
        if (!data) {
          if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) {
-            data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
+            data = static_cast<CoeffReturnType*>(m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
            m_result = data;
          }
          else {
@ -642,7 +642,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
        Op reducer(m_reducer);
        if (internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
          if (m_result) {
-            m_device.deallocate(m_result);
+            m_device.deallocate_temp(m_result);
            m_result = NULL;
          }
          return true;
@ -665,7 +665,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
        const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
        if (!data) {
          if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) {
-            data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
+            data = static_cast<CoeffReturnType*>(m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
            m_result = data;
          }
          else {
@ -675,7 +675,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
        Op reducer(m_reducer);
        if (internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
          if (m_result) {
-            m_device.deallocate(m_result);
+            m_device.deallocate_temp(m_result);
            m_result = NULL;
          }
          return true;
@ -690,7 +690,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
    m_impl.cleanup();
    if (m_result) {
-      m_device.deallocate(m_result);
+      m_device.deallocate_temp(m_result);
      m_result = NULL;
    }
  }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@ -108,7 +108,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
  typedef typename XprType::Scalar Scalar;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  enum {
    IsAligned = false,
@ -266,7 +266,7 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
  typedef typename XprType::Scalar Scalar;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  const Dimensions& dimensions() const { return this->m_dimensions; }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@ -95,7 +95,7 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
  enum {
    IsAligned = false,
-    PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
    BlockAccess = false,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
    CoordAccess = false,
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@ -108,11 +108,11 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
  typedef typename XprType::Scalar Scalar;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  enum {
    IsAligned    = false,
-    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
    BlockAccess  = TensorEvaluator<ArgType, Device>::BlockAccess,
    Layout       = TensorEvaluator<ArgType, Device>::Layout,
    CoordAccess  = false,  // to be implemented
@ -405,11 +405,11 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
  typedef typename XprType::Scalar Scalar;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  enum {
    IsAligned    = false,
-    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
    BlockAccess  = TensorEvaluator<ArgType, Device>::BlockAccess,
    Layout       = TensorEvaluator<ArgType, Device>::Layout,
    RawAccess    = false
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@ -107,7 +107,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
  typedef typename XprType::Scalar Scalar;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  enum {
    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
@ -287,7 +287,7 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
  typedef typename XprType::Scalar Scalar;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
  {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
@ -194,7 +194,7 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  enum {
    IsAligned = false,
--- a/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
+++ b/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
@ -104,9 +104,9 @@ template<>                                  struct h_skip_helper_type<0>
 template<int n>
 struct h_skip {
  template<typename T, T... ii>
-  constexpr static inline typename h_skip_helper_numeric<T, n, ii...>::type helper(numeric_list<T, ii...>) { return typename h_skip_helper_numeric<T, n, ii...>::type(); }
+  constexpr static EIGEN_STRONG_INLINE typename h_skip_helper_numeric<T, n, ii...>::type helper(numeric_list<T, ii...>) { return typename h_skip_helper_numeric<T, n, ii...>::type(); }
  template<typename... tt>
-  constexpr static inline typename h_skip_helper_type<n, tt...>::type helper(type_list<tt...>) { return typename h_skip_helper_type<n, tt...>::type(); }
+  constexpr static EIGEN_STRONG_INLINE typename h_skip_helper_type<n, tt...>::type helper(type_list<tt...>) { return typename h_skip_helper_type<n, tt...>::type(); }
 };
 template<int n, typename a> struct skip { typedef decltype(h_skip<n>::helper(a())) type; };
@ -268,7 +268,7 @@ template<
  typename Reducer
 > struct reduce<Reducer>
 {
-  EIGEN_DEVICE_FUNC constexpr static inline int run() { return Reducer::Identity; }
+  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE int run() { return Reducer::Identity; }
 };
 template<
@ -276,7 +276,7 @@ template<
  typename A
 > struct reduce<Reducer, A>
 {
-  EIGEN_DEVICE_FUNC constexpr static inline A run(A a) { return a; }
+  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE A run(A a) { return a; }
 };
 template<
@ -285,7 +285,7 @@ template<
  typename... Ts
 > struct reduce<Reducer, A, Ts...>
 {
-  EIGEN_DEVICE_FUNC constexpr static inline auto run(A a, Ts... ts) -> decltype(Reducer::run(a, reduce<Reducer, Ts...>::run(ts...))) {
+  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, Ts... ts) -> decltype(Reducer::run(a, reduce<Reducer, Ts...>::run(ts...))) {
    return Reducer::run(a, reduce<Reducer, Ts...>::run(ts...));
  }
 };
@ -293,29 +293,29 @@ template<
 /* generic binary operations */
 struct sum_op           {
-  template<typename A, typename B> EIGEN_DEVICE_FUNC constexpr static inline auto run(A a, B b) -> decltype(a + b)   { return a + b;   }
+  template<typename A, typename B> EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a + b)   { return a + b;   }
  static constexpr int Identity = 0;
 };
 struct product_op       {
-  template<typename A, typename B> EIGEN_DEVICE_FUNC constexpr static inline auto run(A a, B b) -> decltype(a * b)   { return a * b;   }
+  template<typename A, typename B> EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a * b)   { return a * b;   }
  static constexpr int Identity = 1;
 };
-struct logical_and_op   { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a && b)  { return a && b;  } };
+struct logical_and_op   { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a && b)  { return a && b;  } };
-struct logical_or_op    { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a || b)  { return a || b;  } };
+struct logical_or_op    { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a || b)  { return a || b;  } };
-struct equal_op         { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a == b)  { return a == b;  } };
+struct equal_op         { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a == b)  { return a == b;  } };
-struct not_equal_op     { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a != b)  { return a != b;  } };
+struct not_equal_op     { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a != b)  { return a != b;  } };
-struct lesser_op        { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a < b)   { return a < b;   } };
+struct lesser_op        { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a < b)   { return a < b;   } };
-struct lesser_equal_op  { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a <= b)  { return a <= b;  } };
+struct lesser_equal_op  { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a <= b)  { return a <= b;  } };
-struct greater_op       { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a > b)   { return a > b;   } };
+struct greater_op       { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a > b)   { return a > b;   } };
-struct greater_equal_op { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a >= b)  { return a >= b;  } };
+struct greater_equal_op { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a >= b)  { return a >= b;  } };
 /* generic unary operations */
-struct not_op                { template<typename A> constexpr static inline auto run(A a) -> decltype(!a)      { return !a;      } };
+struct not_op                { template<typename A> constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(!a)      { return !a;      } };
-struct negation_op           { template<typename A> constexpr static inline auto run(A a) -> decltype(-a)      { return -a;      } };
+struct negation_op           { template<typename A> constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(-a)      { return -a;      } };
-struct greater_equal_zero_op { template<typename A> constexpr static inline auto run(A a) -> decltype(a >= 0)  { return a >= 0;  } };
+struct greater_equal_zero_op { template<typename A> constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(a >= 0)  { return a >= 0;  } };
 /* reductions for lists */
@ -324,13 +324,13 @@ struct greater_equal_zero_op { template<typename A> constexpr static inline auto
 // together in front... (13.0 doesn't work with array_prod/array_reduce/... anyway, but 13.1
 // does...
 template<typename... Ts>
-EIGEN_DEVICE_FUNC constexpr inline decltype(reduce<product_op, Ts...>::run((*((Ts*)0))...)) arg_prod(Ts... ts)
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE decltype(reduce<product_op, Ts...>::run((*((Ts*)0))...)) arg_prod(Ts... ts)
 {
  return reduce<product_op, Ts...>::run(ts...);
 }
 template<typename... Ts>
-constexpr inline decltype(reduce<sum_op, Ts...>::run((*((Ts*)0))...)) arg_sum(Ts... ts)
+constexpr EIGEN_STRONG_INLINE decltype(reduce<sum_op, Ts...>::run((*((Ts*)0))...)) arg_sum(Ts... ts)
 {
  return reduce<sum_op, Ts...>::run(ts...);
 }
@ -338,13 +338,13 @@ constexpr inline decltype(reduce<sum_op, Ts...>::run((*((Ts*)0))...)) arg_sum(Ts
 /* reverse arrays */
 template<typename Array, int... n>
-constexpr inline Array h_array_reverse(Array arr, numeric_list<int, n...>)
+constexpr EIGEN_STRONG_INLINE Array h_array_reverse(Array arr, numeric_list<int, n...>)
 {
  return {{array_get<sizeof...(n) - n - 1>(arr)...}};
 }
 template<typename T, std::size_t N>
-constexpr inline array<T, N> array_reverse(array<T, N> arr)
+constexpr EIGEN_STRONG_INLINE array<T, N> array_reverse(array<T, N> arr)
 {
  return h_array_reverse(arr, typename gen_numeric_list<int, N>::type());
 }
@ -359,7 +359,7 @@ constexpr inline array<T, N> array_reverse(array<T, N> arr)
 // an infinite loop)
 template<typename Reducer, typename T, std::size_t N, std::size_t n = N - 1>
 struct h_array_reduce {
-  EIGEN_DEVICE_FUNC constexpr static inline auto run(array<T, N> arr, T identity) -> decltype(Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr, identity), array_get<n>(arr)))
+  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(array<T, N> arr, T identity) -> decltype(Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr, identity), array_get<n>(arr)))
  {
    return Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr, identity), array_get<n>(arr));
  }
@ -368,7 +368,7 @@ struct h_array_reduce {
 template<typename Reducer, typename T, std::size_t N>
 struct h_array_reduce<Reducer, T, N, 0>
 {
-  EIGEN_DEVICE_FUNC constexpr static inline T run(const array<T, N>& arr, T)
+  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE T run(const array<T, N>& arr, T)
  {
    return array_get<0>(arr);
  }
@ -377,14 +377,14 @@ struct h_array_reduce<Reducer, T, N, 0>
 template<typename Reducer, typename T>
 struct h_array_reduce<Reducer, T, 0>
 {
-  EIGEN_DEVICE_FUNC constexpr static inline T run(const array<T, 0>&, T identity)
+  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE T run(const array<T, 0>&, T identity)
  {
    return identity;
  }
 };
 template<typename Reducer, typename T, std::size_t N>
-EIGEN_DEVICE_FUNC constexpr inline auto array_reduce(const array<T, N>& arr, T identity) -> decltype(h_array_reduce<Reducer, T, N>::run(arr, identity))
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_reduce(const array<T, N>& arr, T identity) -> decltype(h_array_reduce<Reducer, T, N>::run(arr, identity))
 {
  return h_array_reduce<Reducer, T, N>::run(arr, identity);
 }
@ -392,13 +392,13 @@ EIGEN_DEVICE_FUNC constexpr inline auto array_reduce(const array<T, N>& arr, T i
 /* standard array reductions */
 template<typename T, std::size_t N>
-EIGEN_DEVICE_FUNC constexpr inline auto array_sum(const array<T, N>& arr) -> decltype(array_reduce<sum_op, T, N>(arr, static_cast<T>(0)))
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_sum(const array<T, N>& arr) -> decltype(array_reduce<sum_op, T, N>(arr, static_cast<T>(0)))
 {
  return array_reduce<sum_op, T, N>(arr, static_cast<T>(0));
 }
 template<typename T, std::size_t N>
-EIGEN_DEVICE_FUNC constexpr inline auto array_prod(const array<T, N>& arr) -> decltype(array_reduce<product_op, T, N>(arr, static_cast<T>(1)))
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_prod(const array<T, N>& arr) -> decltype(array_reduce<product_op, T, N>(arr, static_cast<T>(1)))
 {
  return array_reduce<product_op, T, N>(arr, static_cast<T>(1));
 }
@ -414,13 +414,13 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector<t>& a) {
 /* zip an array */
 template<typename Op, typename A, typename B, std::size_t N, int... n>
-constexpr inline array<decltype(Op::run(A(), B())),N> h_array_zip(array<A, N> a, array<B, N> b, numeric_list<int, n...>)
+constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A(), B())),N> h_array_zip(array<A, N> a, array<B, N> b, numeric_list<int, n...>)
 {
  return array<decltype(Op::run(A(), B())),N>{{ Op::run(array_get<n>(a), array_get<n>(b))... }};
 }
 template<typename Op, typename A, typename B, std::size_t N>
-constexpr inline array<decltype(Op::run(A(), B())),N> array_zip(array<A, N> a, array<B, N> b)
+constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A(), B())),N> array_zip(array<A, N> a, array<B, N> b)
 {
  return h_array_zip<Op>(a, b, typename gen_numeric_list<int, N>::type());
 }
@ -428,13 +428,13 @@ constexpr inline array<decltype(Op::run(A(), B())),N> array_zip(array<A, N> a, a
 /* zip an array and reduce the result */
 template<typename Reducer, typename Op, typename A, typename B, std::size_t N, int... n>
-constexpr inline auto h_array_zip_and_reduce(array<A, N> a, array<B, N> b, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(array_get<n>(a), array_get<n>(b))...))
+constexpr EIGEN_STRONG_INLINE auto h_array_zip_and_reduce(array<A, N> a, array<B, N> b, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(array_get<n>(a), array_get<n>(b))...))
 {
  return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(array_get<n>(a), array_get<n>(b))...);
 }
 template<typename Reducer, typename Op, typename A, typename B, std::size_t N>
-constexpr inline auto array_zip_and_reduce(array<A, N> a, array<B, N> b) -> decltype(h_array_zip_and_reduce<Reducer, Op, A, B, N>(a, b, typename gen_numeric_list<int, N>::type()))
+constexpr EIGEN_STRONG_INLINE auto array_zip_and_reduce(array<A, N> a, array<B, N> b) -> decltype(h_array_zip_and_reduce<Reducer, Op, A, B, N>(a, b, typename gen_numeric_list<int, N>::type()))
 {
  return h_array_zip_and_reduce<Reducer, Op, A, B, N>(a, b, typename gen_numeric_list<int, N>::type());
 }
@ -442,13 +442,13 @@ constexpr inline auto array_zip_and_reduce(array<A, N> a, array<B, N> b) -> decl
 /* apply stuff to an array */
 template<typename Op, typename A, std::size_t N, int... n>
-constexpr inline array<decltype(Op::run(A())),N> h_array_apply(array<A, N> a, numeric_list<int, n...>)
+constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A())),N> h_array_apply(array<A, N> a, numeric_list<int, n...>)
 {
  return array<decltype(Op::run(A())),N>{{ Op::run(array_get<n>(a))... }};
 }
 template<typename Op, typename A, std::size_t N>
-constexpr inline array<decltype(Op::run(A())),N> array_apply(array<A, N> a)
+constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A())),N> array_apply(array<A, N> a)
 {
  return h_array_apply<Op>(a, typename gen_numeric_list<int, N>::type());
 }
@ -456,13 +456,13 @@ constexpr inline array<decltype(Op::run(A())),N> array_apply(array<A, N> a)
 /* apply stuff to an array and reduce */
 template<typename Reducer, typename Op, typename A, std::size_t N, int... n>
-constexpr inline auto h_array_apply_and_reduce(array<A, N> arr, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(array_get<n>(arr))...))
+constexpr EIGEN_STRONG_INLINE auto h_array_apply_and_reduce(array<A, N> arr, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(array_get<n>(arr))...))
 {
  return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(array_get<n>(arr))...);
 }
 template<typename Reducer, typename Op, typename A, std::size_t N>
-constexpr inline auto array_apply_and_reduce(array<A, N> a) -> decltype(h_array_apply_and_reduce<Reducer, Op, A, N>(a, typename gen_numeric_list<int, N>::type()))
+constexpr EIGEN_STRONG_INLINE auto array_apply_and_reduce(array<A, N> a) -> decltype(h_array_apply_and_reduce<Reducer, Op, A, N>(a, typename gen_numeric_list<int, N>::type()))
 {
  return h_array_apply_and_reduce<Reducer, Op, A, N>(a, typename gen_numeric_list<int, N>::type());
 }
@ -476,7 +476,7 @@ template<int n>
 struct h_repeat
 {
  template<typename t, int... ii>
-  constexpr static inline array<t, n> run(t v, numeric_list<int, ii...>)
+  constexpr static EIGEN_STRONG_INLINE array<t, n> run(t v, numeric_list<int, ii...>)
  {
    return {{ typename id_numeric<int, ii, t>::type(v)... }};
  }
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
@ -395,7 +395,6 @@ void matrix_exp_compute(const ArgType& arg, ResultType &result, false_type) // d
 template<typename Derived> struct MatrixExponentialReturnValue
 : public ReturnByValue<MatrixExponentialReturnValue<Derived> >
 {
    typedef typename Derived::Index Index;
  public:
    /** \brief Constructor.
      *
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
@ -53,7 +53,7 @@ template <typename MatrixType>
 typename NumTraits<typename MatrixType::Scalar>::Real matrix_function_compute_mu(const MatrixType& A)
 {
  typedef typename plain_col_type<MatrixType>::type VectorType;
-  typename MatrixType::Index rows = A.rows();
+  Index rows = A.rows();
  const MatrixType N = MatrixType::Identity(rows, rows) - A;
  VectorType e = VectorType::Ones(rows);
  N.template triangularView<Upper>().solveInPlace(e);
@ -65,7 +65,6 @@ MatrixType MatrixFunctionAtomic<MatrixType>::compute(const MatrixType& A)
 {
  // TODO: Use that A is upper triangular
  typedef typename NumTraits<Scalar>::Real RealScalar;
  typedef typename MatrixType::Index Index;
  Index rows = A.rows();
  Scalar avgEival = A.trace() / Scalar(RealScalar(rows));
  MatrixType Ashifted = A - avgEival * MatrixType::Identity(rows, rows);
@ -131,7 +130,6 @@ typename ListOfClusters::iterator matrix_function_find_cluster(Index key, ListOf
 template <typename EivalsType, typename Cluster>
 void matrix_function_partition_eigenvalues(const EivalsType& eivals, std::list<Cluster>& clusters)
 {
  typedef typename EivalsType::Index Index;
  typedef typename EivalsType::RealScalar RealScalar;
  for (Index i=0; i<eivals.rows(); ++i) {
    // Find cluster containing i-th ei'val, adding a new cluster if necessary
@ -179,7 +177,7 @@ void matrix_function_compute_block_start(const VectorType& clusterSize, VectorTy
 {
  blockStart.resize(clusterSize.rows());
  blockStart(0) = 0;
-  for (typename VectorType::Index i = 1; i < clusterSize.rows(); i++) {
+  for (Index i = 1; i < clusterSize.rows(); i++) {
    blockStart(i) = blockStart(i-1) + clusterSize(i-1);
  }
 }
@ -188,7 +186,6 @@ void matrix_function_compute_block_start(const VectorType& clusterSize, VectorTy
 template <typename EivalsType, typename ListOfClusters, typename VectorType>
 void matrix_function_compute_map(const EivalsType& eivals, const ListOfClusters& clusters, VectorType& eivalToCluster)
 {
  typedef typename EivalsType::Index Index;
  eivalToCluster.resize(eivals.rows());
  Index clusterIndex = 0;
  for (typename ListOfClusters::const_iterator cluster = clusters.begin(); cluster != clusters.end(); ++cluster) {
@ -205,7 +202,6 @@ void matrix_function_compute_map(const EivalsType& eivals, const ListOfClusters&
 template <typename DynVectorType, typename VectorType>
 void matrix_function_compute_permutation(const DynVectorType& blockStart, const DynVectorType& eivalToCluster, VectorType& permutation)
 {
  typedef typename VectorType::Index Index;
  DynVectorType indexNextEntry = blockStart;
  permutation.resize(eivalToCluster.rows());
  for (Index i = 0; i < eivalToCluster.rows(); i++) {
@ -219,7 +215,6 @@ void matrix_function_compute_permutation(const DynVectorType& blockStart, const
 template <typename VectorType, typename MatrixType>
 void matrix_function_permute_schur(VectorType& permutation, MatrixType& U, MatrixType& T)
 {
  typedef typename VectorType::Index Index;
  for (Index i = 0; i < permutation.rows() - 1; i++) {
    Index j;
    for (j = i; j < permutation.rows(); j++) {
@ -247,7 +242,7 @@ template <typename MatrixType, typename AtomicType, typename VectorType>
 void matrix_function_compute_block_atomic(const MatrixType& T, AtomicType& atomic, const VectorType& blockStart, const VectorType& clusterSize, MatrixType& fT)
 { 
  fT.setZero(T.rows(), T.cols());
-  for (typename VectorType::Index i = 0; i < clusterSize.rows(); ++i) {
+  for (Index i = 0; i < clusterSize.rows(); ++i) {
    fT.block(blockStart(i), blockStart(i), clusterSize(i), clusterSize(i))
      = atomic.compute(T.block(blockStart(i), blockStart(i), clusterSize(i), clusterSize(i)));
  }
@ -285,7 +280,6 @@ MatrixType matrix_function_solve_triangular_sylvester(const MatrixType& A, const
  eigen_assert(C.rows() == A.rows());
  eigen_assert(C.cols() == B.rows());
  typedef typename MatrixType::Index Index;
  typedef typename MatrixType::Scalar Scalar;
  Index m = A.rows();
@ -330,11 +324,8 @@ void matrix_function_compute_above_diagonal(const MatrixType& T, const VectorTyp
 { 
  typedef internal::traits<MatrixType> Traits;
  typedef typename MatrixType::Scalar Scalar;
  typedef typename MatrixType::Index Index;
  static const int RowsAtCompileTime = Traits::RowsAtCompileTime;
  static const int ColsAtCompileTime = Traits::ColsAtCompileTime;
  static const int Options = MatrixType::Options;
-  typedef Matrix<Scalar, Dynamic, Dynamic, Options, RowsAtCompileTime, ColsAtCompileTime> DynMatrixType;
+  typedef Matrix<Scalar, Dynamic, Dynamic, Options, Traits::RowsAtCompileTime, Traits::ColsAtCompileTime> DynMatrixType;
  for (Index k = 1; k < clusterSize.rows(); k++) {
    for (Index i = 0; i < clusterSize.rows() - k; i++) {
@ -481,7 +472,6 @@ template<typename Derived> class MatrixFunctionReturnValue
 {
  public:
    typedef typename Derived::Scalar Scalar;
    typedef typename Derived::Index Index;
    typedef typename internal::stem_function<Scalar>::type StemFunction;
  protected:
@ -506,10 +496,8 @@ template<typename Derived> class MatrixFunctionReturnValue
      typedef typename internal::nested_eval<Derived, 10>::type NestedEvalType;
      typedef typename internal::remove_all<NestedEvalType>::type NestedEvalTypeClean;
      typedef internal::traits<NestedEvalTypeClean> Traits;
      static const int RowsAtCompileTime = Traits::RowsAtCompileTime;
      static const int ColsAtCompileTime = Traits::ColsAtCompileTime;
      typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
-      typedef Matrix<ComplexScalar, Dynamic, Dynamic, 0, RowsAtCompileTime, ColsAtCompileTime> DynMatrixType;
+      typedef Matrix<ComplexScalar, Dynamic, Dynamic, 0, Traits::RowsAtCompileTime, Traits::ColsAtCompileTime> DynMatrixType;
      typedef internal::MatrixFunctionAtomic<DynMatrixType> AtomicType;
      AtomicType atomic(m_f);
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
@ -332,10 +332,8 @@ public:
    typedef typename internal::nested_eval<Derived, 10>::type DerivedEvalType;
    typedef typename internal::remove_all<DerivedEvalType>::type DerivedEvalTypeClean;
    typedef internal::traits<DerivedEvalTypeClean> Traits;
    static const int RowsAtCompileTime = Traits::RowsAtCompileTime;
    static const int ColsAtCompileTime = Traits::ColsAtCompileTime;
    typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
-    typedef Matrix<ComplexScalar, Dynamic, Dynamic, 0, RowsAtCompileTime, ColsAtCompileTime> DynMatrixType;
+    typedef Matrix<ComplexScalar, Dynamic, Dynamic, 0, Traits::RowsAtCompileTime, Traits::ColsAtCompileTime> DynMatrixType;
    typedef internal::MatrixLogarithmAtomic<DynMatrixType> AtomicType;
    AtomicType atomic;
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
@ -40,7 +40,6 @@ class MatrixPowerParenthesesReturnValue : public ReturnByValue< MatrixPowerParen
 {
  public:
    typedef typename MatrixType::RealScalar RealScalar;
    typedef typename MatrixType::Index Index;
    /**
     * \brief Constructor.
@ -94,7 +93,6 @@ class MatrixPowerAtomic : internal::noncopyable
    typedef typename MatrixType::Scalar Scalar;
    typedef typename MatrixType::RealScalar RealScalar;
    typedef std::complex<RealScalar> ComplexScalar;
    typedef typename MatrixType::Index Index;
    typedef Block<MatrixType,Dynamic,Dynamic> ResultType;
    const MatrixType& m_A;
@ -340,7 +338,6 @@ class MatrixPower : internal::noncopyable
  private:
    typedef typename MatrixType::Scalar Scalar;
    typedef typename MatrixType::RealScalar RealScalar;
    typedef typename MatrixType::Index Index;
  public:
    /**
@ -600,7 +597,6 @@ class MatrixPowerReturnValue : public ReturnByValue< MatrixPowerReturnValue<Deri
  public:
    typedef typename Derived::PlainObject PlainObject;
    typedef typename Derived::RealScalar RealScalar;
    typedef typename Derived::Index Index;
    /**
     * \brief Constructor.
@ -648,7 +644,6 @@ class MatrixComplexPowerReturnValue : public ReturnByValue< MatrixComplexPowerRe
  public:
    typedef typename Derived::PlainObject PlainObject;
    typedef typename std::complex<typename Derived::RealScalar> ComplexScalar;
    typedef typename Derived::Index Index;
    /**
     * \brief Constructor.
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h
@ -17,7 +17,7 @@ namespace internal {
 // pre:  T.block(i,i,2,2) has complex conjugate eigenvalues
 // post: sqrtT.block(i,i,2,2) is square root of T.block(i,i,2,2)
 template <typename MatrixType, typename ResultType>
-void matrix_sqrt_quasi_triangular_2x2_diagonal_block(const MatrixType& T, typename MatrixType::Index i, ResultType& sqrtT)
+void matrix_sqrt_quasi_triangular_2x2_diagonal_block(const MatrixType& T, Index i, ResultType& sqrtT)
 {
  // TODO: This case (2-by-2 blocks with complex conjugate eigenvalues) is probably hidden somewhere
  //       in EigenSolver. If we expose it, we could call it directly from here.
@ -32,7 +32,7 @@ void matrix_sqrt_quasi_triangular_2x2_diagonal_block(const MatrixType& T, typena
 //       all blocks of sqrtT to left of and below (i,j) are correct
 // post: sqrtT(i,j) has the correct value
 template <typename MatrixType, typename ResultType>
-void matrix_sqrt_quasi_triangular_1x1_off_diagonal_block(const MatrixType& T, typename MatrixType::Index i, typename MatrixType::Index j, ResultType& sqrtT)
+void matrix_sqrt_quasi_triangular_1x1_off_diagonal_block(const MatrixType& T, Index i, Index j, ResultType& sqrtT)
 {
  typedef typename traits<MatrixType>::Scalar Scalar;
  Scalar tmp = (sqrtT.row(i).segment(i+1,j-i-1) * sqrtT.col(j).segment(i+1,j-i-1)).value();
@ -41,7 +41,7 @@ void matrix_sqrt_quasi_triangular_1x1_off_diagonal_block(const MatrixType& T, ty
 // similar to compute1x1offDiagonalBlock()
 template <typename MatrixType, typename ResultType>
-void matrix_sqrt_quasi_triangular_1x2_off_diagonal_block(const MatrixType& T, typename MatrixType::Index i, typename MatrixType::Index j, ResultType& sqrtT)
+void matrix_sqrt_quasi_triangular_1x2_off_diagonal_block(const MatrixType& T, Index i, Index j, ResultType& sqrtT)
 {
  typedef typename traits<MatrixType>::Scalar Scalar;
  Matrix<Scalar,1,2> rhs = T.template block<1,2>(i,j);
@ -54,7 +54,7 @@ void matrix_sqrt_quasi_triangular_1x2_off_diagonal_block(const MatrixType& T, ty
 // similar to compute1x1offDiagonalBlock()
 template <typename MatrixType, typename ResultType>
-void matrix_sqrt_quasi_triangular_2x1_off_diagonal_block(const MatrixType& T, typename MatrixType::Index i, typename MatrixType::Index j, ResultType& sqrtT)
+void matrix_sqrt_quasi_triangular_2x1_off_diagonal_block(const MatrixType& T, Index i, Index j, ResultType& sqrtT)
 {
  typedef typename traits<MatrixType>::Scalar Scalar;
  Matrix<Scalar,2,1> rhs = T.template block<2,1>(i,j);
@ -101,7 +101,7 @@ void matrix_sqrt_quasi_triangular_solve_auxiliary_equation(MatrixType& X, const
 // similar to compute1x1offDiagonalBlock()
 template <typename MatrixType, typename ResultType>
-void matrix_sqrt_quasi_triangular_2x2_off_diagonal_block(const MatrixType& T, typename MatrixType::Index i, typename MatrixType::Index j, ResultType& sqrtT)
+void matrix_sqrt_quasi_triangular_2x2_off_diagonal_block(const MatrixType& T, Index i, Index j, ResultType& sqrtT)
 {
  typedef typename traits<MatrixType>::Scalar Scalar;
  Matrix<Scalar,2,2> A = sqrtT.template block<2,2>(i,i);
@ -120,7 +120,6 @@ template <typename MatrixType, typename ResultType>
 void matrix_sqrt_quasi_triangular_diagonal(const MatrixType& T, ResultType& sqrtT)
 {
  using std::sqrt;
  typedef typename MatrixType::Index Index;
  const Index size = T.rows();
  for (Index i = 0; i < size; i++) {
    if (i == size - 1 || T.coeff(i+1, i) == 0) {
@ -139,7 +138,6 @@ void matrix_sqrt_quasi_triangular_diagonal(const MatrixType& T, ResultType& sqrt
 template <typename MatrixType, typename ResultType>
 void matrix_sqrt_quasi_triangular_off_diagonal(const MatrixType& T, ResultType& sqrtT)
 {
  typedef typename MatrixType::Index Index;
  const Index size = T.rows();
  for (Index j = 1; j < size; j++) {
      if (T.coeff(j, j-1) != 0)  // if T(j-1:j, j-1:j) is a 2-by-2 block
@ -206,8 +204,7 @@ template <typename MatrixType, typename ResultType>
 void matrix_sqrt_triangular(const MatrixType &arg, ResultType &result)
 {
  using std::sqrt;
-  typedef typename MatrixType::Index Index;
+  typedef typename MatrixType::Scalar Scalar;
      typedef typename MatrixType::Scalar Scalar;
  eigen_assert(arg.rows() == arg.cols());
@ -318,7 +315,6 @@ template<typename Derived> class MatrixSquareRootReturnValue
 : public ReturnByValue<MatrixSquareRootReturnValue<Derived> >
 {
  protected:
    typedef typename Derived::Index Index;
    typedef typename internal::ref_selector<Derived>::type DerivedNested;
  public:
--- a/unsupported/Eigen/src/Polynomials/Companion.h
+++ b/unsupported/Eigen/src/Polynomials/Companion.h
@ -89,13 +89,13 @@ class companion
    {
      const Index deg   = m_monic.size();
      const Index deg_1 = deg-1;
-      DenseCompanionMatrixType companion(deg,deg);
+      DenseCompanionMatrixType companMat(deg,deg);
-      companion <<
+      companMat <<
        ( LeftBlock(deg,deg_1)
          << LeftBlockFirstRow::Zero(1,deg_1),
          BottomLeftBlock::Identity(deg-1,deg-1)*m_bl_diag.asDiagonal() ).finished()
        , m_monic;
-      return companion;
+      return companMat;
    }
--- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h
@ -24,7 +24,7 @@ namespace Eigen {
  * \sa Eigen::igammac(), Eigen::lgamma()
  */
 template<typename Derived,typename ExponentDerived>
-inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
+EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
 igamma(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x)
 {
  return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_op<typename Derived::Scalar>, const Derived, const ExponentDerived>(
@ -47,7 +47,7 @@ igamma(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerive
  * \sa Eigen::igamma(), Eigen::lgamma()
  */
 template <typename Derived, typename ExponentDerived>
-inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_der_a_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
+EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_der_a_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
 igamma_der_a(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x) {
  return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_der_a_op<typename Derived::Scalar>, const Derived, const ExponentDerived>(
    a.derived(),
@ -68,7 +68,7 @@ igamma_der_a(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<Exponent
  * \sa Eigen::igamma(), Eigen::lgamma()
  */
 template <typename AlphaDerived, typename SampleDerived>
-inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_gamma_sample_der_alpha_op<typename AlphaDerived::Scalar>, const AlphaDerived, const SampleDerived>
+EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp<Eigen::internal::scalar_gamma_sample_der_alpha_op<typename AlphaDerived::Scalar>, const AlphaDerived, const SampleDerived>
 gamma_sample_der_alpha(const Eigen::ArrayBase<AlphaDerived>& alpha, const Eigen::ArrayBase<SampleDerived>& sample) {
  return Eigen::CwiseBinaryOp<Eigen::internal::scalar_gamma_sample_der_alpha_op<typename AlphaDerived::Scalar>, const AlphaDerived, const SampleDerived>(
      alpha.derived(),
@ -86,7 +86,7 @@ gamma_sample_der_alpha(const Eigen::ArrayBase<AlphaDerived>& alpha, const Eigen:
  * \sa Eigen::igamma(), Eigen::lgamma()
  */
 template<typename Derived,typename ExponentDerived>
-inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igammac_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
+EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igammac_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
 igammac(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x)
 {
  return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igammac_op<typename Derived::Scalar>, const Derived, const ExponentDerived>(
@ -108,7 +108,7 @@ igammac(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDeriv
 // * \warning Be careful with the order of the parameters: x.polygamma(n) is equivalent to polygamma(n,x)
 // * \sa ArrayBase::polygamma()
 template<typename DerivedN,typename DerivedX>
-inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_polygamma_op<typename DerivedX::Scalar>, const DerivedN, const DerivedX>
+EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp<Eigen::internal::scalar_polygamma_op<typename DerivedX::Scalar>, const DerivedN, const DerivedX>
 polygamma(const Eigen::ArrayBase<DerivedN>& n, const Eigen::ArrayBase<DerivedX>& x)
 {
  return Eigen::CwiseBinaryOp<Eigen::internal::scalar_polygamma_op<typename DerivedX::Scalar>, const DerivedN, const DerivedX>(
@ -128,7 +128,7 @@ polygamma(const Eigen::ArrayBase<DerivedN>& n, const Eigen::ArrayBase<DerivedX>&
  * \sa Eigen::betainc(), Eigen::lgamma()
  */
 template<typename ArgADerived, typename ArgBDerived, typename ArgXDerived>
-inline const Eigen::CwiseTernaryOp<Eigen::internal::scalar_betainc_op<typename ArgXDerived::Scalar>, const ArgADerived, const ArgBDerived, const ArgXDerived>
+EIGEN_STRONG_INLINE const Eigen::CwiseTernaryOp<Eigen::internal::scalar_betainc_op<typename ArgXDerived::Scalar>, const ArgADerived, const ArgBDerived, const ArgXDerived>
 betainc(const Eigen::ArrayBase<ArgADerived>& a, const Eigen::ArrayBase<ArgBDerived>& b, const Eigen::ArrayBase<ArgXDerived>& x)
 {
  return Eigen::CwiseTernaryOp<Eigen::internal::scalar_betainc_op<typename ArgXDerived::Scalar>, const ArgADerived, const ArgBDerived, const ArgXDerived>(
@ -152,7 +152,7 @@ betainc(const Eigen::ArrayBase<ArgADerived>& a, const Eigen::ArrayBase<ArgBDeriv
  * \sa ArrayBase::zeta()
  */
 template<typename DerivedX,typename DerivedQ>
-inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_zeta_op<typename DerivedX::Scalar>, const DerivedX, const DerivedQ>
+EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp<Eigen::internal::scalar_zeta_op<typename DerivedX::Scalar>, const DerivedX, const DerivedQ>
 zeta(const Eigen::ArrayBase<DerivedX>& x, const Eigen::ArrayBase<DerivedQ>& q)
 {
  return Eigen::CwiseBinaryOp<Eigen::internal::scalar_zeta_op<typename DerivedX::Scalar>, const DerivedX, const DerivedQ>(
@ -176,7 +176,7 @@ zeta(const Eigen::ArrayBase<DerivedX>& x, const Eigen::ArrayBase<DerivedQ>& q)
  * \sa ArrayBase::i0e()
  */
 template <typename Derived>
-inline const Eigen::CwiseUnaryOp<
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
    Eigen::internal::scalar_i0e_op<typename Derived::Scalar>, const Derived>
 i0e(const Eigen::ArrayBase<Derived>& x) {
  return Eigen::CwiseUnaryOp<
@ -199,7 +199,7 @@ i0e(const Eigen::ArrayBase<Derived>& x) {
  * \sa ArrayBase::i1e()
  */
 template <typename Derived>
-inline const Eigen::CwiseUnaryOp<
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
    Eigen::internal::scalar_i1e_op<typename Derived::Scalar>, const Derived>
 i1e(const Eigen::ArrayBase<Derived>& x) {
  return Eigen::CwiseUnaryOp<
--- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h
@ -155,11 +155,11 @@ struct functor_traits<scalar_betainc_op<Scalar> > {
 */
 template<typename Scalar> struct scalar_lgamma_op {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_lgamma_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const {
    using numext::lgamma; return lgamma(a);
  }
  typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plgamma(a); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { return internal::plgamma(a); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_lgamma_op<Scalar> >
@ -177,11 +177,11 @@ struct functor_traits<scalar_lgamma_op<Scalar> >
 */
 template<typename Scalar> struct scalar_digamma_op {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_digamma_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const {
    using numext::digamma; return digamma(a);
  }
  typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pdigamma(a); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { return internal::pdigamma(a); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_digamma_op<Scalar> >
@ -199,11 +199,11 @@ struct functor_traits<scalar_digamma_op<Scalar> >
 */
 template<typename Scalar> struct scalar_zeta_op {
    EIGEN_EMPTY_STRUCT_CTOR(scalar_zeta_op)
-    EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& x, const Scalar& q) const {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& x, const Scalar& q) const {
        using numext::zeta; return zeta(x, q);
    }
    typedef typename packet_traits<Scalar>::type Packet;
-    EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& x, const Packet& q) const { return internal::pzeta(x, q); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x, const Packet& q) const { return internal::pzeta(x, q); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_zeta_op<Scalar> >
@ -221,11 +221,11 @@ struct functor_traits<scalar_zeta_op<Scalar> >
 */
 template<typename Scalar> struct scalar_polygamma_op {
    EIGEN_EMPTY_STRUCT_CTOR(scalar_polygamma_op)
-    EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& n, const Scalar& x) const {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& n, const Scalar& x) const {
        using numext::polygamma; return polygamma(n, x);
    }
    typedef typename packet_traits<Scalar>::type Packet;
-    EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& n, const Packet& x) const { return internal::ppolygamma(n, x); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& n, const Packet& x) const { return internal::ppolygamma(n, x); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_polygamma_op<Scalar> >
@ -244,11 +244,11 @@ struct functor_traits<scalar_polygamma_op<Scalar> >
 */
 template<typename Scalar> struct scalar_erf_op {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_erf_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const {
    using numext::erf; return erf(a);
  }
  typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::perf(a); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { return internal::perf(a); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_erf_op<Scalar> >
@ -267,11 +267,11 @@ struct functor_traits<scalar_erf_op<Scalar> >
 */
 template<typename Scalar> struct scalar_erfc_op {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_erfc_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const {
    using numext::erfc; return erfc(a);
  }
  typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::perfc(a); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { return internal::perfc(a); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_erfc_op<Scalar> >
@ -291,12 +291,12 @@ struct functor_traits<scalar_erfc_op<Scalar> >
 template <typename Scalar>
 struct scalar_i0e_op {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_i0e_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& x) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
    using numext::i0e;
    return i0e(x);
  }
  typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& x) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
    return internal::pi0e(x);
  }
 };
@ -318,12 +318,12 @@ struct functor_traits<scalar_i0e_op<Scalar> > {
 template <typename Scalar>
 struct scalar_i1e_op {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_i1e_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& x) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
    using numext::i1e;
    return i1e(x);
  }
  typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& x) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
    return internal::pi1e(x);
  }
 };
--- a/unsupported/doc/examples/FFT.cpp
+++ b/unsupported/doc/examples/FFT.cpp
@ -61,14 +61,14 @@ template <typename T>
 void RandomFill(std::vector<T> & vec)
 {
    for (size_t k=0;k<vec.size();++k)
-        vec[k] = T( rand() )/T(RAND_MAX) - .5;
+        vec[k] = T( rand() )/T(RAND_MAX) - T(.5);
 }
 template <typename T>
 void RandomFill(std::vector<std::complex<T> > & vec)
 {
    for (size_t k=0;k<vec.size();++k)
-        vec[k] = std::complex<T> ( T( rand() )/T(RAND_MAX) - .5, T( rand() )/T(RAND_MAX) - .5);
+        vec[k] = std::complex<T> ( T( rand() )/T(RAND_MAX) - T(.5), T( rand() )/T(RAND_MAX) - T(.5));
 }
 template <typename T_time,typename T_freq>
@ -85,7 +85,7 @@ void fwd_inv(size_t nfft)
    vector<T_time> timebuf2;
    fft.inv(timebuf2,freqbuf);
-    long double rmse = mag2(timebuf - timebuf2) / mag2(timebuf);
+    T_time rmse = mag2(timebuf - timebuf2) / mag2(timebuf);
    cout << "roundtrip rmse: " << rmse << endl;
 }
--- a/unsupported/test/cxx11_non_blocking_thread_pool.cpp
+++ b/unsupported/test/cxx11_non_blocking_thread_pool.cpp
@ -18,7 +18,7 @@ static void test_create_destroy_empty_pool()
  // Just create and destroy the pool. This will wind up and tear down worker
  // threads. Ensure there are no issues in that logic.
  for (int i = 0; i < 16; ++i) {
-    NonBlockingThreadPool tp(i);
+    ThreadPool tp(i);
  }
 }
@ -27,7 +27,7 @@ static void test_parallelism(bool allow_spinning)
 {
  // Test we never-ever fail to match available tasks with idle threads.
  const int kThreads = 16;  // code below expects that this is a multiple of 4
-  NonBlockingThreadPool tp(kThreads, allow_spinning);
+  ThreadPool tp(kThreads, allow_spinning);
  VERIFY_IS_EQUAL(tp.NumThreads(), kThreads);
  VERIFY_IS_EQUAL(tp.CurrentThreadId(), -1);
  for (int iter = 0; iter < 100; ++iter) {
@ -104,7 +104,7 @@ static void test_parallelism(bool allow_spinning)
 static void test_cancel()
 {
-  NonBlockingThreadPool tp(2);
+  ThreadPool tp(2);
  // Schedule a large number of closure that each sleeps for one second. This
  // will keep the thread pool busy for much longer than the default test timeout.