From 66d073c38e3cd5dad974deea7b3d1d45247ea55b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jo=C3=A3o=20P=2E=20L=2E=20de=20Carvalho?=
 <jaopaulolc@gmail.com>
Date: Fri, 9 Aug 2019 15:56:26 -0600
Subject: [PATCH 01/30] bug #1718: Add cast to successfully compile with clang
 on PowerPC

Ignoring -Wc11-extensions warnings thrown by clang at Altivec/PacketMath.h
---
 Eigen/src/Core/arch/AltiVec/PacketMath.h    | 2 +-
 Eigen/src/Core/util/DisableStupidWarnings.h | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index 4b770d036..f3d374a62 100755
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -452,7 +452,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, con
 template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); }
 
 template<> EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
-  return vec_sel(b, a, mask);
+  return vec_sel(b, a, reinterpret_cast<Packet4ui>(mask));
 }
 
 template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return vec_round(a); }
diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h
index 6c7c2d655..4501d3248 100755
--- a/Eigen/src/Core/util/DisableStupidWarnings.h
+++ b/Eigen/src/Core/util/DisableStupidWarnings.h
@@ -44,6 +44,11 @@
   #if __clang_major__ >= 3 && __clang_minor__ >= 5
     #pragma clang diagnostic ignored "-Wabsolute-value"
   #endif
+  #if ( defined(__ALTIVEC__) || defined(__VSX__) ) && __cplusplus < 201103L
+    // warning: generic selections are a C11-specific feature
+    // ignoring warnings thrown at vec_ctf in Altivec/PacketMath.h
+    #pragma clang diagnostic ignored "-Wc11-extensions"
+  #endif
 
 #elif defined __GNUC__
 

From 4d29aa0294a0d0aa21c41eef687840a5c59bf692 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jo=C3=A3o=20P=2E=20L=2E=20de=20Carvalho?=
 <jaopaulolc@gmail.com>
Date: Fri, 9 Aug 2019 15:59:26 -0600
Subject: [PATCH 02/30] Fix offset argument of ploadu/pstoreu for Altivec

If no offset is given, them it should be zero.

Also passes full address to vec_vsx_ld/st builtins.

Removes userless _EIGEN_ALIGNED_PTR & _EIGEN_MASK_ALIGNMENT.

Removes unnecessary casts.
---
 Eigen/src/Core/arch/AltiVec/PacketMath.h | 21 ++++++---------------
 1 file changed, 6 insertions(+), 15 deletions(-)

diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index f3d374a62..1fef285ce 100755
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -83,15 +83,6 @@ static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 };
 static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 };
 static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 };
 
-// Mask alignment
-#ifdef __PPC64__
-#define _EIGEN_MASK_ALIGNMENT	0xfffffffffffffff0
-#else
-#define _EIGEN_MASK_ALIGNMENT	0xfffffff0
-#endif
-
-#define _EIGEN_ALIGNED_PTR(x)	((std::ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT)
-
 // Handle endianness properly while loading constants
 // Define global static constants:
 #ifdef _BIG_ENDIAN
@@ -487,12 +478,12 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
 template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
 {
   EIGEN_DEBUG_UNALIGNED_LOAD
-  return (Packet4i) vec_vsx_ld((long)from & 15, (const int*) _EIGEN_ALIGNED_PTR(from));
+  return vec_vsx_ld(0, from);
 }
 template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
 {
   EIGEN_DEBUG_UNALIGNED_LOAD
-  return (Packet4f) vec_vsx_ld((long)from & 15, (const float*) _EIGEN_ALIGNED_PTR(from));
+  return vec_vsx_ld(0, from);
 }
 #endif
 
@@ -553,12 +544,12 @@ template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& f
 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*       to, const Packet4i& from)
 {
   EIGEN_DEBUG_ALIGNED_STORE
-  vec_vsx_st(from, (long)to & 15, (int*) _EIGEN_ALIGNED_PTR(to));
+  vec_vsx_st(from, 0, to);
 }
 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet4f& from)
 {
   EIGEN_DEBUG_ALIGNED_STORE
-  vec_vsx_st(from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to));
+  vec_vsx_st(from, 0, to);
 }
 #endif
 
@@ -1045,7 +1036,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { re
 template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
 {
   EIGEN_DEBUG_ALIGNED_LOAD
-  return (Packet2d) vec_vsx_ld((long)from & 15, (const double*) _EIGEN_ALIGNED_PTR(from));
+  return vec_vsx_ld(0, from);
 }
 
 template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*   from)
@@ -1059,7 +1050,7 @@ template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*   from)
 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double*  to, const Packet2d& from)
 {
   EIGEN_DEBUG_ALIGNED_STORE
-  vec_vsx_st((Packet4f)from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to));
+  vec_vsx_st(from, 0, to);
 }
 
 template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_PPC_PREFETCH(addr); }

From 787f6ef0254949380cc6955890eeb9c282c2350f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jo=C3=A3o=20P=2E=20L=2E=20de=20Carvalho?=
 <jaopaulolc@gmail.com>
Date: Fri, 9 Aug 2019 16:02:55 -0600
Subject: [PATCH 03/30] Fix packed load/store for PowerPC's VSX

The vec_vsx_ld/vec_vsx_st builtins were wrongly used for aligned load/store. In fact, they perform unaligned memory access and, even when the address is 16-byte aligned, they are much slower (at least 2x) than their aligned counterparts.

For double/Packet2d vec_xl/vec_xst should be prefered over vec_ld/vec_st, although the latter works when casted to float/Packet4f.

Silencing some weird warning with throw but some GCC versions. Such warning are not thrown by Clang.
---
 Eigen/src/Core/arch/AltiVec/PacketMath.h | 40 +++++++++---------------
 1 file changed, 14 insertions(+), 26 deletions(-)

diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index 1fef285ce..30694d424 100755
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -240,42 +240,38 @@ inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v)
 // Need to define them first or we get specialization after instantiation errors
 template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
 {
+  // some versions of GCC throw "unused-but-set-parameter".
+  // ignoring these warnings for now.
+  EIGEN_UNUSED_VARIABLE(from);
   EIGEN_DEBUG_ALIGNED_LOAD
-#ifdef __VSX__
-  return vec_vsx_ld(0, from);
-#else
   return vec_ld(0, from);
-#endif
 }
 
 template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from)
 {
+  // some versions of GCC throw "unused-but-set-parameter".
+  // ignoring these warnings for now.
+  EIGEN_UNUSED_VARIABLE(from);
   EIGEN_DEBUG_ALIGNED_LOAD
-#ifdef __VSX__
-  return vec_vsx_ld(0, from);
-#else
   return vec_ld(0, from);
-#endif
 }
 
 template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from)
 {
+  // some versions of GCC throw "unused-but-set-parameter" (float *to).
+  // ignoring these warnings for now.
+  EIGEN_UNUSED_VARIABLE(to);
   EIGEN_DEBUG_ALIGNED_STORE
-#ifdef __VSX__
-  vec_vsx_st(from, 0, to);
-#else
   vec_st(from, 0, to);
-#endif
 }
 
 template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from)
 {
+  // some versions of GCC throw "unused-but-set-parameter" (float *to).
+  // ignoring these warnings for now.
+  EIGEN_UNUSED_VARIABLE(to);
   EIGEN_DEBUG_ALIGNED_STORE
-#ifdef __VSX__
-  vec_vsx_st(from, 0, to);
-#else
   vec_st(from, 0, to);
-#endif
 }
 
 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) {
@@ -940,21 +936,13 @@ inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
 template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
 {
   EIGEN_DEBUG_ALIGNED_LOAD
-#ifdef __VSX__
-  return vec_vsx_ld(0, from);
-#else
-  return vec_ld(0, from);
-#endif
+  return vec_xl(0, const_cast<double *>(from)); // cast needed by Clang
 }
 
 template<> EIGEN_STRONG_INLINE void pstore<double>(double*   to, const Packet2d& from)
 {
   EIGEN_DEBUG_ALIGNED_STORE
-#ifdef __VSX__
-  vec_vsx_st(from, 0, to);
-#else
-  vec_st(from, 0, to);
-#endif
+  vec_xst(from, 0, to);
 }
 
 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double&  from) {

From a3298b22ecd19a80a2fc03df3d463fdb04907c87 Mon Sep 17 00:00:00 2001
From: Rasmus Munk Larsen <rmlarsen@google.com>
Date: Mon, 12 Aug 2019 13:53:28 -0700
Subject: [PATCH 04/30] Implement vectorized versions of log1p and expm1 in
 Eigen using Kahan's formulas, and change the scalar implementations to
 properly handle infinite arguments.

Depending on instruction set, significant speedups are observed for the vectorized path:
log1p wall time is reduced 60-93% (2.5x - 15x speedup)
expm1 wall time is reduced 0-85% (1x - 7x speedup)

The scalar path is slower by 20-30% due to the extra branch needed to handle +infinity correctly.

Full benchmarks measured on Intel(R) Xeon(R) Gold 6154 here: https://bitbucket.org/snippets/rmlarsen/MXBkpM
---
 Eigen/src/Core/MathFunctions.h                |  8 +++-
 Eigen/src/Core/arch/AVX/MathFunctions.h       | 10 ++++
 Eigen/src/Core/arch/AVX/PacketMath.h          |  2 +
 Eigen/src/Core/arch/AVX512/MathFunctions.h    | 12 +++++
 Eigen/src/Core/arch/AVX512/PacketMath.h       |  2 +
 .../arch/Default/GenericPacketMathFunctions.h | 46 +++++++++++++++++++
 Eigen/src/Core/arch/SSE/MathFunctions.h       | 13 +++++-
 Eigen/src/Core/arch/SSE/PacketMath.h          |  2 +
 test/packetmath.cpp                           |  6 ++-
 9 files changed, 95 insertions(+), 6 deletions(-)

diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index 8bef59354..1eeb2752b 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -501,7 +501,8 @@ namespace std_fallback {
     }
 
     EIGEN_USING_STD_MATH(log);
-    return (u - RealScalar(1)) * x / log(u);
+    Scalar logu = log(u);
+    return numext::equal_strict(u, logu) ? u : (u - RealScalar(1)) * x / logu;
   }
 }
 
@@ -548,7 +549,10 @@ namespace std_fallback {
     typedef typename NumTraits<Scalar>::Real RealScalar;
     EIGEN_USING_STD_MATH(log);
     Scalar x1p = RealScalar(1) + x;
-    return numext::equal_strict(x1p, Scalar(1)) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) );
+    Scalar log_1p = log(x1p);
+    const bool is_inf = numext::equal_strict(x1p, log_1p);
+    const bool is_small = numext::equal_strict(x1p, Scalar(1));
+    return (is_inf || is_small) ? x : x * (log_1p / (x1p - RealScalar(1)));
   }
 }
 
diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h
index 9f375ed98..c6d3cf6a0 100644
--- a/Eigen/src/Core/arch/AVX/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX/MathFunctions.h
@@ -36,6 +36,16 @@ plog<Packet8f>(const Packet8f& _x) {
   return plog_float(_x);
 }
 
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet8f plog1p<Packet8f>(const Packet8f& _x) {
+  return generic_plog1p(_x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet8f pexpm1<Packet8f>(const Packet8f& _x) {
+  return generic_expm1(_x);
+}
+
 // Exponential function. Works by writing "x = m*log(2) + r" where
 // "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then
 // "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1).
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index 7ee9dee10..5233195f3 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -65,6 +65,8 @@ template<> struct packet_traits<float>  : default_packet_traits
     HasSin  = EIGEN_FAST_MATH,
     HasCos  = EIGEN_FAST_MATH,
     HasLog  = 1,
+    HasLog1p  = 1,
+    HasExpm1  = 1,
     HasExp  = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h
index c2158c538..9e37a720b 100644
--- a/Eigen/src/Core/arch/AVX512/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h
@@ -393,6 +393,18 @@ pcos<Packet16f>(const Packet16f& _x) {
   return pcos_float(_x);
 }
 
+#if defined(EIGEN_VECTORIZE_AVX512DQ)
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet16f plog1p<Packet16f>(const Packet16f& _x) {
+  return generic_plog1p(_x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet16f pexpm1<Packet16f>(const Packet16f& _x) {
+  return generic_expm1(_x);
+}
+#endif
+
 }  // end namespace internal
 
 }  // end namespace Eigen
diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h
index 383c49636..0a81fe02d 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -60,6 +60,8 @@ template<> struct packet_traits<float>  : default_packet_traits
 #if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
 #ifdef EIGEN_VECTORIZE_AVX512DQ
     HasLog = 1,
+    HasLog1p = 1,
+    HasExpm1 = 1,
 #endif
     HasExp = 1,
     HasSqrt = EIGEN_FAST_MATH,
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
index 43e827638..640aae05a 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -126,6 +126,52 @@ Packet plog_float(const Packet _x)
                               por(pselect(pos_inf_mask,cst_pos_inf,x), invalid_mask));
 }
 
+/** \internal \returns log(1 + x) computed using W. Kahan's formula.
+    See: http://www.plunk.org/~hatch/rightway.php
+ */
+template<typename Packet>
+Packet generic_plog1p(const Packet& x)
+{
+  typedef typename unpacket_traits<Packet>::type ScalarType;
+  const Packet one = pset1<Packet>(ScalarType(1));
+  Packet xp1 = padd(x, one);
+  Packet small_mask = pcmp_eq(xp1, one);
+  Packet log1 = plog(xp1);
+  // Add a check to handle x == +inf.
+  Packet pos_inf_mask = pcmp_eq(x, log1);
+  Packet log_large = pmul(x, pdiv(log1, psub(xp1, one)));
+  return pselect(por(small_mask, pos_inf_mask), x, log_large);
+}
+
+/** \internal \returns exp(x)-1 computed using W. Kahan's formula.
+    See: http://www.plunk.org/~hatch/rightway.php
+ */
+template<typename Packet>
+Packet generic_expm1(const Packet& x)
+{
+  typedef typename unpacket_traits<Packet>::type ScalarType;
+  const Packet one = pset1<Packet>(ScalarType(1));
+  const Packet neg_one = pset1<Packet>(ScalarType(-1));
+  Packet u = pexp(x);
+  Packet one_mask = pcmp_eq(u, one);
+  Packet u_minus_one = psub(u, one);
+  Packet neg_one_mask = pcmp_eq(u_minus_one, neg_one);
+  Packet logu = plog(u);
+  // The following comparison is to catch the case where
+  // exp(x) = +inf. It is written in this way to avoid having
+  // to form the constant +inf, which depends on the packet
+  // type.
+  Packet pos_inf_mask = pcmp_eq(logu, u);
+  Packet expm1 = pmul(u_minus_one, pdiv(x, logu));
+  expm1 = pselect(pos_inf_mask, u, expm1);
+  return pselect(one_mask,
+                 x,
+                 pselect(neg_one_mask,
+                         neg_one,
+                         expm1));
+}
+
+
 // Exponential function. Works by writing "x = m*log(2) + r" where
 // "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then
 // "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1).
diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h
index 0d491ab88..02c8f3c2f 100644
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -22,11 +22,20 @@ namespace Eigen {
 namespace internal {
 
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f plog<Packet4f>(const Packet4f& _x)
-{
+Packet4f plog<Packet4f>(const Packet4f& _x) {
   return plog_float(_x);
 }
 
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f plog1p<Packet4f>(const Packet4f& _x) {
+  return generic_plog1p(_x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f pexpm1<Packet4f>(const Packet4f& _x) {
+  return generic_expm1(_x);
+}
+
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f pexp<Packet4f>(const Packet4f& _x)
 {
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 0d571ce61..94603dd55 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -110,6 +110,8 @@ template<> struct packet_traits<float>  : default_packet_traits
     HasSin  = EIGEN_FAST_MATH,
     HasCos  = EIGEN_FAST_MATH,
     HasLog  = 1,
+    HasLog1p  = 1,
+    HasExpm1  = 1,
     HasExp  = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index f1448f335..41000a842 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -604,11 +604,13 @@ template<typename Scalar,typename Packet> void packetmath_real()
   CHECK_CWISE1_IF(PacketTraits::HasSqrt, Scalar(1)/std::sqrt, internal::prsqrt);
   CHECK_CWISE1_IF(PacketTraits::HasLog, std::log, internal::plog);
 #if EIGEN_HAS_C99_MATH && (__cplusplus > 199711L)
-  CHECK_CWISE1_IF(PacketTraits::HasExpm1, std::expm1, internal::pexpm1);
-  CHECK_CWISE1_IF(PacketTraits::HasLog1p, std::log1p, internal::plog1p);
   CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasLGamma, std::lgamma, internal::plgamma);
   CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasErf, std::erf, internal::perf);
   CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasErfc, std::erfc, internal::perfc);
+  data1[0] =  std::numeric_limits<Scalar>::infinity();
+  data1[1] =  std::numeric_limits<Scalar>::denorm_min();
+  CHECK_CWISE1_IF(PacketTraits::HasExpm1, std::expm1, internal::pexpm1);
+  CHECK_CWISE1_IF(PacketTraits::HasLog1p, std::log1p, internal::plog1p);
 #endif
 
   if(PacketSize>=2)

From db9147ae40695e43ec694b2e207d0acc5b0570d8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jo=C3=A3o=20P=2E=20L=2E=20de=20Carvalho?=
 <jaopaulolc@gmail.com>
Date: Wed, 14 Aug 2019 10:37:39 -0600
Subject: [PATCH 05/30] Add missing pcmp_XX methods for double/Packet2d

This actually fixes an issue in unit-test packetmath_2 with pcmp_eq when it is compiled with clang. When pcmp_eq(Packet4f,Packet4f) is used instead of pcmp_eq(Packet2d,Packet2d), the unit-test does not pass due to NaN on ref vector.
---
 Eigen/src/Core/arch/AltiVec/PacketMath.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index 30694d424..521e6076d 100755
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -1009,6 +1009,14 @@ template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const
   return ret;
 }
 
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmple(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmplt(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmpeq(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) {
+  Packet2d c = reinterpret_cast<Packet2d>(vec_cmpge(a,b));
+  return vec_nor(c,c);
+}
+
 template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }
 
 template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); }

From 5ac7984ffa2076cc5b26fb220a3b351951251c2d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jo=C3=A3o=20P=2E=20L=2E=20de=20Carvalho?=
 <jaopaulolc@gmail.com>
Date: Wed, 14 Aug 2019 11:59:12 -0600
Subject: [PATCH 06/30] Fix debug macros in p{load,store}u

---
 Eigen/src/Core/arch/AltiVec/PacketMath.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index 521e6076d..7ee290a29 100755
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -539,12 +539,12 @@ template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& f
 // We also need to redefine little endian loading of Packet4i/Packet4f using VSX
 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*       to, const Packet4i& from)
 {
-  EIGEN_DEBUG_ALIGNED_STORE
+  EIGEN_DEBUG_UNALIGNED_STORE
   vec_vsx_st(from, 0, to);
 }
 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet4f& from)
 {
-  EIGEN_DEBUG_ALIGNED_STORE
+  EIGEN_DEBUG_UNALIGNED_STORE
   vec_vsx_st(from, 0, to);
 }
 #endif
@@ -1031,7 +1031,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { re
 
 template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
 {
-  EIGEN_DEBUG_ALIGNED_LOAD
+  EIGEN_DEBUG_UNALIGNED_LOAD
   return vec_vsx_ld(0, from);
 }
 
@@ -1045,7 +1045,7 @@ template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*   from)
 
 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double*  to, const Packet2d& from)
 {
-  EIGEN_DEBUG_ALIGNED_STORE
+  EIGEN_DEBUG_UNALIGNED_STORE
   vec_vsx_st(from, 0, to);
 }
 

From 071311821e509d87bec609d6a3aeea9dc74cfd66 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Mon, 19 Aug 2019 11:44:25 -0700
Subject: [PATCH 07/30] Remove XSMM support from Tensor module

---
 cmake/FindXsmm.cmake                          |  25 --
 unsupported/Eigen/CXX11/Tensor                |   4 -
 .../CXX11/src/Tensor/TensorContraction.h      | 281 ------------------
 .../src/Tensor/TensorContractionBlocking.h    | 135 ---------
 .../src/Tensor/TensorContractionThreadPool.h  | 201 -------------
 unsupported/test/CMakeLists.txt               |  11 -
 unsupported/test/cxx11_tensor_contraction.cpp |   7 -
 7 files changed, 664 deletions(-)
 delete mode 100644 cmake/FindXsmm.cmake

diff --git a/cmake/FindXsmm.cmake b/cmake/FindXsmm.cmake
deleted file mode 100644
index 809d6f414..000000000
--- a/cmake/FindXsmm.cmake
+++ /dev/null
@@ -1,25 +0,0 @@
-# libxsmm support.
-# libxsmm provides matrix multiplication kernels optimized for
-# the latest Intel architectures.
-# Download the library from https://github.com/hfp/libxsmm
-# Compile with make BLAS=0
-
-if (LIBXSMM)
-  set(XSMM_FIND_QUIETLY TRUE)
-  set(XSMM_INCLUDES ${LIBXSMM}/include)
-  set(XSMM_LIBRARIES ${LIBXSMM}/lib)
-endif (LIBXSMM)
-
-find_path(LIBXSMM 
-  NAMES 
-  libxsmm.h 
-  PATHS 
-  $ENV{XSMMDIR}/include 
-  ${INCLUDE_INSTALL_DIR} 
-)
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(XSMM DEFAULT_MSG
-                                  LIBXSMM)
-
-mark_as_advanced(LIBXSMM)
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index 25b663046..5d18aeb3f 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -73,10 +73,6 @@ typedef unsigned __int64 uint64_t;
 #include <time.h>
 #endif
 
-#if defined(EIGEN_USE_LIBXSMM)
-#include "libxsmm.h"
-#endif
-
 #ifdef EIGEN_USE_THREADS
 #include "ThreadPool"
 #endif
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index de7c2248a..a398b2b3f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -20,70 +20,6 @@ namespace Eigen {
   *
   */
 namespace internal {
-#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
-template<typename Scalar, typename Index>
-void pack_simple(Scalar * dst, const Scalar * src, Index cols, Index rows, Index lddst, Index ldsrc) {
-  size_t psize = packet_traits<Scalar>::size;           // Packet size
-  typedef typename packet_traits<Scalar>::type Packet;  // Packet type
-  size_t alignment = psize*sizeof(Scalar);              // Needed alignment
-  if (rows % psize == 0 && (lddst*sizeof(Scalar)) % alignment == 0 &&
-     (ldsrc*sizeof(Scalar)) % alignment == 0 &&
-     reinterpret_cast<uintptr_t>(src) % alignment == 0 &&
-     reinterpret_cast<uintptr_t>(dst) % alignment == 0) {
-    // Optimized version using packets
-    size_t num_packets = rows / psize;
-    for (Index col = 0; col < cols; ++col) {
-      EIGEN_ASM_COMMENT("begin pack_simple inner copy");
-      // Unrolled manually 4 times.
-      for (size_t i=0; i < num_packets/4; ++i) {
-        internal::pstore(dst, internal::pload<Packet>(src));
-        dst += psize; src += psize;
-        internal::pstore(dst, internal::pload<Packet>(src));
-        dst += psize; src += psize;
-        internal::pstore(dst, internal::pload<Packet>(src));
-        dst += psize; src += psize;
-        internal::pstore(dst, internal::pload<Packet>(src));
-        dst += psize; src += psize;
-      }
-      for (size_t i=0; i < num_packets%4; ++i) {
-        internal::pstore(dst, internal::pload<Packet>(src));
-        dst += psize; src += psize;
-      }
-      dst += lddst - num_packets*psize;
-      src += ldsrc - num_packets*psize;
-      EIGEN_ASM_COMMENT("end pack_simple inner copy");
-    }
-  } else {
-    // Naive memcpy calls
-    for (Index col = 0; col < cols; ++col) {
-      memcpy(dst + col*lddst, src + col*ldsrc, rows*sizeof(Scalar));
-    }
-  }
-}
-
-template<typename LhsScalar, typename RhsScalar, typename Scalar>
-  struct libxsmm_wrapper {
-    libxsmm_wrapper() {}
-    libxsmm_wrapper(int, int, int, int, int, int, int, float, float, int) {}
-    void operator()(const LhsScalar*, const RhsScalar*, Scalar*) {}
-    void operator()(const LhsScalar*, const RhsScalar*, Scalar*, const LhsScalar*, const RhsScalar*, const Scalar*) {}
-  };
-
-  template<>
-  struct libxsmm_wrapper<float, float, float>: public libxsmm_mmfunction<float> {
-    libxsmm_wrapper(): libxsmm_mmfunction() {}
-    libxsmm_wrapper(int flags, int m, int n, int k, int lda, int ldb, int ldc, float alpha, float beta, int prefetch) :
-        libxsmm_mmfunction(flags, m, n, k, lda, ldb, ldc, alpha, beta, prefetch) {}
-  };
-
-  template<>
-  struct libxsmm_wrapper<double, double, double>: public libxsmm_mmfunction<double> {
-    libxsmm_wrapper(): libxsmm_mmfunction() {}
-    libxsmm_wrapper(int flags, int m, int n, int k, int lda, int ldb, int ldc, float alpha, float beta, int prefetch) :
-        libxsmm_mmfunction(flags, m, n, k, lda, ldb, ldc, alpha, beta, prefetch) {}
-  };
-#endif
-
 
 template<typename Dimensions, typename LhsXprType, typename RhsXprType, typename OutputKernelType>
 struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType> >
@@ -640,8 +576,6 @@ struct TensorContractionEvaluatorBase
       }
     }
 
-    EnableXSMMIfPossible(eval_op_indices);
-
     // If the layout is RowMajor, we need to reverse the m_dimensions
     if (static_cast<int>(Layout) == static_cast<int>(RowMajor)) {
       for (int i = 0, j = NumDims - 1; i < j; i++, j--) {
@@ -780,13 +714,6 @@ struct TensorContractionEvaluatorBase
   EIGEN_DEVICE_FUNC
   #endif
   void evalGemm(Scalar* buffer) const {
-    #if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
-    if (m_can_use_xsmm) {
-      evalGemmXSMM(buffer);
-      return;
-    }
-    #endif
-
     // columns in left side, rows in right side
     const Index k = this->m_k_size;
 
@@ -942,213 +869,6 @@ struct TensorContractionEvaluatorBase
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_result; }
 
 protected:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void EnableXSMMIfPossible(const array<IndexPair<Index>, ContractDims>& eval_op_indices) {
-    m_can_use_xsmm = false;
-
-#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
-    typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
-    typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
-    if (!std::is_same<Scalar, LhsScalar>::value ||
-        !std::is_same<Scalar, RhsScalar>::value ||
-        !(std::is_same<Scalar, float>::value ||
-          std::is_same<Scalar, double>::value) ||
-        m_leftImpl.data() == NULL ||
-        m_rightImpl.data() == NULL) {
-      return;
-    }
-
-    // Check if we can use faster matmul algorithms. For contraction to be
-    // equivalent to matmul, we need both lhs and rhs contracting dims sequences
-    // to be either a prefix or suffix of all dims. Also, the order of both
-    // must be the same, so we don't have to do reordering.
-    // For example:
-    // * OK: lhs 4D, rhs 4D, contraction: [(0, 2), (1, 3)]
-    // * BAD: lhs 3D, rhs 3D, contraction: [(1,1)]
-    // * BAD: lhs 3D, rhs 3D, contraction: [(0, 0), (2, 2)]
-    // * BAD: lhs 3D, rhs 3D, contraction: [(0, 2), (1, 1)]
-    // Depending if contraction dims are prefix or suffix of all dims we need to
-    // pre-transpose matrices in matmul algorithm:
-    // lhs: prefix -> transpose, suffix -> no transpose
-    // rhs: prefix -> no transpose, suffix -> transpose
-    // For example, for lhs 2D, rhs 2D, contraction [(1, 0)] is regular,
-    // non-transposed matmul.
-    if (ContractDims == 0) {
-      // This case is totally uninteresting, filter it out to avoid problems
-      // with iterations in further tests.
-      return;
-    }
-
-    // Check if RHS dims list is increasing. LHS already is, so if not, the
-    // order is different and we cannot do matmul.
-    for (int i = 1; i < ContractDims; i++) {
-      if (eval_op_indices[i].second < eval_op_indices[i-1].second) {
-        return;
-      }
-    }
-
-    // Check if no holes.
-    int diff;
-    for (int i = 1; i < ContractDims; i++) {
-      // LHS contract dims are sorted to form an increasing seq.
-      diff = eval_op_indices[i].first - eval_op_indices[i-1].first;
-      if (diff != 1) {
-        return;
-      }
-      // Now we may already assume RHS contract dims seq is increasing too.
-      diff = eval_op_indices[i].second - eval_op_indices[i-1].second;
-      if (diff != 1) {
-        return;
-      }
-    }
-
-    // Check if suffix or prefix.
-    if (eval_op_indices[0].first != 0 &&
-        eval_op_indices[ContractDims-1].first != LDims-1) {
-      return;
-    }
-    if (eval_op_indices[0].second != 0 &&
-        eval_op_indices[ContractDims-1].second != RDims-1) {
-      return;
-    }
-
-    m_can_use_xsmm = true;
-#else
-    EIGEN_UNUSED_VARIABLE(eval_op_indices);
-#endif
-  }
-
-#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
-  EIGEN_DEVICE_FUNC void evalGemmXSMM(Scalar* buffer) const {
-    // columns in left side, rows in right side
-    const Index k = this->m_k_size;
-
-    // rows in left side
-    const Index m = this->m_i_size;
-
-    // columns in right side
-    const Index n = this->m_j_size;
-
-    const bool transposeA = !m_lhs_inner_dim_contiguous;
-    const bool transposeB = !m_rhs_inner_dim_contiguous;
-
-    typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
-    typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
-
-    internal::TensorXsmmContractionBlocking<LhsScalar, RhsScalar, Index> blocking(
-        k, m, n, 1, transposeA, transposeB);
-
-    // Outer blocks sizes
-    const Index mc_outer = blocking.outer_m();
-    const Index nc_outer = blocking.outer_n();
-    const Index kc_outer = blocking.outer_k();
-    // Inner blocks sizes
-    const Index mc = blocking.mc();
-    const Index nc = blocking.nc();
-    const Index kc = blocking.kc();
-    // Decisions whether we should copy parts of matrices
-    const bool copyA = blocking.copyA();
-    const bool copyB = blocking.copyB();
-
-    const LhsScalar* leftData = m_leftImpl.data();
-    const RhsScalar* rightData = m_rightImpl.data();
-
-    const libxsmm_blasint stride_A = static_cast<libxsmm_blasint>(transposeA ? k : m);
-    const libxsmm_blasint stride_B = static_cast<libxsmm_blasint>(transposeB ? n : k);
-    const libxsmm_blasint stride_C = static_cast<libxsmm_blasint>(m);
-
-    const libxsmm_blasint stride_blockA = static_cast<libxsmm_blasint>(mc);
-    // Use bigger stride to avoid hitting same cache line too often.
-    // This consistently gives +~0.5 Gflops.
-    const libxsmm_blasint stride_panelB = static_cast<libxsmm_blasint>(
-        kc % 32 == 0 ? kc + 16 : kc
-    );
-
-    // Kernel for the general case (not edges)
-    internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar> kernel;
-
-    LhsScalar* blockA = NULL;
-    RhsScalar* panelB = NULL;
-
-    if (copyA) {
-      blockA = static_cast<LhsScalar*>(this->m_device.allocate(mc * kc * sizeof(LhsScalar)));
-    }
-    if (copyB) {
-      panelB = static_cast<RhsScalar*>(this->m_device.allocate(nc_outer * stride_panelB * sizeof(RhsScalar)));
-    }
-
-    const Index kernel_stride_A = copyA ? stride_blockA : stride_A;
-    const Index kernel_stride_B = copyB ? stride_panelB : stride_B;
-    kernel = internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar>(0, mc, nc, kc, kernel_stride_A, kernel_stride_B, stride_C, 1, 1, blocking.prefetch());
-
-    // Outer blocking
-    for (Index ki_outer = 0; ki_outer < k; ki_outer += kc_outer) {
-      for (Index mi_outer = 0; mi_outer < m; mi_outer += mc_outer) {
-        for (Index ni_outer = 0; ni_outer < n; ni_outer += nc_outer) {
-          using numext::mini;
-
-          Index actual_nc_outer = mini(ni_outer+nc_outer, n) - ni_outer;
-
-          // Inner blocking
-          for (Index ki = ki_outer; ki < mini(ki_outer+kc_outer, k); ki += kc) {
-            const Index actual_kc = mini(ki_outer+kc_outer, mini(ki+kc, k)) - ki;
-            const float beta = ki == 0 ? 0 : 1;
-
-            if (copyB) {
-              if (transposeB) {
-                libxsmm_otrans(panelB, rightData + ki*stride_B + ni_outer, sizeof(RhsScalar), actual_nc_outer, actual_kc, stride_B, stride_panelB);
-              } else {
-                internal::pack_simple<RhsScalar, Index>(panelB, rightData + ni_outer*stride_B + ki, actual_nc_outer, actual_kc, stride_panelB, stride_B);
-              }
-            }
-
-            for (Index mi = mi_outer; mi < mini(mi_outer+mc_outer, m); mi += mc) {
-              const Index actual_mc = mini(mi_outer+mc_outer, mini(mi+mc, m)) - mi;
-
-              const LhsScalar* a = transposeA ? leftData + mi*stride_A + ki :
-                                                leftData + ki*stride_A + mi;
-
-              if (copyA) {
-                if (transposeA) {
-                  libxsmm_otrans(blockA, a, sizeof(LhsScalar), actual_kc, actual_mc, stride_A, stride_blockA);
-                } else {
-                  internal::pack_simple<LhsScalar, Index>(blockA, a, actual_kc, actual_mc, stride_blockA, stride_A);
-                }
-              }
-              const LhsScalar* actual_a = copyA ? blockA : a;
-
-              for (Index ni = ni_outer; ni < mini(ni_outer+nc_outer, n); ni += nc) {
-                const Index actual_nc = mini(ni_outer+nc_outer, mini(ni+nc, n)) - ni;
-
-                const RhsScalar* b = rightData + ni*stride_B + ki;
-                Scalar* c = buffer + ni*stride_C + mi;
-                const Scalar* cp = c + nc*stride_C;
-
-                const RhsScalar* actual_b = copyB ? panelB + (ni-ni_outer)*stride_panelB : b;
-                const RhsScalar* bp = copyB ? panelB + nc*stride_panelB : b + nc*stride_B;
-
-                if (actual_mc == mc && actual_kc == kc && actual_nc == nc && beta == 1) {
-                  // Most used, cached kernel.
-                  kernel(actual_a, actual_b, c, actual_a, bp, cp);
-                } else {
-                  // Edges - use libxsmm kernel cache.
-                  internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar>(0, actual_mc, actual_nc, actual_kc, kernel_stride_A, kernel_stride_B, stride_C, 1, beta, blocking.prefetch())(actual_a, actual_b, c, actual_a, bp, cp);
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-
-    if (copyA) {
-      this->m_device.deallocate(blockA);
-    }
-    if (copyB) {
-      this->m_device.deallocate(panelB);
-    }
-  }
-#endif
-
   // Prevent assignment
   TensorContractionEvaluatorBase& operator = (const TensorContractionEvaluatorBase&);
   Dimensions m_dimensions;
@@ -1177,7 +897,6 @@ protected:
   const Device EIGEN_DEVICE_REF m_device;
   OutputKernelType m_output_kernel;
   EvaluatorPointerType m_result;
-  bool m_can_use_xsmm;
 };
 
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
index c51f3f8dd..974feb0ad 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
@@ -67,141 +67,6 @@ class TensorContractionBlocking {
   StorageIndex nc_;
 };
 
-
-
-#if defined(EIGEN_USE_LIBXSMM)
-template <typename LhsScalar, typename RhsScalar, typename StorageIndex>
-class TensorXsmmContractionBlocking {
- public:
-  TensorXsmmContractionBlocking(StorageIndex k, StorageIndex m, StorageIndex n,
-      size_t max_num_threads = 1, bool transposeA = false,
-      bool transposeB = false):
-    k_(k), m_(m), n_(n), transposeA_(transposeA),
-    transposeB_(transposeB), num_threads_(max_num_threads) {
-#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
-    if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
-      mc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M;
-      kc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K;
-      nc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N;
-      outer_m_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_M;
-      outer_k_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_K;
-      outer_n_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_N;
-      copyA_ = EIGEN_TEST_SPECIFIC_BLOCKING_COPY_A;
-      copyB_ = EIGEN_TEST_SPECIFIC_BLOCKING_COPY_B;
-      outer_m_ = outer_m_ != 0 ? outer_m_ : m;
-      outer_k_ = outer_k_ != 0 ? outer_k_ : k;
-      outer_n_ = outer_n_ != 0 ? outer_n_ : n;
-    }
-#else
-    // Defaults, possibly overridden per-platform.
-    copyA_ = true;
-    copyB_ = false;
-
-    // If the matrix is small enough, don't do blocking, just call single xsmm
-    // kernel.
-    if (static_cast<double>(m)*k*n <= LIBXSMM_THRESHOLD) {
-      mc_ = m; kc_ = k; nc_ = n;
-      outer_m_ = m; outer_k_ = k; outer_n_ = n;
-      copyA_ = false; copyB_ = false;
-    } else {
-      int arch = libxsmm_cpuid_x86();
-
-      if (arch == LIBXSMM_X86_AVX512_CORE) {
-        // skylake
-        mc_ = 64; kc_ = 64; nc_ = 24;
-        outer_m_ = 512; outer_k_ = 512; outer_n_ = 24*22;
-        // Hack to use this kernel architecture as the other one has performance
-        // issues (no hardware prefetching).
-        // TODO(nishantpatil): This should be removed if the issues are fixed,
-        // or this one becomes the default.
-        setenv("LIBXSMM_AVX512_CLASSIC_GEMM", "1", 1);
-      } else if (arch == LIBXSMM_X86_AVX2) {
-        // haswell
-        mc_ = 32; kc_ = 192; nc_ = 33;
-        outer_m_ = 512; outer_k_ = 3*192; outer_n_ = 33*16;
-      } else if (arch == LIBXSMM_X86_AVX) {
-        // ivybridge
-        mc_ = 32; kc_ = 192; nc_ = 48;
-        outer_m_ = 512; outer_k_ = 3*192; outer_n_ = 48*11;
-      } else {
-        // generic kernel size, usually performing well
-        mc_ = 32; kc_ = 128; nc_ = 32;
-        outer_m_ = 512; outer_k_ = 512; outer_n_ = 512;
-      }
-
-      // Only copy if it makes the stride smaller.
-      copyA_ = copyA_ && (m > mc_);
-      copyB_ = copyB_ && (k > kc_);
-    }
-
-    // We need to copy anyway if transposing
-    copyA_ = copyA_ || transposeA;
-    copyB_ = copyB_ || transposeB;
-
-    // See libxsmm_gemm_prefetch_type definition in libxsmm_typedefs.h
-    prefetch_ = LIBXSMM_PREFETCH_AL2CL2BL2_VIA_C;
-
-#endif
-
-    mc_ = mc_ > m ? m : mc_;
-    nc_ = nc_ > n ? n : nc_;
-    kc_ = kc_ > k ? k : kc_;
-
-    size_t compute_parallelism = (m / mc_) * (n / nc_);
-    size_t pack_parallelism = 0;
-    if (copyA_) {
-      pack_parallelism += (m / mc_) * (k / kc_);
-    }
-    if (copyB_) {
-      pack_parallelism += (n / nc_) * (k / kc_);
-    }
-    size_t parallelism = numext::maxi(compute_parallelism, pack_parallelism);
-
-    num_threads_ = numext::mini<size_t>(num_threads_,
-                                    parallelism / MIN_JOBS_PER_THREAD);
-    num_threads_ = numext::maxi<size_t>(num_threads_, 1);
-
-    // For optimal performance outer block sizes should be multiplies of kernel
-    // sizes, or bigger than matrix size (=no outer blocking).
-    eigen_assert(outer_m_ % mc_ == 0 || outer_m_ >= m);
-    eigen_assert(outer_k_ % kc_ == 0 || outer_k_ >= k);
-    eigen_assert(outer_n_ % nc_ == 0 || outer_n_ >= n);
-  }
-
-  EIGEN_ALWAYS_INLINE StorageIndex kc() const { return kc_; }
-  EIGEN_ALWAYS_INLINE StorageIndex mc() const { return mc_; }
-  EIGEN_ALWAYS_INLINE StorageIndex nc() const { return nc_; }
-  EIGEN_ALWAYS_INLINE StorageIndex outer_k() const { return outer_k_; }
-  EIGEN_ALWAYS_INLINE StorageIndex outer_m() const { return outer_m_; }
-  EIGEN_ALWAYS_INLINE StorageIndex outer_n() const { return outer_n_; }
-  EIGEN_ALWAYS_INLINE bool copyA() const { return copyA_; }
-  EIGEN_ALWAYS_INLINE bool copyB() const { return copyB_; }
-  EIGEN_ALWAYS_INLINE bool transposeA() const { return transposeA_; }
-  EIGEN_ALWAYS_INLINE bool transposeB() const { return transposeB_; }
-  EIGEN_ALWAYS_INLINE int num_threads() const { return num_threads_; }
-  EIGEN_ALWAYS_INLINE StorageIndex blocks_m() const { return divup(m_, mc_); }
-  EIGEN_ALWAYS_INLINE StorageIndex blocks_k() const { return divup(k_, kc_); }
-  EIGEN_ALWAYS_INLINE StorageIndex blocks_n() const { return divup(n_, nc_); }
-  EIGEN_ALWAYS_INLINE libxsmm_gemm_prefetch_type prefetch() const {
-    return prefetch_;
-  }
-
- private:
-  StorageIndex k_, m_, n_;
-  StorageIndex kc_, mc_, nc_;
-  StorageIndex outer_k_, outer_m_, outer_n_;
-  bool copyA_, copyB_, transposeA_, transposeB_;
-  size_t num_threads_;
-
-  // Threshold for m*k*n to skip blocking and just call libxsmm
-  const double LIBXSMM_THRESHOLD = 80*80*80;
-  // For computing optimal number of threads - so that each thread gets at least
-  // that many jobs.
-  const double MIN_JOBS_PER_THREAD = 3;
-  libxsmm_gemm_prefetch_type prefetch_;
-};
-#endif // EIGEN_USE_LIBXSMM
-
 } // end namespace internal
 } // end namespace Eigen
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index 22db6f01b..ca20038a4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -78,23 +78,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     const Index k = this->m_k_size;
     if (m == 0 || n == 0 || k == 0) return;
 
-#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
-    if (this->m_can_use_xsmm) {
-      bool transposeA = !this->m_lhs_inner_dim_contiguous;
-      bool transposeB = !this->m_rhs_inner_dim_contiguous;
-      internal::TensorXsmmContractionBlocking<LhsScalar, RhsScalar, Index>
-          blocking(k, m, n, this->m_device.numThreads(), transposeA,
-                   transposeB);
-
-      if (blocking.num_threads() == 1) {
-        this->evalGemmXSMM(buffer);
-      } else {
-        ContextXsmm<Alignment>(this, buffer, m, n, k, blocking).run();
-      }
-      return;
-    }
-#endif
-
     // Compute a set of algorithm parameters:
     // - kernel block sizes (bm, bn, bk)
     // - task grain sizes (number of kernels executed per task: gm, gn)
@@ -1227,190 +1210,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     return computeBandwidth;
   }
 
-#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
-  // TODO(ezhulenev): Add support for output kernels and LIBXSMM.
-  static_assert(std::is_same<OutputKernelType, const NoOpOutputKernel>::value,
-                "XSMM does not support contraction output kernels.");
-
-  template<int Alignment>
-  class ContextXsmm {
-   public:
-    ContextXsmm(const Self* self, Scalar* buffer, Index m, Index n, Index k,
-                const internal::TensorXsmmContractionBlocking<LhsScalar,
-                    RhsScalar, Index>& blocking):
-        device(self->m_device),
-        m(m), k(k), n(n),
-        stride_a(blocking.transposeA() ? k : m),
-        stride_b(blocking.transposeB() ? n : k),
-        stride_c(m),
-        bm(blocking.mc()), bk(blocking.kc()), bn(blocking.nc()),
-        blocks_m(blocking.blocks_m()), blocks_k(blocking.blocks_k()),
-        blocks_n(blocking.blocks_n()),
-        copyA(blocking.copyA()), copyB(blocking.copyB()),
-        transposeA(blocking.transposeA()), transposeB(blocking.transposeB()),
-        num_threads(blocking.num_threads()),
-        buffer(buffer),
-        leftData(self->m_leftImpl.data()), rightData(self->m_rightImpl.data()),
-        workers_done(blocking.num_threads()),
-
-        packingA_jobs(0), packingB_jobs(0), compute_jobs(0),
-        packingA_done(blocking.blocks_m()), packingB_done(blocking.blocks_n()) {}
-
-    void worker() {
-      // Pack
-
-      if (copyA) {
-        while (true) {
-          uint32_t mk = packingA_jobs++;
-          Index mi = mk / blocks_k;
-          Index ki = mk % blocks_k;
-          if (mi >= blocks_m) break;
-
-          LhsScalar * blockA = blocksA + (bk*bm) * (mi*blocks_k+ki);
-          if (transposeA) {
-            const LhsScalar * current_a = leftData + (bm*mi)*stride_a + (bk*ki);
-            libxsmm_otrans(blockA, current_a, sizeof(LhsScalar), actual_bk(ki),
-                           actual_bm(mi), stride_a, bm);
-          } else {
-            const LhsScalar * current_a = leftData + (bk*ki)*stride_a + (bm*mi);
-            internal::pack_simple<LhsScalar, Index>(blockA, current_a,
-                actual_bk(ki), actual_bm(mi), bm, stride_a);
-          }
-          packingA_done.at(mi)++;
-        }
-      }
-
-      if (copyB) {
-        while (true) {
-          uint32_t nk = packingB_jobs++;
-          Index ni = nk / blocks_k;
-          Index ki = nk % blocks_k;
-          if (ni >= blocks_n) break;
-
-          RhsScalar * blockB = blocksB + (bk*bn) * (ni*blocks_k+ki);
-          if (transposeB) {
-            const RhsScalar * current_b = rightData + (ki*bk)*stride_b +
-                                          (ni*bn);
-            libxsmm_otrans(blockB, current_b, sizeof(RhsScalar), actual_bn(ni),
-                           actual_bk(ki), stride_b, bk);
-          } else {
-            const RhsScalar * current_b = rightData + (ni*bn)*stride_b +
-                                          (ki*bk);
-            internal::pack_simple<RhsScalar, Index>(blockB, current_b,
-                actual_bn(ni), actual_bk(ki), bk, stride_b);
-          }
-          packingB_done.at(ni)++;
-        }
-      }
-
-      // Compute
-
-      while (true) {
-        uint32_t mn = compute_jobs++;
-        Index mi = mn / blocks_n;
-        Index ni = mn % blocks_n;
-        if (mi >= blocks_m) break;
-
-        // Wait for mi, ni packings to be done. This is more fine-grained than
-        // waiting for all workers to finish packing.
-        while ((copyA && (packingA_done.at(mi) < blocks_k)) ||
-               (copyB && (packingB_done.at(ni) < blocks_k)))
-        {}
-
-        for (Index ki=0; ki < blocks_k; ++ki) {
-          const LhsScalar * current_a = copyA ?
-              blocksA + (bk*bm) * (mi*blocks_k+ki) :
-              leftData + (bk*ki)*stride_a + (bm*mi);
-          const RhsScalar * current_b = copyB ?
-              blocksB + (bk*bn) * (ni*blocks_k+ki) :
-              rightData + (ni*bn)*stride_b + (bk*ki);
-
-          Index current_stride_a = copyA ? bm : stride_a;
-          Index current_stride_b = copyB ? bk : stride_b;
-
-          // Memory may not be zeroed, overwrite instead of adding in first
-          // iteration.
-          float beta = ki == 0 ? 0 : 1;
-
-          Scalar * current_c = buffer + (mi*bm) + (ni*bn)*stride_c;
-          internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar>(
-              0, actual_bm(mi), actual_bn(ni), actual_bk(ki),
-              current_stride_a, current_stride_b, stride_c, 1, beta, 0)
-          (current_a, current_b, current_c);
-        }
-      }
-
-      workers_done.Notify();
-    }
-
-    void run() {
-      // Parallelization strategy.
-      //
-      // First pack A into blocks (sharding by m, k) and B (sharding by n,k),
-      // then shard by m, n.
-      //
-      // Do not use advanced ThreadPool queuing, just run a single long-standing
-      // function in each thread.
-      if (copyA) {
-        blocksA = static_cast<LhsScalar*>(device.allocate(
-            (blocks_m*bm)*(blocks_k*bk)*sizeof(LhsScalar)));
-      }
-      if (copyB) {
-        blocksB = static_cast<RhsScalar*>(device.allocate(
-            (blocks_n*bn)*(blocks_k*bk)*sizeof(RhsScalar)));
-      }
-
-      for (Index i = 0; i < num_threads; ++i) {
-          device.enqueueNoNotification([=]() { worker(); });
-      }
-
-      workers_done.Wait();
-
-      if (copyA) {
-        device.deallocate(blocksA);
-      }
-      if (copyB) {
-        device.deallocate(blocksB);
-      }
-    }
-
-   private:
-    // real block size for block index in [0, ..., blocks - 1].
-    Index actual_bm(Index mi) const {
-      return mi != blocks_m - 1 ? bm : m + bm - bm * blocks_m;
-    }
-    Index actual_bk(Index ki) const {
-      return ki != blocks_k - 1 ? bk : k + bk - bk * blocks_k;
-    }
-    Index actual_bn(Index ni) const {
-      return ni != blocks_n - 1 ? bn : n + bn - bn * blocks_n;
-    }
-
-    const Device& device;
-    Index m, k, n;
-    Index stride_a, stride_b, stride_c;
-    Index bm, bk, bn;  // Block sizes.
-    Index blocks_m, blocks_k, blocks_n;  // Number of blocks in each dimension.
-    bool copyA, copyB, transposeA, transposeB;
-    Index num_threads;
-    Scalar *buffer;
-    const LhsScalar *leftData;
-    const RhsScalar *rightData;
-
-    LhsScalar *blocksA;
-    RhsScalar *blocksB;
-    // barrier for joining all threads after all done.
-    Barrier workers_done;
-    // "queues" of (mi,ki), (ki,ni), (mi,ni) jobs packed [0,p)x[0,q) -> [0, p*q)
-    std::atomic<uint32_t> packingA_jobs;
-    std::atomic<uint32_t> packingB_jobs;
-    std::atomic<uint32_t> compute_jobs;
-    // already packed blocks for each mi-panel in A and ni-panel in B.
-    std::vector<std::atomic<uint8_t>> packingA_done;
-    std::vector<std::atomic<uint8_t>> packingB_done;
-  };
-#endif
-
 };
 
 } // end namespace Eigen
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index e6c757275..42a450a85 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -12,17 +12,6 @@ include_directories(../../test ../../unsupported ../../Eigen
 
 find_package (Threads)
 
-find_package(Xsmm)
-if(XSMM_FOUND)
-  add_definitions("-DEIGEN_USE_LIBXSMM")
-  include_directories(${XSMM_INCLUDES})
-  link_directories(${XSMM_LIBRARIES})
-  set(EXTERNAL_LIBS ${EXTERNAL_LIBS} xsmm)
-  ei_add_property(EIGEN_TESTED_BACKENDS  "Xsmm, ")
-else(XSMM_FOUND)
-  ei_add_property(EIGEN_MISSING_BACKENDS  "Xsmm, ")
-endif(XSMM_FOUND)
-
 find_package(GoogleHash)
 if(GOOGLEHASH_FOUND)
   add_definitions("-DEIGEN_GOOGLEHASH_SUPPORT")
diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp
index 75f2e1edf..2fd128121 100644
--- a/unsupported/test/cxx11_tensor_contraction.cpp
+++ b/unsupported/test/cxx11_tensor_contraction.cpp
@@ -511,8 +511,6 @@ static void test_const_inputs()
   VERIFY_IS_APPROX(mat3(1,1), mat1(1,0)*mat2(0,1) + mat1(1,1)*mat2(1,1) + mat1(1,2)*mat2(2,1));
 }
 
-#if !defined(EIGEN_USE_LIBXSMM)
-
 // Apply Sqrt to all output elements.
 struct SqrtOutputKernel {
   template <typename Index, typename Scalar>
@@ -562,9 +560,6 @@ static void test_large_contraction_with_output_kernel() {
   }
 }
 
-#endif  // !defined(EIGEN_USE_LIBXSMM)
-
-
 EIGEN_DECLARE_TEST(cxx11_tensor_contraction)
 {
   CALL_SUBTEST(test_evals<ColMajor>());
@@ -597,8 +592,6 @@ EIGEN_DECLARE_TEST(cxx11_tensor_contraction)
   CALL_SUBTEST(test_tensor_product<RowMajor>());
   CALL_SUBTEST(test_const_inputs<ColMajor>());
   CALL_SUBTEST(test_const_inputs<RowMajor>());
-#if !defined(EIGEN_USE_LIBXSMM)
   CALL_SUBTEST(test_large_contraction_with_output_kernel<ColMajor>());
   CALL_SUBTEST(test_large_contraction_with_output_kernel<RowMajor>());
-#endif
 }

From 6901788013b0148d62118b73ea5eca9c7140f0d7 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Thu, 22 Aug 2019 10:50:51 -0700
Subject: [PATCH 08/30] Asynchronous parallelFor in Eigen ThreadPoolDevice

---
 .../CXX11/src/Tensor/TensorDeviceThreadPool.h | 197 +++++++++++++-----
 1 file changed, 144 insertions(+), 53 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index ef22a268a..ca2794cb5 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -75,9 +75,9 @@ struct ThreadPoolDevice {
   EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const {
     deallocate(buffer);
   }
-  
+
   template<typename Type>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const {
     return data;
   }
 
@@ -181,44 +181,173 @@ struct ThreadPoolDevice {
     return pool_->CurrentThreadId();
   }
 
-  // parallelFor executes f with [0, n) arguments in parallel and waits for
-  // completion. F accepts a half-open interval [first, last).
-  // Block size is chosen based on the iteration cost and resulting parallel
+  // WARNING: This function is synchronous and will block the calling thread.
+  //
+  // Synchronous parallelFor executes f with [0, n) arguments in parallel and
+  // waits for completion. F accepts a half-open interval [first, last). Block
+  // size is chosen based on the iteration cost and resulting parallel
   // efficiency. If block_align is not nullptr, it is called to round up the
   // block size.
   void parallelFor(Index n, const TensorOpCost& cost,
                    std::function<Index(Index)> block_align,
                    std::function<void(Index, Index)> f) const {
-    typedef TensorCostModel<ThreadPoolDevice> CostModel;
+    // Compute small problems directly in the caller thread.
     if (n <= 1 || numThreads() == 1 ||
         CostModel::numThreads(n, cost, static_cast<int>(numThreads())) == 1) {
       f(0, n);
       return;
     }
 
-    // Calculate block size based on (1) the iteration cost and (2) parallel
-    // efficiency. We want blocks to be not too small to mitigate
-    // parallelization overheads; not too large to mitigate tail
-    // effect and potential load imbalance and we also want number
-    // of blocks to be evenly dividable across threads.
+    // Compute block size and total count of blocks.
+    ParallelForBlock block = CalculateParallelForBlock(n, cost, block_align);
 
-    double block_size_f = 1.0 / CostModel::taskSize(1, cost);
+    // Recursively divide size into halves until we reach block_size.
+    // Division code rounds mid to block_size, so we are guaranteed to get
+    // block_count leaves that do actual computations.
+    Barrier barrier(static_cast<unsigned int>(block.count));
+    std::function<void(Index, Index)> handleRange;
+    handleRange = [=, &handleRange, &barrier, &f](Index firstIdx,
+                                                  Index lastIdx) {
+      while (lastIdx - firstIdx > block.size) {
+        // Split into halves and schedule the second half on a different thread.
+        const Index midIdx = firstIdx + divup((lastIdx - firstIdx) / 2, block.size) * block.size;
+        pool_->Schedule([=, &handleRange]() { handleRange(midIdx, lastIdx); });
+        lastIdx = midIdx;
+      }
+      // Single block or less, execute directly.
+      f(firstIdx, lastIdx);
+      barrier.Notify();
+    };
+
+    if (block.count <= numThreads()) {
+      // Avoid a thread hop by running the root of the tree and one block on the
+      // main thread.
+      handleRange(0, n);
+    } else {
+      // Execute the root in the thread pool to avoid running work on more than
+      // numThreads() threads.
+      pool_->Schedule([=, &handleRange]() { handleRange(0, n); });
+    }
+
+    barrier.Wait();
+  }
+
+  // Convenience wrapper for parallelFor that does not align blocks.
+  void parallelFor(Index n, const TensorOpCost& cost,
+                   std::function<void(Index, Index)> f) const {
+    parallelFor(n, cost, NULL, std::move(f));
+  }
+
+  // WARNING: This function is asynchronous and will not block the calling thread.
+  //
+  // Asynchronous parallelFor executes f with [0, n) arguments in parallel
+  // without waiting for completion. When the last block finished, it will call
+  // 'done' callback. F accepts a half-open interval [first, last). Block size
+  // is chosen based on the iteration cost and resulting parallel efficiency. If
+  // block_align is not nullptr, it is called to round up the block size.
+  void parallelForAsync(Index n, const TensorOpCost& cost,
+                        std::function<Index(Index)> block_align,
+                        std::function<void(Index, Index)> f,
+                        std::function<void()> done) const {
+    // Compute block size and total count of blocks.
+    ParallelForBlock block = CalculateParallelForBlock(n, cost, block_align);
+
+    ParallelForAsyncContext* const ctx =
+        new ParallelForAsyncContext(block.count, std::move(f), std::move(done));
+
+    // Recursively divide size into halves until we reach block_size.
+    // Division code rounds mid to block_size, so we are guaranteed to get
+    // block_count leaves that do actual computations.
+    ctx->handle_range = [this, ctx, block](Index firstIdx, Index lastIdx) {
+      while (lastIdx - firstIdx > block.size) {
+        // Split into halves and schedule the second half on a different thread.
+        const Index midIdx = firstIdx + divup((lastIdx - firstIdx) / 2, block.size) * block.size;
+        pool_->Schedule(
+            [ctx, midIdx, lastIdx]() { ctx->handle_range(midIdx, lastIdx); });
+        lastIdx = midIdx;
+      }
+
+      // Single block or less, execute directly.
+      ctx->f(firstIdx, lastIdx);
+
+      // Call 'done' callback if it was the last block.
+      if (ctx->count.fetch_sub(1) == 1) {
+        (ctx->done)();
+        // We can't delete ctx right now, because it will deallocate the closure
+        // we are currently in.
+        pool_->Schedule([ctx]() { delete ctx; });
+      }
+    };
+
+    // Execute the root in the thread pool.
+    pool_->Schedule([ctx, n]() { ctx->handle_range(0, n); });
+  }
+
+  // Convenience wrapper for parallelForAsync that does not align blocks.
+  void parallelForAsync(Index n, const TensorOpCost& cost,
+                        std::function<void(Index, Index)> f,
+                        std::function<void()> done) const {
+    parallelForAsync(n, cost, NULL, std::move(f), std::move(done));
+  }
+
+  // Thread pool accessor.
+  ThreadPoolInterface* getPool() const { return pool_; }
+
+  // Allocator accessor.
+  Allocator* allocator() const { return allocator_; }
+
+ private:
+  typedef TensorCostModel<ThreadPoolDevice> CostModel;
+
+  // For parallelForAsync we must keep passed in closures on the heap, and
+  // delete them only after `done` callback finished.
+  struct ParallelForAsyncContext {
+    ParallelForAsyncContext(Index count, std::function<void(Index, Index)> f,
+                             std::function<void()> done)
+        : count(count), f(std::move(f)), done(std::move(done)) {}
+
+    std::atomic<Index> count;
+    std::function<void(Index, Index)> f;
+    std::function<void()> done;
+
+    std::function<void(Index, Index)> handle_range;
+  };
+
+  struct ParallelForBlock {
+    Index size;   // block size
+    Index count;  // number of blocks
+  };
+
+  // Calculates block size based on (1) the iteration cost and (2) parallel
+  // efficiency. We want blocks to be not too small to mitigate parallelization
+  // overheads; not too large to mitigate tail effect and potential load
+  // imbalance and we also want number of blocks to be evenly dividable across
+  // threads.
+  ParallelForBlock CalculateParallelForBlock(
+      const Index n, const TensorOpCost& cost,
+      std::function<Index(Index)> block_align) const {
+    const double block_size_f = 1.0 / CostModel::taskSize(1, cost);
     const Index max_oversharding_factor = 4;
     Index block_size = numext::mini(
-        n, numext::maxi<Index>(divup<Index>(n, max_oversharding_factor * numThreads()),
-                               block_size_f));
+        n, numext::maxi<Index>(
+               divup<Index>(n, max_oversharding_factor * numThreads()),
+               block_size_f));
     const Index max_block_size = numext::mini(n, 2 * block_size);
+
     if (block_align) {
       Index new_block_size = block_align(block_size);
       eigen_assert(new_block_size >= block_size);
       block_size = numext::mini(n, new_block_size);
     }
+
     Index block_count = divup(n, block_size);
+
     // Calculate parallel efficiency as fraction of total CPU time used for
     // computations:
     double max_efficiency =
         static_cast<double>(block_count) /
         (divup<int>(block_count, numThreads()) * numThreads());
+
     // Now try to increase block size up to max_block_size as long as it
     // doesn't decrease parallel efficiency.
     for (Index prev_block_count = block_count;
@@ -251,47 +380,9 @@ struct ThreadPoolDevice {
       }
     }
 
-    // Recursively divide size into halves until we reach block_size.
-    // Division code rounds mid to block_size, so we are guaranteed to get
-    // block_count leaves that do actual computations.
-    Barrier barrier(static_cast<unsigned int>(block_count));
-    std::function<void(Index, Index)> handleRange;
-    handleRange = [=, &handleRange, &barrier, &f](Index firstIdx, Index lastIdx) {
-      while (lastIdx - firstIdx > block_size) {
-        // Split into halves and schedule the second half on a different thread.
-        const Index midIdx = firstIdx + divup((lastIdx - firstIdx) / 2, block_size) * block_size;
-        pool_->Schedule([=, &handleRange]() { handleRange(midIdx, lastIdx); });
-        lastIdx = midIdx;
-      }
-      // Single block or less, execute directly.
-      f(firstIdx, lastIdx);
-      barrier.Notify();
-    };
-    if (block_count <= numThreads()) {
-      // Avoid a thread hop by running the root of the tree and one block on the
-      // main thread.
-      handleRange(0, n);
-    } else {
-      // Execute the root in the thread pool to avoid running work on more than
-      // numThreads() threads.
-      pool_->Schedule([=, &handleRange]() { handleRange(0, n); });
-    }
-    barrier.Wait();
+    return {block_size, block_count};
   }
 
-  // Convenience wrapper for parallelFor that does not align blocks.
-  void parallelFor(Index n, const TensorOpCost& cost,
-                   std::function<void(Index, Index)> f) const {
-    parallelFor(n, cost, NULL, std::move(f));
-  }
-
-  // Thread pool accessor.
-  ThreadPoolInterface* getPool() const { return pool_; }
-
-  // Allocator accessor.
-  Allocator* allocator() const { return allocator_; }
-
- private:
   ThreadPoolInterface* pool_;
   int num_threads_;
   Allocator* allocator_;

From 8b5ab0e4dd70f449db52503f89cbb3767ccec38c Mon Sep 17 00:00:00 2001
From: maratek <maratek@google.com>
Date: Fri, 23 Aug 2019 15:25:56 -0700
Subject: [PATCH 09/30] Fix get_random_seed on Native Client

Newlib in Native Client SDK does not provide ::random function.
Implement get_random_seed for NaCl using ::rand, similarly to Windows version.
---
 unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
index 2be4f9cc5..445248163 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
@@ -45,6 +45,14 @@ EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
   uint64_t rnd = ::random() ^ mach_absolute_time();
   return rnd;
 
+#elif defined __native_client__
+  // Same approach as for win32, except using clock_gettime
+  timespec ts;
+  clock_gettime(CLOCK_REALTIME, &ts);
+  int rnd1 = ::rand();
+  int rnd2 = ::rand();
+  uint64_t rnd = (rnd1 | rnd2 << 16) ^ ts.tv_nsec;
+  return rnd;
 
 #else
   // Augment the current time with pseudo random number generation

From b021cdea6dd84b0f51dd7aea691d47dd3eab8e9d Mon Sep 17 00:00:00 2001
From: Rasmus Munk Larsen <rmlarsen@google.com>
Date: Tue, 27 Aug 2019 11:30:31 -0700
Subject: [PATCH 10/30] Clean up float16 a.k.a. Eigen::half support in Eigen.
 Move the definition of half to Core/arch/Default and move arch-specific
 packet ops to their respective sub-directories.

---
 Eigen/Core                                  |    9 +-
 Eigen/src/Core/arch/AVX/PacketMath.h        |  362 ++++
 Eigen/src/Core/arch/AVX/TypeCasting.h       |   27 +
 Eigen/src/Core/arch/AVX512/PacketMath.h     |  507 +++++-
 Eigen/src/Core/arch/AVX512/TypeCasting.h    |   25 +
 Eigen/src/Core/arch/{GPU => Default}/Half.h |   18 +-
 Eigen/src/Core/arch/Default/TypeCasting.h   |   77 +
 Eigen/src/Core/arch/GPU/PacketMath.h        |  540 ++++++
 Eigen/src/Core/arch/GPU/PacketMathHalf.h    | 1630 -------------------
 Eigen/src/Core/arch/GPU/TypeCasting.h       |  161 --
 Eigen/src/Core/arch/SSE/PacketMath.h        |  208 +++
 Eigen/src/Core/arch/SSE/TypeCasting.h       |   51 +
 test/half_float.cpp                         |    2 +-
 13 files changed, 1805 insertions(+), 1812 deletions(-)
 create mode 100644 Eigen/src/Core/arch/AVX512/TypeCasting.h
 rename Eigen/src/Core/arch/{GPU => Default}/Half.h (98%)
 create mode 100644 Eigen/src/Core/arch/Default/TypeCasting.h
 delete mode 100644 Eigen/src/Core/arch/GPU/PacketMathHalf.h

diff --git a/Eigen/Core b/Eigen/Core
index af741a241..e6e31caee 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -160,6 +160,9 @@ using std::ptrdiff_t;
 #include "src/Core/GenericPacketMath.h"
 #include "src/Core/MathFunctionsImpl.h"
 #include "src/Core/arch/Default/ConjHelper.h"
+// Generic half float support
+#include "src/Core/arch/Default/Half.h"
+#include "src/Core/arch/Default/TypeCasting.h"
 
 #if defined EIGEN_VECTORIZE_AVX512
   #include "src/Core/arch/SSE/PacketMath.h"
@@ -207,14 +210,10 @@ using std::ptrdiff_t;
   #include "src/Core/arch/MSA/Complex.h"
 #endif
 
-// Half float support
-#include "src/Core/arch/GPU/Half.h"
-#include "src/Core/arch/GPU/PacketMathHalf.h"
-#include "src/Core/arch/GPU/TypeCasting.h"
-
 #if defined EIGEN_VECTORIZE_GPU
   #include "src/Core/arch/GPU/PacketMath.h"
   #include "src/Core/arch/GPU/MathFunctions.h"
+  #include "src/Core/arch/GPU/TypeCasting.h"
 #endif
 
 #if defined(EIGEN_USE_SYCL)
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index 5233195f3..020f6c276 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -31,10 +31,14 @@ namespace internal {
 typedef __m256  Packet8f;
 typedef __m256i Packet8i;
 typedef __m256d Packet4d;
+typedef struct {
+  __m128i x;
+} Packet8h;
 
 template<> struct is_arithmetic<__m256>  { enum { value = true }; };
 template<> struct is_arithmetic<__m256i> { enum { value = true }; };
 template<> struct is_arithmetic<__m256d> { enum { value = true }; };
+template<> struct is_arithmetic<Packet8h> { enum { value = true }; };
 
 #define _EIGEN_DECLARE_CONST_Packet8f(NAME,X) \
   const Packet8f p8f_##NAME = pset1<Packet8f>(X)
@@ -97,6 +101,35 @@ template<> struct packet_traits<double> : default_packet_traits
     HasCeil = 1
   };
 };
+
+template <>
+struct packet_traits<Eigen::half> : default_packet_traits {
+  typedef Packet8h type;
+  // There is no half-size packet for Packet8h.
+  typedef Packet8h half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+    HasHalfPacket = 0,
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasConj   = 0,
+    HasSetLinear = 0,
+    HasSqrt = 0,
+    HasRsqrt = 0,
+    HasExp = 0,
+    HasLog = 0,
+    HasBlend = 0
+  };
+};
 #endif
 
 template<> struct scalar_div_cost<float,true> { enum { value = 14 }; };
@@ -847,6 +880,335 @@ template<> EIGEN_STRONG_INLINE Packet4d pinsertlast(const Packet4d& a, double b)
   return _mm256_blend_pd(a,pset1<Packet4d>(b),(1<<3));
 }
 
+
+// Packet math for Eigen::half
+template<> struct unpacket_traits<Packet8h> { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet8h half; };
+
+template<> EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) {
+  Packet8h result;
+  result.x = _mm_set1_epi16(from.x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8h>(const Packet8h& from) {
+  return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_extract_epi16(from.x, 0)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pload<Packet8h>(const Eigen::half* from) {
+  Packet8h result;
+  result.x = _mm_load_si128(reinterpret_cast<const __m128i*>(from));
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h ploadu<Packet8h>(const Eigen::half* from) {
+  Packet8h result;
+  result.x = _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8h& from) {
+  _mm_store_si128(reinterpret_cast<__m128i*>(to), from.x);
+}
+
+template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet8h& from) {
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from.x);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h
+ploaddup<Packet8h>(const Eigen::half*  from) {
+  Packet8h result;
+  unsigned short a = from[0].x;
+  unsigned short b = from[1].x;
+  unsigned short c = from[2].x;
+  unsigned short d = from[3].x;
+  result.x = _mm_set_epi16(d, d, c, c, b, b, a, a);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h
+ploadquad<Packet8h>(const Eigen::half* from) {
+  Packet8h result;
+  unsigned short a = from[0].x;
+  unsigned short b = from[1].x;
+  result.x = _mm_set_epi16(b, b, b, b, a, a, a, a);
+  return result;
+}
+
+EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) {
+#ifdef EIGEN_HAS_FP16_C
+  return _mm256_cvtph_ps(a.x);
+#else
+  EIGEN_ALIGN32 Eigen::half aux[8];
+  pstore(aux, a);
+  float f0(aux[0]);
+  float f1(aux[1]);
+  float f2(aux[2]);
+  float f3(aux[3]);
+  float f4(aux[4]);
+  float f5(aux[5]);
+  float f6(aux[6]);
+  float f7(aux[7]);
+
+  return _mm256_set_ps(f7, f6, f5, f4, f3, f2, f1, f0);
+#endif
+}
+
+EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) {
+#ifdef EIGEN_HAS_FP16_C
+  Packet8h result;
+  result.x = _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
+  return result;
+#else
+  EIGEN_ALIGN32 float aux[8];
+  pstore(aux, a);
+  Eigen::half h0(aux[0]);
+  Eigen::half h1(aux[1]);
+  Eigen::half h2(aux[2]);
+  Eigen::half h3(aux[3]);
+  Eigen::half h4(aux[4]);
+  Eigen::half h5(aux[5]);
+  Eigen::half h6(aux[6]);
+  Eigen::half h7(aux[7]);
+
+  Packet8h result;
+  result.x = _mm_set_epi16(h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x);
+  return result;
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h ptrue(const Packet8h& a) {
+  Packet8h r; r.x = _mm_cmpeq_epi32(a.x, a.x); return r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a,const Packet8h& b) {
+  // in some cases Packet4i is a wrapper around __m128i, so we either need to
+  // cast to Packet4i to directly call the intrinsics as below:
+  Packet8h r; r.x = _mm_or_si128(a.x,b.x); return r;
+}
+template<> EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h& a,const Packet8h& b) {
+  Packet8h r; r.x = _mm_xor_si128(a.x,b.x); return r;
+}
+template<> EIGEN_STRONG_INLINE Packet8h pand(const Packet8h& a,const Packet8h& b) {
+  Packet8h r; r.x = _mm_and_si128(a.x,b.x); return r;
+}
+template<> EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h& a,const Packet8h& b) {
+  Packet8h r; r.x = _mm_andnot_si128(b.x,a.x); return r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pselect(const Packet8h& mask, const Packet8h& a, const Packet8h& b) {
+  Packet8h r; r.x = _mm_blendv_epi8(b.x, a.x, mask.x); return r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pcmp_eq(const Packet8h& a,const Packet8h& b) {
+  Packet8f af = half2float(a);
+  Packet8f bf = half2float(b);
+  Packet8f rf = pcmp_eq(af, bf);
+  // Pack the 32-bit flags into 16-bits flags.
+  Packet8h result; result.x = _mm_packs_epi32(_mm256_extractf128_si256(_mm256_castps_si256(rf), 0),
+                                              _mm256_extractf128_si256(_mm256_castps_si256(rf), 1));
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) {
+  Packet8h sign_mask; sign_mask.x = _mm_set1_epi16(static_cast<unsigned short>(0x8000));
+  Packet8h result; result.x = _mm_xor_si128(a.x, sign_mask.x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  Packet8f af = half2float(a);
+  Packet8f bf = half2float(b);
+  Packet8f rf = padd(af, bf);
+  return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  Packet8f af = half2float(a);
+  Packet8f bf = half2float(b);
+  Packet8f rf = psub(af, bf);
+  return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  Packet8f af = half2float(a);
+  Packet8f bf = half2float(b);
+  Packet8f rf = pmul(af, bf);
+  return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pdiv<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  Packet8f af = half2float(a);
+  Packet8f bf = half2float(b);
+  Packet8f rf = pdiv(af, bf);
+  return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pgather<Eigen::half, Packet8h>(const Eigen::half* from, Index stride)
+{
+  Packet8h result;
+  result.x = _mm_set_epi16(from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x, from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8h>(Eigen::half* to, const Packet8h& from, Index stride)
+{
+  EIGEN_ALIGN32 Eigen::half aux[8];
+  pstore(aux, from);
+  to[stride*0].x = aux[0].x;
+  to[stride*1].x = aux[1].x;
+  to[stride*2].x = aux[2].x;
+  to[stride*3].x = aux[3].x;
+  to[stride*4].x = aux[4].x;
+  to[stride*5].x = aux[5].x;
+  to[stride*6].x = aux[6].x;
+  to[stride*7].x = aux[7].x;
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half predux<Packet8h>(const Packet8h& a) {
+  Packet8f af = half2float(a);
+  float reduced = predux<Packet8f>(af);
+  return Eigen::half(reduced);
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8h>(const Packet8h& a) {
+  Packet8f af = half2float(a);
+  float reduced = predux_max<Packet8f>(af);
+  return Eigen::half(reduced);
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half predux_min<Packet8h>(const Packet8h& a) {
+  Packet8f af = half2float(a);
+  float reduced = predux_min<Packet8f>(af);
+  return Eigen::half(reduced);
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8h>(const Packet8h& a) {
+  Packet8f af = half2float(a);
+  float reduced = predux_mul<Packet8f>(af);
+  return Eigen::half(reduced);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h preduxp<Packet8h>(const Packet8h* p) {
+  Packet8f pf[8];
+  pf[0] = half2float(p[0]);
+  pf[1] = half2float(p[1]);
+  pf[2] = half2float(p[2]);
+  pf[3] = half2float(p[3]);
+  pf[4] = half2float(p[4]);
+  pf[5] = half2float(p[5]);
+  pf[6] = half2float(p[6]);
+  pf[7] = half2float(p[7]);
+  Packet8f reduced = preduxp<Packet8f>(pf);
+  return float2half(reduced);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h preverse(const Packet8h& a)
+{
+  __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
+  Packet8h res;
+  res.x = _mm_shuffle_epi8(a.x,m);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pinsertfirst(const Packet8h& a, Eigen::half b)
+{
+  Packet8h res;
+  res.x = _mm_insert_epi16(a.x,int(b.x),0);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pinsertlast(const Packet8h& a, Eigen::half b)
+{
+  Packet8h res;
+  res.x = _mm_insert_epi16(a.x,int(b.x),7);
+  return res;
+}
+
+template<int Offset>
+struct palign_impl<Offset,Packet8h>
+{
+  static EIGEN_STRONG_INLINE void run(Packet8h& first, const Packet8h& second)
+  {
+    if (Offset!=0)
+      first.x = _mm_alignr_epi8(second.x,first.x, Offset*2);
+  }
+};
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet8h,8>& kernel) {
+  __m128i a = kernel.packet[0].x;
+  __m128i b = kernel.packet[1].x;
+  __m128i c = kernel.packet[2].x;
+  __m128i d = kernel.packet[3].x;
+  __m128i e = kernel.packet[4].x;
+  __m128i f = kernel.packet[5].x;
+  __m128i g = kernel.packet[6].x;
+  __m128i h = kernel.packet[7].x;
+
+  __m128i a03b03 = _mm_unpacklo_epi16(a, b);
+  __m128i c03d03 = _mm_unpacklo_epi16(c, d);
+  __m128i e03f03 = _mm_unpacklo_epi16(e, f);
+  __m128i g03h03 = _mm_unpacklo_epi16(g, h);
+  __m128i a47b47 = _mm_unpackhi_epi16(a, b);
+  __m128i c47d47 = _mm_unpackhi_epi16(c, d);
+  __m128i e47f47 = _mm_unpackhi_epi16(e, f);
+  __m128i g47h47 = _mm_unpackhi_epi16(g, h);
+
+  __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03);
+  __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03);
+  __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03);
+  __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03);
+  __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47);
+  __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47);
+  __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47);
+  __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47);
+
+  __m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01);
+  __m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01);
+  __m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23);
+  __m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23);
+  __m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45);
+  __m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45);
+  __m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67);
+  __m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67);
+
+  kernel.packet[0].x = a0b0c0d0e0f0g0h0;
+  kernel.packet[1].x = a1b1c1d1e1f1g1h1;
+  kernel.packet[2].x = a2b2c2d2e2f2g2h2;
+  kernel.packet[3].x = a3b3c3d3e3f3g3h3;
+  kernel.packet[4].x = a4b4c4d4e4f4g4h4;
+  kernel.packet[5].x = a5b5c5d5e5f5g5h5;
+  kernel.packet[6].x = a6b6c6d6e6f6g6h6;
+  kernel.packet[7].x = a7b7c7d7e7f7g7h7;
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet8h,4>& kernel) {
+  EIGEN_ALIGN32 Eigen::half in[4][8];
+  pstore<Eigen::half>(in[0], kernel.packet[0]);
+  pstore<Eigen::half>(in[1], kernel.packet[1]);
+  pstore<Eigen::half>(in[2], kernel.packet[2]);
+  pstore<Eigen::half>(in[3], kernel.packet[3]);
+
+  EIGEN_ALIGN32 Eigen::half out[4][8];
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      out[i][j] = in[j][2*i];
+    }
+    for (int j = 0; j < 4; ++j) {
+      out[i][j+4] = in[j][2*i+1];
+    }
+  }
+
+  kernel.packet[0] = pload<Packet8h>(out[0]);
+  kernel.packet[1] = pload<Packet8h>(out[1]);
+  kernel.packet[2] = pload<Packet8h>(out[2]);
+  kernel.packet[3] = pload<Packet8h>(out[3]);
+}
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/arch/AVX/TypeCasting.h b/Eigen/src/Core/arch/AVX/TypeCasting.h
index 7d2e1e67f..910fc06ca 100644
--- a/Eigen/src/Core/arch/AVX/TypeCasting.h
+++ b/Eigen/src/Core/arch/AVX/TypeCasting.h
@@ -52,6 +52,33 @@ template<> EIGEN_STRONG_INLINE Packet8f preinterpret<Packet8f,Packet8i>(const Pa
   return _mm256_castsi256_ps(a);
 }
 
+
+template <>
+struct type_casting_traits<Eigen::half, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8h, Packet8f>(const Packet8h& a) {
+  return half2float(a);
+}
+
+template <>
+struct type_casting_traits<float, Eigen::half> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet8h pcast<Packet8f, Packet8h>(const Packet8f& a) {
+  return float2half(a);
+}
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h
index 0a81fe02d..e37855693 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -44,6 +44,41 @@ template <>
 struct is_arithmetic<__m512d> {
   enum { value = true };
 };
+typedef struct {
+  __m256i x;
+} Packet16h;
+
+
+template<> struct is_arithmetic<Packet16h> { enum { value = true }; };
+
+template <>
+struct packet_traits<half> : default_packet_traits {
+  typedef Packet16h type;
+  // There is no half-size packet for Packet16h.
+  typedef Packet16h half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 16,
+    HasHalfPacket = 0,
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasConj   = 0,
+    HasSetLinear = 0,
+    HasSqrt = 0,
+    HasRsqrt = 0,
+    HasExp = 0,
+    HasLog = 0,
+    HasBlend = 0
+  };
+};
 
 template<> struct packet_traits<float>  : default_packet_traits
 {
@@ -60,8 +95,8 @@ template<> struct packet_traits<float>  : default_packet_traits
 #if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
 #ifdef EIGEN_VECTORIZE_AVX512DQ
     HasLog = 1,
-    HasLog1p = 1,
-    HasExpm1 = 1,
+    HasLog1p  = 1,
+    HasExpm1  = 1,
 #endif
     HasExp = 1,
     HasSqrt = EIGEN_FAST_MATH,
@@ -120,6 +155,13 @@ struct unpacket_traits<Packet16i> {
   enum { size = 16, alignment=Aligned64, vectorizable=false, masked_load_available=false, masked_store_available=false };
 };
 
+template<>
+struct unpacket_traits<Packet16h> {
+  typedef Eigen::half type;
+  typedef Packet16h half;
+  enum {size=16, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false};
+};
+
 template <>
 EIGEN_STRONG_INLINE Packet16f pset1<Packet16f>(const float& from) {
   return _mm512_set1_ps(from);
@@ -1397,6 +1439,467 @@ template<> EIGEN_STRONG_INLINE Packet16f preinterpret<Packet16f,Packet16i>(const
   return _mm512_castsi512_ps(a);
 }
 
+
+// Packet math for Eigen::half
+template<> EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) {
+  Packet16h result;
+  result.x = _mm256_set1_epi16(from.x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet16h>(const Packet16h& from) {
+  return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm256_extract_epi16(from.x, 0)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pload<Packet16h>(const Eigen::half* from) {
+  Packet16h result;
+  result.x = _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(const Eigen::half* from) {
+  Packet16h result;
+  result.x = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) {
+  // (void*) -> workaround clang warning:
+  // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32
+  _mm256_store_si256((__m256i*)(void*)to, from.x);
+}
+
+template<> EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) {
+  // (void*) -> workaround clang warning:
+  // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32
+  _mm256_storeu_si256((__m256i*)(void*)to, from.x);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h
+ploaddup<Packet16h>(const Eigen::half*  from) {
+  Packet16h result;
+  unsigned short a = from[0].x;
+  unsigned short b = from[1].x;
+  unsigned short c = from[2].x;
+  unsigned short d = from[3].x;
+  unsigned short e = from[4].x;
+  unsigned short f = from[5].x;
+  unsigned short g = from[6].x;
+  unsigned short h = from[7].x;
+  result.x = _mm256_set_epi16(h, h, g, g, f, f, e, e, d, d, c, c, b, b, a, a);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h
+ploadquad(const Eigen::half* from) {
+  Packet16h result;
+  unsigned short a = from[0].x;
+  unsigned short b = from[1].x;
+  unsigned short c = from[2].x;
+  unsigned short d = from[3].x;
+  result.x = _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a);
+  return result;
+}
+
+EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) {
+#ifdef EIGEN_HAS_FP16_C
+  return _mm512_cvtph_ps(a.x);
+#else
+  EIGEN_ALIGN64 half aux[16];
+  pstore(aux, a);
+  float f0(aux[0]);
+  float f1(aux[1]);
+  float f2(aux[2]);
+  float f3(aux[3]);
+  float f4(aux[4]);
+  float f5(aux[5]);
+  float f6(aux[6]);
+  float f7(aux[7]);
+  float f8(aux[8]);
+  float f9(aux[9]);
+  float fa(aux[10]);
+  float fb(aux[11]);
+  float fc(aux[12]);
+  float fd(aux[13]);
+  float fe(aux[14]);
+  float ff(aux[15]);
+
+  return _mm512_set_ps(
+      ff, fe, fd, fc, fb, fa, f9, f8, f7, f6, f5, f4, f3, f2, f1, f0);
+#endif
+}
+
+EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) {
+#ifdef EIGEN_HAS_FP16_C
+  Packet16h result;
+  result.x = _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
+  return result;
+#else
+  EIGEN_ALIGN64 float aux[16];
+  pstore(aux, a);
+  half h0(aux[0]);
+  half h1(aux[1]);
+  half h2(aux[2]);
+  half h3(aux[3]);
+  half h4(aux[4]);
+  half h5(aux[5]);
+  half h6(aux[6]);
+  half h7(aux[7]);
+  half h8(aux[8]);
+  half h9(aux[9]);
+  half ha(aux[10]);
+  half hb(aux[11]);
+  half hc(aux[12]);
+  half hd(aux[13]);
+  half he(aux[14]);
+  half hf(aux[15]);
+
+  Packet16h result;
+  result.x = _mm256_set_epi16(
+      hf.x, he.x, hd.x, hc.x, hb.x, ha.x, h9.x, h8.x,
+      h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x);
+  return result;
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pnot(const Packet16h& a) {
+  Packet16h r; r.x = _mm256_xor_si256(a.x, pcmp_eq(a.x, a.x)); return r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h ptrue(const Packet16h& a) {
+  Packet16h r; r.x = Packet8i(ptrue(a.x)); return r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a,const Packet16h& b) {
+  // in some cases Packet8i is a wrapper around __m256i, so we need to
+  // cast to Packet8i to call the correct overload.
+  Packet16h r; r.x = por(Packet8i(a.x),Packet8i(b.x)); return r;
+}
+template<> EIGEN_STRONG_INLINE Packet16h pxor(const Packet16h& a,const Packet16h& b) {
+  Packet16h r; r.x = pxor(Packet8i(a.x),Packet8i(b.x)); return r;
+}
+template<> EIGEN_STRONG_INLINE Packet16h pand(const Packet16h& a,const Packet16h& b) {
+  Packet16h r; r.x = pand(Packet8i(a.x),Packet8i(b.x)); return r;
+}
+template<> EIGEN_STRONG_INLINE Packet16h pandnot(const Packet16h& a,const Packet16h& b) {
+  Packet16h r; r.x = pandnot(Packet8i(a.x),Packet8i(b.x)); return r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pselect(const Packet16h& mask, const Packet16h& a, const Packet16h& b) {
+  Packet16h r; r.x = _mm256_blendv_epi8(b.x, a.x, mask.x); return r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pcmp_eq(const Packet16h& a,const Packet16h& b) {
+  Packet16f af = half2float(a);
+  Packet16f bf = half2float(b);
+  Packet16f rf = pcmp_eq(af, bf);
+  // Pack the 32-bit flags into 16-bits flags.
+  __m256i lo = _mm256_castps_si256(extract256<0>(rf));
+  __m256i hi = _mm256_castps_si256(extract256<1>(rf));
+  __m128i result_lo = _mm_packs_epi32(_mm256_extractf128_si256(lo, 0),
+                                      _mm256_extractf128_si256(lo, 1));
+  __m128i result_hi = _mm_packs_epi32(_mm256_extractf128_si256(hi, 0),
+                                      _mm256_extractf128_si256(hi, 1));
+  Packet16h result; result.x = _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi, 1);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) {
+  Packet16h sign_mask; sign_mask.x = _mm256_set1_epi16(static_cast<unsigned short>(0x8000));
+  Packet16h result; result.x = _mm256_xor_si256(a.x, sign_mask.x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  Packet16f af = half2float(a);
+  Packet16f bf = half2float(b);
+  Packet16f rf = padd(af, bf);
+  return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  Packet16f af = half2float(a);
+  Packet16f bf = half2float(b);
+  Packet16f rf = psub(af, bf);
+  return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  Packet16f af = half2float(a);
+  Packet16f bf = half2float(b);
+  Packet16f rf = pmul(af, bf);
+  return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pdiv<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  Packet16f af = half2float(a);
+  Packet16f bf = half2float(b);
+  Packet16f rf = pdiv(af, bf);
+  return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) {
+  Packet16f from_float = half2float(from);
+  return half(predux(from_float));
+}
+
+template<> EIGEN_STRONG_INLINE half predux_mul<Packet16h>(const Packet16h& from) {
+  Packet16f from_float = half2float(from);
+  return half(predux_mul(from_float));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h preduxp<Packet16h>(const Packet16h* p) {
+  Packet16f pf[16];
+  pf[0] = half2float(p[0]);
+  pf[1] = half2float(p[1]);
+  pf[2] = half2float(p[2]);
+  pf[3] = half2float(p[3]);
+  pf[4] = half2float(p[4]);
+  pf[5] = half2float(p[5]);
+  pf[6] = half2float(p[6]);
+  pf[7] = half2float(p[7]);
+  pf[8] = half2float(p[8]);
+  pf[9] = half2float(p[9]);
+  pf[10] = half2float(p[10]);
+  pf[11] = half2float(p[11]);
+  pf[12] = half2float(p[12]);
+  pf[13] = half2float(p[13]);
+  pf[14] = half2float(p[14]);
+  pf[15] = half2float(p[15]);
+  Packet16f reduced = preduxp<Packet16f>(pf);
+  return float2half(reduced);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h preverse(const Packet16h& a)
+{
+  __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
+  Packet16h res;
+  res.x = _mm256_insertf128_si256(
+                    _mm256_castsi128_si256(_mm_shuffle_epi8(_mm256_extractf128_si256(a.x,1),m)),
+                                           _mm_shuffle_epi8(_mm256_extractf128_si256(a.x,0),m), 1);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pinsertfirst(const Packet16h& a, Eigen::half b)
+{
+  Packet16h res;
+  res.x = _mm256_insert_epi16(a.x,b.x,0);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pinsertlast(const Packet16h& a, Eigen::half b)
+{
+  Packet16h res;
+  res.x = _mm256_insert_epi16(a.x,b.x,15);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride)
+{
+  Packet16h result;
+  result.x = _mm256_set_epi16(
+      from[15*stride].x, from[14*stride].x, from[13*stride].x, from[12*stride].x,
+      from[11*stride].x, from[10*stride].x, from[9*stride].x, from[8*stride].x,
+      from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x,
+      from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE void pscatter<half, Packet16h>(half* to, const Packet16h& from, Index stride)
+{
+  EIGEN_ALIGN64 half aux[16];
+  pstore(aux, from);
+  to[stride*0].x = aux[0].x;
+  to[stride*1].x = aux[1].x;
+  to[stride*2].x = aux[2].x;
+  to[stride*3].x = aux[3].x;
+  to[stride*4].x = aux[4].x;
+  to[stride*5].x = aux[5].x;
+  to[stride*6].x = aux[6].x;
+  to[stride*7].x = aux[7].x;
+  to[stride*8].x = aux[8].x;
+  to[stride*9].x = aux[9].x;
+  to[stride*10].x = aux[10].x;
+  to[stride*11].x = aux[11].x;
+  to[stride*12].x = aux[12].x;
+  to[stride*13].x = aux[13].x;
+  to[stride*14].x = aux[14].x;
+  to[stride*15].x = aux[15].x;
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet16h,16>& kernel) {
+  __m256i a = kernel.packet[0].x;
+  __m256i b = kernel.packet[1].x;
+  __m256i c = kernel.packet[2].x;
+  __m256i d = kernel.packet[3].x;
+  __m256i e = kernel.packet[4].x;
+  __m256i f = kernel.packet[5].x;
+  __m256i g = kernel.packet[6].x;
+  __m256i h = kernel.packet[7].x;
+  __m256i i = kernel.packet[8].x;
+  __m256i j = kernel.packet[9].x;
+  __m256i k = kernel.packet[10].x;
+  __m256i l = kernel.packet[11].x;
+  __m256i m = kernel.packet[12].x;
+  __m256i n = kernel.packet[13].x;
+  __m256i o = kernel.packet[14].x;
+  __m256i p = kernel.packet[15].x;
+
+  __m256i ab_07 = _mm256_unpacklo_epi16(a, b);
+  __m256i cd_07 = _mm256_unpacklo_epi16(c, d);
+  __m256i ef_07 = _mm256_unpacklo_epi16(e, f);
+  __m256i gh_07 = _mm256_unpacklo_epi16(g, h);
+  __m256i ij_07 = _mm256_unpacklo_epi16(i, j);
+  __m256i kl_07 = _mm256_unpacklo_epi16(k, l);
+  __m256i mn_07 = _mm256_unpacklo_epi16(m, n);
+  __m256i op_07 = _mm256_unpacklo_epi16(o, p);
+
+  __m256i ab_8f = _mm256_unpackhi_epi16(a, b);
+  __m256i cd_8f = _mm256_unpackhi_epi16(c, d);
+  __m256i ef_8f = _mm256_unpackhi_epi16(e, f);
+  __m256i gh_8f = _mm256_unpackhi_epi16(g, h);
+  __m256i ij_8f = _mm256_unpackhi_epi16(i, j);
+  __m256i kl_8f = _mm256_unpackhi_epi16(k, l);
+  __m256i mn_8f = _mm256_unpackhi_epi16(m, n);
+  __m256i op_8f = _mm256_unpackhi_epi16(o, p);
+
+  __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07);
+  __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07);
+  __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07);
+  __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07);
+  __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07);
+  __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07);
+  __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07);
+  __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07);
+
+  __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f);
+  __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f);
+  __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f);
+  __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f);
+  __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f);
+  __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f);
+  __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f);
+  __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f);
+
+  __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03);
+  __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03);
+  __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03);
+  __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03);
+  __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47);
+  __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47);
+  __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47);
+  __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47);
+  __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b);
+  __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b);
+  __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b);
+  __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b);
+  __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf);
+  __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf);
+  __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf);
+  __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf);
+
+  // NOTE: no unpacklo/hi instr in this case, so using permute instr.
+  __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
+  __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
+  __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
+  __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
+  __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
+  __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
+  __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
+  __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
+  __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
+  __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
+  __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
+  __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
+  __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
+  __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
+  __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
+  __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
+
+  kernel.packet[0].x = a_p_0;
+  kernel.packet[1].x = a_p_1;
+  kernel.packet[2].x = a_p_2;
+  kernel.packet[3].x = a_p_3;
+  kernel.packet[4].x = a_p_4;
+  kernel.packet[5].x = a_p_5;
+  kernel.packet[6].x = a_p_6;
+  kernel.packet[7].x = a_p_7;
+  kernel.packet[8].x = a_p_8;
+  kernel.packet[9].x = a_p_9;
+  kernel.packet[10].x = a_p_a;
+  kernel.packet[11].x = a_p_b;
+  kernel.packet[12].x = a_p_c;
+  kernel.packet[13].x = a_p_d;
+  kernel.packet[14].x = a_p_e;
+  kernel.packet[15].x = a_p_f;
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet16h,8>& kernel) {
+  EIGEN_ALIGN64 half in[8][16];
+  pstore<half>(in[0], kernel.packet[0]);
+  pstore<half>(in[1], kernel.packet[1]);
+  pstore<half>(in[2], kernel.packet[2]);
+  pstore<half>(in[3], kernel.packet[3]);
+  pstore<half>(in[4], kernel.packet[4]);
+  pstore<half>(in[5], kernel.packet[5]);
+  pstore<half>(in[6], kernel.packet[6]);
+  pstore<half>(in[7], kernel.packet[7]);
+
+  EIGEN_ALIGN64 half out[8][16];
+
+  for (int i = 0; i < 8; ++i) {
+    for (int j = 0; j < 8; ++j) {
+      out[i][j] = in[j][2*i];
+    }
+    for (int j = 0; j < 8; ++j) {
+      out[i][j+8] = in[j][2*i+1];
+    }
+  }
+
+  kernel.packet[0] = pload<Packet16h>(out[0]);
+  kernel.packet[1] = pload<Packet16h>(out[1]);
+  kernel.packet[2] = pload<Packet16h>(out[2]);
+  kernel.packet[3] = pload<Packet16h>(out[3]);
+  kernel.packet[4] = pload<Packet16h>(out[4]);
+  kernel.packet[5] = pload<Packet16h>(out[5]);
+  kernel.packet[6] = pload<Packet16h>(out[6]);
+  kernel.packet[7] = pload<Packet16h>(out[7]);
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet16h,4>& kernel) {
+  EIGEN_ALIGN64 half in[4][16];
+  pstore<half>(in[0], kernel.packet[0]);
+  pstore<half>(in[1], kernel.packet[1]);
+  pstore<half>(in[2], kernel.packet[2]);
+  pstore<half>(in[3], kernel.packet[3]);
+
+  EIGEN_ALIGN64 half out[4][16];
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      out[i][j] = in[j][4*i];
+    }
+    for (int j = 0; j < 4; ++j) {
+      out[i][j+4] = in[j][4*i+1];
+    }
+    for (int j = 0; j < 4; ++j) {
+      out[i][j+8] = in[j][4*i+2];
+    }
+    for (int j = 0; j < 4; ++j) {
+      out[i][j+12] = in[j][4*i+3];
+    }
+  }
+
+  kernel.packet[0] = pload<Packet16h>(out[0]);
+  kernel.packet[1] = pload<Packet16h>(out[1]);
+  kernel.packet[2] = pload<Packet16h>(out[2]);
+  kernel.packet[3] = pload<Packet16h>(out[3]);
+}
+
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/arch/AVX512/TypeCasting.h b/Eigen/src/Core/arch/AVX512/TypeCasting.h
new file mode 100644
index 000000000..777a26ae4
--- /dev/null
+++ b/Eigen/src/Core/arch/AVX512/TypeCasting.h
@@ -0,0 +1,25 @@
+template <>
+struct type_casting_traits<half, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet16f pcast<Packet16h, Packet16f>(const Packet16h& a) {
+  return half2float(a);
+}
+
+template <>
+struct type_casting_traits<float, half> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet16h pcast<Packet16f, Packet16h>(const Packet16f& a) {
+  return float2half(a);
+}
diff --git a/Eigen/src/Core/arch/GPU/Half.h b/Eigen/src/Core/arch/Default/Half.h
similarity index 98%
rename from Eigen/src/Core/arch/GPU/Half.h
rename to Eigen/src/Core/arch/Default/Half.h
index 655dc20d5..56782b340 100644
--- a/Eigen/src/Core/arch/GPU/Half.h
+++ b/Eigen/src/Core/arch/Default/Half.h
@@ -33,8 +33,8 @@
 // to disk and the likes), but fast on GPUs.
 
 
-#ifndef EIGEN_HALF_GPU_H
-#define EIGEN_HALF_GPU_H
+#ifndef EIGEN_HALF_H
+#define EIGEN_HALF_H
 
 #if __cplusplus > 199711L
 #define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type()
@@ -76,7 +76,6 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h);
 
 struct half_base : public __half_raw {
   EIGEN_DEVICE_FUNC half_base() {}
-  EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {}
   EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {}
 
 #if defined(EIGEN_HAS_GPU_FP16)
@@ -114,8 +113,7 @@ struct half : public half_impl::half_base {
   EIGEN_DEVICE_FUNC half() {}
 
   EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {}
-  EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {}
-  
+
 #if defined(EIGEN_HAS_GPU_FP16)
  #if defined(EIGEN_HAS_HIP_FP16)
   EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
@@ -125,7 +123,7 @@ struct half : public half_impl::half_base {
   #endif
  #endif
 #endif
-  
+
 
   explicit EIGEN_DEVICE_FUNC half(bool b)
       : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
@@ -175,12 +173,6 @@ struct half : public half_impl::half_base {
   EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(double) const {
     return static_cast<double>(half_impl::half_to_float(*this));
   }
-
-  EIGEN_DEVICE_FUNC half& operator=(const half& other) {
-    x = other.x;
-    return *this;
-  }
-
 };
 
 } // end namespace Eigen
@@ -761,4 +753,4 @@ bool (isfinite)(const Eigen::half& h) {
 }  // namespace numext
 #endif
 
-#endif // EIGEN_HALF_GPU_H
+#endif // EIGEN_HALF_H
diff --git a/Eigen/src/Core/arch/Default/TypeCasting.h b/Eigen/src/Core/arch/Default/TypeCasting.h
new file mode 100644
index 000000000..b6df98468
--- /dev/null
+++ b/Eigen/src/Core/arch/Default/TypeCasting.h
@@ -0,0 +1,77 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2019 Rasmus Munk Larsen <rmlarsen@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GENERIC_TYPE_CASTING_H
+#define EIGEN_GENERIC_TYPE_CASTING_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<>
+struct scalar_cast_op<float, Eigen::half> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
+  typedef Eigen::half result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const {
+    #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+      (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+      return __float2half(a);
+    #else
+      return Eigen::half(a);
+    #endif
+  }
+};
+
+template<>
+struct functor_traits<scalar_cast_op<float, Eigen::half> >
+{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
+
+
+template<>
+struct scalar_cast_op<int, Eigen::half> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
+  typedef Eigen::half result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const {
+    #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+      (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+      return __float2half(static_cast<float>(a));
+    #else
+      return Eigen::half(static_cast<float>(a));
+    #endif
+  }
+};
+
+template<>
+struct functor_traits<scalar_cast_op<int, Eigen::half> >
+{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
+
+
+template<>
+struct scalar_cast_op<Eigen::half, float> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
+  typedef float result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const {
+    #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+      (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+      return __half2float(a);
+    #else
+      return static_cast<float>(a);
+    #endif
+  }
+};
+
+template<>
+struct functor_traits<scalar_cast_op<Eigen::half, float> >
+{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
+
+}
+}
+
+#endif  // EIGEN_GENERIC_TYPE_CASTING_H
diff --git a/Eigen/src/Core/arch/GPU/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h
index 5084fc786..d1df26e57 100644
--- a/Eigen/src/Core/arch/GPU/PacketMath.h
+++ b/Eigen/src/Core/arch/GPU/PacketMath.h
@@ -458,6 +458,546 @@ ptranspose(PacketBlock<double2,2>& kernel) {
 
 #endif
 
+// Packet math for Eigen::half
+// Most of the following operations require arch >= 3.0
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIPCC) && defined(EIGEN_HIP_DEVICE_COMPILE)) || \
+  (defined(EIGEN_HAS_CUDA_FP16) && defined(__clang__) && defined(__CUDA__))
+
+template<> struct is_arithmetic<half2> { enum { value = true }; };
+
+template<> struct packet_traits<Eigen::half> : default_packet_traits
+{
+  typedef half2 type;
+  typedef half2 half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=2,
+    HasHalfPacket = 0,
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasSqrt   = 1,
+    HasRsqrt  = 1,
+    HasExp    = 1,
+    HasExpm1  = 1,
+    HasLog    = 1,
+    HasLog1p  = 1
+  };
+};
+
+template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef half2 half; };
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
+#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIP_DEVICE_COMPILE)
+  half2 r;
+  r.x = from;
+  r.y = from;
+  return r;
+#else
+  return __half2half2(from);
+#endif
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
+  return *reinterpret_cast<const half2*>(from);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) {
+  return __halves2half2(from[0], from[1]);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half*  from) {
+  return __halves2half2(from[0], from[0]);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) {
+  *reinterpret_cast<half2*>(to) = from;
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
+#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIP_DEVICE_COMPILE)
+  to[0] = from.x;
+  to[1] = from.y;
+#else
+  to[0] = __low2half(from);
+  to[1] = __high2half(from);
+#endif
+}
+
+template<>
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
+
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+
+  return __ldg((const half2*)from);
+
+#else  // EIGEN_CUDA_ARCH
+
+#if EIGEN_CUDA_ARCH >= 350
+   return __ldg((const half2*)from);
+#else
+  return __halves2half2(*(from+0), *(from+1));
+#endif
+
+#endif
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
+
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+
+  return __halves2half2(__ldg(from+0), __ldg(from+1));
+
+#else  // EIGEN_CUDA_ARCH
+
+#if EIGEN_CUDA_ARCH >= 350
+   return __halves2half2(__ldg(from+0), __ldg(from+1));
+#else
+  return __halves2half2(*(from+0), *(from+1));
+#endif
+
+#endif
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
+  return __halves2half2(from[0*stride], from[1*stride]);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) {
+  to[stride*0] = __low2half(from);
+  to[stride*1] = __high2half(from);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) {
+  return __low2half(a);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half result1 = half_impl::raw_uint16_to_half(a1.x & 0x7FFF);
+  half result2 = half_impl::raw_uint16_to_half(a2.x & 0x7FFF);
+  return __halves2half2(result1, result2);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue<half2>(const half2& a) {
+  half true_half = half_impl::raw_uint16_to_half(0xffffu);
+  return pset1<half2>(true_half);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pzero<half2>(const half2& a) {
+  half false_half = half_impl::raw_uint16_to_half(0x0000u);
+  return pset1<half2>(false_half);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<half2,2>& kernel) {
+  __half a1 = __low2half(kernel.packet[0]);
+  __half a2 = __high2half(kernel.packet[0]);
+  __half b1 = __low2half(kernel.packet[1]);
+  __half b2 = __high2half(kernel.packet[1]);
+  kernel.packet[0] = __halves2half2(a1, b1);
+  kernel.packet[1] = __halves2half2(a2, b2);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+
+  return __halves2half2(a, __hadd(a, __float2half(1.0f)));
+
+#else  // EIGEN_CUDA_ARCH
+
+#if EIGEN_CUDA_ARCH >= 530
+  return __halves2half2(a, __hadd(a, __float2half(1.0f)));
+#else
+  float f = __half2float(a) + 1.0f;
+  return __halves2half2(a, __float2half(f));
+#endif
+
+#endif
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect<half2>(const half2& mask,
+                                                           const half2& a,
+                                                           const half2& b) {
+  half mask_low = __low2half(mask);
+  half mask_high = __high2half(mask);
+  half result_low = mask_low == half(0) ? __low2half(b) : __low2half(a);
+  half result_high = mask_high == half(0) ? __high2half(b) : __high2half(a);
+  return __halves2half2(result_low, result_high);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq<half2>(const half2& a,
+                                                           const half2& b) {
+  half true_half = half_impl::raw_uint16_to_half(0xffffu);
+  half false_half = half_impl::raw_uint16_to_half(0x0000u);
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half b1 = __low2half(b);
+  half b2 = __high2half(b);
+  half eq1 = __half2float(a1) == __half2float(b1) ? true_half : false_half;
+  half eq2 = __half2float(a2) == __half2float(b2) ? true_half : false_half;
+  return __halves2half2(eq1, eq2);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pand<half2>(const half2& a,
+                                                        const half2& b) {
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half b1 = __low2half(b);
+  half b2 = __high2half(b);
+  half result1 = half_impl::raw_uint16_to_half(a1.x & b1.x);
+  half result2 = half_impl::raw_uint16_to_half(a2.x & b2.x);
+  return __halves2half2(result1, result2);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por<half2>(const half2& a,
+                                                       const half2& b) {
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half b1 = __low2half(b);
+  half b2 = __high2half(b);
+  half result1 = half_impl::raw_uint16_to_half(a1.x | b1.x);
+  half result2 = half_impl::raw_uint16_to_half(a2.x | b2.x);
+  return __halves2half2(result1, result2);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor<half2>(const half2& a,
+                                                        const half2& b) {
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half b1 = __low2half(b);
+  half b2 = __high2half(b);
+  half result1 = half_impl::raw_uint16_to_half(a1.x ^ b1.x);
+  half result2 = half_impl::raw_uint16_to_half(a2.x ^ b2.x);
+  return __halves2half2(result1, result2);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot<half2>(const half2& a,
+                                                           const half2& b) {
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half b1 = __low2half(b);
+  half b2 = __high2half(b);
+  half result1 = half_impl::raw_uint16_to_half(a1.x & ~b1.x);
+  half result2 = half_impl::raw_uint16_to_half(a2.x & ~b2.x);
+  return __halves2half2(result1, result2);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+
+  return __hadd2(a, b);
+
+#else  // EIGEN_CUDA_ARCH
+
+#if EIGEN_CUDA_ARCH >= 530
+  return __hadd2(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 + b1;
+  float r2 = a2 + b2;
+  return __floats2half2_rn(r1, r2);
+#endif
+
+#endif
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+
+  return __hsub2(a, b);
+
+#else  // EIGEN_CUDA_ARCH
+
+#if EIGEN_CUDA_ARCH >= 530
+  return __hsub2(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 - b1;
+  float r2 = a2 - b2;
+  return __floats2half2_rn(r1, r2);
+#endif
+
+#endif
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+
+  return __hneg2(a);
+
+#else  // EIGEN_CUDA_ARCH
+
+#if EIGEN_CUDA_ARCH >= 530
+  return __hneg2(a);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return __floats2half2_rn(-a1, -a2);
+#endif
+
+#endif
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+
+  return __hmul2(a, b);
+
+#else  // EIGEN_CUDA_ARCH
+
+#if EIGEN_CUDA_ARCH >= 530
+  return __hmul2(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 * b1;
+  float r2 = a2 * b2;
+  return __floats2half2_rn(r1, r2);
+#endif
+
+#endif
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+
+   return __hfma2(a, b, c);
+
+#else  // EIGEN_CUDA_ARCH
+
+#if EIGEN_CUDA_ARCH >= 530
+   return __hfma2(a, b, c);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float c1 = __low2float(c);
+  float c2 = __high2float(c);
+  float r1 = a1 * b1 + c1;
+  float r2 = a2 * b2 + c2;
+  return __floats2half2_rn(r1, r2);
+#endif
+
+#endif
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+
+  return __h2div(a, b);
+
+#else // EIGEN_CUDA_ARCH
+
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 / b1;
+  float r2 = a2 / b2;
+  return __floats2half2_rn(r1, r2);
+
+#endif
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  __half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
+  __half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
+  return __halves2half2(r1, r2);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  __half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
+  __half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
+  return __halves2half2(r1, r2);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+
+  return __hadd(__low2half(a), __high2half(a));
+
+#else  // EIGEN_CUDA_ARCH
+
+#if EIGEN_CUDA_ARCH >= 530
+  return __hadd(__low2half(a), __high2half(a));
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return Eigen::half(__float2half(a1 + a2));
+#endif
+
+#endif
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+
+  __half first = __low2half(a);
+  __half second = __high2half(a);
+  return __hgt(first, second) ? first : second;
+
+#else  // EIGEN_CUDA_ARCH
+
+#if EIGEN_CUDA_ARCH >= 530
+  __half first = __low2half(a);
+  __half second = __high2half(a);
+  return __hgt(first, second) ? first : second;
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return a1 > a2 ? __low2half(a) : __high2half(a);
+#endif
+
+#endif
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+
+  __half first = __low2half(a);
+  __half second = __high2half(a);
+  return __hlt(first, second) ? first : second;
+
+#else  // EIGEN_CUDA_ARCH
+
+#if EIGEN_CUDA_ARCH >= 530
+  __half first = __low2half(a);
+  __half second = __high2half(a);
+  return __hlt(first, second) ? first : second;
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return a1 < a2 ? __low2half(a) : __high2half(a);
+#endif
+
+#endif
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+
+  return __hmul(__low2half(a), __high2half(a));
+
+#else  // EIGEN_CUDA_ARCH
+
+#if EIGEN_CUDA_ARCH >= 530
+  return __hmul(__low2half(a), __high2half(a));
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return Eigen::half(__float2half(a1 * a2));
+#endif
+
+#endif
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = log1pf(a1);
+  float r2 = log1pf(a2);
+  return __floats2half2_rn(r1, r2);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexpm1<half2>(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = expm1f(a1);
+  float r2 = expm1f(a2);
+  return __floats2half2_rn(r1, r2);
+}
+
+#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
+  defined(EIGEN_HIP_DEVICE_COMPILE)
+
+template<>  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+half2 plog<half2>(const half2& a) {
+  return h2log(a);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+half2 pexp<half2>(const half2& a) {
+  return h2exp(a);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+half2 psqrt<half2>(const half2& a) {
+  return h2sqrt(a);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+half2 prsqrt<half2>(const half2& a) {
+  return h2rsqrt(a);
+}
+
+#else
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = logf(a1);
+  float r2 = logf(a2);
+  return __floats2half2_rn(r1, r2);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = expf(a1);
+  float r2 = expf(a2);
+  return __floats2half2_rn(r1, r2);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = sqrtf(a1);
+  float r2 = sqrtf(a2);
+  return __floats2half2_rn(r1, r2);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = rsqrtf(a1);
+  float r2 = rsqrtf(a2);
+  return __floats2half2_rn(r1, r2);
+}
+#endif
+
+#endif
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h
deleted file mode 100644
index 5e143e65a..000000000
--- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h
+++ /dev/null
@@ -1,1630 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_PACKET_MATH_HALF_GPU_H
-#define EIGEN_PACKET_MATH_HALF_GPU_H
-
-
-namespace Eigen {
-namespace internal {
-
-// Most of the following operations require arch >= 3.0
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIPCC) && defined(EIGEN_HIP_DEVICE_COMPILE)) || \
-  (defined(EIGEN_HAS_CUDA_FP16) && defined(__clang__) && defined(__CUDA__))
-
-template<> struct is_arithmetic<half2> { enum { value = true }; };
-
-template<> struct packet_traits<Eigen::half> : default_packet_traits
-{
-  typedef half2 type;
-  typedef half2 half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size=2,
-    HasHalfPacket = 0,
-    HasAdd    = 1,
-    HasSub    = 1,
-    HasMul    = 1,
-    HasDiv    = 1,
-    HasSqrt   = 1,
-    HasRsqrt  = 1,
-    HasExp    = 1,
-    HasExpm1  = 1,
-    HasLog    = 1,
-    HasLog1p  = 1
-  };
-};
-
-template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef half2 half; };
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
-#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIP_DEVICE_COMPILE)
-  half2 r;
-  r.x = from;
-  r.y = from;
-  return r;
-#else
-  return __half2half2(from);
-#endif
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
-  return *reinterpret_cast<const half2*>(from);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) {
-  return __halves2half2(from[0], from[1]);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half*  from) {
-  return __halves2half2(from[0], from[0]);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) {
-  *reinterpret_cast<half2*>(to) = from;
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
-#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIP_DEVICE_COMPILE)
-  to[0] = from.x;
-  to[1] = from.y;
-#else
-  to[0] = __low2half(from);
-  to[1] = __high2half(from);
-#endif
-}
-
-template<>
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
-
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
-  return __ldg((const half2*)from);
-
-#else  // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 350
-   return __ldg((const half2*)from);
-#else
-  return __halves2half2(*(from+0), *(from+1));
-#endif
-
-#endif
-}
-
-template<>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
-
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
-  return __halves2half2(__ldg(from+0), __ldg(from+1));
-
-#else  // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 350
-   return __halves2half2(__ldg(from+0), __ldg(from+1));
-#else
-  return __halves2half2(*(from+0), *(from+1));
-#endif
-
-#endif
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
-  return __halves2half2(from[0*stride], from[1*stride]);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) {
-  to[stride*0] = __low2half(from);
-  to[stride*1] = __high2half(from);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) {
-  return __low2half(a);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
-  half a1 = __low2half(a);
-  half a2 = __high2half(a);
-  half result1 = half_impl::raw_uint16_to_half(a1.x & 0x7FFF);
-  half result2 = half_impl::raw_uint16_to_half(a2.x & 0x7FFF);
-  return __halves2half2(result1, result2);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue<half2>(const half2& a) {
-  half true_half = half_impl::raw_uint16_to_half(0xffffu);
-  return pset1<half2>(true_half);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pzero<half2>(const half2& a) {
-  half false_half = half_impl::raw_uint16_to_half(0x0000u);
-  return pset1<half2>(false_half);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<half2,2>& kernel) {
-  __half a1 = __low2half(kernel.packet[0]);
-  __half a2 = __high2half(kernel.packet[0]);
-  __half b1 = __low2half(kernel.packet[1]);
-  __half b2 = __high2half(kernel.packet[1]);
-  kernel.packet[0] = __halves2half2(a1, b1);
-  kernel.packet[1] = __halves2half2(a2, b2);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-  
-  return __halves2half2(a, __hadd(a, __float2half(1.0f)));
-  
-#else  // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 530
-  return __halves2half2(a, __hadd(a, __float2half(1.0f)));
-#else
-  float f = __half2float(a) + 1.0f;
-  return __halves2half2(a, __float2half(f));
-#endif
-
-#endif
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect<half2>(const half2& mask,
-                                                           const half2& a,
-                                                           const half2& b) {
-  half mask_low = __low2half(mask);
-  half mask_high = __high2half(mask);
-  half result_low = mask_low == half(0) ? __low2half(b) : __low2half(a);
-  half result_high = mask_high == half(0) ? __high2half(b) : __high2half(a);
-  return __halves2half2(result_low, result_high);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq<half2>(const half2& a,
-                                                           const half2& b) {
-  half true_half = half_impl::raw_uint16_to_half(0xffffu);
-  half false_half = half_impl::raw_uint16_to_half(0x0000u);
-  half a1 = __low2half(a);
-  half a2 = __high2half(a);
-  half b1 = __low2half(b);
-  half b2 = __high2half(b);
-  half eq1 = __half2float(a1) == __half2float(b1) ? true_half : false_half;
-  half eq2 = __half2float(a2) == __half2float(b2) ? true_half : false_half;
-  return __halves2half2(eq1, eq2);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pand<half2>(const half2& a,
-                                                        const half2& b) {
-  half a1 = __low2half(a);
-  half a2 = __high2half(a);
-  half b1 = __low2half(b);
-  half b2 = __high2half(b);
-  half result1 = half_impl::raw_uint16_to_half(a1.x & b1.x);
-  half result2 = half_impl::raw_uint16_to_half(a2.x & b2.x);
-  return __halves2half2(result1, result2);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por<half2>(const half2& a,
-                                                       const half2& b) {
-  half a1 = __low2half(a);
-  half a2 = __high2half(a);
-  half b1 = __low2half(b);
-  half b2 = __high2half(b);
-  half result1 = half_impl::raw_uint16_to_half(a1.x | b1.x);
-  half result2 = half_impl::raw_uint16_to_half(a2.x | b2.x);
-  return __halves2half2(result1, result2);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor<half2>(const half2& a,
-                                                        const half2& b) {
-  half a1 = __low2half(a);
-  half a2 = __high2half(a);
-  half b1 = __low2half(b);
-  half b2 = __high2half(b);
-  half result1 = half_impl::raw_uint16_to_half(a1.x ^ b1.x);
-  half result2 = half_impl::raw_uint16_to_half(a2.x ^ b2.x);
-  return __halves2half2(result1, result2);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot<half2>(const half2& a,
-                                                           const half2& b) {
-  half a1 = __low2half(a);
-  half a2 = __high2half(a);
-  half b1 = __low2half(b);
-  half b2 = __high2half(b);
-  half result1 = half_impl::raw_uint16_to_half(a1.x & ~b1.x);
-  half result2 = half_impl::raw_uint16_to_half(a2.x & ~b2.x);
-  return __halves2half2(result1, result2);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
-  return __hadd2(a, b);
-
-#else  // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 530
-  return __hadd2(a, b);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float r1 = a1 + b1;
-  float r2 = a2 + b2;
-  return __floats2half2_rn(r1, r2);
-#endif
-
-#endif
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
-  return __hsub2(a, b);
-  
-#else  // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 530
-  return __hsub2(a, b);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float r1 = a1 - b1;
-  float r2 = a2 - b2;
-  return __floats2half2_rn(r1, r2);
-#endif
-
-#endif
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
-  return __hneg2(a);
-
-#else  // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 530
-  return __hneg2(a);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  return __floats2half2_rn(-a1, -a2);
-#endif
-
-#endif
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
-  return __hmul2(a, b);
-
-#else  // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 530
-  return __hmul2(a, b);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float r1 = a1 * b1;
-  float r2 = a2 * b2;
-  return __floats2half2_rn(r1, r2);
-#endif
-
-#endif
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
-   return __hfma2(a, b, c);
-
-#else  // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 530
-   return __hfma2(a, b, c);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float c1 = __low2float(c);
-  float c2 = __high2float(c);
-  float r1 = a1 * b1 + c1;
-  float r2 = a2 * b2 + c2;
-  return __floats2half2_rn(r1, r2);
-#endif
-
-#endif
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-  
-  return __h2div(a, b);
-  
-#else // EIGEN_CUDA_ARCH
-  
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float r1 = a1 / b1;
-  float r2 = a2 / b2;
-  return __floats2half2_rn(r1, r2);
-
-#endif
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  __half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
-  __half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
-  return __halves2half2(r1, r2);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  __half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
-  __half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
-  return __halves2half2(r1, r2);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
-  return __hadd(__low2half(a), __high2half(a));
-
-#else  // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 530
-  return __hadd(__low2half(a), __high2half(a));
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  return Eigen::half(__float2half(a1 + a2));
-#endif
-
-#endif
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
-  __half first = __low2half(a);
-  __half second = __high2half(a);
-  return __hgt(first, second) ? first : second;
-
-#else  // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 530
-  __half first = __low2half(a);
-  __half second = __high2half(a);
-  return __hgt(first, second) ? first : second;
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  return a1 > a2 ? __low2half(a) : __high2half(a);
-#endif
-
-#endif
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
-  __half first = __low2half(a);
-  __half second = __high2half(a);
-  return __hlt(first, second) ? first : second;
-
-#else  // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 530
-  __half first = __low2half(a);
-  __half second = __high2half(a);
-  return __hlt(first, second) ? first : second;
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  return a1 < a2 ? __low2half(a) : __high2half(a);
-#endif
-
-#endif
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
-  return __hmul(__low2half(a), __high2half(a));
-
-#else  // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 530
-  return __hmul(__low2half(a), __high2half(a));
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  return Eigen::half(__float2half(a1 * a2));
-#endif
-
-#endif
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float r1 = log1pf(a1);
-  float r2 = log1pf(a2);
-  return __floats2half2_rn(r1, r2);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexpm1<half2>(const half2& a) {
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float r1 = expm1f(a1);
-  float r2 = expm1f(a2);
-  return __floats2half2_rn(r1, r2);
-}
-
-#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
-  defined(EIGEN_HIP_DEVICE_COMPILE)
-
-template<>  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-half2 plog<half2>(const half2& a) {
-  return h2log(a);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-half2 pexp<half2>(const half2& a) {
-  return h2exp(a);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-half2 psqrt<half2>(const half2& a) {
-  return h2sqrt(a);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-half2 prsqrt<half2>(const half2& a) {
-  return h2rsqrt(a);
-}
-
-#else
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float r1 = logf(a1);
-  float r2 = logf(a2);
-  return __floats2half2_rn(r1, r2);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float r1 = expf(a1);
-  float r2 = expf(a2);
-  return __floats2half2_rn(r1, r2);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float r1 = sqrtf(a1);
-  float r2 = sqrtf(a2);
-  return __floats2half2_rn(r1, r2);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) {
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float r1 = rsqrtf(a1);
-  float r2 = rsqrtf(a2);
-  return __floats2half2_rn(r1, r2);
-}
-
-#endif
-
-#elif defined EIGEN_VECTORIZE_AVX512
-
-typedef struct {
-  __m256i x;
-} Packet16h;
-
-
-template<> struct is_arithmetic<Packet16h> { enum { value = true }; };
-
-template <>
-struct packet_traits<half> : default_packet_traits {
-  typedef Packet16h type;
-  // There is no half-size packet for Packet16h.
-  typedef Packet16h half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 16,
-    HasHalfPacket = 0,
-    HasAdd    = 1,
-    HasSub    = 1,
-    HasMul    = 1,
-    HasDiv    = 1,
-    HasNegate = 1,
-    HasAbs    = 0,
-    HasAbs2   = 0,
-    HasMin    = 0,
-    HasMax    = 0,
-    HasConj   = 0,
-    HasSetLinear = 0,
-    HasSqrt = 0,
-    HasRsqrt = 0,
-    HasExp = 0,
-    HasLog = 0,
-    HasBlend = 0
-  };
-};
-
-
-template<> struct unpacket_traits<Packet16h> { typedef Eigen::half type; enum {size=16, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet16h half; };
-
-template<> EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) {
-  Packet16h result;
-  result.x = _mm256_set1_epi16(from.x);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet16h>(const Packet16h& from) {
-  return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm256_extract_epi16(from.x, 0)));
-}
-
-template<> EIGEN_STRONG_INLINE Packet16h pload<Packet16h>(const Eigen::half* from) {
-  Packet16h result;
-  result.x = _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(const Eigen::half* from) {
-  Packet16h result;
-  result.x = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) {
-  // (void*) -> workaround clang warning:
-  // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32
-  _mm256_store_si256((__m256i*)(void*)to, from.x);
-}
-
-template<> EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) {
-  // (void*) -> workaround clang warning:
-  // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32
-  _mm256_storeu_si256((__m256i*)(void*)to, from.x);
-}
-
-template<> EIGEN_STRONG_INLINE Packet16h
-ploaddup<Packet16h>(const Eigen::half*  from) {
-  Packet16h result;
-  unsigned short a = from[0].x;
-  unsigned short b = from[1].x;
-  unsigned short c = from[2].x;
-  unsigned short d = from[3].x;
-  unsigned short e = from[4].x;
-  unsigned short f = from[5].x;
-  unsigned short g = from[6].x;
-  unsigned short h = from[7].x;
-  result.x = _mm256_set_epi16(h, h, g, g, f, f, e, e, d, d, c, c, b, b, a, a);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Packet16h
-ploadquad(const Eigen::half* from) {
-  Packet16h result;
-  unsigned short a = from[0].x;
-  unsigned short b = from[1].x;
-  unsigned short c = from[2].x;
-  unsigned short d = from[3].x;
-  result.x = _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a);
-  return result;
-}
-
-EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) {
-#ifdef EIGEN_HAS_FP16_C
-  return _mm512_cvtph_ps(a.x);
-#else
-  EIGEN_ALIGN64 half aux[16];
-  pstore(aux, a);
-  float f0(aux[0]);
-  float f1(aux[1]);
-  float f2(aux[2]);
-  float f3(aux[3]);
-  float f4(aux[4]);
-  float f5(aux[5]);
-  float f6(aux[6]);
-  float f7(aux[7]);
-  float f8(aux[8]);
-  float f9(aux[9]);
-  float fa(aux[10]);
-  float fb(aux[11]);
-  float fc(aux[12]);
-  float fd(aux[13]);
-  float fe(aux[14]);
-  float ff(aux[15]);
-
-  return _mm512_set_ps(
-      ff, fe, fd, fc, fb, fa, f9, f8, f7, f6, f5, f4, f3, f2, f1, f0);
-#endif
-}
-
-EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) {
-#ifdef EIGEN_HAS_FP16_C
-  Packet16h result;
-  result.x = _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
-  return result;
-#else
-  EIGEN_ALIGN64 float aux[16];
-  pstore(aux, a);
-  half h0(aux[0]);
-  half h1(aux[1]);
-  half h2(aux[2]);
-  half h3(aux[3]);
-  half h4(aux[4]);
-  half h5(aux[5]);
-  half h6(aux[6]);
-  half h7(aux[7]);
-  half h8(aux[8]);
-  half h9(aux[9]);
-  half ha(aux[10]);
-  half hb(aux[11]);
-  half hc(aux[12]);
-  half hd(aux[13]);
-  half he(aux[14]);
-  half hf(aux[15]);
-
-  Packet16h result;
-  result.x = _mm256_set_epi16(
-      hf.x, he.x, hd.x, hc.x, hb.x, ha.x, h9.x, h8.x,
-      h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x);
-  return result;
-#endif
-}
-
-template<> EIGEN_STRONG_INLINE Packet16h pnot(const Packet16h& a) {
-  Packet16h r; r.x = _mm256_xor_si256(a.x, pcmp_eq(a.x, a.x)); return r;
-}
-
-template<> EIGEN_STRONG_INLINE Packet16h ptrue(const Packet16h& a) {
-  Packet16h r; r.x = Packet8i(ptrue(a.x)); return r;
-}
-
-template<> EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a,const Packet16h& b) {
-  // in some cases Packet8i is a wrapper around __m256i, so we need to 
-  // cast to Packet8i to call the correct overload.
-  Packet16h r; r.x = por(Packet8i(a.x),Packet8i(b.x)); return r;
-}
-template<> EIGEN_STRONG_INLINE Packet16h pxor(const Packet16h& a,const Packet16h& b) {
-  Packet16h r; r.x = pxor(Packet8i(a.x),Packet8i(b.x)); return r;
-}
-template<> EIGEN_STRONG_INLINE Packet16h pand(const Packet16h& a,const Packet16h& b) {
-  Packet16h r; r.x = pand(Packet8i(a.x),Packet8i(b.x)); return r;
-}
-template<> EIGEN_STRONG_INLINE Packet16h pandnot(const Packet16h& a,const Packet16h& b) {
-  Packet16h r; r.x = pandnot(Packet8i(a.x),Packet8i(b.x)); return r;
-}
-
-template<> EIGEN_STRONG_INLINE Packet16h pselect(const Packet16h& mask, const Packet16h& a, const Packet16h& b) {
-  Packet16h r; r.x = _mm256_blendv_epi8(b.x, a.x, mask.x); return r;
-}
-
-template<> EIGEN_STRONG_INLINE Packet16h pcmp_eq(const Packet16h& a,const Packet16h& b) {
-  Packet16f af = half2float(a);
-  Packet16f bf = half2float(b);
-  Packet16f rf = pcmp_eq(af, bf);
-  // Pack the 32-bit flags into 16-bits flags.
-  __m256i lo = _mm256_castps_si256(extract256<0>(rf));
-  __m256i hi = _mm256_castps_si256(extract256<1>(rf));
-  __m128i result_lo = _mm_packs_epi32(_mm256_extractf128_si256(lo, 0),
-                                      _mm256_extractf128_si256(lo, 1));
-  __m128i result_hi = _mm_packs_epi32(_mm256_extractf128_si256(hi, 0),
-                                      _mm256_extractf128_si256(hi, 1));
-  Packet16h result; result.x = _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi, 1);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) {
-  Packet16h sign_mask; sign_mask.x = _mm256_set1_epi16(static_cast<unsigned short>(0x8000));
-  Packet16h result; result.x = _mm256_xor_si256(a.x, sign_mask.x);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
-  Packet16f af = half2float(a);
-  Packet16f bf = half2float(b);
-  Packet16f rf = padd(af, bf);
-  return float2half(rf);
-}
-
-template<> EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) {
-  Packet16f af = half2float(a);
-  Packet16f bf = half2float(b);
-  Packet16f rf = psub(af, bf);
-  return float2half(rf);
-}
-
-template<> EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
-  Packet16f af = half2float(a);
-  Packet16f bf = half2float(b);
-  Packet16f rf = pmul(af, bf);
-  return float2half(rf);
-}
-
-template<> EIGEN_STRONG_INLINE Packet16h pdiv<Packet16h>(const Packet16h& a, const Packet16h& b) {
-  Packet16f af = half2float(a);
-  Packet16f bf = half2float(b);
-  Packet16f rf = pdiv(af, bf);
-  return float2half(rf);
-}
-
-template<> EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) {
-  Packet16f from_float = half2float(from);
-  return half(predux(from_float));
-}
-
-template<> EIGEN_STRONG_INLINE half predux_mul<Packet16h>(const Packet16h& from) {
-  Packet16f from_float = half2float(from);
-  return half(predux_mul(from_float));
-}
-
-template<> EIGEN_STRONG_INLINE Packet16h preduxp<Packet16h>(const Packet16h* p) {
-  Packet16f pf[16];
-  pf[0] = half2float(p[0]);
-  pf[1] = half2float(p[1]);
-  pf[2] = half2float(p[2]);
-  pf[3] = half2float(p[3]);
-  pf[4] = half2float(p[4]);
-  pf[5] = half2float(p[5]);
-  pf[6] = half2float(p[6]);
-  pf[7] = half2float(p[7]);
-  pf[8] = half2float(p[8]);
-  pf[9] = half2float(p[9]);
-  pf[10] = half2float(p[10]);
-  pf[11] = half2float(p[11]);
-  pf[12] = half2float(p[12]);
-  pf[13] = half2float(p[13]);
-  pf[14] = half2float(p[14]);
-  pf[15] = half2float(p[15]);
-  Packet16f reduced = preduxp<Packet16f>(pf);
-  return float2half(reduced);
-}
-
-template<> EIGEN_STRONG_INLINE Packet16h preverse(const Packet16h& a)
-{
-  __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
-  Packet16h res;
-  res.x = _mm256_insertf128_si256(
-                    _mm256_castsi128_si256(_mm_shuffle_epi8(_mm256_extractf128_si256(a.x,1),m)),
-                                           _mm_shuffle_epi8(_mm256_extractf128_si256(a.x,0),m), 1);
-  return res;
-}
-
-template<> EIGEN_STRONG_INLINE Packet16h pinsertfirst(const Packet16h& a, Eigen::half b)
-{
-  Packet16h res;
-  res.x = _mm256_insert_epi16(a.x,b.x,0);
-  return res;
-}
-
-template<> EIGEN_STRONG_INLINE Packet16h pinsertlast(const Packet16h& a, Eigen::half b)
-{
-  Packet16h res;
-  res.x = _mm256_insert_epi16(a.x,b.x,15);
-  return res;
-}
-
-template<> EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride)
-{
-  Packet16h result;
-  result.x = _mm256_set_epi16(
-      from[15*stride].x, from[14*stride].x, from[13*stride].x, from[12*stride].x,
-      from[11*stride].x, from[10*stride].x, from[9*stride].x, from[8*stride].x,
-      from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x,
-      from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE void pscatter<half, Packet16h>(half* to, const Packet16h& from, Index stride)
-{
-  EIGEN_ALIGN64 half aux[16];
-  pstore(aux, from);
-  to[stride*0].x = aux[0].x;
-  to[stride*1].x = aux[1].x;
-  to[stride*2].x = aux[2].x;
-  to[stride*3].x = aux[3].x;
-  to[stride*4].x = aux[4].x;
-  to[stride*5].x = aux[5].x;
-  to[stride*6].x = aux[6].x;
-  to[stride*7].x = aux[7].x;
-  to[stride*8].x = aux[8].x;
-  to[stride*9].x = aux[9].x;
-  to[stride*10].x = aux[10].x;
-  to[stride*11].x = aux[11].x;
-  to[stride*12].x = aux[12].x;
-  to[stride*13].x = aux[13].x;
-  to[stride*14].x = aux[14].x;
-  to[stride*15].x = aux[15].x;
-}
-
-EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet16h,16>& kernel) {
-  __m256i a = kernel.packet[0].x;
-  __m256i b = kernel.packet[1].x;
-  __m256i c = kernel.packet[2].x;
-  __m256i d = kernel.packet[3].x;
-  __m256i e = kernel.packet[4].x;
-  __m256i f = kernel.packet[5].x;
-  __m256i g = kernel.packet[6].x;
-  __m256i h = kernel.packet[7].x;
-  __m256i i = kernel.packet[8].x;
-  __m256i j = kernel.packet[9].x;
-  __m256i k = kernel.packet[10].x;
-  __m256i l = kernel.packet[11].x;
-  __m256i m = kernel.packet[12].x;
-  __m256i n = kernel.packet[13].x;
-  __m256i o = kernel.packet[14].x;
-  __m256i p = kernel.packet[15].x;
-
-  __m256i ab_07 = _mm256_unpacklo_epi16(a, b);
-  __m256i cd_07 = _mm256_unpacklo_epi16(c, d);
-  __m256i ef_07 = _mm256_unpacklo_epi16(e, f);
-  __m256i gh_07 = _mm256_unpacklo_epi16(g, h);
-  __m256i ij_07 = _mm256_unpacklo_epi16(i, j);
-  __m256i kl_07 = _mm256_unpacklo_epi16(k, l);
-  __m256i mn_07 = _mm256_unpacklo_epi16(m, n);
-  __m256i op_07 = _mm256_unpacklo_epi16(o, p);
-
-  __m256i ab_8f = _mm256_unpackhi_epi16(a, b);
-  __m256i cd_8f = _mm256_unpackhi_epi16(c, d);
-  __m256i ef_8f = _mm256_unpackhi_epi16(e, f);
-  __m256i gh_8f = _mm256_unpackhi_epi16(g, h);
-  __m256i ij_8f = _mm256_unpackhi_epi16(i, j);
-  __m256i kl_8f = _mm256_unpackhi_epi16(k, l);
-  __m256i mn_8f = _mm256_unpackhi_epi16(m, n);
-  __m256i op_8f = _mm256_unpackhi_epi16(o, p);
-
-  __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07);
-  __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07);
-  __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07);
-  __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07);
-  __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07);
-  __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07);
-  __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07);
-  __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07);
-
-  __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f);
-  __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f);
-  __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f);
-  __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f);
-  __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f);
-  __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f);
-  __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f);
-  __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f);
-
-  __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03);
-  __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03);
-  __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03);
-  __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03);
-  __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47);
-  __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47);
-  __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47);
-  __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47);
-  __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b);
-  __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b);
-  __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b);
-  __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b);
-  __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf);
-  __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf);
-  __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf);
-  __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf);
-
-  // NOTE: no unpacklo/hi instr in this case, so using permute instr.
-  __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
-  __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
-  __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
-  __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
-  __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
-  __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
-  __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
-  __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
-  __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
-  __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
-  __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
-  __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
-  __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
-  __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
-  __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
-  __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
-
-  kernel.packet[0].x = a_p_0;
-  kernel.packet[1].x = a_p_1;
-  kernel.packet[2].x = a_p_2;
-  kernel.packet[3].x = a_p_3;
-  kernel.packet[4].x = a_p_4;
-  kernel.packet[5].x = a_p_5;
-  kernel.packet[6].x = a_p_6;
-  kernel.packet[7].x = a_p_7;
-  kernel.packet[8].x = a_p_8;
-  kernel.packet[9].x = a_p_9;
-  kernel.packet[10].x = a_p_a;
-  kernel.packet[11].x = a_p_b;
-  kernel.packet[12].x = a_p_c;
-  kernel.packet[13].x = a_p_d;
-  kernel.packet[14].x = a_p_e;
-  kernel.packet[15].x = a_p_f;
-}
-
-EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet16h,8>& kernel) {
-  EIGEN_ALIGN64 half in[8][16];
-  pstore<half>(in[0], kernel.packet[0]);
-  pstore<half>(in[1], kernel.packet[1]);
-  pstore<half>(in[2], kernel.packet[2]);
-  pstore<half>(in[3], kernel.packet[3]);
-  pstore<half>(in[4], kernel.packet[4]);
-  pstore<half>(in[5], kernel.packet[5]);
-  pstore<half>(in[6], kernel.packet[6]);
-  pstore<half>(in[7], kernel.packet[7]);
-
-  EIGEN_ALIGN64 half out[8][16];
-
-  for (int i = 0; i < 8; ++i) {
-    for (int j = 0; j < 8; ++j) {
-      out[i][j] = in[j][2*i];
-    }
-    for (int j = 0; j < 8; ++j) {
-      out[i][j+8] = in[j][2*i+1];
-    }
-  }
-
-  kernel.packet[0] = pload<Packet16h>(out[0]);
-  kernel.packet[1] = pload<Packet16h>(out[1]);
-  kernel.packet[2] = pload<Packet16h>(out[2]);
-  kernel.packet[3] = pload<Packet16h>(out[3]);
-  kernel.packet[4] = pload<Packet16h>(out[4]);
-  kernel.packet[5] = pload<Packet16h>(out[5]);
-  kernel.packet[6] = pload<Packet16h>(out[6]);
-  kernel.packet[7] = pload<Packet16h>(out[7]);
-}
-
-EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet16h,4>& kernel) {
-  EIGEN_ALIGN64 half in[4][16];
-  pstore<half>(in[0], kernel.packet[0]);
-  pstore<half>(in[1], kernel.packet[1]);
-  pstore<half>(in[2], kernel.packet[2]);
-  pstore<half>(in[3], kernel.packet[3]);
-
-  EIGEN_ALIGN64 half out[4][16];
-
-  for (int i = 0; i < 4; ++i) {
-    for (int j = 0; j < 4; ++j) {
-      out[i][j] = in[j][4*i];
-    }
-    for (int j = 0; j < 4; ++j) {
-      out[i][j+4] = in[j][4*i+1];
-    }
-    for (int j = 0; j < 4; ++j) {
-      out[i][j+8] = in[j][4*i+2];
-    }
-    for (int j = 0; j < 4; ++j) {
-      out[i][j+12] = in[j][4*i+3];
-    }
-  }
-
-  kernel.packet[0] = pload<Packet16h>(out[0]);
-  kernel.packet[1] = pload<Packet16h>(out[1]);
-  kernel.packet[2] = pload<Packet16h>(out[2]);
-  kernel.packet[3] = pload<Packet16h>(out[3]);
-}
-
-
-#elif defined EIGEN_VECTORIZE_AVX
-
-typedef struct {
-  __m128i x;
-} Packet8h;
-
-
-template<> struct is_arithmetic<Packet8h> { enum { value = true }; };
-
-template <>
-struct packet_traits<Eigen::half> : default_packet_traits {
-  typedef Packet8h type;
-  // There is no half-size packet for Packet8h.
-  typedef Packet8h half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 8,
-    HasHalfPacket = 0,
-    HasAdd    = 1,
-    HasSub    = 1,
-    HasMul    = 1,
-    HasDiv    = 1,
-    HasNegate = 1,
-    HasAbs    = 0,
-    HasAbs2   = 0,
-    HasMin    = 0,
-    HasMax    = 0,
-    HasConj   = 0,
-    HasSetLinear = 0,
-    HasSqrt = 0,
-    HasRsqrt = 0,
-    HasExp = 0,
-    HasLog = 0,
-    HasBlend = 0
-  };
-};
-
-
-template<> struct unpacket_traits<Packet8h> { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet8h half; };
-
-template<> EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) {
-  Packet8h result;
-  result.x = _mm_set1_epi16(from.x);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8h>(const Packet8h& from) {
-  return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_extract_epi16(from.x, 0)));
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h pload<Packet8h>(const Eigen::half* from) {
-  Packet8h result;
-  result.x = _mm_load_si128(reinterpret_cast<const __m128i*>(from));
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h ploadu<Packet8h>(const Eigen::half* from) {
-  Packet8h result;
-  result.x = _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8h& from) {
-  _mm_store_si128(reinterpret_cast<__m128i*>(to), from.x);
-}
-
-template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet8h& from) {
-  _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from.x);
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h
-ploaddup<Packet8h>(const Eigen::half*  from) {
-  Packet8h result;
-  unsigned short a = from[0].x;
-  unsigned short b = from[1].x;
-  unsigned short c = from[2].x;
-  unsigned short d = from[3].x;
-  result.x = _mm_set_epi16(d, d, c, c, b, b, a, a);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h
-ploadquad<Packet8h>(const Eigen::half* from) {
-  Packet8h result;
-  unsigned short a = from[0].x;
-  unsigned short b = from[1].x;
-  result.x = _mm_set_epi16(b, b, b, b, a, a, a, a);
-  return result;
-}
-
-EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) {
-#ifdef EIGEN_HAS_FP16_C
-  return _mm256_cvtph_ps(a.x);
-#else
-  EIGEN_ALIGN32 Eigen::half aux[8];
-  pstore(aux, a);
-  float f0(aux[0]);
-  float f1(aux[1]);
-  float f2(aux[2]);
-  float f3(aux[3]);
-  float f4(aux[4]);
-  float f5(aux[5]);
-  float f6(aux[6]);
-  float f7(aux[7]);
-
-  return _mm256_set_ps(f7, f6, f5, f4, f3, f2, f1, f0);
-#endif
-}
-
-EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) {
-#ifdef EIGEN_HAS_FP16_C
-  Packet8h result;
-  result.x = _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
-  return result;
-#else
-  EIGEN_ALIGN32 float aux[8];
-  pstore(aux, a);
-  Eigen::half h0(aux[0]);
-  Eigen::half h1(aux[1]);
-  Eigen::half h2(aux[2]);
-  Eigen::half h3(aux[3]);
-  Eigen::half h4(aux[4]);
-  Eigen::half h5(aux[5]);
-  Eigen::half h6(aux[6]);
-  Eigen::half h7(aux[7]);
-
-  Packet8h result;
-  result.x = _mm_set_epi16(h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x);
-  return result;
-#endif
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h ptrue(const Packet8h& a) {
-  Packet8h r; r.x = _mm_cmpeq_epi32(a.x, a.x); return r;
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a,const Packet8h& b) {
-  // in some cases Packet4i is a wrapper around __m128i, so we either need to 
-  // cast to Packet4i to directly call the intrinsics as below:
-  Packet8h r; r.x = _mm_or_si128(a.x,b.x); return r;
-}
-template<> EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h& a,const Packet8h& b) {
-  Packet8h r; r.x = _mm_xor_si128(a.x,b.x); return r;
-}
-template<> EIGEN_STRONG_INLINE Packet8h pand(const Packet8h& a,const Packet8h& b) {
-  Packet8h r; r.x = _mm_and_si128(a.x,b.x); return r;
-}
-template<> EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h& a,const Packet8h& b) {
-  Packet8h r; r.x = _mm_andnot_si128(b.x,a.x); return r;
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h pselect(const Packet8h& mask, const Packet8h& a, const Packet8h& b) {
-  Packet8h r; r.x = _mm_blendv_epi8(b.x, a.x, mask.x); return r;
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h pcmp_eq(const Packet8h& a,const Packet8h& b) {
-  Packet8f af = half2float(a);
-  Packet8f bf = half2float(b);
-  Packet8f rf = pcmp_eq(af, bf);
-  // Pack the 32-bit flags into 16-bits flags.
-  Packet8h result; result.x = _mm_packs_epi32(_mm256_extractf128_si256(_mm256_castps_si256(rf), 0),
-                                              _mm256_extractf128_si256(_mm256_castps_si256(rf), 1));
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; }
-
-template<> EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) {
-  Packet8h sign_mask; sign_mask.x = _mm_set1_epi16(static_cast<unsigned short>(0x8000));
-  Packet8h result; result.x = _mm_xor_si128(a.x, sign_mask.x);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
-  Packet8f af = half2float(a);
-  Packet8f bf = half2float(b);
-  Packet8f rf = padd(af, bf);
-  return float2half(rf);
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) {
-  Packet8f af = half2float(a);
-  Packet8f bf = half2float(b);
-  Packet8f rf = psub(af, bf);
-  return float2half(rf);
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
-  Packet8f af = half2float(a);
-  Packet8f bf = half2float(b);
-  Packet8f rf = pmul(af, bf);
-  return float2half(rf);
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h pdiv<Packet8h>(const Packet8h& a, const Packet8h& b) {
-  Packet8f af = half2float(a);
-  Packet8f bf = half2float(b);
-  Packet8f rf = pdiv(af, bf);
-  return float2half(rf);
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h pgather<Eigen::half, Packet8h>(const Eigen::half* from, Index stride)
-{
-  Packet8h result;
-  result.x = _mm_set_epi16(from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x, from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8h>(Eigen::half* to, const Packet8h& from, Index stride)
-{
-  EIGEN_ALIGN32 Eigen::half aux[8];
-  pstore(aux, from);
-  to[stride*0].x = aux[0].x;
-  to[stride*1].x = aux[1].x;
-  to[stride*2].x = aux[2].x;
-  to[stride*3].x = aux[3].x;
-  to[stride*4].x = aux[4].x;
-  to[stride*5].x = aux[5].x;
-  to[stride*6].x = aux[6].x;
-  to[stride*7].x = aux[7].x;
-}
-
-template<> EIGEN_STRONG_INLINE Eigen::half predux<Packet8h>(const Packet8h& a) {
-  Packet8f af = half2float(a);
-  float reduced = predux<Packet8f>(af);
-  return Eigen::half(reduced);
-}
-
-template<> EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8h>(const Packet8h& a) {
-  Packet8f af = half2float(a);
-  float reduced = predux_max<Packet8f>(af);
-  return Eigen::half(reduced);
-}
-
-template<> EIGEN_STRONG_INLINE Eigen::half predux_min<Packet8h>(const Packet8h& a) {
-  Packet8f af = half2float(a);
-  float reduced = predux_min<Packet8f>(af);
-  return Eigen::half(reduced);
-}
-
-template<> EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8h>(const Packet8h& a) {
-  Packet8f af = half2float(a);
-  float reduced = predux_mul<Packet8f>(af);
-  return Eigen::half(reduced);
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h preduxp<Packet8h>(const Packet8h* p) {
-  Packet8f pf[8];
-  pf[0] = half2float(p[0]);
-  pf[1] = half2float(p[1]);
-  pf[2] = half2float(p[2]);
-  pf[3] = half2float(p[3]);
-  pf[4] = half2float(p[4]);
-  pf[5] = half2float(p[5]);
-  pf[6] = half2float(p[6]);
-  pf[7] = half2float(p[7]);
-  Packet8f reduced = preduxp<Packet8f>(pf);
-  return float2half(reduced);
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h preverse(const Packet8h& a)
-{
-  __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
-  Packet8h res;
-  res.x = _mm_shuffle_epi8(a.x,m);
-  return res;
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h pinsertfirst(const Packet8h& a, Eigen::half b)
-{
-  Packet8h res;
-  res.x = _mm_insert_epi16(a.x,int(b.x),0);
-  return res;
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h pinsertlast(const Packet8h& a, Eigen::half b)
-{
-  Packet8h res;
-  res.x = _mm_insert_epi16(a.x,int(b.x),7);
-  return res;
-}
-
-template<int Offset>
-struct palign_impl<Offset,Packet8h>
-{
-  static EIGEN_STRONG_INLINE void run(Packet8h& first, const Packet8h& second)
-  {
-    if (Offset!=0)
-      first.x = _mm_alignr_epi8(second.x,first.x, Offset*2);
-  }
-};
-
-EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet8h,8>& kernel) {
-  __m128i a = kernel.packet[0].x;
-  __m128i b = kernel.packet[1].x;
-  __m128i c = kernel.packet[2].x;
-  __m128i d = kernel.packet[3].x;
-  __m128i e = kernel.packet[4].x;
-  __m128i f = kernel.packet[5].x;
-  __m128i g = kernel.packet[6].x;
-  __m128i h = kernel.packet[7].x;
-
-  __m128i a03b03 = _mm_unpacklo_epi16(a, b);
-  __m128i c03d03 = _mm_unpacklo_epi16(c, d);
-  __m128i e03f03 = _mm_unpacklo_epi16(e, f);
-  __m128i g03h03 = _mm_unpacklo_epi16(g, h);
-  __m128i a47b47 = _mm_unpackhi_epi16(a, b);
-  __m128i c47d47 = _mm_unpackhi_epi16(c, d);
-  __m128i e47f47 = _mm_unpackhi_epi16(e, f);
-  __m128i g47h47 = _mm_unpackhi_epi16(g, h);
-
-  __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03);
-  __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03);
-  __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03);
-  __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03);
-  __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47);
-  __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47);
-  __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47);
-  __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47);
-
-  __m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01);
-  __m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01);
-  __m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23);
-  __m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23);
-  __m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45);
-  __m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45);
-  __m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67);
-  __m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67);
-
-  kernel.packet[0].x = a0b0c0d0e0f0g0h0;
-  kernel.packet[1].x = a1b1c1d1e1f1g1h1;
-  kernel.packet[2].x = a2b2c2d2e2f2g2h2;
-  kernel.packet[3].x = a3b3c3d3e3f3g3h3;
-  kernel.packet[4].x = a4b4c4d4e4f4g4h4;
-  kernel.packet[5].x = a5b5c5d5e5f5g5h5;
-  kernel.packet[6].x = a6b6c6d6e6f6g6h6;
-  kernel.packet[7].x = a7b7c7d7e7f7g7h7;
-}
-
-EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet8h,4>& kernel) {
-  EIGEN_ALIGN32 Eigen::half in[4][8];
-  pstore<Eigen::half>(in[0], kernel.packet[0]);
-  pstore<Eigen::half>(in[1], kernel.packet[1]);
-  pstore<Eigen::half>(in[2], kernel.packet[2]);
-  pstore<Eigen::half>(in[3], kernel.packet[3]);
-
-  EIGEN_ALIGN32 Eigen::half out[4][8];
-
-  for (int i = 0; i < 4; ++i) {
-    for (int j = 0; j < 4; ++j) {
-      out[i][j] = in[j][2*i];
-    }
-    for (int j = 0; j < 4; ++j) {
-      out[i][j+4] = in[j][2*i+1];
-    }
-  }
-
-  kernel.packet[0] = pload<Packet8h>(out[0]);
-  kernel.packet[1] = pload<Packet8h>(out[1]);
-  kernel.packet[2] = pload<Packet8h>(out[2]);
-  kernel.packet[3] = pload<Packet8h>(out[3]);
-}
-
-
-// Disable the following code since it's broken on too many platforms / compilers.
-//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
-#elif 0
-
-typedef struct {
-  __m64 x;
-} Packet4h;
-
-
-template<> struct is_arithmetic<Packet4h> { enum { value = true }; };
-
-template <>
-struct packet_traits<Eigen::half> : default_packet_traits {
-  typedef Packet4h type;
-  // There is no half-size packet for Packet4h.
-  typedef Packet4h half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 4,
-    HasHalfPacket = 0,
-    HasAdd    = 1,
-    HasSub    = 1,
-    HasMul    = 1,
-    HasDiv    = 1,
-    HasNegate = 0,
-    HasAbs    = 0,
-    HasAbs2   = 0,
-    HasMin    = 0,
-    HasMax    = 0,
-    HasConj   = 0,
-    HasSetLinear = 0,
-    HasSqrt = 0,
-    HasRsqrt = 0,
-    HasExp = 0,
-    HasLog = 0,
-    HasBlend = 0
-  };
-};
-
-
-template<> struct unpacket_traits<Packet4h> { typedef Eigen::half type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h half; };
-
-template<> EIGEN_STRONG_INLINE Packet4h pset1<Packet4h>(const Eigen::half& from) {
-  Packet4h result;
-  result.x = _mm_set1_pi16(from.x);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h>(const Packet4h& from) {
-  return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_cvtsi64_si32(from.x)));
-}
-
-template<> EIGEN_STRONG_INLINE Packet4h pconj(const Packet4h& a) { return a; }
-
-template<> EIGEN_STRONG_INLINE Packet4h padd<Packet4h>(const Packet4h& a, const Packet4h& b) {
-  __int64_t a64 = _mm_cvtm64_si64(a.x);
-  __int64_t b64 = _mm_cvtm64_si64(b.x);
-
-  Eigen::half h[4];
-
-  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
-  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
-  h[0] = ha + hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
-  h[1] = ha + hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
-  h[2] = ha + hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
-  h[3] = ha + hb;
-  Packet4h result;
-  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Packet4h psub<Packet4h>(const Packet4h& a, const Packet4h& b) {
-  __int64_t a64 = _mm_cvtm64_si64(a.x);
-  __int64_t b64 = _mm_cvtm64_si64(b.x);
-
-  Eigen::half h[4];
-
-  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
-  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
-  h[0] = ha - hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
-  h[1] = ha - hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
-  h[2] = ha - hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
-  h[3] = ha - hb;
-  Packet4h result;
-  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Packet4h pmul<Packet4h>(const Packet4h& a, const Packet4h& b) {
-  __int64_t a64 = _mm_cvtm64_si64(a.x);
-  __int64_t b64 = _mm_cvtm64_si64(b.x);
-
-  Eigen::half h[4];
-
-  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
-  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
-  h[0] = ha * hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
-  h[1] = ha * hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
-  h[2] = ha * hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
-  h[3] = ha * hb;
-  Packet4h result;
-  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Packet4h pdiv<Packet4h>(const Packet4h& a, const Packet4h& b) {
-  __int64_t a64 = _mm_cvtm64_si64(a.x);
-  __int64_t b64 = _mm_cvtm64_si64(b.x);
-
-  Eigen::half h[4];
-
-  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
-  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
-  h[0] = ha / hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
-  h[1] = ha / hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
-  h[2] = ha / hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
-  h[3] = ha / hb;
-  Packet4h result;
-  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Packet4h pload<Packet4h>(const Eigen::half* from) {
-  Packet4h result;
-  result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Packet4h ploadu<Packet4h>(const Eigen::half* from) {
-  Packet4h result;
-  result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4h& from) {
-  __int64_t r = _mm_cvtm64_si64(from.x);
-  *(reinterpret_cast<__int64_t*>(to)) = r;
-}
-
-template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4h& from) {
-  __int64_t r = _mm_cvtm64_si64(from.x);
-  *(reinterpret_cast<__int64_t*>(to)) = r;
-}
-
-template<> EIGEN_STRONG_INLINE Packet4h
-ploadquad<Packet4h>(const Eigen::half* from) {
-  return pset1<Packet4h>(*from);
-}
-
-template<> EIGEN_STRONG_INLINE Packet4h pgather<Eigen::half, Packet4h>(const Eigen::half* from, Index stride)
-{
-  Packet4h result;
-  result.x = _mm_set_pi16(from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h>(Eigen::half* to, const Packet4h& from, Index stride)
-{
-  __int64_t a = _mm_cvtm64_si64(from.x);
-  to[stride*0].x = static_cast<unsigned short>(a);
-  to[stride*1].x = static_cast<unsigned short>(a >> 16);
-  to[stride*2].x = static_cast<unsigned short>(a >> 32);
-  to[stride*3].x = static_cast<unsigned short>(a >> 48);
-}
-
-EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet4h,4>& kernel) {
-  __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x);
-  __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x);
-  __m64 T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x);
-  __m64 T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x);
-
-  kernel.packet[0].x = _mm_unpacklo_pi32(T0, T1);
-  kernel.packet[1].x = _mm_unpackhi_pi32(T0, T1);
-  kernel.packet[2].x = _mm_unpacklo_pi32(T2, T3);
-  kernel.packet[3].x = _mm_unpackhi_pi32(T2, T3);
-}
-
-#endif
-
-}
-}
-
-#endif // EIGEN_PACKET_MATH_HALF_GPU_H
diff --git a/Eigen/src/Core/arch/GPU/TypeCasting.h b/Eigen/src/Core/arch/GPU/TypeCasting.h
index 57a55d08b..c278f3fe8 100644
--- a/Eigen/src/Core/arch/GPU/TypeCasting.h
+++ b/Eigen/src/Core/arch/GPU/TypeCasting.h
@@ -14,64 +14,6 @@ namespace Eigen {
 
 namespace internal {
 
-template<>
-struct scalar_cast_op<float, Eigen::half> {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
-  typedef Eigen::half result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const {
-    #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-      (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
-      return __float2half(a);
-    #else
-      return Eigen::half(a);
-    #endif
-  }
-};
-
-template<>
-struct functor_traits<scalar_cast_op<float, Eigen::half> >
-{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
-
-
-template<>
-struct scalar_cast_op<int, Eigen::half> {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
-  typedef Eigen::half result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const {
-    #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-      (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
-      return __float2half(static_cast<float>(a));
-    #else
-      return Eigen::half(static_cast<float>(a));
-    #endif
-  }
-};
-
-template<>
-struct functor_traits<scalar_cast_op<int, Eigen::half> >
-{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
-
-
-template<>
-struct scalar_cast_op<Eigen::half, float> {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
-  typedef float result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const {
-    #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-      (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
-      return __half2float(a);
-    #else
-      return static_cast<float>(a);
-    #endif
-  }
-};
-
-template<>
-struct functor_traits<scalar_cast_op<Eigen::half, float> >
-{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
-
-
-
 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
   (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
 
@@ -104,109 +46,6 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcast<float4, half2>(cons
   return __floats2half2_rn(a.x, a.y);
 }
 
-#elif defined EIGEN_VECTORIZE_AVX512
-template <>
-struct type_casting_traits<half, float> {
-  enum {
-    VectorizedCast = 1,
-    SrcCoeffRatio = 1,
-    TgtCoeffRatio = 1
-  };
-};
-
-template<> EIGEN_STRONG_INLINE Packet16f pcast<Packet16h, Packet16f>(const Packet16h& a) {
-  return half2float(a);
-}
-
-template <>
-struct type_casting_traits<float, half> {
-  enum {
-    VectorizedCast = 1,
-    SrcCoeffRatio = 1,
-    TgtCoeffRatio = 1
-  };
-};
-
-template<> EIGEN_STRONG_INLINE Packet16h pcast<Packet16f, Packet16h>(const Packet16f& a) {
-  return float2half(a);
-}
-
-#elif defined EIGEN_VECTORIZE_AVX
-
-template <>
-struct type_casting_traits<Eigen::half, float> {
-  enum {
-    VectorizedCast = 1,
-    SrcCoeffRatio = 1,
-    TgtCoeffRatio = 1
-  };
-};
-
-template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8h, Packet8f>(const Packet8h& a) {
-  return half2float(a);
-}
-
-template <>
-struct type_casting_traits<float, Eigen::half> {
-  enum {
-    VectorizedCast = 1,
-    SrcCoeffRatio = 1,
-    TgtCoeffRatio = 1
-  };
-};
-
-template<> EIGEN_STRONG_INLINE Packet8h pcast<Packet8f, Packet8h>(const Packet8f& a) {
-  return float2half(a);
-}
-
-// Disable the following code since it's broken on too many platforms / compilers.
-//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
-#elif 0
-
-template <>
-struct type_casting_traits<Eigen::half, float> {
-  enum {
-    VectorizedCast = 1,
-    SrcCoeffRatio = 1,
-    TgtCoeffRatio = 1
-  };
-};
-
-template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4h, Packet4f>(const Packet4h& a) {
-  __int64_t a64 = _mm_cvtm64_si64(a.x);
-  Eigen::half h = raw_uint16_to_half(static_cast<unsigned short>(a64));
-  float f1 = static_cast<float>(h);
-  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
-  float f2 = static_cast<float>(h);
-  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
-  float f3 = static_cast<float>(h);
-  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
-  float f4 = static_cast<float>(h);
-  return _mm_set_ps(f4, f3, f2, f1);
-}
-
-template <>
-struct type_casting_traits<float, Eigen::half> {
-  enum {
-    VectorizedCast = 1,
-    SrcCoeffRatio = 1,
-    TgtCoeffRatio = 1
-  };
-};
-
-template<> EIGEN_STRONG_INLINE Packet4h pcast<Packet4f, Packet4h>(const Packet4f& a) {
-  EIGEN_ALIGN16 float aux[4];
-  pstore(aux, a);
-  Eigen::half h0(aux[0]);
-  Eigen::half h1(aux[1]);
-  Eigen::half h2(aux[2]);
-  Eigen::half h3(aux[3]);
-
-  Packet4h result;
-  result.x = _mm_set_pi16(h3.x, h2.x, h1.x, h0.x);
-  return result;
-}
-
 #endif
 
 } // end namespace internal
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 94603dd55..b59e2c602 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -1055,6 +1055,214 @@ template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, co
 }
 #endif
 
+
+// Packet math for Eigen::half
+// Disable the following code since it's broken on too many platforms / compilers.
+//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
+#if 0
+
+typedef struct {
+  __m64 x;
+} Packet4h;
+
+
+template<> struct is_arithmetic<Packet4h> { enum { value = true }; };
+
+template <>
+struct packet_traits<Eigen::half> : default_packet_traits {
+  typedef Packet4h type;
+  // There is no half-size packet for Packet4h.
+  typedef Packet4h half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+    HasHalfPacket = 0,
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 0,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasConj   = 0,
+    HasSetLinear = 0,
+    HasSqrt = 0,
+    HasRsqrt = 0,
+    HasExp = 0,
+    HasLog = 0,
+    HasBlend = 0
+  };
+};
+
+
+template<> struct unpacket_traits<Packet4h> { typedef Eigen::half type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h half; };
+
+template<> EIGEN_STRONG_INLINE Packet4h pset1<Packet4h>(const Eigen::half& from) {
+  Packet4h result;
+  result.x = _mm_set1_pi16(from.x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h>(const Packet4h& from) {
+  return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_cvtsi64_si32(from.x)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h pconj(const Packet4h& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet4h padd<Packet4h>(const Packet4h& a, const Packet4h& b) {
+  __int64_t a64 = _mm_cvtm64_si64(a.x);
+  __int64_t b64 = _mm_cvtm64_si64(b.x);
+
+  Eigen::half h[4];
+
+  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
+  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
+  h[0] = ha + hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
+  h[1] = ha + hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
+  h[2] = ha + hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
+  h[3] = ha + hb;
+  Packet4h result;
+  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h psub<Packet4h>(const Packet4h& a, const Packet4h& b) {
+  __int64_t a64 = _mm_cvtm64_si64(a.x);
+  __int64_t b64 = _mm_cvtm64_si64(b.x);
+
+  Eigen::half h[4];
+
+  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
+  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
+  h[0] = ha - hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
+  h[1] = ha - hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
+  h[2] = ha - hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
+  h[3] = ha - hb;
+  Packet4h result;
+  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h pmul<Packet4h>(const Packet4h& a, const Packet4h& b) {
+  __int64_t a64 = _mm_cvtm64_si64(a.x);
+  __int64_t b64 = _mm_cvtm64_si64(b.x);
+
+  Eigen::half h[4];
+
+  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
+  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
+  h[0] = ha * hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
+  h[1] = ha * hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
+  h[2] = ha * hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
+  h[3] = ha * hb;
+  Packet4h result;
+  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h pdiv<Packet4h>(const Packet4h& a, const Packet4h& b) {
+  __int64_t a64 = _mm_cvtm64_si64(a.x);
+  __int64_t b64 = _mm_cvtm64_si64(b.x);
+
+  Eigen::half h[4];
+
+  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
+  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
+  h[0] = ha / hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
+  h[1] = ha / hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
+  h[2] = ha / hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
+  h[3] = ha / hb;
+  Packet4h result;
+  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h pload<Packet4h>(const Eigen::half* from) {
+  Packet4h result;
+  result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h ploadu<Packet4h>(const Eigen::half* from) {
+  Packet4h result;
+  result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4h& from) {
+  __int64_t r = _mm_cvtm64_si64(from.x);
+  *(reinterpret_cast<__int64_t*>(to)) = r;
+}
+
+template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4h& from) {
+  __int64_t r = _mm_cvtm64_si64(from.x);
+  *(reinterpret_cast<__int64_t*>(to)) = r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h
+ploadquad<Packet4h>(const Eigen::half* from) {
+  return pset1<Packet4h>(*from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h pgather<Eigen::half, Packet4h>(const Eigen::half* from, Index stride)
+{
+  Packet4h result;
+  result.x = _mm_set_pi16(from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h>(Eigen::half* to, const Packet4h& from, Index stride)
+{
+  __int64_t a = _mm_cvtm64_si64(from.x);
+  to[stride*0].x = static_cast<unsigned short>(a);
+  to[stride*1].x = static_cast<unsigned short>(a >> 16);
+  to[stride*2].x = static_cast<unsigned short>(a >> 32);
+  to[stride*3].x = static_cast<unsigned short>(a >> 48);
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet4h,4>& kernel) {
+  __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x);
+  __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x);
+  __m64 T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x);
+  __m64 T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x);
+
+  kernel.packet[0].x = _mm_unpacklo_pi32(T0, T1);
+  kernel.packet[1].x = _mm_unpackhi_pi32(T0, T1);
+  kernel.packet[2].x = _mm_unpacklo_pi32(T2, T3);
+  kernel.packet[3].x = _mm_unpackhi_pi32(T2, T3);
+}
+
+#endif
+
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/arch/SSE/TypeCasting.h b/Eigen/src/Core/arch/SSE/TypeCasting.h
index f607366f0..1b8e9a550 100644
--- a/Eigen/src/Core/arch/SSE/TypeCasting.h
+++ b/Eigen/src/Core/arch/SSE/TypeCasting.h
@@ -77,6 +77,57 @@ template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f,Packet4i>(const Pa
   return _mm_castsi128_ps(a);
 }
 
+
+// Disable the following code since it's broken on too many platforms / compilers.
+//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
+#if 0
+
+template <>
+struct type_casting_traits<Eigen::half, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4h, Packet4f>(const Packet4h& a) {
+  __int64_t a64 = _mm_cvtm64_si64(a.x);
+  Eigen::half h = raw_uint16_to_half(static_cast<unsigned short>(a64));
+  float f1 = static_cast<float>(h);
+  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+  float f2 = static_cast<float>(h);
+  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+  float f3 = static_cast<float>(h);
+  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+  float f4 = static_cast<float>(h);
+  return _mm_set_ps(f4, f3, f2, f1);
+}
+
+template <>
+struct type_casting_traits<float, Eigen::half> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet4h pcast<Packet4f, Packet4h>(const Packet4f& a) {
+  EIGEN_ALIGN16 float aux[4];
+  pstore(aux, a);
+  Eigen::half h0(aux[0]);
+  Eigen::half h1(aux[1]);
+  Eigen::half h2(aux[2]);
+  Eigen::half h3(aux[3]);
+
+  Packet4h result;
+  result.x = _mm_set_pi16(h3.x, h2.x, h1.x, h0.x);
+  return result;
+}
+
+#endif
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/test/half_float.cpp b/test/half_float.cpp
index 2a7f9b497..48afdb21b 100644
--- a/test/half_float.cpp
+++ b/test/half_float.cpp
@@ -9,7 +9,7 @@
 
 #include "main.h"
 
-#include <Eigen/src/Core/arch/GPU/Half.h>
+#include <Eigen/src/Core/arch/Default/Half.h>
 
 // Make sure it's possible to forward declare Eigen::half
 namespace Eigen {

From 9aba527405b40132a308f5f782dacadf6ef50acd Mon Sep 17 00:00:00 2001
From: Rasmus Munk Larsen <rmlarsen@google.com>
Date: Tue, 27 Aug 2019 15:35:29 -0700
Subject: [PATCH 11/30] Revert changes to std_falback::log1p that broke
 handling of arguments less than -1. Fix packet op accordingly.

---
 Eigen/src/Core/MathFunctions.h                           | 3 +--
 Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h | 4 +---
 test/packetmath.cpp                                      | 2 --
 3 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index 1eeb2752b..fcf62011e 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -550,9 +550,8 @@ namespace std_fallback {
     EIGEN_USING_STD_MATH(log);
     Scalar x1p = RealScalar(1) + x;
     Scalar log_1p = log(x1p);
-    const bool is_inf = numext::equal_strict(x1p, log_1p);
     const bool is_small = numext::equal_strict(x1p, Scalar(1));
-    return (is_inf || is_small) ? x : x * (log_1p / (x1p - RealScalar(1)));
+    return is_small ? x : x * (log_1p / (x1p - RealScalar(1)));
   }
 }
 
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
index 640aae05a..505a0eec8 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -137,10 +137,8 @@ Packet generic_plog1p(const Packet& x)
   Packet xp1 = padd(x, one);
   Packet small_mask = pcmp_eq(xp1, one);
   Packet log1 = plog(xp1);
-  // Add a check to handle x == +inf.
-  Packet pos_inf_mask = pcmp_eq(x, log1);
   Packet log_large = pmul(x, pdiv(log1, psub(xp1, one)));
-  return pselect(por(small_mask, pos_inf_mask), x, log_large);
+  return pselect(small_mask, x, log_large);
 }
 
 /** \internal \returns exp(x)-1 computed using W. Kahan's formula.
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index 41000a842..28768b18d 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -607,8 +607,6 @@ template<typename Scalar,typename Packet> void packetmath_real()
   CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasLGamma, std::lgamma, internal::plgamma);
   CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasErf, std::erf, internal::perf);
   CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasErfc, std::erfc, internal::perfc);
-  data1[0] =  std::numeric_limits<Scalar>::infinity();
-  data1[1] =  std::numeric_limits<Scalar>::denorm_min();
   CHECK_CWISE1_IF(PacketTraits::HasExpm1, std::expm1, internal::pexpm1);
   CHECK_CWISE1_IF(PacketTraits::HasLog1p, std::log1p, internal::plog1p);
 #endif

From 6e77f9bef35012f160b307bdeae73194fde91e51 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Wed, 28 Aug 2019 10:32:19 -0700
Subject: [PATCH 12/30] Remove shadow warnings in TensorDeviceThreadPool

---
 .../Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h    | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index ca2794cb5..edb0b3e25 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -90,7 +90,6 @@ struct ThreadPoolDevice {
     // CPU cycles due to the threads competing for memory bandwidth, so we
     // statically schedule at most 4 block copies here.
     const size_t kMinBlockSize = 32768;
-    typedef TensorCostModel<ThreadPoolDevice> CostModel;
     const size_t num_threads = CostModel::numThreads(n, TensorOpCost(1.0, 1.0, 0), 4);
     if (n <= kMinBlockSize || num_threads < 2) {
       ::memcpy(dst, src, n);
@@ -302,9 +301,12 @@ struct ThreadPoolDevice {
   // For parallelForAsync we must keep passed in closures on the heap, and
   // delete them only after `done` callback finished.
   struct ParallelForAsyncContext {
-    ParallelForAsyncContext(Index count, std::function<void(Index, Index)> f,
-                             std::function<void()> done)
-        : count(count), f(std::move(f)), done(std::move(done)) {}
+    ParallelForAsyncContext(Index block_count,
+                            std::function<void(Index, Index)> block_f,
+                            std::function<void()> done_callback)
+        : count(block_count),
+          f(std::move(block_f)),
+          done(std::move(done_callback)) {}
 
     std::atomic<Index> count;
     std::function<void(Index, Index)> f;

From 1187bb65ad196161a07f4e0125e478d022ea1b08 Mon Sep 17 00:00:00 2001
From: Rasmus Munk Larsen <rmlarsen@google.com>
Date: Wed, 28 Aug 2019 12:20:21 -0700
Subject: [PATCH 13/30] Add more tests for corner cases of log1p and expm1. Add
 handling of infinite arguments to log1p such that log1p(inf) = inf.

---
 Eigen/src/Core/MathFunctions.h                     |  3 ++-
 .../Core/arch/Default/GenericPacketMathFunctions.h |  3 ++-
 test/packetmath.cpp                                | 14 +++++++++++++-
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index fcf62011e..fbec39d83 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -551,7 +551,8 @@ namespace std_fallback {
     Scalar x1p = RealScalar(1) + x;
     Scalar log_1p = log(x1p);
     const bool is_small = numext::equal_strict(x1p, Scalar(1));
-    return is_small ? x : x * (log_1p / (x1p - RealScalar(1)));
+    const bool is_inf = numext::equal_strict(x1p, log_1p);
+    return (is_small || is_inf) ? x : x * (log_1p / (x1p - RealScalar(1)));
   }
 }
 
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
index 505a0eec8..0fc673e12 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -137,8 +137,9 @@ Packet generic_plog1p(const Packet& x)
   Packet xp1 = padd(x, one);
   Packet small_mask = pcmp_eq(xp1, one);
   Packet log1 = plog(xp1);
+  Packet inf_mask = pcmp_eq(xp1, log1);
   Packet log_large = pmul(x, pdiv(log1, psub(xp1, one)));
-  return pselect(small_mask, x, log_large);
+  return pselect(por(small_mask, inf_mask), x, log_large);
 }
 
 /** \internal \returns exp(x)-1 computed using W. Kahan's formula.
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index 28768b18d..67ff6dc5b 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -607,8 +607,12 @@ template<typename Scalar,typename Packet> void packetmath_real()
   CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasLGamma, std::lgamma, internal::plgamma);
   CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasErf, std::erf, internal::perf);
   CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasErfc, std::erfc, internal::perfc);
-  CHECK_CWISE1_IF(PacketTraits::HasExpm1, std::expm1, internal::pexpm1);
+  data1[0] = std::numeric_limits<Scalar>::infinity();
+  data1[1] = Scalar(-1);
   CHECK_CWISE1_IF(PacketTraits::HasLog1p, std::log1p, internal::plog1p);
+  data1[0] = std::numeric_limits<Scalar>::infinity();
+  data1[1] = -std::numeric_limits<Scalar>::infinity();
+  CHECK_CWISE1_IF(PacketTraits::HasExpm1, std::expm1, internal::pexpm1);
 #endif
 
   if(PacketSize>=2)
@@ -648,6 +652,14 @@ template<typename Scalar,typename Packet> void packetmath_real()
       h.store(data2, internal::plog(h.load(data1)));
       VERIFY((numext::isinf)(data2[0]));
     }
+    if(PacketTraits::HasLog1p) {
+      packet_helper<PacketTraits::HasLog1p,Packet> h;
+      data1[0] = Scalar(-2);
+      data1[1] = -std::numeric_limits<Scalar>::infinity();
+      h.store(data2, internal::plog1p(h.load(data1)));
+      VERIFY((numext::isnan)(data2[0]));
+      VERIFY((numext::isnan)(data2[1]));
+    }
     if(PacketTraits::HasSqrt)
     {
       packet_helper<PacketTraits::HasSqrt,Packet> h;

From bc40d4522c56fdf861fcdab28f4b7db609d8065e Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Wed, 28 Aug 2019 17:46:05 -0700
Subject: [PATCH 14/30] Const correctness in TensorMap<const Tensor<T, ...>>
 expressions

---
 .../src/Tensor/TensorForwardDeclarations.h    |  1 +
 .../Eigen/CXX11/src/Tensor/TensorMap.h        | 66 +++++++++++--------
 unsupported/test/cxx11_tensor_map.cpp         | 41 +++++++++---
 3 files changed, 74 insertions(+), 34 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
index a95f22631..3cca0c7e9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -20,6 +20,7 @@ namespace Eigen {
 // map_allocator.
 template<typename T> struct MakePointer {
   typedef T* Type;
+  typedef const T* ConstType;
 };
 
 template <typename T>
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
index 395cdf9c8..b28cd822f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
@@ -42,13 +42,27 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
     typedef typename NumTraits<Scalar>::Real RealScalar;
     typedef typename Base::CoeffReturnType CoeffReturnType;
 
-  /*    typedef typename internal::conditional<
-                         bool(internal::is_lvalue<PlainObjectType>::value),
-                         Scalar *,
-                         const Scalar *>::type
-                     PointerType;*/
     typedef typename MakePointer_<Scalar>::Type PointerType;
-    typedef PointerType PointerArgType;
+    typedef typename MakePointer_<Scalar>::ConstType PointerConstType;
+
+    // WARN: PointerType still can be a pointer to const (const Scalar*), for
+    // example in TensorMap<Tensor<const Scalar, ...>> expression. This type of
+    // expression should be illegal, but adding this restriction is not possible
+    // in practice (see https://bitbucket.org/eigen/eigen/pull-requests/488).
+    typedef typename internal::conditional<
+        bool(internal::is_lvalue<PlainObjectType>::value),
+        PointerType,      // use simple pointer in lvalue expressions
+        PointerConstType  // use const pointer in rvalue expressions
+        >::type StoragePointerType;
+
+    // If TensorMap was constructed over rvalue expression (e.g. const Tensor),
+    // we should return a reference to const from operator() (and others), even
+    // if TensorMap itself is not const.
+    typedef typename internal::conditional<
+        bool(internal::is_lvalue<PlainObjectType>::value),
+        Scalar&,
+        const Scalar&
+        >::type StorageRefType;
 
     static const int Options = Options_;
 
@@ -63,47 +77,47 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
     };
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr) : m_data(dataPtr), m_dimensions() {
+    EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr) : m_data(dataPtr), m_dimensions() {
       // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
       EIGEN_STATIC_ASSERT((0 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
 
 #if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes> EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) {
+    EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) {
       // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
       EIGEN_STATIC_ASSERT((sizeof...(otherDimensions) + 1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
 #else
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(firstDimension) {
+    EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(firstDimension) {
       // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
       EIGEN_STATIC_ASSERT((1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2) : m_data(dataPtr), m_dimensions(dim1, dim2) {
+    EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2) : m_data(dataPtr), m_dimensions(dim1, dim2) {
       EIGEN_STATIC_ASSERT(2 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3) {
+    EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2, Index dim3) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3) {
       EIGEN_STATIC_ASSERT(3 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4) {
+    EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4) {
       EIGEN_STATIC_ASSERT(4 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4, dim5) {
+    EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4, dim5) {
       EIGEN_STATIC_ASSERT(5 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
 #endif
 
-   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, const array<Index, NumIndices>& dimensions)
+   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, const array<Index, NumIndices>& dimensions)
       : m_data(dataPtr), m_dimensions(dimensions)
     { }
 
     template <typename Dimensions>
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, const Dimensions& dimensions)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, const Dimensions& dimensions)
       : m_data(dataPtr), m_dimensions(dimensions)
     { }
 
@@ -120,9 +134,9 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE PointerType data() { return m_data; }
+    EIGEN_STRONG_INLINE StoragePointerType data() { return m_data; }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const PointerType data() const { return m_data; }
+    EIGEN_STRONG_INLINE PointerConstType data() const { return m_data; }
 
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const
@@ -213,7 +227,7 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
 #endif
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices)
+    EIGEN_STRONG_INLINE StorageRefType operator()(const array<Index, NumIndices>& indices)
     {
       //      eigen_assert(checkIndexRange(indices));
       if (PlainObjectType::Options&RowMajor) {
@@ -226,14 +240,14 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()()
+    EIGEN_STRONG_INLINE StorageRefType operator()()
     {
       EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE)
       return m_data[0];
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()(Index index)
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index index)
     {
       eigen_internal_assert(index >= 0 && index < size());
       return m_data[index];
@@ -241,7 +255,7 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
 
 #if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes> EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
     {
       static_assert(sizeof...(otherIndices) + 2 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
        eigen_assert(internal::all((Eigen::NumTraits<Index>::highest() >= otherIndices)...));
@@ -256,7 +270,7 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
     }
 #else
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1)
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1)
     {
        if (PlainObjectType::Options&RowMajor) {
          const Index index = i1 + i0 * m_dimensions[1];
@@ -267,7 +281,7 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
       }
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2)
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2)
     {
        if (PlainObjectType::Options&RowMajor) {
          const Index index = i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0);
@@ -278,7 +292,7 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
       }
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3)
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3)
     {
       if (PlainObjectType::Options&RowMajor) {
         const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0));
@@ -289,7 +303,7 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
       }
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4)
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3, Index i4)
     {
       if (PlainObjectType::Options&RowMajor) {
         const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)));
@@ -320,7 +334,7 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
     }
 
   private:
-    typename MakePointer_<Scalar>::Type m_data;
+    StoragePointerType m_data;
     Dimensions m_dimensions;
 };
 
diff --git a/unsupported/test/cxx11_tensor_map.cpp b/unsupported/test/cxx11_tensor_map.cpp
index ce608aca7..dc8532f5c 100644
--- a/unsupported/test/cxx11_tensor_map.cpp
+++ b/unsupported/test/cxx11_tensor_map.cpp
@@ -19,8 +19,8 @@ static void test_0d()
   Tensor<int, 0> scalar1;
   Tensor<int, 0, RowMajor> scalar2;
 
-  TensorMap<Tensor<const int, 0> > scalar3(scalar1.data());
-  TensorMap<Tensor<const int, 0, RowMajor> > scalar4(scalar2.data());
+  TensorMap<const Tensor<int, 0> > scalar3(scalar1.data());
+  TensorMap<const Tensor<int, 0, RowMajor> > scalar4(scalar2.data());
 
   scalar1() = 7;
   scalar2() = 13;
@@ -37,8 +37,8 @@ static void test_1d()
   Tensor<int, 1> vec1(6);
   Tensor<int, 1, RowMajor> vec2(6);
 
-  TensorMap<Tensor<const int, 1> > vec3(vec1.data(), 6);
-  TensorMap<Tensor<const int, 1, RowMajor> > vec4(vec2.data(), 6);
+  TensorMap<const Tensor<int, 1> > vec3(vec1.data(), 6);
+  TensorMap<const Tensor<int, 1, RowMajor> > vec4(vec2.data(), 6);
 
   vec1(0) = 4;  vec2(0) = 0;
   vec1(1) = 8;  vec2(1) = 1;
@@ -85,8 +85,8 @@ static void test_2d()
   mat2(1,1) = 4;
   mat2(1,2) = 5;
 
-  TensorMap<Tensor<const int, 2> > mat3(mat1.data(), 2, 3);
-  TensorMap<Tensor<const int, 2, RowMajor> > mat4(mat2.data(), 2, 3);
+  TensorMap<const Tensor<int, 2> > mat3(mat1.data(), 2, 3);
+  TensorMap<const Tensor<int, 2, RowMajor> > mat4(mat2.data(), 2, 3);
 
   VERIFY_IS_EQUAL(mat3.rank(), 2);
   VERIFY_IS_EQUAL(mat3.size(), 6);
@@ -129,8 +129,8 @@ static void test_3d()
     }
   }
 
-  TensorMap<Tensor<const int, 3> > mat3(mat1.data(), 2, 3, 7);
-  TensorMap<Tensor<const int, 3, RowMajor> > mat4(mat2.data(), 2, 3, 7);
+  TensorMap<const Tensor<int, 3> > mat3(mat1.data(), 2, 3, 7);
+  TensorMap<const Tensor<int, 3, RowMajor> > mat4(mat2.data(), 2, 3, 7);
 
   VERIFY_IS_EQUAL(mat3.rank(), 3);
   VERIFY_IS_EQUAL(mat3.size(), 2*3*7);
@@ -265,6 +265,29 @@ static void test_casting()
   VERIFY_IS_EQUAL(sum1, 861);
 }
 
+template<typename T>
+static const T& add_const(T& value) {
+  return value;
+}
+
+static void test_0d_const_tensor()
+{
+  Tensor<int, 0> scalar1;
+  Tensor<int, 0, RowMajor> scalar2;
+
+  TensorMap<const Tensor<int, 0> > scalar3(add_const(scalar1).data());
+  TensorMap<const Tensor<int, 0, RowMajor> > scalar4(add_const(scalar2).data());
+
+  scalar1() = 7;
+  scalar2() = 13;
+
+  VERIFY_IS_EQUAL(scalar1.rank(), 0);
+  VERIFY_IS_EQUAL(scalar1.size(), 1);
+
+  VERIFY_IS_EQUAL(scalar3(), 7);
+  VERIFY_IS_EQUAL(scalar4(), 13);
+}
+
 EIGEN_DECLARE_TEST(cxx11_tensor_map)
 {
   CALL_SUBTEST(test_0d());
@@ -274,4 +297,6 @@ EIGEN_DECLARE_TEST(cxx11_tensor_map)
 
   CALL_SUBTEST(test_from_tensor());
   CALL_SUBTEST(test_casting());
+
+  CALL_SUBTEST(test_0d_const_tensor());
 }

From f6c51d9209ccc04d28c39f4c8059e7d3e74d6e07 Mon Sep 17 00:00:00 2001
From: Rasmus Munk Larsen <rmlarsen@google.com>
Date: Fri, 30 Aug 2019 14:03:29 -0700
Subject: [PATCH 15/30] Fix missing header inclusion and colliding definitions
 for half type casting, which broke build with -march=native on
 Haswell/Skylake.

---
 Eigen/Core                               |  1 +
 Eigen/src/Core/arch/AVX/TypeCasting.h    |  3 +++
 Eigen/src/Core/arch/AVX512/TypeCasting.h | 22 ++++++++++++++++++++++
 3 files changed, 26 insertions(+)

diff --git a/Eigen/Core b/Eigen/Core
index e6e31caee..9d8f8fce8 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -172,6 +172,7 @@ using std::ptrdiff_t;
   #include "src/Core/arch/AVX/TypeCasting.h"
   #include "src/Core/arch/AVX/Complex.h"
   #include "src/Core/arch/AVX512/PacketMath.h"
+  #include "src/Core/arch/AVX512/TypeCasting.h"
   #include "src/Core/arch/AVX512/Complex.h"
   #include "src/Core/arch/SSE/MathFunctions.h"
   #include "src/Core/arch/AVX/MathFunctions.h"
diff --git a/Eigen/src/Core/arch/AVX/TypeCasting.h b/Eigen/src/Core/arch/AVX/TypeCasting.h
index 910fc06ca..181043588 100644
--- a/Eigen/src/Core/arch/AVX/TypeCasting.h
+++ b/Eigen/src/Core/arch/AVX/TypeCasting.h
@@ -52,6 +52,7 @@ template<> EIGEN_STRONG_INLINE Packet8f preinterpret<Packet8f,Packet8i>(const Pa
   return _mm256_castsi256_ps(a);
 }
 
+#ifndef EIGEN_VECTORIZE_AVX512
 
 template <>
 struct type_casting_traits<Eigen::half, float> {
@@ -75,6 +76,8 @@ struct type_casting_traits<float, Eigen::half> {
   };
 };
 
+#endif  // EIGEN_VECTORIZE_AVX512
+
 template<> EIGEN_STRONG_INLINE Packet8h pcast<Packet8f, Packet8h>(const Packet8f& a) {
   return float2half(a);
 }
diff --git a/Eigen/src/Core/arch/AVX512/TypeCasting.h b/Eigen/src/Core/arch/AVX512/TypeCasting.h
index 777a26ae4..a82176941 100644
--- a/Eigen/src/Core/arch/AVX512/TypeCasting.h
+++ b/Eigen/src/Core/arch/AVX512/TypeCasting.h
@@ -1,3 +1,19 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2019 Rasmus Munk Larsen <rmlarsen@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_AVX512_H
+#define EIGEN_TYPE_CASTING_AVX512_H
+
+namespace Eigen {
+
+namespace internal {
+
 template <>
 struct type_casting_traits<half, float> {
   enum {
@@ -23,3 +39,9 @@ struct type_casting_traits<float, half> {
 template<> EIGEN_STRONG_INLINE Packet16h pcast<Packet16f, Packet16h>(const Packet16f& a) {
   return float2half(a);
 }
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TYPE_CASTING_AVX512_H

From 66665e7e76d2ad5aa37775b3777e9a53c6d1c18c Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Fri, 30 Aug 2019 14:49:40 -0700
Subject: [PATCH 16/30] Asynchronous expression evaluation with
 TensorAsyncDevice

---
 unsupported/Eigen/CXX11/ThreadPool            |   2 +-
 .../Eigen/CXX11/src/Tensor/TensorBase.h       |  11 +
 .../Eigen/CXX11/src/Tensor/TensorBlock.h      |   1 +
 .../Eigen/CXX11/src/Tensor/TensorDevice.h     |  41 +++
 .../Eigen/CXX11/src/Tensor/TensorExecutor.h   | 271 +++++++++++++++---
 .../src/Tensor/TensorForwardDeclarations.h    |   6 +
 unsupported/test/cxx11_tensor_executor.cpp    | 107 ++++++-
 unsupported/test/cxx11_tensor_thread_pool.cpp |  39 ++-
 8 files changed, 414 insertions(+), 64 deletions(-)

diff --git a/unsupported/Eigen/CXX11/ThreadPool b/unsupported/Eigen/CXX11/ThreadPool
index d046af9b2..613fdb57a 100644
--- a/unsupported/Eigen/CXX11/ThreadPool
+++ b/unsupported/Eigen/CXX11/ThreadPool
@@ -43,7 +43,7 @@
 #include <mutex>
 #include <thread>
 #include <functional>
-#include <memory>
+#include <utility>
 #include "src/util/CXX11Meta.h"
 #include "src/util/MaxSizeVector.h"
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index dbacf494e..095c85dc4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -1063,6 +1063,17 @@ class TensorBase : public TensorBase<Derived, ReadOnlyAccessors> {
       return TensorDevice<Derived, DeviceType>(dev, derived());
     }
 
+#ifdef EIGEN_USE_THREADS
+    // Select the async device on which to evaluate the expression.
+    template <typename DeviceType>
+    typename internal::enable_if<
+        internal::is_same<DeviceType, ThreadPoolDevice>::value,
+        TensorAsyncDevice<Derived, DeviceType>>::type
+    device(const DeviceType& dev, std::function<void()> done) {
+      return TensorAsyncDevice<Derived, DeviceType>(dev, derived(), std::move(done));
+    }
+#endif  // EIGEN_USE_THREADS
+
  protected:
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Derived& derived() { return *static_cast<Derived*>(this); }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
index 49fb21dc8..c8a8b16db 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
@@ -932,6 +932,7 @@ class TensorBlockMapper {
   typedef TensorBlock<Scalar, StorageIndex, NumDims, Layout> Block;
   typedef DSizes<StorageIndex, NumDims> Dimensions;
 
+  TensorBlockMapper() {}
   TensorBlockMapper(const Dimensions& dims,
                     const TensorBlockShapeType block_shape,
                     Index min_target_size)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
index 29e50a3b2..5122b3623 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
@@ -63,6 +63,47 @@ template <typename ExpressionType, typename DeviceType> class TensorDevice {
     ExpressionType& m_expression;
 };
 
+#ifdef EIGEN_USE_THREADS
+
+/** \class TensorAsyncDevice
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Pseudo expression providing an operator = that will evaluate its
+  * argument asynchronously on the specified device (currently supports only
+  * ThreadPoolDevice).
+  *
+  * Example:
+  *    std::function<void()> done = []() {};
+  *    C.device(EIGEN_THREAD_POOL, std::move(done)) = A + B;
+ */
+
+template <typename ExpressionType, typename DeviceType>
+class TensorAsyncDevice {
+ public:
+  TensorAsyncDevice(const DeviceType& device, ExpressionType& expression,
+                    std::function<void()> done)
+      : m_device(device), m_expression(expression), m_done(std::move(done)) {}
+
+  template <typename OtherDerived>
+  EIGEN_STRONG_INLINE TensorAsyncDevice& operator=(const OtherDerived& other) {
+    typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
+    typedef internal::TensorAsyncExecutor<const Assign, DeviceType> Executor;
+
+    // WARNING: After assignment 'm_done' callback will be in undefined state.
+    Assign assign(m_expression, other);
+    Executor::runAsync(assign, m_device, std::move(m_done));
+
+    return *this;
+  }
+
+ protected:
+  const DeviceType& m_device;
+  ExpressionType& m_expression;
+  std::function<void()> m_done;
+};
+
+#endif  // EIGEN_USE_THREADS
+
 } // end namespace Eigen
 
 #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index fddb90d77..18d9de9e6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -84,7 +84,7 @@ class TensorExecutor {
 
   EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE void run(const Expression& expr,
-                         const Device& device = Device()) {
+                                      const Device& device = Device()) {
     TensorEvaluator<Expression, Device> evaluator(expr, device);
     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
     if (needs_assign) {
@@ -97,6 +97,14 @@ class TensorExecutor {
   }
 };
 
+/**
+ * Default async execution strategy is not implemented. Currently it's only
+ * available for ThreadPoolDevice (see definition below).
+ */
+template <typename Expression, typename Device, bool Vectorizable,
+          bool Tileable>
+class TensorAsyncExecutor {};
+
 /**
  * Process all the data with a single cpu thread, using vectorized instructions.
  */
@@ -107,8 +115,8 @@ class TensorExecutor<Expression, DefaultDevice, /*Vectorizable*/ true,
   typedef typename Expression::Index StorageIndex;
 
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE void run(const Expression& expr,
-                         const DefaultDevice& device = DefaultDevice()) {
+  static EIGEN_STRONG_INLINE void run(
+      const Expression& expr, const DefaultDevice& device = DefaultDevice()) {
     TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
     if (needs_assign) {
@@ -206,8 +214,81 @@ class TensorExecutor<Expression, DefaultDevice, Vectorizable,
 /**
  * Multicore strategy: the index space is partitioned and each partition is
  * executed on a single core.
+ *
+ * (1) TensorExecutor will submit work to the ThreadPoolDevice managed thread
+ *     pool, and will block the caller thread until all tasks are finished.
+ *
+ * (2) TensorAsyncExecutor is a non-blocking version, that will submit work to
+ *     the ThreadPoolDevice managed thread pool, and will return immediately.
+ *     It will call 'done' callback after all tasks are finished.
  */
 #ifdef EIGEN_USE_THREADS
+
+template <typename TensorBlockMapper>
+struct TensorExecutorTilingContext {
+  typedef typename TensorBlockMapper::Block TensorBlock;
+
+  TensorExecutorTilingContext() : buffer(nullptr) {}
+  TensorExecutorTilingContext(const TensorBlockMapper& b_mapper,
+                              const TensorOpCost& b_cost, void* b_buffer,
+                              size_t b_aligned_size)
+      : block_mapper(b_mapper),
+        cost(b_cost),
+        buffer(b_buffer),
+        aligned_blocksize(b_aligned_size) {}
+
+  template <typename Scalar>
+  Scalar* GetCurrentThreadBuffer(const ThreadPoolDevice& device) const {
+    // ThreadPoolDevice::currentThreadId() returns -1 if called from a thread
+    // not in the thread pool, such as the main thread dispatching Eigen
+    // expressions.
+    const int thread_idx = device.currentThreadId();
+    eigen_assert(thread_idx >= -1 && thread_idx < device.numThreads());
+
+    const Index offset = aligned_blocksize * (thread_idx + 1);
+    return reinterpret_cast<Scalar*>(static_cast<char*>(buffer) + offset);
+  }
+
+  TensorBlockMapper block_mapper;  // navigate through blocks
+  TensorOpCost cost;               // cost of computing a single block
+  void* buffer;                    // temporary buffer for blocks
+  size_t aligned_blocksize;        // block size after memory alignment
+};
+
+// Computes a block evaluation parameters, and allocates temporary memory buffer
+// for blocks. See TensorExecutor/TensorAsyncExecutor (Tileable=true) below.
+template <typename Evaluator, typename TensorBlockMapper, bool Vectorizable>
+TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(
+    const ThreadPoolDevice& device, const Evaluator& evaluator) {
+  // Prefer blocks skewed toward inner dimension.
+  TensorBlockShapeType block_shape = kSkewedInnerDims;
+  Index block_total_size = 0;
+
+  // Query expression tree for desired block size/shape.
+  std::vector<TensorOpResourceRequirements> resources;
+  evaluator.getResourceRequirements(&resources);
+  MergeResourceRequirements(resources, &block_shape, &block_total_size);
+  int num_threads = device.numThreads();
+
+  // Estimate minimum block size based on cost.
+  TensorOpCost cost = evaluator.costPerCoeff(Vectorizable);
+  double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(1, cost);
+  size_t block_size = static_cast<size_t>(1.0 / taskSize);
+
+  TensorBlockMapper block_mapper(
+      typename TensorBlockMapper::Dimensions(evaluator.dimensions()),
+      block_shape, block_size);
+
+  block_size = block_mapper.block_dims_total_size();
+  const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
+  const size_t aligned_blocksize =
+      align *
+      divup<size_t>(block_size * sizeof(typename Evaluator::Scalar), align);
+  void* buf = device.allocate((num_threads + 1) * aligned_blocksize);
+
+  return {block_mapper, cost * block_size, buf, aligned_blocksize};
+}
+
 template <typename Evaluator, typename StorageIndex, bool Vectorizable>
 struct EvalRange {
   static void run(Evaluator* evaluator_in, const StorageIndex firstIdx,
@@ -274,7 +355,7 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> {
     typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
 
     Evaluator evaluator(expr, device);
-    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
     if (needs_assign) {
       const StorageIndex size = array_prod(evaluator.dimensions());
       device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
@@ -290,18 +371,18 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> {
 template <typename Expression, bool Vectorizable>
 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ true> {
  public:
+  typedef typename traits<Expression>::Index StorageIndex;
   typedef typename traits<Expression>::Scalar Scalar;
   typedef typename remove_const<Scalar>::type ScalarNoConst;
 
-  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
-  typedef typename traits<Expression>::Index StorageIndex;
-
   static const int NumDims = traits<Expression>::NumDimensions;
 
+  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
+  typedef TensorBlockMapper<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout> BlockMapper;
+  typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
+
   static EIGEN_STRONG_INLINE void run(const Expression& expr,
                          const ThreadPoolDevice& device) {
-    typedef TensorBlockMapper<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout> TensorBlockMapper;
-
     Evaluator evaluator(expr, device);
     Index total_size = array_prod(evaluator.dimensions());
     Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
@@ -315,50 +396,152 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ tr
       return;
     }
 
-    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
     if (needs_assign) {
-      TensorBlockShapeType block_shape = kSkewedInnerDims;
-      Index block_total_size = 0;
-      // Query expression tree for desired block size/shape.
-      std::vector<internal::TensorOpResourceRequirements> resources;
-      evaluator.getResourceRequirements(&resources);
-      MergeResourceRequirements(resources, &block_shape, &block_total_size);
-      int num_threads = device.numThreads();
+      const TilingContext tiling =
+          internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper,
+                                                   Vectorizable>(device, evaluator);
 
-      // Estimate minimum block size based on cost.
-      TensorOpCost cost = evaluator.costPerCoeff(Vectorizable);
-      double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(1, cost);
-      size_t block_size = static_cast<size_t>(1.0 / taskSize);
-      TensorBlockMapper block_mapper(
-          typename TensorBlockMapper::Dimensions(evaluator.dimensions()),
-          block_shape, block_size);
-      block_size = block_mapper.block_dims_total_size();
-      const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
-      const size_t aligned_blocksize =
-          align * divup<size_t>(block_size * sizeof(Scalar), align);
-      void* buf = device.allocate((num_threads + 1) * aligned_blocksize);
       device.parallelFor(
-          block_mapper.total_block_count(), cost * block_size,
-          [=, &device, &evaluator, &block_mapper](StorageIndex firstIdx,
-                                                  StorageIndex lastIdx) {
-            // currentThreadId() returns -1 if called from a thread not in the
-            // thread pool, such as the main thread dispatching Eigen
-            // expressions.
-            const int thread_idx = device.currentThreadId();
-            eigen_assert(thread_idx >= -1 && thread_idx < num_threads);
-            ScalarNoConst* thread_buf = reinterpret_cast<ScalarNoConst*>(
-                static_cast<char*>(buf) + aligned_blocksize * (thread_idx + 1));
+          tiling.block_mapper.total_block_count(), tiling.cost,
+          [=, &device, &evaluator, &tiling](StorageIndex firstIdx,
+                                            StorageIndex lastIdx) {
+            ScalarNoConst* thread_buf =
+                tiling.template GetCurrentThreadBuffer<ScalarNoConst>(device);
             for (StorageIndex i = firstIdx; i < lastIdx; ++i) {
-              auto block = block_mapper.GetBlockForIndex(i, thread_buf);
+              auto block = tiling.block_mapper.GetBlockForIndex(i, thread_buf);
               evaluator.evalBlock(&block);
             }
           });
-      device.deallocate(buf);
+      device.deallocate(tiling.buffer);
     }
     evaluator.cleanup();
   }
 };
 
+template <typename Expression, bool Vectorizable, bool Tileable>
+class TensorAsyncExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> {
+ public:
+  typedef typename Expression::Index StorageIndex;
+  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
+
+  static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
+                                           const ThreadPoolDevice& device,
+                                           std::function<void()> done) {
+    TensorAsyncExecutorContext* const ctx =
+        new TensorAsyncExecutorContext(expr, device, std::move(done));
+    // TODO(ezhulenev): This is a potentially blocking operation. Make it async!
+    const bool needs_assign = ctx->evaluator.evalSubExprsIfNeeded(nullptr);
+
+    typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
+
+    if (needs_assign) {
+      const StorageIndex size = array_prod(ctx->evaluator.dimensions());
+      device.parallelForAsync(
+          size, ctx->evaluator.costPerCoeff(Vectorizable),
+          EvalRange::alignBlockSize,
+          [ctx](StorageIndex firstIdx, StorageIndex lastIdx) {
+            EvalRange::run(&ctx->evaluator, firstIdx, lastIdx);
+          },
+          [ctx]() { delete ctx; });
+    }
+  }
+
+ private:
+  struct TensorAsyncExecutorContext {
+    TensorAsyncExecutorContext(const Expression& expr,
+                               const ThreadPoolDevice& thread_pool,
+                               std::function<void()> done)
+        : evaluator(expr, thread_pool), on_done(std::move(done)) {}
+
+    ~TensorAsyncExecutorContext() {
+      on_done();
+      evaluator.cleanup();
+    }
+
+    Evaluator evaluator;
+
+   private:
+    std::function<void()> on_done;
+  };
+};
+
+template <typename Expression, bool Vectorizable>
+class TensorAsyncExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ true> {
+ public:
+  typedef typename traits<Expression>::Index StorageIndex;
+  typedef typename traits<Expression>::Scalar Scalar;
+  typedef typename remove_const<Scalar>::type ScalarNoConst;
+
+  static const int NumDims = traits<Expression>::NumDimensions;
+
+  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
+  typedef TensorBlockMapper<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout> BlockMapper;
+  typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
+
+  static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
+                                           const ThreadPoolDevice& device,
+                                           std::function<void()> done) {
+    TensorAsyncExecutorContext* const ctx =
+        new TensorAsyncExecutorContext(expr, device, std::move(done));
+
+    Index total_size = array_prod(ctx->evaluator.dimensions());
+    Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
+
+    if (total_size < cache_size &&
+        !ExpressionHasTensorBroadcastingOp<Expression>::value) {
+      internal::TensorAsyncExecutor<Expression, ThreadPoolDevice, Vectorizable,
+                                    /*Tileable*/ false>::runAsync(
+                                        expr, device, [ctx]() { delete ctx; });
+      return;
+    }
+
+    // TODO(ezhulenev): This is a potentially blocking operation. Make it async!
+    const bool needs_assign = ctx->evaluator.evalSubExprsIfNeeded(nullptr);
+
+    if (needs_assign) {
+      ctx->tiling =
+          internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper,
+                                                   Vectorizable>(device, ctx->evaluator);
+
+      device.parallelForAsync(
+          ctx->tiling.block_mapper.total_block_count(), ctx->tiling.cost,
+          [ctx](StorageIndex firstIdx, StorageIndex lastIdx) {
+            ScalarNoConst* thread_buf =
+                ctx->tiling.template GetCurrentThreadBuffer<ScalarNoConst>(ctx->device);
+            for (StorageIndex i = firstIdx; i < lastIdx; ++i) {
+              auto block = ctx->tiling.block_mapper.GetBlockForIndex(i, thread_buf);
+              ctx->evaluator.evalBlock(&block);
+            }
+          },
+          [ctx]() { delete ctx; });
+    }
+  }
+
+ private:
+  struct TensorAsyncExecutorContext {
+    TensorAsyncExecutorContext(const Expression& expr,
+                               const ThreadPoolDevice& thread_pool,
+                               std::function<void()> done)
+        : device(thread_pool),
+          evaluator(expr, thread_pool),
+          on_done(std::move(done)) {}
+
+    ~TensorAsyncExecutorContext() {
+      on_done();
+      device.deallocate(tiling.buffer);
+      evaluator.cleanup();
+    }
+
+    const ThreadPoolDevice& device;
+    Evaluator evaluator;
+    TilingContext tiling;
+
+   private:
+    std::function<void()> on_done;
+  };
+};
+
 #endif  // EIGEN_USE_THREADS
 
 
@@ -419,7 +602,7 @@ template <typename Expression, bool Vectorizable, bool Tileable>
 EIGEN_STRONG_INLINE void TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable>::run(
     const Expression& expr, const GpuDevice& device) {
   TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
-  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+  const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
   if (needs_assign) {
 
     const int block_size = device.maxGpuThreadsPerBlock();
@@ -517,10 +700,10 @@ struct ExecExprFunctorKernel<Expr, false, Evaluator>
 template <typename Expression, bool Vectorizable, bool Tileable>
 class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tileable> {
   public:
-  typedef typename Expression::Index Index; 
+  typedef typename Expression::Index Index;
    static EIGEN_STRONG_INLINE void run(const Expression &expr, const Eigen::SyclDevice &dev) {
     Eigen::TensorEvaluator<Expression, Eigen::SyclDevice> evaluator(expr, dev);
-    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
     if (needs_assign) {
       Index range, GRange, tileSize;
       Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions());
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
index 3cca0c7e9..e823bd932 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -94,6 +94,7 @@ template<typename XprType, template <class> class MakePointer_ = MakePointer> cl
 template<typename XprType> class TensorForcedEvalOp;
 
 template<typename ExpressionType, typename DeviceType> class TensorDevice;
+template<typename ExpressionType, typename DeviceType> class TensorAsyncDevice;
 template<typename Derived, typename Device> struct TensorEvaluator;
 
 struct NoOpOutputKernel;
@@ -167,6 +168,11 @@ template <typename Expression, typename Device,
           bool Tileable = IsTileable<Device, Expression>::value>
 class TensorExecutor;
 
+template <typename Expression, typename Device,
+          bool Vectorizable = IsVectorizable<Device, Expression>::value,
+          bool Tileable = IsTileable<Device, Expression>::value>
+class TensorAsyncExecutor;
+
 }  // end namespace internal
 
 }  // end namespace Eigen
diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp
index e9922a48d..f4d0401da 100644
--- a/unsupported/test/cxx11_tensor_executor.cpp
+++ b/unsupported/test/cxx11_tensor_executor.cpp
@@ -562,37 +562,112 @@ static void test_execute_reverse_rvalue(Device d)
   }
 }
 
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          bool Tileable, int Layout>
+static void test_async_execute_unary_expr(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  // Pick a large enough tensor size to bypass small tensor block evaluation
+  // optimization.
+  auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
+
+  Tensor<T, NumDims, Options, Index> src(dims);
+  Tensor<T, NumDims, Options, Index> dst(dims);
+
+  src.setRandom();
+  const auto expr = src.square();
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor = internal::TensorAsyncExecutor<const Assign, Device,
+                                                 Vectorizable, Tileable>;
+  Eigen::Barrier done(1);
+  Executor::runAsync(Assign(dst, expr), d, [&done]() { done.Notify(); });
+  done.Wait();
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    T square = src.coeff(i) * src.coeff(i);
+    VERIFY_IS_EQUAL(square, dst.coeff(i));
+  }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          bool Tileable, int Layout>
+static void test_async_execute_binary_expr(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  // Pick a large enough tensor size to bypass small tensor block evaluation
+  // optimization.
+  auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
+
+  Tensor<T, NumDims, Options, Index> lhs(dims);
+  Tensor<T, NumDims, Options, Index> rhs(dims);
+  Tensor<T, NumDims, Options, Index> dst(dims);
+
+  lhs.setRandom();
+  rhs.setRandom();
+
+  const auto expr = lhs + rhs;
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor = internal::TensorAsyncExecutor<const Assign, Device,
+                                                 Vectorizable, Tileable>;
+
+  Eigen::Barrier done(1);
+  Executor::runAsync(Assign(dst, expr), d, [&done]() { done.Notify(); });
+  done.Wait();
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    T sum = lhs.coeff(i) + rhs.coeff(i);
+    VERIFY_IS_EQUAL(sum, dst.coeff(i));
+  }
+}
+
 #ifdef EIGEN_DONT_VECTORIZE
 #define VECTORIZABLE(VAL) !EIGEN_DONT_VECTORIZE && VAL
-#else 
+#else
 #define VECTORIZABLE(VAL) VAL
 #endif
 
 #define CALL_SUBTEST_PART(PART) \
   CALL_SUBTEST_##PART
 
-#define CALL_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS)                                                \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false, false, ColMajor>(default_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false, true,  ColMajor>(default_device))); \
+#define CALL_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS)                                                              \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               false, ColMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               true,  ColMajor>(default_device))); \
   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  false, ColMajor>(default_device))); \
   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  true,  ColMajor>(default_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false, false, RowMajor>(default_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false, true,  RowMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               false, RowMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               true,  RowMajor>(default_device))); \
   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  false, RowMajor>(default_device))); \
   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  true,  RowMajor>(default_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, false, ColMajor>(tp_device)));      \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, true,  ColMajor>(tp_device)));      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               false, ColMajor>(tp_device)));      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               true,  ColMajor>(tp_device)));      \
   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  false, ColMajor>(tp_device)));      \
   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  true,  ColMajor>(tp_device)));      \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, false, RowMajor>(tp_device)));      \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, true,  RowMajor>(tp_device)));      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               false, RowMajor>(tp_device)));      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               true,  RowMajor>(tp_device)));      \
   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  false, RowMajor>(tp_device)));      \
   CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  true,  RowMajor>(tp_device)))
 
+// NOTE: Currently only ThreadPoolDevice supports async expression evaluation.
+#define CALL_ASYNC_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS)                                                   \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               false, ColMajor>(tp_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               true,  ColMajor>(tp_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  false, ColMajor>(tp_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  true,  ColMajor>(tp_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               false, RowMajor>(tp_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               true,  RowMajor>(tp_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  false, RowMajor>(tp_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  true,  RowMajor>(tp_device)))
+
 EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
   Eigen::DefaultDevice default_device;
+  // Default device is unused in ASYNC tests.
+  EIGEN_UNUSED_VARIABLE(default_device);
 
-  const auto num_threads = internal::random<int>(1, 24);
+  const auto num_threads = internal::random<int>(20, 24);
   Eigen::ThreadPool tp(num_threads);
   Eigen::ThreadPoolDevice tp_device(&tp, num_threads);
 
@@ -660,8 +735,16 @@ EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
   CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 4);
   CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 5);
 
+  CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 3);
+  CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 4);
+  CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 5);
+
+  CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 3);
+  CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 4);
+  CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 5);
+
   // Force CMake to split this test.
-  // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11;12;13;14
+  // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16
 }
 
 #undef CALL_SUBTEST_COMBINATIONS
diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp
index f8a7b3662..53b50d1ed 100644
--- a/unsupported/test/cxx11_tensor_thread_pool.cpp
+++ b/unsupported/test/cxx11_tensor_thread_pool.cpp
@@ -38,9 +38,9 @@ class TestAllocator : public Allocator {
 
 void test_multithread_elementwise()
 {
-  Tensor<float, 3> in1(2,3,7);
-  Tensor<float, 3> in2(2,3,7);
-  Tensor<float, 3> out(2,3,7);
+  Tensor<float, 3> in1(200, 30, 70);
+  Tensor<float, 3> in2(200, 30, 70);
+  Tensor<float, 3> out(200, 30, 70);
 
   in1.setRandom();
   in2.setRandom();
@@ -49,15 +49,39 @@ void test_multithread_elementwise()
   Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(3, 11));
   out.device(thread_pool_device) = in1 + in2 * 3.14f;
 
-  for (int i = 0; i < 2; ++i) {
-    for (int j = 0; j < 3; ++j) {
-      for (int k = 0; k < 7; ++k) {
-        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f);
+  for (int i = 0; i < 200; ++i) {
+    for (int j = 0; j < 30; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i, j, k), in1(i, j, k) + in2(i, j, k) * 3.14f);
       }
     }
   }
 }
 
+void test_async_multithread_elementwise()
+{
+  Tensor<float, 3> in1(200, 30, 70);
+  Tensor<float, 3> in2(200, 30, 70);
+  Tensor<float, 3> out(200, 30, 70);
+
+  in1.setRandom();
+  in2.setRandom();
+
+  Eigen::ThreadPool tp(internal::random<int>(3, 11));
+  Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(3, 11));
+
+  Eigen::Barrier b(1);
+  out.device(thread_pool_device, [&b]() { b.Notify(); }) = in1 + in2 * 3.14f;
+  b.Wait();
+
+  for (int i = 0; i < 200; ++i) {
+    for (int j = 0; j < 30; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i, j, k), in1(i, j, k) + in2(i, j, k) * 3.14f);
+      }
+    }
+  }
+}
 
 void test_multithread_compound_assignment()
 {
@@ -516,6 +540,7 @@ void test_threadpool_allocate(TestAllocator* allocator)
 EIGEN_DECLARE_TEST(cxx11_tensor_thread_pool)
 {
   CALL_SUBTEST_1(test_multithread_elementwise());
+  CALL_SUBTEST_1(test_async_multithread_elementwise());
   CALL_SUBTEST_1(test_multithread_compound_assignment());
 
   CALL_SUBTEST_2(test_multithread_contraction<ColMajor>());

From 619cea94916e7531a839ee0ff657714857921db8 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Fri, 30 Aug 2019 14:51:17 -0700
Subject: [PATCH 17/30] Revert accidentally removed <memory> header from
 ThreadPool

---
 unsupported/Eigen/CXX11/ThreadPool | 1 +
 1 file changed, 1 insertion(+)

diff --git a/unsupported/Eigen/CXX11/ThreadPool b/unsupported/Eigen/CXX11/ThreadPool
index 613fdb57a..7a795da3d 100644
--- a/unsupported/Eigen/CXX11/ThreadPool
+++ b/unsupported/Eigen/CXX11/ThreadPool
@@ -43,6 +43,7 @@
 #include <mutex>
 #include <thread>
 #include <functional>
+#include <memory>
 #include <utility>
 #include "src/util/CXX11Meta.h"
 #include "src/util/MaxSizeVector.h"

From f0b36fb9a405400e82b73ea70097b8ae3cd1095a Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Fri, 30 Aug 2019 15:13:38 -0700
Subject: [PATCH 18/30] evalSubExprsIfNeededAsync + async
 TensorContractionThreadPool

---
 .../Eigen/CXX11/src/Tensor/TensorAssign.h     |  12 +
 .../CXX11/src/Tensor/TensorBroadcasting.h     |   8 +
 .../CXX11/src/Tensor/TensorContraction.h      | 139 ++--
 .../src/Tensor/TensorContractionThreadPool.h  | 711 ++++++++++++------
 .../CXX11/src/Tensor/TensorDeviceThreadPool.h |  35 +-
 .../Eigen/CXX11/src/Tensor/TensorEvaluator.h  |  51 +-
 .../Eigen/CXX11/src/Tensor/TensorExecutor.h   |  36 +-
 .../Eigen/CXX11/src/ThreadPool/Barrier.h      |   3 +
 unsupported/test/cxx11_tensor_thread_pool.cpp | 140 ++++
 9 files changed, 833 insertions(+), 302 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
index d6e51bc6c..270ad974e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@@ -147,6 +147,18 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
     // by the rhs to the lhs.
     return m_rightImpl.evalSubExprsIfNeeded(m_leftImpl.data());
   }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done](bool) {
+      m_rightImpl.evalSubExprsIfNeededAsync(
+          m_leftImpl.data(), [done](bool need_assign) { done(need_assign); });
+    });
+  }
+#endif  // EIGEN_USE_THREADS
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
     m_leftImpl.cleanup();
     m_rightImpl.cleanup();
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index 10bdbc6a0..b290de311 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -214,6 +214,14 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     return true;
   }
 
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+  }
+#endif  // EIGEN_USE_THREADS
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index a398b2b3f..2f8656fbb 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -373,13 +373,13 @@ struct TensorContractionEvaluatorBase
   typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
-    IsAligned = true,
-    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccess = false,
+    IsAligned         = true,
+    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess       = false,
     PreferBlockAccess = false,
-    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = true
+    Layout            = TensorEvaluator<LeftArgType, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = true
   };
 
   // Most of the code is assuming that both input tensors are ColMajor. If the
@@ -390,7 +390,7 @@ struct TensorContractionEvaluatorBase
     static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
   typedef typename internal::conditional<
     static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
-  
+
   typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluatorType;
   typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluatorType;
 
@@ -605,48 +605,99 @@ struct TensorContractionEvaluatorBase
     }
   }
 
-#define TENSOR_CONTRACTION_DISPATCH(METHOD, ALIGNMENT, ARGS)   \
-    if (this->m_lhs_inner_dim_contiguous) { \
-      if (this->m_rhs_inner_dim_contiguous) { \
-        if (this->m_rhs_inner_dim_reordered) { \
-          METHOD<true, true, true, ALIGNMENT>ARGS;    \
-        } \
-        else { \
-          METHOD<true, true, false, ALIGNMENT>ARGS; \
-        } \
-      } \
-      else { \
-       if (this->m_rhs_inner_dim_reordered) { \
-          METHOD<true, false, true, ALIGNMENT>ARGS; \
-        } \
-        else { \
-          METHOD<true, false, false, ALIGNMENT>ARGS; \
-        } \
-      } \
-    } \
-    else { \
-      if (this->m_rhs_inner_dim_contiguous) { \
-        if (this->m_rhs_inner_dim_reordered) { \
-          METHOD<false, true, true, ALIGNMENT>ARGS; \
-        } \
-        else { \
-          METHOD<false, true, false, ALIGNMENT>ARGS; \
-        } \
-      } \
-      else { \
-       if (this->m_rhs_inner_dim_reordered) { \
-          METHOD<false, false, true, ALIGNMENT>ARGS; \
-        } \
-        else { \
-          METHOD<false, false, false, ALIGNMENT>ARGS; \
-        } \
-      } \
-    }
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType dest, EvalSubExprsCallback done) {
+    m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done, dest](bool) {
+      m_rightImpl.evalSubExprsIfNeededAsync(nullptr, [this, done, dest](bool) {
+        if (dest) {
+          evalToAsync(dest, [done]() { done(false); });
+        } else {
+          m_result = static_cast<EvaluatorPointerType>(
+              m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
+          evalToAsync(m_result, [done]() { done(true); });
+        }
+      });
+    });
+  }
+#endif  // EIGEN_USE_THREADS
+
+#define TENSOR_CONTRACTION_DISPATCH(METHOD, ALIGNMENT, ARGS) \
+  if (this->m_lhs_inner_dim_contiguous) {                    \
+    if (this->m_rhs_inner_dim_contiguous) {                  \
+      if (this->m_rhs_inner_dim_reordered) {                 \
+        METHOD<true, true, true, ALIGNMENT> ARGS;            \
+      } else {                                               \
+        METHOD<true, true, false, ALIGNMENT> ARGS;           \
+      }                                                      \
+    } else {                                                 \
+      if (this->m_rhs_inner_dim_reordered) {                 \
+        METHOD<true, false, true, ALIGNMENT> ARGS;           \
+      } else {                                               \
+        METHOD<true, false, false, ALIGNMENT> ARGS;          \
+      }                                                      \
+    }                                                        \
+  } else {                                                   \
+    if (this->m_rhs_inner_dim_contiguous) {                  \
+      if (this->m_rhs_inner_dim_reordered) {                 \
+        METHOD<false, true, true, ALIGNMENT> ARGS;           \
+      } else {                                               \
+        METHOD<false, true, false, ALIGNMENT> ARGS;          \
+      }                                                      \
+    } else {                                                 \
+      if (this->m_rhs_inner_dim_reordered) {                 \
+        METHOD<false, false, true, ALIGNMENT> ARGS;          \
+      } else {                                               \
+        METHOD<false, false, false, ALIGNMENT> ARGS;         \
+      }                                                      \
+    }                                                        \
+  }
+
+#define TENSOR_CONTRACTION_ASYNC_DISPATCH(METHOD, DONE, ALIGNMENT, ARGS, FN) \
+  if (this->m_lhs_inner_dim_contiguous) {                                    \
+    if (this->m_rhs_inner_dim_contiguous) {                                  \
+      if (this->m_rhs_inner_dim_reordered) {                                 \
+        (new METHOD<DONE, true, true, true, ALIGNMENT> ARGS)->FN;            \
+      } else {                                                               \
+        (new METHOD<DONE, true, true, false, ALIGNMENT> ARGS)->FN;           \
+      }                                                                      \
+    } else {                                                                 \
+      if (this->m_rhs_inner_dim_reordered) {                                 \
+        (new METHOD<DONE, true, false, true, ALIGNMENT> ARGS)->FN;           \
+      } else {                                                               \
+        (new METHOD<DONE, true, false, false, ALIGNMENT> ARGS)->FN;          \
+      }                                                                      \
+    }                                                                        \
+  } else {                                                                   \
+    if (this->m_rhs_inner_dim_contiguous) {                                  \
+      if (this->m_rhs_inner_dim_reordered) {                                 \
+        (new METHOD<DONE, false, true, true, ALIGNMENT> ARGS)->FN;           \
+      } else {                                                               \
+        (new METHOD<DONE, false, true, false, ALIGNMENT> ARGS)->FN;          \
+      }                                                                      \
+    } else {                                                                 \
+      if (this->m_rhs_inner_dim_reordered) {                                 \
+        (new METHOD<DONE, false, false, true, ALIGNMENT> ARGS)->FN;          \
+      } else {                                                               \
+        (new METHOD<DONE, false, false, false, ALIGNMENT> ARGS)->FN;         \
+      }                                                                      \
+    }                                                                        \
+  }
 
   EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const {
    static_cast<const Derived*>(this)->template evalProduct<Unaligned>(buffer);
   }
 
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalToCallback>
+  void evalToAsync(Scalar* buffer, EvalToCallback done) const {
+    static_cast<const Derived*>(this)
+        ->template evalProductAsync<EvalToCallback, Unaligned>(buffer,
+                                                               std::move(done));
+  }
+#endif  // EIGEN_USE_THREADS
+
   template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
             bool rhs_inner_dim_reordered, int Alignment>
   void evalProductSequential(Scalar* buffer) const {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index ca20038a4..f9d9d6d31 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -73,6 +73,34 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
 
   template <int Alignment>
   void evalProduct(Scalar* buffer) const {
+    evalProductImpl<NoCallback, Alignment>(buffer, NoCallback());
+  }
+
+  template <typename EvalToCallback, int Alignment>
+  void evalProductAsync(Scalar* buffer, EvalToCallback done) const {
+    evalProductImpl<EvalToCallback, Alignment>(buffer, std::move(done));
+  }
+
+  template <typename DoneCallback, int Alignment>
+  void evalProductImpl(Scalar* buffer, DoneCallback done) const {
+    // This function computes a lot of heuristics in multiple steps, and it
+    // also has multiple exit points. To keep it sane, readable and all in one
+    // place, sync/async execution decision is made at runtime at the very end.
+    //
+    // (1) In sync mode we allocate Context on the stack, submit computations
+    //     to the device thread pool, and block on a barrier until it is
+    //     completed.
+    //
+    // (2) In async mode we allocate Context on the heap, and after all tasks
+    //     are finished, we call provided the done callback, and delete a
+    //     context from the heap.
+    //
+    // (*) EvalParallelContext & EvalShardedByInnerDimContext owns all the state
+    // and temporary buffers, requried for executing the tensor contraction.
+    // They are responsible for cleaning it up after contraction is done.
+    static const bool IsEvalInSyncMode =
+        std::is_same<DoneCallback, NoCallback>::value;
+
     const Index m = this->m_i_size;
     const Index n = this->m_j_size;
     const Index k = this->m_k_size;
@@ -134,8 +162,16 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     if (shardByInnerDim(m, n, k, num_threads, num_threads_by_k)) {
       // We are in the scenario where it is more effective to shard by the
       // inner dimension.
-      this->template evalShardedByInnerDim<Alignment>(num_threads_by_k,
-                                                      buffer);
+      if (IsEvalInSyncMode) {
+        EvalShardedByInnerDimContext<DoneCallback> ctx(
+            this, num_threads_by_k, buffer, m, n, k, std::move(done));
+        ctx.template run<Alignment>();
+      } else {
+        auto* ctx = new EvalShardedByInnerDimContext<DoneCallback>(
+            this, num_threads_by_k, buffer, m, n, k, std::move(done));
+        ctx->template runAsync<Alignment>();
+      }
+
       return;
     }
 
@@ -146,6 +182,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     if (num_threads == 1) {
       TENSOR_CONTRACTION_DISPATCH(this->template evalProductSequential,
                                   Unaligned, (buffer));
+      if (!IsEvalInSyncMode) done();
       return;
     }
 
@@ -230,21 +267,89 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     // optimization.
     if (parallelize_by_sharding_dim_only) parallel_pack = false;
 
+    // TODO(ezhulnev): With if contexpr we don't need SyncEvalParallelContext.
+    if (IsEvalInSyncMode) {
 #define CONTEXT_ARGS                                                        \
   (this, num_threads, buffer, m, n, k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, \
-   nn0, shard_by_col, parallel_pack, parallelize_by_sharding_dim_only)      \
+   nn0, shard_by_col, parallel_pack, parallelize_by_sharding_dim_only,      \
+   NoCallback())                                                            \
       .run()
-
-    TENSOR_CONTRACTION_DISPATCH(Context, Alignment, CONTEXT_ARGS);
-
+      TENSOR_CONTRACTION_DISPATCH(SyncEvalParallelContext, Alignment,
+                                  CONTEXT_ARGS);
 #undef CONTEXT_ARGS
 
+    } else {
+#define CONTEXT_ARGS                                                        \
+  (this, num_threads, buffer, m, n, k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, \
+   nn0, shard_by_col, parallel_pack, parallelize_by_sharding_dim_only,      \
+   std::move(done))
+      TENSOR_CONTRACTION_ASYNC_DISPATCH(EvalParallelContext, DoneCallback,
+                                        Alignment, CONTEXT_ARGS, run());
+#undef CONTEXT_ARGS
+    }
   }
 
-  // Context coordinates a single parallel gemm operation.
- template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
-            bool rhs_inner_dim_reordered, int Alignment>
-  class Context {
+  // ------------------------------------------------------------------------ //
+
+  // Dummy struct to represent an empty DoneCallback.
+
+  struct NoCallback {
+    void operator()() {
+      eigen_assert(false && "NoCallback should never be called");
+    }
+  };
+
+  // ------------------------------------------------------------------------ //
+
+  template <typename DoneCallback, typename Context>
+  class EvalParallelNotification;
+
+  // Synchronous evaluation notification that blocks caller thread in Wait().
+  template <typename Context>
+  class EvalParallelNotification<NoCallback, Context> {
+   public:
+    EvalParallelNotification(Context*, NoCallback) {}
+    void Notify() { done_.Notify(); }
+    void Wait() { done_.Wait(); }
+   private:
+    Eigen::Notification done_;
+  };
+
+  // Asynchronous evaluation notification that does not block in Wait().
+  template <typename DoneCallback, typename Context>
+  class EvalParallelNotification {
+   public:
+    EvalParallelNotification(Context* ctx, DoneCallback done)
+        : ctx_(ctx), done_(std::move(done)) {}
+
+    void Notify() {
+      // Make a copy of done callback, because it will be destructed when we
+      // will delete context in the next line (EvalParallelNotification is a
+      // data member of EvalParallelContext class).
+      DoneCallback done_copy = std::move(done_);
+
+      // Delete parallel evaluation context.
+      delete ctx_;
+
+      // Now safely call the done callback.
+      done_copy();
+    }
+
+    void Wait() {}
+
+   private:
+    Context* ctx_;
+    DoneCallback done_;
+  };
+
+  // Context orchestrates sync/async parallel contraction evaluation. When it is
+  // executed in asynchronous mode, it owns all the shared state that might be
+  // accessible by block packing and kernel tasks.
+
+  template <typename DoneCallback, bool lhs_inner_dim_contiguous,
+            bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered,
+            int Alignment>
+  class EvalParallelContext {
    public:
     typedef internal::TensorContractionInputMapper<
         LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t,
@@ -267,11 +372,15 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     typedef typename TensorContractionKernel::RhsBlock RhsBlock;
     typedef typename TensorContractionKernel::BlockMemHandle BlockMemHandle;
 
-    Context(const Self* self, int num_threads, Scalar* buffer, Index tm, Index tn,
-            Index tk, Index bm, Index bn, Index bk, Index nm, Index nn, Index nk,
-            Index gm, Index gn, Index nm0, Index nn0, bool shard_by_col,
-            bool parallel_pack, bool parallelize_by_sharding_dim_only)
-        : device_(self->m_device),
+    EvalParallelContext(const Self* self, int num_threads, Scalar* buffer,
+                        Index tm, Index tn, Index tk, Index bm, Index bn,
+                        Index bk, Index nm, Index nn, Index nk, Index gm,
+                        Index gn, Index nm0, Index nn0, bool shard_by_col,
+                        bool parallel_pack,
+                        bool parallelize_by_sharding_dim_only,
+                        DoneCallback done)
+        : done_(this, std::move(done)),
+          device_(self->m_device),
           lhs_(self->m_leftImpl, self->m_left_nocontract_strides,
                self->m_i_strides, self->m_left_contracting_strides,
                self->m_k_strides),
@@ -299,8 +408,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
           gn_(gn),
           nm0_(nm0),
           nn0_(nn0),
-          kernel_(m_, k_, n_, bm_, bk_, bn_)
-  {
+          kernel_(m_, k_, n_, bm_, bk_, bn_) {
       // These two options are mutually exclusive.
       eigen_assert(!(parallel_pack && parallelize_by_sharding_dim_only));
 
@@ -371,7 +479,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
       }
     }
 
-    ~Context() {
+    ~EvalParallelContext() {
       for (Index x = 0; x < P; x++) {
         for (Index m = 0; m < nm_; m++) delete[] state_kernel_[x][m];
         delete[] state_kernel_[x];
@@ -386,16 +494,28 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     void run() {
       // Kick off packing of the first slice.
       signal_switch(0, 1);
+
       // Wait for overall completion.
-      // TODO(dvyukov): this wait can lead to deadlock.
-      // If nthreads contractions are concurrently submitted from worker
-      // threads, this wait will block all worker threads and the system will
-      // deadlock.
+      //
+      // If parallel evaluation is executed in async mode, this is a no-op, and
+      // Wait() will return immediately. In synchronous mode it will block the
+      // caller thread until it will receive notification from last task.
+      //
+      // In async mode, last task when completed will call done callback from
+      // the same thread, and will delete this context.
+      //
+      // TODO(dvyukov): This wait can lead to deadlock if contraction is
+      // evaluated in synchronous mode. If nthreads contractions are
+      // concurrently submitted from worker threads, this wait will block all
+      // worker threads and the system will deadlock.
       done_.Wait();
     }
 
    private:
-    Notification done_;
+    // This notification is specialized on the type of DoneCallback and can be
+    // blocking or non-blocking.
+    EvalParallelNotification<DoneCallback, EvalParallelContext> done_;
+
     const Device& device_;
     LhsMapper lhs_;
     RhsMapper rhs_;
@@ -780,10 +900,344 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     Index gm(Index m) const { return m + 1 < nm_ ? gm_ : nm0_ + gm_ - gm_ * nm_; }
     Index gn(Index n) const { return n + 1 < nn_ ? gn_ : nn0_ + gn_ - gn_ * nn_; }
 
-    Context(const Context&) = delete;
-    void operator=(const Context&) = delete;
+    EvalParallelContext(const EvalParallelContext&) = delete;
+    void operator=(const EvalParallelContext&) = delete;
   };
 
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
+            bool rhs_inner_dim_reordered, int Alignment>
+  using SyncEvalParallelContext =
+      EvalParallelContext<NoCallback, lhs_inner_dim_contiguous,
+                          rhs_inner_dim_contiguous, rhs_inner_dim_reordered,
+                          Alignment>;
+
+  // ------------------------------------------------------------------------ //
+
+  // EvalShardedByInnerDimContext orchestrates sync/async contraction
+  // evaluation, when we shard by inner dimension. When it is executed in
+  // asynchronous mode, it owns all the shared state that might be accessible by
+  // block processing tasks.
+
+  template <typename DoneCallback>
+  struct EvalShardedByInnerDimContext {
+    EvalShardedByInnerDimContext(const Self* evaluator, int num_threads,
+                                 Scalar* result, Index m, Index n, Index k,
+                                 DoneCallback done)
+        : evaluator(evaluator),
+          m_lhs_inner_dim_contiguous(evaluator->m_lhs_inner_dim_contiguous),
+          m_rhs_inner_dim_contiguous(evaluator->m_rhs_inner_dim_contiguous),
+          m_rhs_inner_dim_reordered(evaluator->m_rhs_inner_dim_reordered),
+          num_threads(num_threads),
+          result(result),
+          m(m),
+          n(n),
+          k(k),
+          done(std::move(done)),
+          buffer_size_bytes(m * n * sizeof(Scalar)),
+          block_size(blockSize(k, num_threads)),
+          num_blocks(divup<Index>(k, block_size)),
+          num_pending_blocks(internal::convert_index<int>(num_blocks)),
+          l0_ranges(divup<Index>(num_blocks, l0_size)),
+          l0_state(l0_ranges),
+          block_buffers(num_blocks) {
+      // Keep count of pending gemm tasks for each l0 range.
+      for (int i = 0; i < l0_ranges; ++i) {
+        const Index num_pending_tasks = actualRangeSize(l0_ranges, l0_size, i);
+        l0_state.emplace_back(internal::convert_index<int>(num_pending_tasks));
+      }
+
+      // Allocate temporary buffers for each block.
+      for (Index block_idx = 0; block_idx < num_blocks; ++block_idx) {
+        Scalar* buf = block_idx == 0
+                          ? result
+                          : static_cast<Scalar*>(evaluator->m_device.allocate(
+                                buffer_size_bytes));
+        block_buffers.emplace_back(buf);
+      }
+    }
+
+    ~EvalShardedByInnerDimContext() {
+      for (Index i = 1; i < num_blocks; ++i) {
+        evaluator->m_device.deallocate(block_buffers[i]);
+      }
+    }
+
+    template <int Alignment>
+    void run() {
+      Barrier barrier(internal::convert_index<int>(num_blocks));
+      for (Index block_idx = 0; block_idx < num_blocks; ++block_idx) {
+        evaluator->m_device.enqueueNoNotification(
+            [this, block_idx, &barrier]() {
+              Index block_start = block_idx * block_size;
+              Index block_end = block_start + actualBlockSize(block_idx);
+
+              processBlock<Alignment>(block_idx, block_start, block_end);
+              barrier.Notify();
+            });
+      }
+      barrier.Wait();
+
+      // Aggregate partial sums from l0 ranges.
+      aggregateL0Blocks<Alignment>();
+
+      // Apply output kernel.
+      applyOutputKernel();
+    }
+
+    template <int Alignment>
+    void runAsync() {
+      for (Index block_idx = 0; block_idx < num_blocks; ++block_idx) {
+        evaluator->m_device.enqueueNoNotification([this, block_idx]() {
+          Index block_start = block_idx * block_size;
+          Index block_end = block_start + actualBlockSize(block_idx);
+
+          processBlock<Alignment>(block_idx, block_start, block_end);
+
+          int v = num_pending_blocks.fetch_sub(1);
+          eigen_assert(v >= 1);
+
+          if (v == 1) {
+            // Aggregate partial sums from l0 ranges.
+            aggregateL0Blocks<Alignment>();
+
+            // Apply output kernel.
+            applyOutputKernel();
+
+            // NOTE: If we call `done` callback before deleting this (context),
+            // it might deallocate Self* pointer captured by context, and we'll
+            // fail in destructor trying to deallocate temporary buffers.
+
+            // Move done call back from context before it will be destructed.
+            DoneCallback done_copy = std::move(done);
+
+            // We are confident that we are the last one who touches context.
+            delete this;
+
+            // Now safely call the done callback.
+            done_copy();
+          }
+        });
+      }
+    }
+
+   private:
+    // The underlying GEMM kernel assumes that k is a multiple of
+    // the packet size and subtle breakage occurs if this is violated.
+    static const Index packet_size = internal::packet_traits<RhsScalar>::size;
+
+    const Self* evaluator;  // TensorContraction evaluator
+
+    // These fields required fromTENSOR_CONTRACTION_DISPATCH macro.
+    bool m_lhs_inner_dim_contiguous;
+    bool m_rhs_inner_dim_contiguous;
+    bool m_rhs_inner_dim_reordered;
+
+    int num_threads;
+    Scalar* result;
+
+    Index m;
+    Index n;
+    Index k;
+
+    DoneCallback done;
+
+    // ----------------------------------------------------------------------//
+    // Algorithm parameters.
+
+    // We will compute partial results into the buffers of this size.
+    Index buffer_size_bytes;
+
+    Index block_size;
+    Index num_blocks;
+
+    // Keep track of pending tasks when evaluate in async mode.
+    std::atomic<int> num_pending_blocks;
+
+    // We compute partial gemm results in parallel, and to get the final result
+    // we need to add them all together. For the large number of threads (>= 48)
+    // this adds a very expensive sequential step at the end.
+    //
+    // We split the [0, num_blocks) into small ranges, and when a task for the
+    // block finishes its partial gemm computation, it checks if it was the last
+    // gemm in the range, and if so, it will add all blocks of the range.
+    //
+    // After all tasks done, we need to add only these pre-aggregated blocks.
+
+    // For now we use just a single level of ranges to compute pre-aggregated
+    // partial sums, but in general we can use more layers to compute tree
+    // aggregation in parallel and reduce the size of the sequential step.
+    //
+    // TODO(ezhulenev): Add multilevel tree aggregation? Probably will make
+    // sense only if number of threads >= ~128?
+    static const Index l0_size = 4;
+    Index l0_ranges;
+
+    // Keep count of pending gemm tasks for each l0 range.
+    MaxSizeVector<std::atomic<int>> l0_state;  // [0, l0_ranges)
+
+    // Buffers allocated for each temporary block computation.
+    MaxSizeVector<Scalar*> block_buffers;  // [0, num_blocks)
+
+    template <int Alignment>
+    void processBlock(Index block_idx, Index begin, Index end) {
+      Scalar* buf = block_buffers[block_idx];
+      ::memset(buf, 0, buffer_size_bytes);
+
+      TENSOR_CONTRACTION_DISPATCH(
+          evaluator->template evalGemmPartialWithoutOutputKernel, Alignment,
+          (buf, begin, end,
+           /*num_threads=*/internal::convert_index<int>(num_blocks)));
+
+      // Check if it was the last task in l0 range.
+      const Index l0_index = block_idx / l0_size;
+      const int v = l0_state[l0_index].fetch_sub(1);
+      eigen_assert(v >= 1);
+
+      // If we processed the last block of the range, we can aggregate all
+      // partial results into the first block of the range.
+      if (v == 1) {
+        const Index rng_size = actualRangeSize(l0_ranges, l0_size, l0_index);
+        const Index dst_block_idx = l0_index * l0_size;
+
+        if (rng_size == l0_size) {
+          addAllToBuffer<Alignment>(
+              m * n,
+              /*src_buf0=*/block_buffers[dst_block_idx + 1],
+              /*src_buf1=*/block_buffers[dst_block_idx + 2],
+              /*src_buf2=*/block_buffers[dst_block_idx + 3],
+              /*dst_buf= */ block_buffers[dst_block_idx]);
+        } else {
+          // Aggregate blocks of potentially incomplete last range.
+          for (int i = 1; i < rng_size; ++i) {
+            addToBuffer<Alignment>(m * n,
+                                   /*src_buf=*/block_buffers[dst_block_idx + i],
+                                   /*dst_buf=*/block_buffers[dst_block_idx]);
+          }
+        }
+      }
+    }
+
+    // Aggregate partial sums from l0 ranges.
+    template <int Alignment>
+    void aggregateL0Blocks() const {
+      Index l0_index = 1;
+
+      for (; l0_index + 2 < l0_ranges; l0_index += 3) {
+        addAllToBuffer<Alignment>(
+            m * n,
+            /*src_buf0=*/block_buffers[(l0_index + 0) * l0_size],
+            /*src_buf1=*/block_buffers[(l0_index + 1) * l0_size],
+            /*src_buf2=*/block_buffers[(l0_index + 2) * l0_size],
+            /*dst_buf= */ block_buffers[0]);
+      }
+
+      for (; l0_index < l0_ranges; ++l0_index) {
+        addToBuffer<Alignment>(m * n, block_buffers[l0_index * l0_size],
+                               block_buffers[0]);
+      }
+    }
+
+    void applyOutputKernel() const {
+      typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+      evaluator->m_output_kernel(
+          OutputMapper(result, m), evaluator->m_tensor_contraction_params,
+          static_cast<Eigen::Index>(0), static_cast<Eigen::Index>(0), m, n);
+    }
+
+    // Compute block size with accounting for potentially incomplete last block.
+    Index actualBlockSize(Index block_idx) const {
+      return block_idx + 1 < num_blocks
+                 ? block_size
+                 : k + block_size - block_size * num_blocks;
+    };
+
+    // Compute range size with accounting for potentially incomplete last range.
+    Index actualRangeSize(Index num_ranges, Index range_size,
+                          Index range_idx) const {
+      eigen_assert(range_idx < num_ranges);
+      return range_idx + 1 < num_ranges
+                 ? range_size
+                 : num_blocks + range_size - range_size * num_ranges;
+    };
+
+    template <int Alignment>
+    EIGEN_STRONG_INLINE static void addToBuffer(size_t n, const Scalar* src_buf,
+                                                Scalar* tgt_buf) {
+      const int output_packet_size =
+          internal::unpacket_traits<PacketReturnType>::size;
+      size_t i = 0;
+      const size_t num_packets = n / output_packet_size;
+      for (; i < output_packet_size * num_packets; i += output_packet_size) {
+        const PacketReturnType src_val =
+            internal::pload<PacketReturnType>(src_buf + i);
+        const PacketReturnType tgt_val =
+            internal::ploadt<PacketReturnType, Alignment>(tgt_buf + i);
+        const PacketReturnType sum = internal::padd(src_val, tgt_val);
+        internal::pstoret<Scalar, PacketReturnType, Alignment>(tgt_buf + i,
+                                                               sum);
+      }
+      for (; i < n; ++i) {
+        tgt_buf[i] += src_buf[i];
+      }
+    }
+
+    template <int Alignment>
+    EIGEN_STRONG_INLINE static void addAllToBuffer(size_t n,
+                                                   const Scalar* src_buf0,
+                                                   const Scalar* src_buf1,
+                                                   const Scalar* src_buf2,
+                                                   Scalar* dst_buf) {
+      using ::Eigen::internal::padd;
+      using ::Eigen::internal::pload;
+      using ::Eigen::internal::ploadt;
+      using ::Eigen::internal::pstoret;
+
+      const int output_packet_size =
+          internal::unpacket_traits<PacketReturnType>::size;
+
+      size_t i = 0;
+      const size_t num_packets = n / output_packet_size;
+      for (; i < output_packet_size * num_packets; i += output_packet_size) {
+        const auto src_val0 = pload<PacketReturnType>(src_buf0 + i);
+        const auto src_val1 = pload<PacketReturnType>(src_buf1 + i);
+        const auto src_val2 = pload<PacketReturnType>(src_buf2 + i);
+
+        const auto dst_val = ploadt<PacketReturnType, Alignment>(dst_buf + i);
+        const auto sum =
+            padd(padd(dst_val, src_val0), padd(src_val1, src_val2));
+
+        pstoret<Scalar, PacketReturnType, Alignment>(dst_buf + i, sum);
+      }
+      for (; i < n; ++i) {
+        dst_buf[i] += src_buf0[i] + src_buf1[i] + src_buf2[i];
+      }
+    }
+
+    // Cost model doesn't capture well the cost associated with constructing
+    // tensor contraction mappers and computing loop bounds in gemm_pack_lhs
+    // and gemm_pack_rhs, so we specify minimum desired block size.
+    static Index blockSize(Index k, int num_threads) {
+      const auto round_up = [=](Index index) -> Index {
+        const Index kmultiple = packet_size <= 8 ? 8 : packet_size;
+        return divup<Index>(index, kmultiple) * kmultiple;
+      };
+
+      const Index target_block_size = round_up(divup<Index>(k, num_threads));
+      const Index desired_min_block_size = 12 * packet_size;
+
+      return numext::mini<Index>(
+          k, numext::maxi<Index>(desired_min_block_size, target_block_size));
+    }
+
+    EvalShardedByInnerDimContext(const EvalShardedByInnerDimContext&) = delete;
+    void operator=(const EvalShardedByInnerDimContext&) = delete;
+  };
+
+  // ------------------------------------------------------------------------ //
+
+  // Below are the function used by evalProductImpl heuristics, trying to select
+  // optimcal parameters for parallelization algorithm.
+
   // Decide whether we want to shard m x n contraction by columns or by rows.
   static bool shardByCol(Index m, Index n, Index num_threads) {
     // Note: we are comparing both n and m against Traits::nr, it is not
@@ -916,55 +1370,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     return cost + lhsCost + rhsCost;
   }
 
-  template <int Alignment>
-  EIGEN_STRONG_INLINE void addToBuffer(size_t n, const Scalar* src_buf,
-                                       Scalar* tgt_buf) const {
-    const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size;
-    size_t i = 0;
-    const size_t num_packets = n / output_packet_size;
-    for (; i < output_packet_size * num_packets; i += output_packet_size) {
-      const PacketReturnType src_val =
-          internal::pload<PacketReturnType>(src_buf + i);
-      const PacketReturnType tgt_val =
-          internal::ploadt<PacketReturnType, Alignment>(tgt_buf + i);
-      const PacketReturnType sum = internal::padd(src_val, tgt_val);
-      internal::pstoret<Scalar, PacketReturnType, Alignment>(tgt_buf + i, sum);
-    }
-    for (; i < n; ++i) {
-      tgt_buf[i] += src_buf[i];
-    }
-  }
-
-  template <int Alignment>
-  EIGEN_STRONG_INLINE void addAllToBuffer(size_t n, const Scalar* src_buf0,
-                                          const Scalar* src_buf1,
-                                          const Scalar* src_buf2,
-                                          Scalar* dst_buf) const {
-    using ::Eigen::internal::padd;
-    using ::Eigen::internal::pload;
-    using ::Eigen::internal::ploadt;
-    using ::Eigen::internal::pstoret;
-
-    const int output_packet_size =
-        internal::unpacket_traits<PacketReturnType>::size;
-
-    size_t i = 0;
-    const size_t num_packets = n / output_packet_size;
-    for (; i < output_packet_size * num_packets; i += output_packet_size) {
-      const auto src_val0 = pload<PacketReturnType>(src_buf0 + i);
-      const auto src_val1 = pload<PacketReturnType>(src_buf1 + i);
-      const auto src_val2 = pload<PacketReturnType>(src_buf2 + i);
-
-      const auto dst_val = ploadt<PacketReturnType, Alignment>(dst_buf + i);
-      const auto sum = padd(padd(dst_val, src_val0), padd(src_val1, src_val2));
-
-      pstoret<Scalar, PacketReturnType, Alignment>(dst_buf + i, sum);
-    }
-    for (; i < n; ++i) {
-      dst_buf[i] += src_buf0[i] + src_buf1[i] + src_buf2[i];
-    }
-  }
-
   // Decide whether we want to shard m x k x n contraction over the inner
   // (contraction) dimension (k).
   static bool shardByInnerDim(Index m, Index n, Index k, int num_threads,
@@ -992,163 +1397,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     return shard_by_k;
   }
 
-  template <int Alignment>
-  void evalShardedByInnerDim(int num_threads, Scalar* result) const {
-    const Index m = this->m_i_size;
-    const Index n = this->m_j_size;
-    const Index k = this->m_k_size;
-
-    // We will compute partial results into the buffers of this size.
-    const Index buffer_size_bytes = m * n * sizeof(Scalar);
-
-    // The underlying GEMM kernel assumes that k is a multiple of
-    // the packet size and subtle breakage occurs if this is violated.
-    const Index packet_size = internal::packet_traits<RhsScalar>::size;
-
-    const auto round_up = [=](Index index) -> Index {
-      const Index kmultiple = packet_size <= 8 ? 8 : packet_size;
-      return divup<Index>(index, kmultiple) * kmultiple;
-    };
-
-    // Cost model doesn't capture well the cost associated with constructing
-    // tensor contraction mappers and computing loop bounds in gemm_pack_lhs and
-    // gemm_pack_rhs, so we specify minimum desired block size.
-    const Index target_block_size = round_up(divup<Index>(k, num_threads));
-    const Index desired_min_block_size = 12 * packet_size;
-
-    const Index block_size = numext::mini<Index>(
-        k, numext::maxi<Index>(desired_min_block_size, target_block_size));
-    const Index num_blocks = divup<Index>(k, block_size);
-
-    // Compute block size with accounting for potentially incomplete last block.
-    const auto actual_block_size = [=](Index block_idx) -> Index {
-      return block_idx + 1 < num_blocks
-                 ? block_size
-                 : k + block_size - block_size * num_blocks;
-    };
-
-    // We compute partial gemm results in parallel, and to get the final result
-    // we need to add them all together. For the large number of threads (>= 48)
-    // this adds a very expensive sequential step at the end.
-    //
-    // We split the [0, num_blocks) into small ranges, and when a task for the
-    // block finishes its partial gemm computation, it checks if it was the last
-    // gemm in the range, and if so, it will add all blocks of the range.
-    //
-    // After all tasks finihes, we need to add only these pre-aggregated blocks.
-
-    // Compute range size with accounting for potentially incomplete last range.
-    const auto actual_range_size = [=](Index num_ranges, Index range_size,
-                                       Index range_idx) -> Index {
-      eigen_assert(range_idx < num_ranges);
-      return range_idx + 1 < num_ranges
-                 ? range_size
-                 : num_blocks + range_size - range_size * num_ranges;
-    };
-
-    // For now we use just a single level of ranges to compute pre-aggregated
-    // partial sums, but in general we can use more layers to compute tree
-    // aggregation in parallel and reduce the size of the sequential step.
-    //
-    // TODO(ezhulenev): Add multilevel tree aggregation? Probably will make
-    // sense only if number of threads >= ~128?
-    static const Index l0_size = 4;
-    const Index l0_ranges = divup<Index>(num_blocks, l0_size);
-
-    // Keep count of pending gemm tasks for each l0 range.
-    MaxSizeVector<std::atomic<int>> l0_state(l0_ranges);
-    for (int i = 0; i < l0_ranges; ++i) {
-      const Index num_pending_tasks = actual_range_size(l0_ranges, l0_size, i);
-      l0_state.emplace_back(internal::convert_index<int>(num_pending_tasks));
-    }
-
-    MaxSizeVector<Scalar*> block_buffers(num_blocks);
-
-    auto process_block = [&, this](Index block_idx, Index begin, Index end) {
-      Scalar* buf = block_buffers[block_idx];
-      ::memset(buf, 0, buffer_size_bytes);
-
-      TENSOR_CONTRACTION_DISPATCH(
-          this->template evalGemmPartialWithoutOutputKernel, Alignment,
-          (buf, begin, end,
-           /*num_threads=*/internal::convert_index<int>(num_blocks)));
-
-      // Check if it was the last task in l0 range.
-      const Index l0_index = block_idx / l0_size;
-      const int v = l0_state[l0_index].fetch_sub(1);
-      eigen_assert(v >= 1);
-
-      // If we processed the last block of the range, we can aggregate all
-      // partial results into the first block of the range.
-      if (v == 1) {
-        const Index rng_size = actual_range_size(l0_ranges, l0_size, l0_index);
-        const Index dst_block_idx = l0_index * l0_size;
-
-        if (rng_size == l0_size) {
-          addAllToBuffer<Alignment>(
-              m * n,
-              /*src_buf0=*/block_buffers[dst_block_idx + 1],
-              /*src_buf1=*/block_buffers[dst_block_idx + 2],
-              /*src_buf2=*/block_buffers[dst_block_idx + 3],
-              /*dst_buf= */ block_buffers[dst_block_idx]);
-        } else {
-          // Aggregate blocks of potentially incomplete last range.
-          for (int i = 1; i < rng_size; ++i) {
-            addToBuffer<Alignment>(m * n,
-                                   /*src_buf=*/block_buffers[dst_block_idx + i],
-                                   /*dst_buf=*/block_buffers[dst_block_idx]);
-          }
-        }
-      }
-    };
-
-    Barrier barrier(internal::convert_index<int>(num_blocks));
-    for (Index block_idx = 0; block_idx < num_blocks; ++block_idx) {
-      Scalar* buf = block_idx == 0
-                        ? result
-                        : static_cast<Scalar*>(
-                              this->m_device.allocate(buffer_size_bytes));
-      block_buffers.push_back(buf);
-
-      Index block_start = block_idx * block_size;
-      Index block_end = block_start + actual_block_size(block_idx);
-
-      this->m_device.enqueueNoNotification([=, &barrier, &process_block]() {
-        process_block(block_idx, block_start, block_end);
-        barrier.Notify();
-      });
-    }
-    barrier.Wait();
-
-    // Aggregate partial sums from l0 ranges.
-    Index l0_index = 1;
-    for (; l0_index + 2 < l0_ranges; l0_index += 3) {
-      addAllToBuffer<Alignment>(
-          m * n,
-          /*src_buf0=*/block_buffers[(l0_index + 0) * l0_size],
-          /*src_buf1=*/block_buffers[(l0_index + 1) * l0_size],
-          /*src_buf2=*/block_buffers[(l0_index + 2) * l0_size],
-          /*dst_buf= */block_buffers[0]);
-    }
-    for (; l0_index < l0_ranges; ++l0_index) {
-      addToBuffer<Alignment>(m * n, block_buffers[l0_index * l0_size],
-                             block_buffers[0]);
-    }
-
-    // Don't forget to deallocate ALL temporary buffers.
-    for (Index i = 1; i < num_blocks; ++i) {
-      this->m_device.deallocate(block_buffers[i]);
-    }
-
-    // Finally call output kernel with finalized output buffer.
-    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
-    this->m_output_kernel(OutputMapper(result, m),
-                          this->m_tensor_contraction_params,
-                          static_cast<Eigen::Index>(0),
-                          static_cast<Eigen::Index>(0),
-                          m, n);
-  }
-
   TensorOpCost contractionCostPerInnerDim(Index m, Index n, Index k) const {
     // Compute cost.
     const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size;
@@ -1188,7 +1436,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     return num_threads;
   }
 
-
   double computeBandwidth(bool shard_by_col, Index bm, Index bn,
                           Index bk) const {
     // Peak VFMA bandwidth is 0.5. However if we have not enough data for
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index edb0b3e25..cee46634c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -52,7 +52,7 @@ class Allocator {
 // Build a thread pool device on top the an existing pool of threads.
 struct ThreadPoolDevice {
   // The ownership of the thread pool remains with the caller.
-  ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores, Allocator* allocator = NULL)
+  ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores, Allocator* allocator = nullptr)
       : pool_(pool), num_threads_(num_cores), allocator_(allocator) { }
 
   EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
@@ -234,7 +234,7 @@ struct ThreadPoolDevice {
   // Convenience wrapper for parallelFor that does not align blocks.
   void parallelFor(Index n, const TensorOpCost& cost,
                    std::function<void(Index, Index)> f) const {
-    parallelFor(n, cost, NULL, std::move(f));
+    parallelFor(n, cost, nullptr, std::move(f));
   }
 
   // WARNING: This function is asynchronous and will not block the calling thread.
@@ -248,6 +248,14 @@ struct ThreadPoolDevice {
                         std::function<Index(Index)> block_align,
                         std::function<void(Index, Index)> f,
                         std::function<void()> done) const {
+    // Compute small problems directly in the caller thread.
+    if (n <= 1 || numThreads() == 1 ||
+        CostModel::numThreads(n, cost, static_cast<int>(numThreads())) == 1) {
+      f(0, n);
+      done();
+      return;
+    }
+
     // Compute block size and total count of blocks.
     ParallelForBlock block = CalculateParallelForBlock(n, cost, block_align);
 
@@ -269,24 +277,26 @@ struct ThreadPoolDevice {
       // Single block or less, execute directly.
       ctx->f(firstIdx, lastIdx);
 
-      // Call 'done' callback if it was the last block.
-      if (ctx->count.fetch_sub(1) == 1) {
-        (ctx->done)();
-        // We can't delete ctx right now, because it will deallocate the closure
-        // we are currently in.
-        pool_->Schedule([ctx]() { delete ctx; });
-      }
+      // Delete async context if it was the last block.
+      if (ctx->count.fetch_sub(1) == 1) delete ctx;
     };
 
-    // Execute the root in the thread pool.
-    pool_->Schedule([ctx, n]() { ctx->handle_range(0, n); });
+    if (block.count <= numThreads()) {
+      // Avoid a thread hop by running the root of the tree and one block on the
+      // main thread.
+      ctx->handle_range(0, n);
+    } else {
+      // Execute the root in the thread pool to avoid running work on more than
+      // numThreads() threads.
+      pool_->Schedule([ctx, n]() { ctx->handle_range(0, n); });
+    }
   }
 
   // Convenience wrapper for parallelForAsync that does not align blocks.
   void parallelForAsync(Index n, const TensorOpCost& cost,
                         std::function<void(Index, Index)> f,
                         std::function<void()> done) const {
-    parallelForAsync(n, cost, NULL, std::move(f), std::move(done));
+    parallelForAsync(n, cost, nullptr, std::move(f), std::move(done));
   }
 
   // Thread pool accessor.
@@ -307,6 +317,7 @@ struct ThreadPoolDevice {
         : count(block_count),
           f(std::move(block_f)),
           done(std::move(done_callback)) {}
+    ~ParallelForAsyncContext() { done(); }
 
     std::atomic<Index> count;
     std::function<void(Index, Index)> f;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index a3a79d4e9..fec735868 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -79,7 +79,16 @@ struct TensorEvaluator
     return true;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType dest, EvalSubExprsCallback done) {
+    // TODO(ezhulenev): ThreadPoolDevice memcpy is blockign operation.
+    done(evalSubExprsIfNeeded(dest));
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {}
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
     eigen_assert(m_data != NULL);
@@ -247,6 +256,15 @@ struct TensorEvaluator<const Derived, Device>
     return true;
   }
 
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType dest, EvalSubExprsCallback done) {
+    // TODO(ezhulenev): ThreadPoolDevice memcpy is a blockign operation.
+    done(evalSubExprsIfNeeded(dest));
+  }
+#endif  // EIGEN_USE_THREADS
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
@@ -346,6 +364,15 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { return true; }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType, EvalSubExprsCallback done) {
+    done(true);
+  }
+#endif  // EIGEN_USE_THREADS
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
 
   EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
@@ -425,6 +452,15 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
     m_argImpl.evalSubExprsIfNeeded(NULL);
     return true;
   }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_argImpl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+  }
+#endif  // EIGEN_USE_THREADS
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
     m_argImpl.cleanup();
   }
@@ -546,6 +582,19 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
     m_rightImpl.evalSubExprsIfNeeded(NULL);
     return true;
   }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType, EvalSubExprsCallback done) {
+    // TODO(ezhulenev): Evaluate two expression in parallel?
+    m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done](bool) {
+      m_rightImpl.evalSubExprsIfNeededAsync(nullptr,
+                                            [done](bool) { done(true); });
+    });
+  }
+#endif  // EIGEN_USE_THREADS
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
     m_leftImpl.cleanup();
     m_rightImpl.cleanup();
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 18d9de9e6..ce2337b63 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -430,12 +430,14 @@ class TensorAsyncExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable>
                                            std::function<void()> done) {
     TensorAsyncExecutorContext* const ctx =
         new TensorAsyncExecutorContext(expr, device, std::move(done));
-    // TODO(ezhulenev): This is a potentially blocking operation. Make it async!
-    const bool needs_assign = ctx->evaluator.evalSubExprsIfNeeded(nullptr);
 
-    typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
+    const auto on_eval_subexprs = [ctx, &device](bool need_assign) -> void {
+      if (!need_assign) {
+        delete ctx;
+        return;
+      }
 
-    if (needs_assign) {
+      typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
       const StorageIndex size = array_prod(ctx->evaluator.dimensions());
       device.parallelForAsync(
           size, ctx->evaluator.costPerCoeff(Vectorizable),
@@ -444,7 +446,9 @@ class TensorAsyncExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable>
             EvalRange::run(&ctx->evaluator, firstIdx, lastIdx);
           },
           [ctx]() { delete ctx; });
-    }
+    };
+
+    ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
   }
 
  private:
@@ -496,26 +500,32 @@ class TensorAsyncExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable
       return;
     }
 
-    // TODO(ezhulenev): This is a potentially blocking operation. Make it async!
-    const bool needs_assign = ctx->evaluator.evalSubExprsIfNeeded(nullptr);
+    const auto on_eval_subexprs = [ctx, &device](bool need_assign) -> void {
+      if (!need_assign) {
+        delete ctx;
+        return;
+      }
 
-    if (needs_assign) {
       ctx->tiling =
-          internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper,
-                                                   Vectorizable>(device, ctx->evaluator);
+          GetTensorExecutorTilingContext<Evaluator, TensorBlockMapper,
+                                         Vectorizable>(device, ctx->evaluator);
 
       device.parallelForAsync(
           ctx->tiling.block_mapper.total_block_count(), ctx->tiling.cost,
           [ctx](StorageIndex firstIdx, StorageIndex lastIdx) {
             ScalarNoConst* thread_buf =
-                ctx->tiling.template GetCurrentThreadBuffer<ScalarNoConst>(ctx->device);
+                ctx->tiling.template GetCurrentThreadBuffer<ScalarNoConst>(
+                    ctx->device);
             for (StorageIndex i = firstIdx; i < lastIdx; ++i) {
-              auto block = ctx->tiling.block_mapper.GetBlockForIndex(i, thread_buf);
+              auto block =
+                  ctx->tiling.block_mapper.GetBlockForIndex(i, thread_buf);
               ctx->evaluator.evalBlock(&block);
             }
           },
           [ctx]() { delete ctx; });
-    }
+    };
+
+    ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
   }
 
  private:
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/Barrier.h b/unsupported/Eigen/CXX11/src/ThreadPool/Barrier.h
index bae68e1fb..e4c59dc3d 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/Barrier.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/Barrier.h
@@ -25,6 +25,9 @@ class Barrier {
   void Notify() {
     unsigned int v = state_.fetch_sub(2, std::memory_order_acq_rel) - 2;
     if (v != 1) {
+      // Clear the lowest bit (waiter flag) and check that the original state
+      // value was not zero. If it was zero, it means that notify was called
+      // more times than the original count.
       eigen_plain_assert(((v + 2) & ~1) != 0);
       return;  // either count has not dropped to 0, or waiter is not waiting
     }
diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp
index 53b50d1ed..62973cd08 100644
--- a/unsupported/test/cxx11_tensor_thread_pool.cpp
+++ b/unsupported/test/cxx11_tensor_thread_pool.cpp
@@ -330,6 +330,52 @@ static void test_multithread_contraction_with_output_kernel() {
   }
 }
 
+template<int DataLayout>
+void test_async_multithread_contraction_agrees_with_singlethread()
+{
+  int contract_size = internal::random<int>(100, 500);
+
+  Tensor<float, 3, DataLayout> left(internal::random<int>(10, 40),
+                                    contract_size,
+                                    internal::random<int>(10, 40));
+
+  Tensor<float, 4, DataLayout> right(
+      internal::random<int>(1, 20), internal::random<int>(1, 20), contract_size,
+      internal::random<int>(1, 20));
+
+  left.setRandom();
+  right.setRandom();
+
+  // add constants to shift values away from 0 for more precision
+  left += left.constant(1.5f);
+  right += right.constant(1.5f);
+
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims({{DimPair(1, 2)}});
+
+  Eigen::ThreadPool tp(internal::random<int>(2, 11));
+  Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(8, 32));
+
+  Tensor<float, 5, DataLayout> st_result;
+  st_result = left.contract(right, dims);
+
+  Tensor<float, 5, DataLayout> tp_result(st_result.dimensions());
+
+  Eigen::Barrier barrier(1);
+  tp_result.device(thread_pool_device, [&barrier]() { barrier.Notify(); }) =
+      left.contract(right, dims);
+  barrier.Wait();
+
+  VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions()));
+  for (ptrdiff_t i = 0; i < st_result.size(); i++) {
+    // if both of the values are very small, then do nothing (because the test
+    // will fail due to numerical precision issues when values are small)
+    if (numext::abs(st_result.data()[i] - tp_result.data()[i]) >= 1e-4f) {
+      VERIFY_IS_APPROX(st_result.data()[i], tp_result.data()[i]);
+    }
+  }
+}
+
 // We are triggering 'evalShardedByInnerDim' optimization.
 template <int DataLayout>
 static void test_sharded_by_inner_dim_contraction()
@@ -410,6 +456,93 @@ static void test_sharded_by_inner_dim_contraction_with_output_kernel()
   }
 }
 
+// We are triggering 'evalShardedByInnerDim' optimization.
+template <int DataLayout>
+static void test_async_sharded_by_inner_dim_contraction()
+{
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+
+  const int num_threads = internal::random<int>(4, 16);
+  ThreadPool threads(num_threads);
+  Eigen::ThreadPoolDevice device(&threads, num_threads);
+
+  Tensor<float, 2, DataLayout> t_left(2, 10000);
+  Tensor<float, 2, DataLayout> t_right(10000, 10);
+  Tensor<float, 2, DataLayout> t_result(2, 10);
+
+  t_left.setRandom();
+  t_right.setRandom();
+  // Put trash in t_result to verify contraction clears output memory.
+  t_result.setRandom();
+
+  // Add a little offset so that the results won't be close to zero.
+  t_left += t_left.constant(1.0f);
+  t_right += t_right.constant(1.0f);
+
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 2, 10000);
+  MapXf m_right(t_right.data(), 10000, 10);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
+
+  // compute results by separate methods
+  Eigen::Barrier barrier(1);
+  t_result.device(device, [&barrier]() { barrier.Notify(); }) =
+      t_left.contract(t_right, dims);
+  barrier.Wait();
+
+  m_result = m_left * m_right;
+
+  for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]);
+  }
+}
+
+// We are triggering 'evalShardedByInnerDim' optimization with output kernel.
+template <int DataLayout>
+static void test_async_sharded_by_inner_dim_contraction_with_output_kernel()
+{
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+
+  const int num_threads = internal::random<int>(4, 16);
+  ThreadPool threads(num_threads);
+  Eigen::ThreadPoolDevice device(&threads, num_threads);
+
+  Tensor<float, 2, DataLayout> t_left(2, 10000);
+  Tensor<float, 2, DataLayout> t_right(10000, 10);
+  Tensor<float, 2, DataLayout> t_result(2, 10);
+
+  t_left.setRandom();
+  t_right.setRandom();
+  // Put trash in t_result to verify contraction clears output memory.
+  t_result.setRandom();
+
+  // Add a little offset so that the results won't be close to zero.
+  t_left += t_left.constant(1.0f);
+  t_right += t_right.constant(1.0f);
+
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 2, 10000);
+  MapXf m_right(t_right.data(), 10000, 10);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
+
+  // compute results by separate methods
+  Eigen::Barrier barrier(1);
+  t_result.device(device, [&barrier]() { barrier.Notify(); }) =
+      t_left.contract(t_right, dims, SqrtOutputKernel());
+  barrier.Wait();
+  m_result = m_left * m_right;
+
+  for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i]));
+  }
+}
+
 template<int DataLayout>
 void test_full_contraction() {
   int contract_size1 = internal::random<int>(1, 500);
@@ -550,11 +683,18 @@ EIGEN_DECLARE_TEST(cxx11_tensor_thread_pool)
   CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread<RowMajor>());
   CALL_SUBTEST_3(test_multithread_contraction_with_output_kernel<ColMajor>());
   CALL_SUBTEST_3(test_multithread_contraction_with_output_kernel<RowMajor>());
+  CALL_SUBTEST_3(test_async_multithread_contraction_agrees_with_singlethread<ColMajor>());
+  CALL_SUBTEST_3(test_async_multithread_contraction_agrees_with_singlethread<RowMajor>());
 
+  // Test EvalShardedByInnerDimContext parallelization strategy.
   CALL_SUBTEST_4(test_sharded_by_inner_dim_contraction<ColMajor>());
   CALL_SUBTEST_4(test_sharded_by_inner_dim_contraction<RowMajor>());
   CALL_SUBTEST_4(test_sharded_by_inner_dim_contraction_with_output_kernel<ColMajor>());
   CALL_SUBTEST_4(test_sharded_by_inner_dim_contraction_with_output_kernel<RowMajor>());
+  CALL_SUBTEST_4(test_async_sharded_by_inner_dim_contraction<ColMajor>());
+  CALL_SUBTEST_4(test_async_sharded_by_inner_dim_contraction<RowMajor>());
+  CALL_SUBTEST_4(test_async_sharded_by_inner_dim_contraction_with_output_kernel<ColMajor>());
+  CALL_SUBTEST_4(test_async_sharded_by_inner_dim_contraction_with_output_kernel<RowMajor>());
 
   // Exercise various cases that have been problematic in the past.
   CALL_SUBTEST_5(test_contraction_corner_cases<ColMajor>());

From edf2ec28d864f1cc1c7d93e34e13333571f91565 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Fri, 30 Aug 2019 15:29:25 -0700
Subject: [PATCH 19/30] Fix block mapper type name in TensorExecutor

---
 unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index ce2337b63..10339e5e7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -507,7 +507,7 @@ class TensorAsyncExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable
       }
 
       ctx->tiling =
-          GetTensorExecutorTilingContext<Evaluator, TensorBlockMapper,
+          GetTensorExecutorTilingContext<Evaluator, BlockMapper,
                                          Vectorizable>(device, ctx->evaluator);
 
       device.parallelForAsync(

From 79c402e40e80b670eb6bbaae631d0a5694d720b8 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Fri, 30 Aug 2019 15:38:31 -0700
Subject: [PATCH 20/30] Fix shadow warnings in TensorContractionThreadPool

---
 .../src/Tensor/TensorContractionThreadPool.h  | 21 +++++++++----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index f9d9d6d31..4adfeb560 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -920,19 +920,19 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
 
   template <typename DoneCallback>
   struct EvalShardedByInnerDimContext {
-    EvalShardedByInnerDimContext(const Self* evaluator, int num_threads,
-                                 Scalar* result, Index m, Index n, Index k,
-                                 DoneCallback done)
-        : evaluator(evaluator),
+    EvalShardedByInnerDimContext(const Self* self, int num_threads,
+                                 Scalar* result_buffer,
+                                 Index m_size, Index n_size, Index k_size,
+                                 DoneCallback done_callback)
+        : evaluator(self),
           m_lhs_inner_dim_contiguous(evaluator->m_lhs_inner_dim_contiguous),
           m_rhs_inner_dim_contiguous(evaluator->m_rhs_inner_dim_contiguous),
           m_rhs_inner_dim_reordered(evaluator->m_rhs_inner_dim_reordered),
-          num_threads(num_threads),
-          result(result),
-          m(m),
-          n(n),
-          k(k),
-          done(std::move(done)),
+          result(result_buffer),
+          m(m_size),
+          n(n_size),
+          k(k_size),
+          done(std::move(done_callback)),
           buffer_size_bytes(m * n * sizeof(Scalar)),
           block_size(blockSize(k, num_threads)),
           num_blocks(divup<Index>(k, block_size)),
@@ -1032,7 +1032,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     bool m_rhs_inner_dim_contiguous;
     bool m_rhs_inner_dim_reordered;
 
-    int num_threads;
     Scalar* result;
 
     Index m;

From fab51d133e6143527e5e8ea26004da5dac0586b9 Mon Sep 17 00:00:00 2001
From: Anshul Jaiswal <ajaiswal@fb.com>
Date: Sat, 8 Jun 2019 21:09:06 +0000
Subject: [PATCH 21/30] Updated Eigen_Colamd.h, namespacing macros ALIVE & DEAD
 as COLAMD_ALIVE & COLAMD_DEAD to prevent conflicts with other libraries /
 code.

---
 Eigen/src/OrderingMethods/Eigen_Colamd.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/Eigen/src/OrderingMethods/Eigen_Colamd.h b/Eigen/src/OrderingMethods/Eigen_Colamd.h
index 67fcad3f7..2a4338393 100644
--- a/Eigen/src/OrderingMethods/Eigen_Colamd.h
+++ b/Eigen/src/OrderingMethods/Eigen_Colamd.h
@@ -105,8 +105,8 @@ namespace internal {
 #define COLAMD_EMPTY (-1)
 
 /* Row and column status */
-#define ALIVE (0)
-#define DEAD  (-1)
+#define COLAMD_ALIVE (0)
+#define COLAMD_DEAD  (-1)
 
 /* Column status */
 #define DEAD_PRINCIPAL    (-1)
@@ -114,12 +114,12 @@ namespace internal {
 
 /* Macros for row and column status update and checking. */
 #define ROW_IS_DEAD(r)      ROW_IS_MARKED_DEAD (Row[r].shared2.mark)
-#define ROW_IS_MARKED_DEAD(row_mark)  (row_mark < ALIVE)
-#define ROW_IS_ALIVE(r)     (Row [r].shared2.mark >= ALIVE)
-#define COL_IS_DEAD(c)      (Col [c].start < ALIVE)
-#define COL_IS_ALIVE(c)     (Col [c].start >= ALIVE)
+#define ROW_IS_MARKED_DEAD(row_mark)  (row_mark < COLAMD_ALIVE)
+#define ROW_IS_ALIVE(r)     (Row [r].shared2.mark >= COLAMD_ALIVE)
+#define COL_IS_DEAD(c)      (Col [c].start < COLAMD_ALIVE)
+#define COL_IS_ALIVE(c)     (Col [c].start >= COLAMD_ALIVE)
 #define COL_IS_DEAD_PRINCIPAL(c)  (Col [c].start == DEAD_PRINCIPAL)
-#define KILL_ROW(r)     { Row [r].shared2.mark = DEAD ; }
+#define KILL_ROW(r)     { Row [r].shared2.mark = COLAMD_DEAD ; }
 #define KILL_PRINCIPAL_COL(c)   { Col [c].start = DEAD_PRINCIPAL ; }
 #define KILL_NON_PRINCIPAL_COL(c) { Col [c].start = DEAD_NON_PRINCIPAL ; }
 

From 0a6b553ecf0716c735e19e829b5d1fb177ef36d6 Mon Sep 17 00:00:00 2001
From: Anshul Jaiswal <ajaiswal@fb.com>
Date: Sun, 21 Jul 2019 04:53:31 +0000
Subject: [PATCH 22/30] Eigen_Colamd.h edited online with Bitbucket replacing
 constant #defines with const definitions

---
 Eigen/src/OrderingMethods/Eigen_Colamd.h | 363 +++++++++++------------
 1 file changed, 181 insertions(+), 182 deletions(-)

diff --git a/Eigen/src/OrderingMethods/Eigen_Colamd.h b/Eigen/src/OrderingMethods/Eigen_Colamd.h
index 2a4338393..bba7c67ac 100644
--- a/Eigen/src/OrderingMethods/Eigen_Colamd.h
+++ b/Eigen/src/OrderingMethods/Eigen_Colamd.h
@@ -13,37 +13,37 @@
 //   Davis (davis@cise.ufl.edu), University of Florida.  The algorithm was
 //   developed in collaboration with John Gilbert, Xerox PARC, and Esmond
 //   Ng, Oak Ridge National Laboratory.
-// 
+//
 //     Date:
-// 
+//
 //   September 8, 2003.  Version 2.3.
-// 
+//
 //     Acknowledgements:
-// 
+//
 //   This work was supported by the National Science Foundation, under
 //   grants DMS-9504974 and DMS-9803599.
-// 
+//
 //     Notice:
-// 
+//
 //   Copyright (c) 1998-2003 by the University of Florida.
 //   All Rights Reserved.
-// 
+//
 //   THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
 //   EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
-// 
+//
 //   Permission is hereby granted to use, copy, modify, and/or distribute
 //   this program, provided that the Copyright, this License, and the
 //   Availability of the original version is retained on all copies and made
 //   accessible to the end-user of any code or package that includes COLAMD
-//   or any modified version of COLAMD. 
-// 
+//   or any modified version of COLAMD.
+//
 //     Availability:
-// 
+//
 //   The colamd/symamd library is available at
-// 
+//
 //       http://www.suitesparse.com
 
-  
+
 #ifndef EIGEN_COLAMD_H
 #define EIGEN_COLAMD_H
 
@@ -57,42 +57,42 @@ namespace internal {
 /* ========================================================================== */
 
 /* size of the knobs [ ] array.  Only knobs [0..1] are currently used. */
-#define COLAMD_KNOBS 20
+const size_t ColamdKnobs = 20;
 
 /* number of output statistics.  Only stats [0..6] are currently used. */
-#define COLAMD_STATS 20 
+const size_t ColamdStats = 20;
 
 /* knobs [0] and stats [0]: dense row knob and output statistic. */
-#define COLAMD_DENSE_ROW 0
+const size_t ColamdDenseRow = 0;
 
 /* knobs [1] and stats [1]: dense column knob and output statistic. */
-#define COLAMD_DENSE_COL 1
+const size_t ColamdDenseCol = 1;
 
 /* stats [2]: memory defragmentation count output statistic */
-#define COLAMD_DEFRAG_COUNT 2
+const size_t ColamdDefragCount = 2;
 
 /* stats [3]: colamd status:  zero OK, > 0 warning or notice, < 0 error */
-#define COLAMD_STATUS 3
+const size_t ColamdStatus = 3;
 
-/* stats [4..6]: error info, or info on jumbled columns */ 
-#define COLAMD_INFO1 4
-#define COLAMD_INFO2 5
-#define COLAMD_INFO3 6
+/* stats [4..6]: error info, or info on jumbled columns */
+const size_t ColamdInfo1 = 4;
+const size_t ColamdInfo2 = 5;
+const size_t ColamdInfo3 = 6;
 
 /* error codes returned in stats [3]: */
-#define COLAMD_OK       (0)
-#define COLAMD_OK_BUT_JUMBLED     (1)
-#define COLAMD_ERROR_A_not_present    (-1)
-#define COLAMD_ERROR_p_not_present    (-2)
-#define COLAMD_ERROR_nrow_negative    (-3)
-#define COLAMD_ERROR_ncol_negative    (-4)
-#define COLAMD_ERROR_nnz_negative   (-5)
-#define COLAMD_ERROR_p0_nonzero     (-6)
-#define COLAMD_ERROR_A_too_small    (-7)
-#define COLAMD_ERROR_col_length_negative  (-8)
-#define COLAMD_ERROR_row_index_out_of_bounds  (-9)
-#define COLAMD_ERROR_out_of_memory    (-10)
-#define COLAMD_ERROR_internal_error   (-999)
+const int ColamdOk = 0;
+const int ColamdOkButJumbled = 1;
+const int ColamdErrorANotPresent = -1;
+const int ColamdErrorPNotPresent = -2;
+const int ColamdErrorNrowNegative = -3;
+const int ColamdErrorNcolNegative = -4;
+const int ColamdErrorNnzNegative = -5;
+const int ColamdErrorP0Nonzero = -6;
+const int ColamdErrorATooSmall = -7;
+const int ColamdErrorColLengthNegative = -8;
+const int ColamdErrorRowIndexOutOfBounds = -9;
+const int ColamdErrorOutOfMemory = -10;
+const int ColamdErrorInternalError = -999;
 
 /* ========================================================================== */
 /* === Definitions ========================================================== */
@@ -101,27 +101,26 @@ namespace internal {
 #define ONES_COMPLEMENT(r) (-(r)-1)
 
 /* -------------------------------------------------------------------------- */
-
-#define COLAMD_EMPTY (-1)
+const int ColamdEmpty = -1;
 
 /* Row and column status */
-#define COLAMD_ALIVE (0)
-#define COLAMD_DEAD  (-1)
+const int ColamdAlive = 0;
+const int ColamdDead = -1;
 
 /* Column status */
-#define DEAD_PRINCIPAL    (-1)
-#define DEAD_NON_PRINCIPAL  (-2)
+const int ColamdDeadPrincipal = -1;
+const int ColamdDeadNonPrincipal = -2;
 
 /* Macros for row and column status update and checking. */
 #define ROW_IS_DEAD(r)      ROW_IS_MARKED_DEAD (Row[r].shared2.mark)
-#define ROW_IS_MARKED_DEAD(row_mark)  (row_mark < COLAMD_ALIVE)
-#define ROW_IS_ALIVE(r)     (Row [r].shared2.mark >= COLAMD_ALIVE)
-#define COL_IS_DEAD(c)      (Col [c].start < COLAMD_ALIVE)
-#define COL_IS_ALIVE(c)     (Col [c].start >= COLAMD_ALIVE)
-#define COL_IS_DEAD_PRINCIPAL(c)  (Col [c].start == DEAD_PRINCIPAL)
-#define KILL_ROW(r)     { Row [r].shared2.mark = COLAMD_DEAD ; }
-#define KILL_PRINCIPAL_COL(c)   { Col [c].start = DEAD_PRINCIPAL ; }
-#define KILL_NON_PRINCIPAL_COL(c) { Col [c].start = DEAD_NON_PRINCIPAL ; }
+#define ROW_IS_MARKED_DEAD(row_mark)  (row_mark < ColamdAlive)
+#define ROW_IS_ALIVE(r)     (Row [r].shared2.mark >= ColamdAlive)
+#define COL_IS_DEAD(c)      (Col [c].start < ColamdAlive)
+#define COL_IS_ALIVE(c)     (Col [c].start >= ColamdAlive)
+#define COL_IS_DEAD_PRINCIPAL(c)  (Col [c].start == ColamdDeadPrincipal)
+#define KILL_ROW(r)     { Row [r].shared2.mark = ColamdDead ; }
+#define KILL_PRINCIPAL_COL(c)   { Col [c].start = ColamdDeadPrincipal ; }
+#define KILL_NON_PRINCIPAL_COL(c) { Col [c].start = ColamdDeadNonPrincipal ; }
 
 /* ========================================================================== */
 /* === Colamd reporting mechanism =========================================== */
@@ -131,7 +130,7 @@ namespace internal {
 template <typename IndexType>
 struct colamd_col
 {
-  IndexType start ;   /* index for A of first row in this column, or DEAD */
+  IndexType start ;   /* index for A of first row in this column, or ColamdDead */
   /* if column is dead */
   IndexType length ;  /* number of rows in this column */
   union
@@ -159,9 +158,9 @@ struct colamd_col
     IndexType degree_next ; /* next column, if col is in a degree list */
     IndexType hash_next ;   /* next column, if col is in a hash list */
   } shared4 ;
-  
+
 };
- 
+
 template <typename IndexType>
 struct Colamd_Row
 {
@@ -177,13 +176,13 @@ struct Colamd_Row
     IndexType mark ;  /* for computing set differences and marking dead rows*/
     IndexType first_column ;/* first column in row (used in garbage collection) */
   } shared2 ;
-  
+
 };
- 
+
 /* ========================================================================== */
 /* === Colamd recommended memory size ======================================= */
 /* ========================================================================== */
- 
+
 /*
   The recommended length Alen of the array A passed to colamd is given by
   the COLAMD_RECOMMENDED (nnz, n_row, n_col) macro.  It returns -1 if any
@@ -192,14 +191,14 @@ struct Colamd_Row
   required for the Col and Row arrays, respectively, which are internal to
   colamd.  An additional n_col space is the minimal amount of "elbow room",
   and nnz/5 more space is recommended for run time efficiency.
-  
+
   This macro is not needed when using symamd.
-  
+
   Explicit typecast to IndexType added Sept. 23, 2002, COLAMD version 2.2, to avoid
   gcc -pedantic warning messages.
 */
 template <typename IndexType>
-inline IndexType colamd_c(IndexType n_col) 
+inline IndexType colamd_c(IndexType n_col)
 { return IndexType( ((n_col) + 1) * sizeof (colamd_col<IndexType>) / sizeof (IndexType) ) ; }
 
 template <typename IndexType>
@@ -208,10 +207,10 @@ inline IndexType  colamd_r(IndexType n_row)
 
 // Prototypes of non-user callable routines
 template <typename IndexType>
-static IndexType init_rows_cols (IndexType n_row, IndexType n_col, Colamd_Row<IndexType> Row [], colamd_col<IndexType> col [], IndexType A [], IndexType p [], IndexType stats[COLAMD_STATS] ); 
+static IndexType init_rows_cols (IndexType n_row, IndexType n_col, Colamd_Row<IndexType> Row [], colamd_col<IndexType> col [], IndexType A [], IndexType p [], IndexType stats[ColamdStats] );
 
 template <typename IndexType>
-static void init_scoring (IndexType n_row, IndexType n_col, Colamd_Row<IndexType> Row [], colamd_col<IndexType> Col [], IndexType A [], IndexType head [], double knobs[COLAMD_KNOBS], IndexType *p_n_row2, IndexType *p_n_col2, IndexType *p_max_deg);
+static void init_scoring (IndexType n_row, IndexType n_col, Colamd_Row<IndexType> Row [], colamd_col<IndexType> Col [], IndexType A [], IndexType head [], double knobs[ColamdKnobs], IndexType *p_n_row2, IndexType *p_n_col2, IndexType *p_max_deg);
 
 template <typename IndexType>
 static IndexType find_ordering (IndexType n_row, IndexType n_col, IndexType Alen, Colamd_Row<IndexType> Row [], colamd_col<IndexType> Col [], IndexType A [], IndexType head [], IndexType n_col2, IndexType max_deg, IndexType pfree);
@@ -240,14 +239,14 @@ static inline  IndexType clear_mark (IndexType n_row, Colamd_Row<IndexType> Row
 
 
 /**
- * \brief Returns the recommended value of Alen 
- * 
- * Returns recommended value of Alen for use by colamd.  
- * Returns -1 if any input argument is negative.  
- * The use of this routine or macro is optional.  
- * Note that the macro uses its arguments   more than once, 
- * so be careful for side effects, if you pass expressions as arguments to COLAMD_RECOMMENDED.  
- * 
+ * \brief Returns the recommended value of Alen
+ *
+ * Returns recommended value of Alen for use by colamd.
+ * Returns -1 if any input argument is negative.
+ * The use of this routine or macro is optional.
+ * Note that the macro uses its arguments   more than once,
+ * so be careful for side effects, if you pass expressions as arguments to COLAMD_RECOMMENDED.
+ *
  * \param nnz nonzeros in A
  * \param n_row number of rows in A
  * \param n_col number of columns in A
@@ -259,18 +258,18 @@ inline IndexType colamd_recommended ( IndexType nnz, IndexType n_row, IndexType
   if ((nnz) < 0 || (n_row) < 0 || (n_col) < 0)
     return (-1);
   else
-    return (2 * (nnz) + colamd_c (n_col) + colamd_r (n_row) + (n_col) + ((nnz) / 5)); 
+    return (2 * (nnz) + colamd_c (n_col) + colamd_r (n_row) + (n_col) + ((nnz) / 5));
 }
 
 /**
  * \brief set default parameters  The use of this routine is optional.
- * 
- * Colamd: rows with more than (knobs [COLAMD_DENSE_ROW] * n_col)
- * entries are removed prior to ordering.  Columns with more than
- * (knobs [COLAMD_DENSE_COL] * n_row) entries are removed prior to
- * ordering, and placed last in the output column ordering. 
  *
- * COLAMD_DENSE_ROW and COLAMD_DENSE_COL are defined as 0 and 1,
+ * Colamd: rows with more than (knobs [ColamdDenseRow] * n_col)
+ * entries are removed prior to ordering.  Columns with more than
+ * (knobs [ColamdDenseCol] * n_row) entries are removed prior to
+ * ordering, and placed last in the output column ordering.
+ *
+ * ColamdDenseRow and ColamdDenseCol are defined as 0 and 1,
  * respectively, in colamd.h.  Default values of these two knobs
  * are both 0.5.  Currently, only knobs [0] and knobs [1] are
  * used, but future versions may use more knobs.  If so, they will
@@ -279,37 +278,37 @@ inline IndexType colamd_recommended ( IndexType nnz, IndexType n_row, IndexType
  * not need to change, assuming that you either use
  * colamd_set_defaults, or pass a (double *) NULL pointer as the
  * knobs array to colamd or symamd.
- * 
+ *
  * \param knobs parameter settings for colamd
  */
 
-static inline void colamd_set_defaults(double knobs[COLAMD_KNOBS])
+static inline void colamd_set_defaults(double knobs[ColamdKnobs])
 {
   /* === Local variables ================================================== */
-  
+
   int i ;
 
   if (!knobs)
   {
     return ;      /* no knobs to initialize */
   }
-  for (i = 0 ; i < COLAMD_KNOBS ; i++)
+  for (i = 0 ; i < ColamdKnobs ; i++)
   {
     knobs [i] = 0 ;
   }
-  knobs [COLAMD_DENSE_ROW] = 0.5 ;  /* ignore rows over 50% dense */
-  knobs [COLAMD_DENSE_COL] = 0.5 ;  /* ignore columns over 50% dense */
+  knobs [ColamdDenseRow] = 0.5 ;  /* ignore rows over 50% dense */
+  knobs [ColamdDenseCol] = 0.5 ;  /* ignore columns over 50% dense */
 }
 
-/** 
+/**
  * \brief  Computes a column ordering using the column approximate minimum degree ordering
- * 
+ *
  * Computes a column ordering (Q) of A such that P(AQ)=LU or
  * (AQ)'AQ=LL' have less fill-in and require fewer floating point
  * operations than factorizing the unpermuted matrix A or A'A,
  * respectively.
- * 
- * 
+ *
+ *
  * \param n_row number of rows in A
  * \param n_col number of columns in A
  * \param Alen, size of the array A
@@ -319,10 +318,10 @@ static inline void colamd_set_defaults(double knobs[COLAMD_KNOBS])
  * \param stats colamd output statistics and error codes
  */
 template <typename IndexType>
-static bool colamd(IndexType n_row, IndexType n_col, IndexType Alen, IndexType *A, IndexType *p, double knobs[COLAMD_KNOBS], IndexType stats[COLAMD_STATS])
+static bool colamd(IndexType n_row, IndexType n_col, IndexType Alen, IndexType *A, IndexType *p, double knobs[ColamdKnobs], IndexType stats[ColamdStats])
 {
   /* === Local variables ================================================== */
-  
+
   IndexType i ;     /* loop index */
   IndexType nnz ;     /* nonzeros in A */
   IndexType Row_size ;    /* size of Row [], in integers */
@@ -334,128 +333,128 @@ static bool colamd(IndexType n_row, IndexType n_col, IndexType Alen, IndexType *
   IndexType n_row2 ;    /* number of non-dense, non-empty rows */
   IndexType ngarbage ;    /* number of garbage collections performed */
   IndexType max_deg ;   /* maximum row degree */
-  double default_knobs [COLAMD_KNOBS] ; /* default knobs array */
-  
-  
+  double default_knobs [ColamdKnobs] ; /* default knobs array */
+
+
   /* === Check the input arguments ======================================== */
-  
+
   if (!stats)
   {
     COLAMD_DEBUG0 (("colamd: stats not present\n")) ;
     return (false) ;
   }
-  for (i = 0 ; i < COLAMD_STATS ; i++)
+  for (i = 0 ; i < ColamdStats ; i++)
   {
     stats [i] = 0 ;
   }
-  stats [COLAMD_STATUS] = COLAMD_OK ;
-  stats [COLAMD_INFO1] = -1 ;
-  stats [COLAMD_INFO2] = -1 ;
-  
+  stats [ColamdStatus] = ColamdOk ;
+  stats [ColamdInfo1] = -1 ;
+  stats [ColamdInfo2] = -1 ;
+
   if (!A)   /* A is not present */
   {
-    stats [COLAMD_STATUS] = COLAMD_ERROR_A_not_present ;
+    stats [ColamdStatus] = ColamdErrorANotPresent ;
     COLAMD_DEBUG0 (("colamd: A not present\n")) ;
     return (false) ;
   }
-  
+
   if (!p)   /* p is not present */
   {
-    stats [COLAMD_STATUS] = COLAMD_ERROR_p_not_present ;
+    stats [ColamdStatus] = ColamdErrorPNotPresent ;
     COLAMD_DEBUG0 (("colamd: p not present\n")) ;
     return (false) ;
   }
-  
+
   if (n_row < 0)  /* n_row must be >= 0 */
   {
-    stats [COLAMD_STATUS] = COLAMD_ERROR_nrow_negative ;
-    stats [COLAMD_INFO1] = n_row ;
+    stats [ColamdStatus] = ColamdErrorNrowNegative ;
+    stats [ColamdInfo1] = n_row ;
     COLAMD_DEBUG0 (("colamd: nrow negative %d\n", n_row)) ;
     return (false) ;
   }
-  
+
   if (n_col < 0)  /* n_col must be >= 0 */
   {
-    stats [COLAMD_STATUS] = COLAMD_ERROR_ncol_negative ;
-    stats [COLAMD_INFO1] = n_col ;
+    stats [ColamdStatus] = ColamdErrorNcolNegative ;
+    stats [ColamdInfo1] = n_col ;
     COLAMD_DEBUG0 (("colamd: ncol negative %d\n", n_col)) ;
     return (false) ;
   }
-  
+
   nnz = p [n_col] ;
   if (nnz < 0)  /* nnz must be >= 0 */
   {
-    stats [COLAMD_STATUS] = COLAMD_ERROR_nnz_negative ;
-    stats [COLAMD_INFO1] = nnz ;
+    stats [ColamdStatus] = ColamdErrorNnzNegative ;
+    stats [ColamdInfo1] = nnz ;
     COLAMD_DEBUG0 (("colamd: number of entries negative %d\n", nnz)) ;
     return (false) ;
   }
-  
+
   if (p [0] != 0)
   {
-    stats [COLAMD_STATUS] = COLAMD_ERROR_p0_nonzero ;
-    stats [COLAMD_INFO1] = p [0] ;
+    stats [ColamdStatus] = ColamdErrorP0Nonzero ;
+    stats [ColamdInfo1] = p [0] ;
     COLAMD_DEBUG0 (("colamd: p[0] not zero %d\n", p [0])) ;
     return (false) ;
   }
-  
+
   /* === If no knobs, set default knobs =================================== */
-  
+
   if (!knobs)
   {
     colamd_set_defaults (default_knobs) ;
     knobs = default_knobs ;
   }
-  
+
   /* === Allocate the Row and Col arrays from array A ===================== */
-  
+
   Col_size = colamd_c (n_col) ;
   Row_size = colamd_r (n_row) ;
   need = 2*nnz + n_col + Col_size + Row_size ;
-  
+
   if (need > Alen)
   {
     /* not enough space in array A to perform the ordering */
-    stats [COLAMD_STATUS] = COLAMD_ERROR_A_too_small ;
-    stats [COLAMD_INFO1] = need ;
-    stats [COLAMD_INFO2] = Alen ;
+    stats [ColamdStatus] = ColamdErrorATooSmall ;
+    stats [ColamdInfo1] = need ;
+    stats [ColamdInfo2] = Alen ;
     COLAMD_DEBUG0 (("colamd: Need Alen >= %d, given only Alen = %d\n", need,Alen));
     return (false) ;
   }
-  
+
   Alen -= Col_size + Row_size ;
   Col = (colamd_col<IndexType> *) &A [Alen] ;
   Row = (Colamd_Row<IndexType> *) &A [Alen + Col_size] ;
 
   /* === Construct the row and column data structures ===================== */
-  
+
   if (!Eigen::internal::init_rows_cols (n_row, n_col, Row, Col, A, p, stats))
   {
     /* input matrix is invalid */
     COLAMD_DEBUG0 (("colamd: Matrix invalid\n")) ;
     return (false) ;
   }
-  
+
   /* === Initialize scores, kill dense rows/columns ======================= */
 
   Eigen::internal::init_scoring (n_row, n_col, Row, Col, A, p, knobs,
 		&n_row2, &n_col2, &max_deg) ;
-  
+
   /* === Order the supercolumns =========================================== */
-  
+
   ngarbage = Eigen::internal::find_ordering (n_row, n_col, Alen, Row, Col, A, p,
 			    n_col2, max_deg, 2*nnz) ;
-  
+
   /* === Order the non-principal columns ================================== */
-  
+
   Eigen::internal::order_children (n_col, Col, p) ;
-  
+
   /* === Return statistics in stats ======================================= */
-  
-  stats [COLAMD_DENSE_ROW] = n_row - n_row2 ;
-  stats [COLAMD_DENSE_COL] = n_col - n_col2 ;
-  stats [COLAMD_DEFRAG_COUNT] = ngarbage ;
-  COLAMD_DEBUG0 (("colamd: done.\n")) ; 
+
+  stats [ColamdDenseRow] = n_row - n_row2 ;
+  stats [ColamdDenseCol] = n_col - n_col2 ;
+  stats [ColamdDefragCount] = ngarbage ;
+  COLAMD_DEBUG0 (("colamd: done.\n")) ;
   return (true) ;
 }
 
@@ -489,7 +488,7 @@ static IndexType init_rows_cols  /* returns true if OK, or false otherwise */
     colamd_col<IndexType> Col [],    /* of size n_col+1 */
     IndexType A [],     /* row indices of A, of size Alen */
     IndexType p [],     /* pointers to columns in A, of size n_col+1 */
-    IndexType stats [COLAMD_STATS]  /* colamd statistics */ 
+    IndexType stats [ColamdStats]  /* colamd statistics */
     )
 {
   /* === Local variables ================================================== */
@@ -512,24 +511,24 @@ static IndexType init_rows_cols  /* returns true if OK, or false otherwise */
     if ((Col [col].length) < 0) // extra parentheses to work-around gcc bug 10200
     {
       /* column pointers must be non-decreasing */
-      stats [COLAMD_STATUS] = COLAMD_ERROR_col_length_negative ;
-      stats [COLAMD_INFO1] = col ;
-      stats [COLAMD_INFO2] = Col [col].length ;
+      stats [ColamdStatus] = ColamdErrorColLengthNegative ;
+      stats [ColamdInfo1] = col ;
+      stats [ColamdInfo2] = Col [col].length ;
       COLAMD_DEBUG0 (("colamd: col %d length %d < 0\n", col, Col [col].length)) ;
       return (false) ;
     }
 
     Col [col].shared1.thickness = 1 ;
     Col [col].shared2.score = 0 ;
-    Col [col].shared3.prev = COLAMD_EMPTY ;
-    Col [col].shared4.degree_next = COLAMD_EMPTY ;
+    Col [col].shared3.prev = ColamdEmpty ;
+    Col [col].shared4.degree_next = ColamdEmpty ;
   }
 
   /* p [0..n_col] no longer needed, used as "head" in subsequent routines */
 
   /* === Scan columns, compute row degrees, and check row indices ========= */
 
-  stats [COLAMD_INFO3] = 0 ;  /* number of duplicate or unsorted row indices*/
+  stats [ColamdInfo3] = 0 ;  /* number of duplicate or unsorted row indices*/
 
   for (row = 0 ; row < n_row ; row++)
   {
@@ -551,10 +550,10 @@ static IndexType init_rows_cols  /* returns true if OK, or false otherwise */
       /* make sure row indices within range */
       if (row < 0 || row >= n_row)
       {
-	stats [COLAMD_STATUS] = COLAMD_ERROR_row_index_out_of_bounds ;
-	stats [COLAMD_INFO1] = col ;
-	stats [COLAMD_INFO2] = row ;
-	stats [COLAMD_INFO3] = n_row ;
+	stats [ColamdStatus] = ColamdErrorRowIndexOutOfBounds ;
+	stats [ColamdInfo1] = col ;
+	stats [ColamdInfo2] = row ;
+	stats [ColamdInfo3] = n_row ;
 	COLAMD_DEBUG0 (("colamd: row %d col %d out of bounds\n", row, col)) ;
 	return (false) ;
       }
@@ -563,10 +562,10 @@ static IndexType init_rows_cols  /* returns true if OK, or false otherwise */
       {
 	/* row index are unsorted or repeated (or both), thus col */
 	/* is jumbled.  This is a notice, not an error condition. */
-	stats [COLAMD_STATUS] = COLAMD_OK_BUT_JUMBLED ;
-	stats [COLAMD_INFO1] = col ;
-	stats [COLAMD_INFO2] = row ;
-	(stats [COLAMD_INFO3]) ++ ;
+	stats [ColamdStatus] = ColamdOkButJumbled ;
+	stats [ColamdInfo1] = col ;
+	stats [ColamdInfo2] = row ;
+	(stats [ColamdInfo3]) ++ ;
 	COLAMD_DEBUG1 (("colamd: row %d col %d unsorted/duplicate\n",row,col));
       }
 
@@ -604,7 +603,7 @@ static IndexType init_rows_cols  /* returns true if OK, or false otherwise */
 
   /* === Create row form ================================================== */
 
-  if (stats [COLAMD_STATUS] == COLAMD_OK_BUT_JUMBLED)
+  if (stats [ColamdStatus] == ColamdOkButJumbled)
   {
     /* if cols jumbled, watch for repeated row indices */
     for (col = 0 ; col < n_col ; col++)
@@ -646,7 +645,7 @@ static IndexType init_rows_cols  /* returns true if OK, or false otherwise */
 
   /* === See if we need to re-create columns ============================== */
 
-  if (stats [COLAMD_STATUS] == COLAMD_OK_BUT_JUMBLED)
+  if (stats [ColamdStatus] == ColamdOkButJumbled)
   {
     COLAMD_DEBUG0 (("colamd: reconstructing column form, matrix jumbled\n")) ;
 
@@ -705,7 +704,7 @@ static void init_scoring
     colamd_col<IndexType> Col [],    /* of size n_col+1 */
     IndexType A [],     /* column form and row form of A */
     IndexType head [],    /* of size n_col+1 */
-    double knobs [COLAMD_KNOBS],/* parameters */
+    double knobs [ColamdKnobs],/* parameters */
     IndexType *p_n_row2,    /* number of non-dense, non-empty rows */
     IndexType *p_n_col2,    /* number of non-dense, non-empty columns */
     IndexType *p_max_deg    /* maximum row degree */
@@ -732,8 +731,8 @@ static void init_scoring
 
   /* === Extract knobs ==================================================== */
 
-  dense_row_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [COLAMD_DENSE_ROW] * n_col), n_col)) ;
-  dense_col_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [COLAMD_DENSE_COL] * n_row), n_row)) ;
+  dense_row_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [ColamdDenseRow] * n_col), n_col)) ;
+  dense_col_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [ColamdDenseCol] * n_row), n_row)) ;
   COLAMD_DEBUG1 (("colamd: densecount: %d %d\n", dense_row_count, dense_col_count)) ;
   max_deg = 0 ;
   n_col2 = n_col ;
@@ -870,7 +869,7 @@ static void init_scoring
   /* clear the hash buckets */
   for (c = 0 ; c <= n_col ; c++)
   {
-    head [c] = COLAMD_EMPTY ;
+    head [c] = ColamdEmpty ;
   }
   min_score = n_col ;
   /* place in reverse order, so low column indices are at the front */
@@ -891,16 +890,16 @@ static void init_scoring
       COLAMD_ASSERT (min_score <= n_col) ;
       COLAMD_ASSERT (score >= 0) ;
       COLAMD_ASSERT (score <= n_col) ;
-      COLAMD_ASSERT (head [score] >= COLAMD_EMPTY) ;
+      COLAMD_ASSERT (head [score] >= ColamdEmpty) ;
 
       /* now add this column to dList at proper score location */
       next_col = head [score] ;
-      Col [c].shared3.prev = COLAMD_EMPTY ;
+      Col [c].shared3.prev = ColamdEmpty ;
       Col [c].shared4.degree_next = next_col ;
 
       /* if there already was a column with the same score, set its */
       /* previous pointer to this new column */
-      if (next_col != COLAMD_EMPTY)
+      if (next_col != ColamdEmpty)
       {
 	Col [next_col].shared3.prev = c ;
       }
@@ -1001,10 +1000,10 @@ static IndexType find_ordering /* return the number of garbage collections */
     /* make sure degree list isn't empty */
     COLAMD_ASSERT (min_score >= 0) ;
     COLAMD_ASSERT (min_score <= n_col) ;
-    COLAMD_ASSERT (head [min_score] >= COLAMD_EMPTY) ;
+    COLAMD_ASSERT (head [min_score] >= ColamdEmpty) ;
 
     /* get pivot column from head of minimum degree list */
-    while (min_score < n_col && head [min_score] == COLAMD_EMPTY)
+    while (min_score < n_col && head [min_score] == ColamdEmpty)
     {
       min_score++ ;
     }
@@ -1012,9 +1011,9 @@ static IndexType find_ordering /* return the number of garbage collections */
     COLAMD_ASSERT (pivot_col >= 0 && pivot_col <= n_col) ;
     next_col = Col [pivot_col].shared4.degree_next ;
     head [min_score] = next_col ;
-    if (next_col != COLAMD_EMPTY)
+    if (next_col != ColamdEmpty)
     {
-      Col [next_col].shared3.prev = COLAMD_EMPTY ;
+      Col [next_col].shared3.prev = ColamdEmpty ;
     }
 
     COLAMD_ASSERT (COL_IS_ALIVE (pivot_col)) ;
@@ -1120,7 +1119,7 @@ static IndexType find_ordering /* return the number of garbage collections */
     else
     {
       /* there is no pivot row, since it is of zero length */
-      pivot_row = COLAMD_EMPTY ;
+      pivot_row = ColamdEmpty ;
       COLAMD_ASSERT (pivot_row_length == 0) ;
     }
     COLAMD_ASSERT (Col [pivot_col].length > 0 || pivot_row_length == 0) ;
@@ -1172,8 +1171,8 @@ static IndexType find_ordering /* return the number of garbage collections */
       next_col = Col [col].shared4.degree_next ;
       COLAMD_ASSERT (cur_score >= 0) ;
       COLAMD_ASSERT (cur_score <= n_col) ;
-      COLAMD_ASSERT (cur_score >= COLAMD_EMPTY) ;
-      if (prev_col == COLAMD_EMPTY)
+      COLAMD_ASSERT (cur_score >= ColamdEmpty) ;
+      if (prev_col == ColamdEmpty)
       {
 	head [cur_score] = next_col ;
       }
@@ -1181,7 +1180,7 @@ static IndexType find_ordering /* return the number of garbage collections */
       {
 	Col [prev_col].shared4.degree_next = next_col ;
       }
-      if (next_col != COLAMD_EMPTY)
+      if (next_col != ColamdEmpty)
       {
 	Col [next_col].shared3.prev = prev_col ;
       }
@@ -1302,7 +1301,7 @@ static IndexType find_ordering /* return the number of garbage collections */
 	COLAMD_ASSERT (hash <= n_col) ;
 
 	head_column = head [hash] ;
-	if (head_column > COLAMD_EMPTY)
+	if (head_column > ColamdEmpty)
 	{
 	  /* degree list "hash" is non-empty, use prev (shared3) of */
 	  /* first column in degree list as head of hash bucket */
@@ -1391,11 +1390,11 @@ static IndexType find_ordering /* return the number of garbage collections */
       COLAMD_ASSERT (min_score <= n_col) ;
       COLAMD_ASSERT (cur_score >= 0) ;
       COLAMD_ASSERT (cur_score <= n_col) ;
-      COLAMD_ASSERT (head [cur_score] >= COLAMD_EMPTY) ;
+      COLAMD_ASSERT (head [cur_score] >= ColamdEmpty) ;
       next_col = head [cur_score] ;
       Col [col].shared4.degree_next = next_col ;
-      Col [col].shared3.prev = COLAMD_EMPTY ;
-      if (next_col != COLAMD_EMPTY)
+      Col [col].shared3.prev = ColamdEmpty ;
+      if (next_col != ColamdEmpty)
       {
 	Col [next_col].shared3.prev = col ;
       }
@@ -1465,7 +1464,7 @@ static inline  void order_children
   {
     /* find an un-ordered non-principal column */
     COLAMD_ASSERT (COL_IS_DEAD (i)) ;
-    if (!COL_IS_DEAD_PRINCIPAL (i) && Col [i].shared2.order == COLAMD_EMPTY)
+    if (!COL_IS_DEAD_PRINCIPAL (i) && Col [i].shared2.order == ColamdEmpty)
     {
       parent = i ;
       /* once found, find its principal parent */
@@ -1482,7 +1481,7 @@ static inline  void order_children
 
       do
       {
-	COLAMD_ASSERT (Col [c].shared2.order == COLAMD_EMPTY) ;
+	COLAMD_ASSERT (Col [c].shared2.order == ColamdEmpty) ;
 
 	/* order this column */
 	Col [c].shared2.order = order++ ;
@@ -1495,7 +1494,7 @@ static inline  void order_children
 	/* continue until we hit an ordered column.  There are */
 	/* guaranteed not to be anymore unordered columns */
 	/* above an ordered column */
-      } while (Col [c].shared2.order == COLAMD_EMPTY) ;
+      } while (Col [c].shared2.order == ColamdEmpty) ;
 
       /* re-order the super_col parent to largest order for this group */
       Col [parent].shared2.order = order ;
@@ -1547,7 +1546,7 @@ template <typename IndexType>
 static void detect_super_cols
 (
   /* === Parameters ======================================================= */
-  
+
   colamd_col<IndexType> Col [],    /* of size n_col+1 */
   IndexType A [],     /* row indices of A */
   IndexType head [],    /* head of degree lists and hash buckets */
@@ -1590,7 +1589,7 @@ static void detect_super_cols
     /* === Get the first column in this hash bucket ===================== */
 
     head_column = head [hash] ;
-    if (head_column > COLAMD_EMPTY)
+    if (head_column > ColamdEmpty)
     {
       first_col = Col [head_column].shared3.headhash ;
     }
@@ -1601,7 +1600,7 @@ static void detect_super_cols
 
     /* === Consider each column in the hash bucket ====================== */
 
-    for (super_c = first_col ; super_c != COLAMD_EMPTY ;
+    for (super_c = first_col ; super_c != ColamdEmpty ;
 	 super_c = Col [super_c].shared4.hash_next)
     {
       COLAMD_ASSERT (COL_IS_ALIVE (super_c)) ;
@@ -1614,7 +1613,7 @@ static void detect_super_cols
       /* === Compare super_c with all columns after it ================ */
 
       for (c = Col [super_c].shared4.hash_next ;
-	   c != COLAMD_EMPTY ; c = Col [c].shared4.hash_next)
+	   c != ColamdEmpty ; c = Col [c].shared4.hash_next)
       {
 	COLAMD_ASSERT (c != super_c) ;
 	COLAMD_ASSERT (COL_IS_ALIVE (c)) ;
@@ -1660,7 +1659,7 @@ static void detect_super_cols
 	Col [c].shared1.parent = super_c ;
 	KILL_NON_PRINCIPAL_COL (c) ;
 	/* order c later, in order_children() */
-	Col [c].shared2.order = COLAMD_EMPTY ;
+	Col [c].shared2.order = ColamdEmpty ;
 	/* remove c from hash bucket */
 	Col [prev_c].shared4.hash_next = Col [c].shared4.hash_next ;
       }
@@ -1668,15 +1667,15 @@ static void detect_super_cols
 
     /* === Empty this hash bucket ======================================= */
 
-    if (head_column > COLAMD_EMPTY)
+    if (head_column > ColamdEmpty)
     {
       /* corresponding degree list "hash" is not empty */
-      Col [head_column].shared3.headhash = COLAMD_EMPTY ;
+      Col [head_column].shared3.headhash = ColamdEmpty ;
     }
     else
     {
       /* corresponding degree list "hash" is empty */
-      head [hash] = COLAMD_EMPTY ;
+      head [hash] = ColamdEmpty ;
     }
   }
 }
@@ -1698,7 +1697,7 @@ template <typename IndexType>
 static IndexType garbage_collection  /* returns the new value of pfree */
   (
     /* === Parameters ======================================================= */
-    
+
     IndexType n_row,      /* number of rows */
     IndexType n_col,      /* number of columns */
     Colamd_Row<IndexType> Row [],    /* row info */
@@ -1839,5 +1838,5 @@ static inline  IndexType clear_mark  /* return the new value for tag_mark */
 }
 
 
-} // namespace internal 
+} // namespace internal
 #endif

From 39f30923c29c77c3a17c77b9f59dbc73291cf02a Mon Sep 17 00:00:00 2001
From: Anshul Jaiswal <ajaiswal@fb.com>
Date: Thu, 15 Aug 2019 20:15:19 +0000
Subject: [PATCH 23/30] Eigen_Colamd.h edited replacing macros with constexprs
 and functions.

---
 Eigen/src/OrderingMethods/Eigen_Colamd.h | 208 ++++++++++++++---------
 1 file changed, 126 insertions(+), 82 deletions(-)

diff --git a/Eigen/src/OrderingMethods/Eigen_Colamd.h b/Eigen/src/OrderingMethods/Eigen_Colamd.h
index bba7c67ac..aec383abf 100644
--- a/Eigen/src/OrderingMethods/Eigen_Colamd.h
+++ b/Eigen/src/OrderingMethods/Eigen_Colamd.h
@@ -47,6 +47,16 @@
 #ifndef EIGEN_COLAMD_H
 #define EIGEN_COLAMD_H
 
+/* ========================================================================== */
+/* === Knob and statistics definitions used elsewhere ======================= */
+/* ========================================================================== */
+
+/* size of the knobs [ ] array.  Only knobs [0..1] are currently used. */
+constexpr size_t ColamdKnobs = 20;
+
+/* number of output statistics.  Only stats [0..6] are currently used. */
+constexpr size_t ColamdStats = 20;
+
 namespace internal {
 /* Ensure that debugging is turned off: */
 #ifndef COLAMD_NDEBUG
@@ -56,71 +66,59 @@ namespace internal {
 /* === Knob and statistics definitions ====================================== */
 /* ========================================================================== */
 
-/* size of the knobs [ ] array.  Only knobs [0..1] are currently used. */
-const size_t ColamdKnobs = 20;
-
-/* number of output statistics.  Only stats [0..6] are currently used. */
-const size_t ColamdStats = 20;
-
 /* knobs [0] and stats [0]: dense row knob and output statistic. */
-const size_t ColamdDenseRow = 0;
+constexpr size_t ColamdDenseRow = 0;
 
 /* knobs [1] and stats [1]: dense column knob and output statistic. */
-const size_t ColamdDenseCol = 1;
+constexpr size_t ColamdDenseCol = 1;
 
 /* stats [2]: memory defragmentation count output statistic */
-const size_t ColamdDefragCount = 2;
+constexpr size_t ColamdDefragCount = 2;
 
 /* stats [3]: colamd status:  zero OK, > 0 warning or notice, < 0 error */
-const size_t ColamdStatus = 3;
+constexpr size_t ColamdStatus = 3;
 
 /* stats [4..6]: error info, or info on jumbled columns */
-const size_t ColamdInfo1 = 4;
-const size_t ColamdInfo2 = 5;
-const size_t ColamdInfo3 = 6;
+constexpr size_t ColamdInfo1 = 4;
+constexpr size_t ColamdInfo2 = 5;
+constexpr size_t ColamdInfo3 = 6;
 
 /* error codes returned in stats [3]: */
-const int ColamdOk = 0;
-const int ColamdOkButJumbled = 1;
-const int ColamdErrorANotPresent = -1;
-const int ColamdErrorPNotPresent = -2;
-const int ColamdErrorNrowNegative = -3;
-const int ColamdErrorNcolNegative = -4;
-const int ColamdErrorNnzNegative = -5;
-const int ColamdErrorP0Nonzero = -6;
-const int ColamdErrorATooSmall = -7;
-const int ColamdErrorColLengthNegative = -8;
-const int ColamdErrorRowIndexOutOfBounds = -9;
-const int ColamdErrorOutOfMemory = -10;
-const int ColamdErrorInternalError = -999;
+constexpr int ColamdOk = 0;
+constexpr int ColamdOkButJumbled = 1;
+constexpr int ColamdErrorANotPresent = -1;
+constexpr int ColamdErrorPNotPresent = -2;
+constexpr int ColamdErrorNrowNegative = -3;
+constexpr int ColamdErrorNcolNegative = -4;
+constexpr int ColamdErrorNnzNegative = -5;
+constexpr int ColamdErrorP0Nonzero = -6;
+constexpr int ColamdErrorATooSmall = -7;
+constexpr int ColamdErrorColLengthNegative = -8;
+constexpr int ColamdErrorRowIndexOutOfBounds = -9;
+constexpr int ColamdErrorOutOfMemory = -10;
+constexpr int ColamdErrorInternalError = -999;
 
 /* ========================================================================== */
 /* === Definitions ========================================================== */
 /* ========================================================================== */
 
-#define ONES_COMPLEMENT(r) (-(r)-1)
+// #define ONES_COMPLEMENT(r) (-(r)-1)
+
+template <typename IndexType>
+IndexType ones_complement(const IndexType r) {
+  return (-(r)-1);
+}
 
 /* -------------------------------------------------------------------------- */
-const int ColamdEmpty = -1;
+constexpr int ColamdEmpty = -1;
 
 /* Row and column status */
-const int ColamdAlive = 0;
-const int ColamdDead = -1;
+constexpr int ColamdAlive = 0;
+constexpr int ColamdDead = -1;
 
 /* Column status */
-const int ColamdDeadPrincipal = -1;
-const int ColamdDeadNonPrincipal = -2;
-
-/* Macros for row and column status update and checking. */
-#define ROW_IS_DEAD(r)      ROW_IS_MARKED_DEAD (Row[r].shared2.mark)
-#define ROW_IS_MARKED_DEAD(row_mark)  (row_mark < ColamdAlive)
-#define ROW_IS_ALIVE(r)     (Row [r].shared2.mark >= ColamdAlive)
-#define COL_IS_DEAD(c)      (Col [c].start < ColamdAlive)
-#define COL_IS_ALIVE(c)     (Col [c].start >= ColamdAlive)
-#define COL_IS_DEAD_PRINCIPAL(c)  (Col [c].start == ColamdDeadPrincipal)
-#define KILL_ROW(r)     { Row [r].shared2.mark = ColamdDead ; }
-#define KILL_PRINCIPAL_COL(c)   { Col [c].start = ColamdDeadPrincipal ; }
-#define KILL_NON_PRINCIPAL_COL(c) { Col [c].start = ColamdDeadNonPrincipal ; }
+constexpr int ColamdDeadPrincipal = -1;
+constexpr int ColamdDeadNonPrincipal = -2;
 
 /* ========================================================================== */
 /* === Colamd reporting mechanism =========================================== */
@@ -179,6 +177,52 @@ struct Colamd_Row
 
 };
 
+/* Methods for row and column status update and checking. */
+template <typename IndexType>
+bool row_is_marked_dead(const IndexType row_mark) {
+  return row_mark < ColamdAlive;
+}
+
+template <typename IndexType>
+bool row_is_dead(const Colamd_Row<IndexType>* row, const IndexType r) {
+  return row_is_marked_dead(row[r].shared2.mark);
+}
+
+template <typename IndexType>
+bool row_is_alive(const Colamd_Row<IndexType>* row, const IndexType r) {
+  return row[r].shared2.mark >= ColamdAlive;
+}
+
+template <typename IndexType>
+void kill_row(Colamd_Row<IndexType>* row, const IndexType r) {
+  row[r].shared2.mark = ColamdDead;
+}
+
+template <typename IndexType>
+bool col_is_dead(const colamd_col<IndexType>* col, const IndexType c) {
+  return col[c].start < ColamdAlive;
+}
+
+template <typename IndexType>
+bool col_is_alive(const colamd_col<IndexType>* col, const IndexType c) {
+  return col[c].start >= ColamdAlive;
+}
+
+template <typename IndexType>
+bool col_is_dead_principal(const colamd_col<IndexType>* col, const IndexType c) {
+  return col[c].start == ColamdDeadPrincipal;
+}
+
+template <typename IndexType>
+void kill_principal_col(colamd_col<IndexType>* col, const IndexType c) {
+  col[c].start = ColamdDeadPrincipal;
+}
+
+template <typename IndexType>
+void kill_non_principal_col(colamd_col<IndexType>* col, const IndexType c) {
+  col[c].start = ColamdDeadNonPrincipal;
+}
+
 /* ========================================================================== */
 /* === Colamd recommended memory size ======================================= */
 /* ========================================================================== */
@@ -749,7 +793,7 @@ static void init_scoring
     {
       /* this is a empty column, kill and order it last */
       Col [c].shared2.order = --n_col2 ;
-      KILL_PRINCIPAL_COL (c) ;
+      kill_principal_col(Col, c) ;
     }
   }
   COLAMD_DEBUG1 (("colamd: null columns killed: %d\n", n_col - n_col2)) ;
@@ -760,7 +804,7 @@ static void init_scoring
   for (c = n_col-1 ; c >= 0 ; c--)
   {
     /* skip any dead columns */
-    if (COL_IS_DEAD (c))
+    if (col_is_dead(Col, c))
     {
       continue ;
     }
@@ -776,7 +820,7 @@ static void init_scoring
       {
 	Row [*cp++].shared1.degree-- ;
       }
-      KILL_PRINCIPAL_COL (c) ;
+      kill_principal_col(Col, c) ;
     }
   }
   COLAMD_DEBUG1 (("colamd: Dense and null columns killed: %d\n", n_col - n_col2)) ;
@@ -790,7 +834,7 @@ static void init_scoring
     if (deg > dense_row_count || deg == 0)
     {
       /* kill a dense or empty row */
-      KILL_ROW (r) ;
+      kill_row(Row, r) ;
       --n_row2 ;
     }
     else
@@ -812,7 +856,7 @@ static void init_scoring
   for (c = n_col-1 ; c >= 0 ; c--)
   {
     /* skip dead column */
-    if (COL_IS_DEAD (c))
+    if (col_is_dead(Col, c))
     {
       continue ;
     }
@@ -825,7 +869,7 @@ static void init_scoring
       /* get a row */
       row = *cp++ ;
       /* skip if dead */
-      if (ROW_IS_DEAD (row))
+      if (row_is_dead(Row, row))
       {
 	continue ;
       }
@@ -844,7 +888,7 @@ static void init_scoring
       /* and have already been killed) */
       COLAMD_DEBUG2 (("Newly null killed: %d\n", c)) ;
       Col [c].shared2.order = --n_col2 ;
-      KILL_PRINCIPAL_COL (c) ;
+      kill_principal_col(Col, c) ;
     }
     else
     {
@@ -877,7 +921,7 @@ static void init_scoring
   for (c = n_col-1 ; c >= 0 ; c--)
   {
     /* only add principal columns to degree lists */
-    if (COL_IS_ALIVE (c))
+    if (col_is_alive(Col, c))
     {
       COLAMD_DEBUG4 (("place %d score %d minscore %d ncol %d\n",
 		      c, Col [c].shared2.score, min_score, n_col)) ;
@@ -1016,7 +1060,7 @@ static IndexType find_ordering /* return the number of garbage collections */
       Col [next_col].shared3.prev = ColamdEmpty ;
     }
 
-    COLAMD_ASSERT (COL_IS_ALIVE (pivot_col)) ;
+    COLAMD_ASSERT (col_is_alive(Col, pivot_col)) ;
     COLAMD_DEBUG3 (("Pivot col: %d\n", pivot_col)) ;
 
     /* remember score for defrag check */
@@ -1063,9 +1107,9 @@ static IndexType find_ordering /* return the number of garbage collections */
     {
       /* get a row */
       row = *cp++ ;
-      COLAMD_DEBUG4 (("Pivot col pattern %d %d\n", ROW_IS_ALIVE (row), row)) ;
+      COLAMD_DEBUG4 (("Pivot col pattern %d %d\n", row_is_alive(Row, row), row)) ;
       /* skip if row is dead */
-      if (ROW_IS_DEAD (row))
+      if (row_is_dead(Row, row))
       {
 	continue ;
       }
@@ -1077,7 +1121,7 @@ static IndexType find_ordering /* return the number of garbage collections */
 	col = *rp++ ;
 	/* add the column, if alive and untagged */
 	col_thickness = Col [col].shared1.thickness ;
-	if (col_thickness > 0 && COL_IS_ALIVE (col))
+	if (col_thickness > 0 && col_is_alive(Col, col))
 	{
 	  /* tag column in pivot row */
 	  Col [col].shared1.thickness = -col_thickness ;
@@ -1104,7 +1148,7 @@ static IndexType find_ordering /* return the number of garbage collections */
       /* may be killing an already dead row */
       row = *cp++ ;
       COLAMD_DEBUG3 (("Kill row in pivot col: %d\n", row)) ;
-      KILL_ROW (row) ;
+      kill_row(Row, row) ;
     }
 
     /* === Select a row index to use as the new pivot row =============== */
@@ -1156,7 +1200,7 @@ static IndexType find_ordering /* return the number of garbage collections */
     while (rp < rp_end)
     {
       col = *rp++ ;
-      COLAMD_ASSERT (COL_IS_ALIVE (col) && col != pivot_col) ;
+      COLAMD_ASSERT (col_is_alive(Col, col) && col != pivot_col) ;
       COLAMD_DEBUG3 (("Col: %d\n", col)) ;
 
       /* clear tags used to construct pivot row pattern */
@@ -1195,7 +1239,7 @@ static IndexType find_ordering /* return the number of garbage collections */
 	row = *cp++ ;
 	row_mark = Row [row].shared2.mark ;
 	/* skip if dead */
-	if (ROW_IS_MARKED_DEAD (row_mark))
+	if (row_is_marked_dead (row_mark))
 	{
 	  continue ;
 	}
@@ -1214,7 +1258,7 @@ static IndexType find_ordering /* return the number of garbage collections */
 	if (set_difference == 0)
 	{
 	  COLAMD_DEBUG3 (("aggressive absorption. Row: %d\n", row)) ;
-	  KILL_ROW (row) ;
+	  kill_row(Row, row) ;
 	}
 	else
 	{
@@ -1236,7 +1280,7 @@ static IndexType find_ordering /* return the number of garbage collections */
     {
       /* get a column */
       col = *rp++ ;
-      COLAMD_ASSERT (COL_IS_ALIVE (col) && col != pivot_col) ;
+      COLAMD_ASSERT (col_is_alive(Col, col) && col != pivot_col) ;
       hash = 0 ;
       cur_score = 0 ;
       cp = &A [Col [col].start] ;
@@ -1253,7 +1297,7 @@ static IndexType find_ordering /* return the number of garbage collections */
 	COLAMD_ASSERT(row >= 0 && row < n_row) ;
 	row_mark = Row [row].shared2.mark ;
 	/* skip if dead */
-	if (ROW_IS_MARKED_DEAD (row_mark))
+	if (row_is_marked_dead (row_mark))
 	{
 	  continue ;
 	}
@@ -1277,7 +1321,7 @@ static IndexType find_ordering /* return the number of garbage collections */
       {
 	COLAMD_DEBUG4 (("further mass elimination. Col: %d\n", col)) ;
 	/* nothing left but the pivot row in this column */
-	KILL_PRINCIPAL_COL (col) ;
+	kill_principal_col(Col, col) ;
 	pivot_row_degree -= Col [col].shared1.thickness ;
 	COLAMD_ASSERT (pivot_row_degree >= 0) ;
 	/* order it */
@@ -1318,7 +1362,7 @@ static IndexType find_ordering /* return the number of garbage collections */
 
 	/* save hash function in Col [col].shared3.hash */
 	Col [col].shared3.hash = (IndexType) hash ;
-	COLAMD_ASSERT (COL_IS_ALIVE (col)) ;
+	COLAMD_ASSERT (col_is_alive(Col, col)) ;
       }
     }
 
@@ -1332,7 +1376,7 @@ static IndexType find_ordering /* return the number of garbage collections */
 
     /* === Kill the pivotal column ====================================== */
 
-    KILL_PRINCIPAL_COL (pivot_col) ;
+    kill_principal_col(Col, pivot_col) ;
 
     /* === Clear mark =================================================== */
 
@@ -1356,7 +1400,7 @@ static IndexType find_ordering /* return the number of garbage collections */
     {
       col = *rp++ ;
       /* skip dead columns */
-      if (COL_IS_DEAD (col))
+      if (col_is_dead(Col, col))
       {
 	continue ;
       }
@@ -1463,15 +1507,15 @@ static inline  void order_children
   for (i = 0 ; i < n_col ; i++)
   {
     /* find an un-ordered non-principal column */
-    COLAMD_ASSERT (COL_IS_DEAD (i)) ;
-    if (!COL_IS_DEAD_PRINCIPAL (i) && Col [i].shared2.order == ColamdEmpty)
+    COLAMD_ASSERT (col_is_dead(Col, i)) ;
+    if (!col_is_dead_principal(Col, i) && Col [i].shared2.order == ColamdEmpty)
     {
       parent = i ;
       /* once found, find its principal parent */
       do
       {
 	parent = Col [parent].shared1.parent ;
-      } while (!COL_IS_DEAD_PRINCIPAL (parent)) ;
+      } while (!col_is_dead_principal(Col, parent)) ;
 
       /* now, order all un-ordered non-principal columns along path */
       /* to this parent.  collapse tree at the same time */
@@ -1577,7 +1621,7 @@ static void detect_super_cols
   while (rp < rp_end)
   {
     col = *rp++ ;
-    if (COL_IS_DEAD (col))
+    if (col_is_dead(Col, col))
     {
       continue ;
     }
@@ -1603,7 +1647,7 @@ static void detect_super_cols
     for (super_c = first_col ; super_c != ColamdEmpty ;
 	 super_c = Col [super_c].shared4.hash_next)
     {
-      COLAMD_ASSERT (COL_IS_ALIVE (super_c)) ;
+      COLAMD_ASSERT (col_is_alive(Col, super_c)) ;
       COLAMD_ASSERT (Col [super_c].shared3.hash == hash) ;
       length = Col [super_c].length ;
 
@@ -1616,7 +1660,7 @@ static void detect_super_cols
 	   c != ColamdEmpty ; c = Col [c].shared4.hash_next)
       {
 	COLAMD_ASSERT (c != super_c) ;
-	COLAMD_ASSERT (COL_IS_ALIVE (c)) ;
+	COLAMD_ASSERT (col_is_alive(Col, c)) ;
 	COLAMD_ASSERT (Col [c].shared3.hash == hash) ;
 
 	/* not identical if lengths or scores are different */
@@ -1657,7 +1701,7 @@ static void detect_super_cols
 
 	Col [super_c].shared1.thickness += Col [c].shared1.thickness ;
 	Col [c].shared1.parent = super_c ;
-	KILL_NON_PRINCIPAL_COL (c) ;
+	kill_non_principal_col(Col, c) ;
 	/* order c later, in order_children() */
 	Col [c].shared2.order = ColamdEmpty ;
 	/* remove c from hash bucket */
@@ -1720,7 +1764,7 @@ static IndexType garbage_collection  /* returns the new value of pfree */
   pdest = &A[0] ;
   for (c = 0 ; c < n_col ; c++)
   {
-    if (COL_IS_ALIVE (c))
+    if (col_is_alive(Col, c))
     {
       psrc = &A [Col [c].start] ;
 
@@ -1731,7 +1775,7 @@ static IndexType garbage_collection  /* returns the new value of pfree */
       for (j = 0 ; j < length ; j++)
       {
 	r = *psrc++ ;
-	if (ROW_IS_ALIVE (r))
+	if (row_is_alive(Row, r))
 	{
 	  *pdest++ = r ;
 	}
@@ -1744,22 +1788,22 @@ static IndexType garbage_collection  /* returns the new value of pfree */
 
   for (r = 0 ; r < n_row ; r++)
   {
-    if (ROW_IS_ALIVE (r))
+    if (row_is_alive(Row, r))
     {
       if (Row [r].length == 0)
       {
 	/* this row is of zero length.  cannot compact it, so kill it */
 	COLAMD_DEBUG3 (("Defrag row kill\n")) ;
-	KILL_ROW (r) ;
+	kill_row(Row, r) ;
       }
       else
       {
 	/* save first column index in Row [r].shared2.first_column */
 	psrc = &A [Row [r].start] ;
 	Row [r].shared2.first_column = *psrc ;
-	COLAMD_ASSERT (ROW_IS_ALIVE (r)) ;
+	COLAMD_ASSERT (row_is_alive(Row, r)) ;
 	/* flag the start of the row with the one's complement of row */
-	*psrc = ONES_COMPLEMENT (r) ;
+	*psrc = ones_complement(r) ;
 
       }
     }
@@ -1775,11 +1819,11 @@ static IndexType garbage_collection  /* returns the new value of pfree */
     {
       psrc-- ;
       /* get the row index */
-      r = ONES_COMPLEMENT (*psrc) ;
+      r = ones_complement(*psrc) ;
       COLAMD_ASSERT (r >= 0 && r < n_row) ;
       /* restore first column index */
       *psrc = Row [r].shared2.first_column ;
-      COLAMD_ASSERT (ROW_IS_ALIVE (r)) ;
+      COLAMD_ASSERT (row_is_alive(Row, r)) ;
 
       /* move and compact the row */
       COLAMD_ASSERT (pdest <= psrc) ;
@@ -1788,7 +1832,7 @@ static IndexType garbage_collection  /* returns the new value of pfree */
       for (j = 0 ; j < length ; j++)
       {
 	c = *psrc++ ;
-	if (COL_IS_ALIVE (c))
+	if (col_is_alive(Col, c))
 	{
 	  *pdest++ = c ;
 	}
@@ -1829,7 +1873,7 @@ static inline  IndexType clear_mark  /* return the new value for tag_mark */
 
   for (r = 0 ; r < n_row ; r++)
   {
-    if (ROW_IS_ALIVE (r))
+    if (row_is_alive(Row, r))
     {
       Row [r].shared2.mark = 0 ;
     }

From 283558face1688a69683b1124142325a3ac4855a Mon Sep 17 00:00:00 2001
From: Anshul Jaiswal <ajaiswal@fb.com>
Date: Thu, 15 Aug 2019 20:21:56 +0000
Subject: [PATCH 24/30] Ordering.h edited to fix dependencies on Eigen_Colamd.h

---
 Eigen/src/OrderingMethods/Ordering.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Eigen/src/OrderingMethods/Ordering.h b/Eigen/src/OrderingMethods/Ordering.h
index 8791158be..10ba6b464 100644
--- a/Eigen/src/OrderingMethods/Ordering.h
+++ b/Eigen/src/OrderingMethods/Ordering.h
@@ -131,8 +131,8 @@ class COLAMDOrdering
       // Get the recommended value of Alen to be used by colamd
       StorageIndex Alen = internal::colamd_recommended(nnz, m, n); 
       // Set the default parameters
-      double knobs [COLAMD_KNOBS]; 
-      StorageIndex stats [COLAMD_STATS];
+      double knobs [ColamdKnobs]; 
+      StorageIndex stats [ColamdStats];
       internal::colamd_set_defaults(knobs);
       
       IndexVector p(n+1), A(Alen); 

From a4d1a6cd7de5112e1b2aca1eaf76b06ed1619c81 Mon Sep 17 00:00:00 2001
From: Anshul Jaiswal <ajaiswal@fb.com>
Date: Sat, 17 Aug 2019 05:29:23 +0000
Subject: [PATCH 25/30] Eigen_Colamd.h updated to replace constexpr with consts
 and enums.

---
 Eigen/src/OrderingMethods/Eigen_Colamd.h | 230 ++++++++++++-----------
 1 file changed, 118 insertions(+), 112 deletions(-)

diff --git a/Eigen/src/OrderingMethods/Eigen_Colamd.h b/Eigen/src/OrderingMethods/Eigen_Colamd.h
index aec383abf..3c9e85aa8 100644
--- a/Eigen/src/OrderingMethods/Eigen_Colamd.h
+++ b/Eigen/src/OrderingMethods/Eigen_Colamd.h
@@ -52,10 +52,10 @@
 /* ========================================================================== */
 
 /* size of the knobs [ ] array.  Only knobs [0..1] are currently used. */
-constexpr size_t ColamdKnobs = 20;
+const size_t ColamdKnobs = 20;
 
 /* number of output statistics.  Only stats [0..6] are currently used. */
-constexpr size_t ColamdStats = 20;
+const size_t ColamdStats = 20;
 
 namespace internal {
 /* Ensure that debugging is turned off: */
@@ -66,59 +66,65 @@ namespace internal {
 /* === Knob and statistics definitions ====================================== */
 /* ========================================================================== */
 
-/* knobs [0] and stats [0]: dense row knob and output statistic. */
-constexpr size_t ColamdDenseRow = 0;
+/* Indices into knobs and stats array. */
+enum KnobsStatsIndex {
+  /* knobs [0] and stats [0]: dense row knob and output statistic. */
+  DenseRow = 0,
 
-/* knobs [1] and stats [1]: dense column knob and output statistic. */
-constexpr size_t ColamdDenseCol = 1;
+  /* knobs [1] and stats [1]: dense column knob and output statistic. */
+  DenseCol = 1,
 
-/* stats [2]: memory defragmentation count output statistic */
-constexpr size_t ColamdDefragCount = 2;
+  /* stats [2]: memory defragmentation count output statistic */
+  DefragCount = 2,
 
-/* stats [3]: colamd status:  zero OK, > 0 warning or notice, < 0 error */
-constexpr size_t ColamdStatus = 3;
+  /* stats [3]: colamd status:  zero OK, > 0 warning or notice, < 0 error */
+  Status = 3,
 
-/* stats [4..6]: error info, or info on jumbled columns */
-constexpr size_t ColamdInfo1 = 4;
-constexpr size_t ColamdInfo2 = 5;
-constexpr size_t ColamdInfo3 = 6;
+  /* stats [4..6]: error info, or info on jumbled columns */
+  Info1 = 4,
+  Info2 = 5,
+  Info3 = 6
+};
 
 /* error codes returned in stats [3]: */
-constexpr int ColamdOk = 0;
-constexpr int ColamdOkButJumbled = 1;
-constexpr int ColamdErrorANotPresent = -1;
-constexpr int ColamdErrorPNotPresent = -2;
-constexpr int ColamdErrorNrowNegative = -3;
-constexpr int ColamdErrorNcolNegative = -4;
-constexpr int ColamdErrorNnzNegative = -5;
-constexpr int ColamdErrorP0Nonzero = -6;
-constexpr int ColamdErrorATooSmall = -7;
-constexpr int ColamdErrorColLengthNegative = -8;
-constexpr int ColamdErrorRowIndexOutOfBounds = -9;
-constexpr int ColamdErrorOutOfMemory = -10;
-constexpr int ColamdErrorInternalError = -999;
-
+enum Status {
+  Ok = 0,
+  OkButJumbled = 1,
+  ErrorANotPresent = -1,
+  ErrorPNotPresent = -2,
+  ErrorNrowNegative = -3,
+  ErrorNcolNegative = -4,
+  ErrorNnzNegative = -5,
+  ErrorP0Nonzero = -6,
+  ErrorATooSmall = -7,
+  ErrorColLengthNegative = -8,
+  ErrorRowIndexOutOfBounds = -9,
+  ErrorOutOfMemory = -10,
+  ErrorInternalError = -999
+};
 /* ========================================================================== */
 /* === Definitions ========================================================== */
 /* ========================================================================== */
 
-// #define ONES_COMPLEMENT(r) (-(r)-1)
-
 template <typename IndexType>
 IndexType ones_complement(const IndexType r) {
   return (-(r)-1);
 }
 
 /* -------------------------------------------------------------------------- */
-constexpr int ColamdEmpty = -1;
+const int Empty = -1;
 
 /* Row and column status */
-constexpr int ColamdAlive = 0;
-constexpr int ColamdDead = -1;
+enum RowColumnStatus {
+  Alive = 0,
+  Dead = -1
+};
 
 /* Column status */
-constexpr int ColamdDeadPrincipal = -1;
-constexpr int ColamdDeadNonPrincipal = -2;
+enum ColumnStatus {
+  DeadPrincipal = -1,
+  DeadNonPrincipal = -2
+};
 
 /* ========================================================================== */
 /* === Colamd reporting mechanism =========================================== */
@@ -128,7 +134,7 @@ constexpr int ColamdDeadNonPrincipal = -2;
 template <typename IndexType>
 struct colamd_col
 {
-  IndexType start ;   /* index for A of first row in this column, or ColamdDead */
+  IndexType start ;   /* index for A of first row in this column, or Dead */
   /* if column is dead */
   IndexType length ;  /* number of rows in this column */
   union
@@ -180,7 +186,7 @@ struct Colamd_Row
 /* Methods for row and column status update and checking. */
 template <typename IndexType>
 bool row_is_marked_dead(const IndexType row_mark) {
-  return row_mark < ColamdAlive;
+  return row_mark < Alive;
 }
 
 template <typename IndexType>
@@ -190,37 +196,37 @@ bool row_is_dead(const Colamd_Row<IndexType>* row, const IndexType r) {
 
 template <typename IndexType>
 bool row_is_alive(const Colamd_Row<IndexType>* row, const IndexType r) {
-  return row[r].shared2.mark >= ColamdAlive;
+  return row[r].shared2.mark >= Alive;
 }
 
 template <typename IndexType>
 void kill_row(Colamd_Row<IndexType>* row, const IndexType r) {
-  row[r].shared2.mark = ColamdDead;
+  row[r].shared2.mark = Dead;
 }
 
 template <typename IndexType>
 bool col_is_dead(const colamd_col<IndexType>* col, const IndexType c) {
-  return col[c].start < ColamdAlive;
+  return col[c].start < Alive;
 }
 
 template <typename IndexType>
 bool col_is_alive(const colamd_col<IndexType>* col, const IndexType c) {
-  return col[c].start >= ColamdAlive;
+  return col[c].start >= Alive;
 }
 
 template <typename IndexType>
 bool col_is_dead_principal(const colamd_col<IndexType>* col, const IndexType c) {
-  return col[c].start == ColamdDeadPrincipal;
+  return col[c].start == DeadPrincipal;
 }
 
 template <typename IndexType>
 void kill_principal_col(colamd_col<IndexType>* col, const IndexType c) {
-  col[c].start = ColamdDeadPrincipal;
+  col[c].start = DeadPrincipal;
 }
 
 template <typename IndexType>
 void kill_non_principal_col(colamd_col<IndexType>* col, const IndexType c) {
-  col[c].start = ColamdDeadNonPrincipal;
+  col[c].start = DeadNonPrincipal;
 }
 
 /* ========================================================================== */
@@ -308,12 +314,12 @@ inline IndexType colamd_recommended ( IndexType nnz, IndexType n_row, IndexType
 /**
  * \brief set default parameters  The use of this routine is optional.
  *
- * Colamd: rows with more than (knobs [ColamdDenseRow] * n_col)
+ * Colamd: rows with more than (knobs [DenseRow] * n_col)
  * entries are removed prior to ordering.  Columns with more than
- * (knobs [ColamdDenseCol] * n_row) entries are removed prior to
+ * (knobs [DenseCol] * n_row) entries are removed prior to
  * ordering, and placed last in the output column ordering.
  *
- * ColamdDenseRow and ColamdDenseCol are defined as 0 and 1,
+ * DenseRow and DenseCol are defined as 0 and 1,
  * respectively, in colamd.h.  Default values of these two knobs
  * are both 0.5.  Currently, only knobs [0] and knobs [1] are
  * used, but future versions may use more knobs.  If so, they will
@@ -340,8 +346,8 @@ static inline void colamd_set_defaults(double knobs[ColamdKnobs])
   {
     knobs [i] = 0 ;
   }
-  knobs [ColamdDenseRow] = 0.5 ;  /* ignore rows over 50% dense */
-  knobs [ColamdDenseCol] = 0.5 ;  /* ignore columns over 50% dense */
+  knobs [DenseRow] = 0.5 ;  /* ignore rows over 50% dense */
+  knobs [DenseCol] = 0.5 ;  /* ignore columns over 50% dense */
 }
 
 /**
@@ -391,36 +397,36 @@ static bool colamd(IndexType n_row, IndexType n_col, IndexType Alen, IndexType *
   {
     stats [i] = 0 ;
   }
-  stats [ColamdStatus] = ColamdOk ;
-  stats [ColamdInfo1] = -1 ;
-  stats [ColamdInfo2] = -1 ;
+  stats [Status] = Ok ;
+  stats [Info1] = -1 ;
+  stats [Info2] = -1 ;
 
   if (!A)   /* A is not present */
   {
-    stats [ColamdStatus] = ColamdErrorANotPresent ;
+    stats [Status] = ErrorANotPresent ;
     COLAMD_DEBUG0 (("colamd: A not present\n")) ;
     return (false) ;
   }
 
   if (!p)   /* p is not present */
   {
-    stats [ColamdStatus] = ColamdErrorPNotPresent ;
+    stats [Status] = ErrorPNotPresent ;
     COLAMD_DEBUG0 (("colamd: p not present\n")) ;
     return (false) ;
   }
 
   if (n_row < 0)  /* n_row must be >= 0 */
   {
-    stats [ColamdStatus] = ColamdErrorNrowNegative ;
-    stats [ColamdInfo1] = n_row ;
+    stats [Status] = ErrorNrowNegative ;
+    stats [Info1] = n_row ;
     COLAMD_DEBUG0 (("colamd: nrow negative %d\n", n_row)) ;
     return (false) ;
   }
 
   if (n_col < 0)  /* n_col must be >= 0 */
   {
-    stats [ColamdStatus] = ColamdErrorNcolNegative ;
-    stats [ColamdInfo1] = n_col ;
+    stats [Status] = ErrorNcolNegative ;
+    stats [Info1] = n_col ;
     COLAMD_DEBUG0 (("colamd: ncol negative %d\n", n_col)) ;
     return (false) ;
   }
@@ -428,16 +434,16 @@ static bool colamd(IndexType n_row, IndexType n_col, IndexType Alen, IndexType *
   nnz = p [n_col] ;
   if (nnz < 0)  /* nnz must be >= 0 */
   {
-    stats [ColamdStatus] = ColamdErrorNnzNegative ;
-    stats [ColamdInfo1] = nnz ;
+    stats [Status] = ErrorNnzNegative ;
+    stats [Info1] = nnz ;
     COLAMD_DEBUG0 (("colamd: number of entries negative %d\n", nnz)) ;
     return (false) ;
   }
 
   if (p [0] != 0)
   {
-    stats [ColamdStatus] = ColamdErrorP0Nonzero ;
-    stats [ColamdInfo1] = p [0] ;
+    stats [Status] = ErrorP0Nonzero ;
+    stats [Info1] = p [0] ;
     COLAMD_DEBUG0 (("colamd: p[0] not zero %d\n", p [0])) ;
     return (false) ;
   }
@@ -459,9 +465,9 @@ static bool colamd(IndexType n_row, IndexType n_col, IndexType Alen, IndexType *
   if (need > Alen)
   {
     /* not enough space in array A to perform the ordering */
-    stats [ColamdStatus] = ColamdErrorATooSmall ;
-    stats [ColamdInfo1] = need ;
-    stats [ColamdInfo2] = Alen ;
+    stats [Status] = ErrorATooSmall ;
+    stats [Info1] = need ;
+    stats [Info2] = Alen ;
     COLAMD_DEBUG0 (("colamd: Need Alen >= %d, given only Alen = %d\n", need,Alen));
     return (false) ;
   }
@@ -495,9 +501,9 @@ static bool colamd(IndexType n_row, IndexType n_col, IndexType Alen, IndexType *
 
   /* === Return statistics in stats ======================================= */
 
-  stats [ColamdDenseRow] = n_row - n_row2 ;
-  stats [ColamdDenseCol] = n_col - n_col2 ;
-  stats [ColamdDefragCount] = ngarbage ;
+  stats [DenseRow] = n_row - n_row2 ;
+  stats [DenseCol] = n_col - n_col2 ;
+  stats [DefragCount] = ngarbage ;
   COLAMD_DEBUG0 (("colamd: done.\n")) ;
   return (true) ;
 }
@@ -555,24 +561,24 @@ static IndexType init_rows_cols  /* returns true if OK, or false otherwise */
     if ((Col [col].length) < 0) // extra parentheses to work-around gcc bug 10200
     {
       /* column pointers must be non-decreasing */
-      stats [ColamdStatus] = ColamdErrorColLengthNegative ;
-      stats [ColamdInfo1] = col ;
-      stats [ColamdInfo2] = Col [col].length ;
+      stats [Status] = ErrorColLengthNegative ;
+      stats [Info1] = col ;
+      stats [Info2] = Col [col].length ;
       COLAMD_DEBUG0 (("colamd: col %d length %d < 0\n", col, Col [col].length)) ;
       return (false) ;
     }
 
     Col [col].shared1.thickness = 1 ;
     Col [col].shared2.score = 0 ;
-    Col [col].shared3.prev = ColamdEmpty ;
-    Col [col].shared4.degree_next = ColamdEmpty ;
+    Col [col].shared3.prev = Empty ;
+    Col [col].shared4.degree_next = Empty ;
   }
 
   /* p [0..n_col] no longer needed, used as "head" in subsequent routines */
 
   /* === Scan columns, compute row degrees, and check row indices ========= */
 
-  stats [ColamdInfo3] = 0 ;  /* number of duplicate or unsorted row indices*/
+  stats [Info3] = 0 ;  /* number of duplicate or unsorted row indices*/
 
   for (row = 0 ; row < n_row ; row++)
   {
@@ -594,10 +600,10 @@ static IndexType init_rows_cols  /* returns true if OK, or false otherwise */
       /* make sure row indices within range */
       if (row < 0 || row >= n_row)
       {
-	stats [ColamdStatus] = ColamdErrorRowIndexOutOfBounds ;
-	stats [ColamdInfo1] = col ;
-	stats [ColamdInfo2] = row ;
-	stats [ColamdInfo3] = n_row ;
+	stats [Status] = ErrorRowIndexOutOfBounds ;
+	stats [Info1] = col ;
+	stats [Info2] = row ;
+	stats [Info3] = n_row ;
 	COLAMD_DEBUG0 (("colamd: row %d col %d out of bounds\n", row, col)) ;
 	return (false) ;
       }
@@ -606,10 +612,10 @@ static IndexType init_rows_cols  /* returns true if OK, or false otherwise */
       {
 	/* row index are unsorted or repeated (or both), thus col */
 	/* is jumbled.  This is a notice, not an error condition. */
-	stats [ColamdStatus] = ColamdOkButJumbled ;
-	stats [ColamdInfo1] = col ;
-	stats [ColamdInfo2] = row ;
-	(stats [ColamdInfo3]) ++ ;
+	stats [Status] = OkButJumbled ;
+	stats [Info1] = col ;
+	stats [Info2] = row ;
+	(stats [Info3]) ++ ;
 	COLAMD_DEBUG1 (("colamd: row %d col %d unsorted/duplicate\n",row,col));
       }
 
@@ -647,7 +653,7 @@ static IndexType init_rows_cols  /* returns true if OK, or false otherwise */
 
   /* === Create row form ================================================== */
 
-  if (stats [ColamdStatus] == ColamdOkButJumbled)
+  if (stats [Status] == OkButJumbled)
   {
     /* if cols jumbled, watch for repeated row indices */
     for (col = 0 ; col < n_col ; col++)
@@ -689,7 +695,7 @@ static IndexType init_rows_cols  /* returns true if OK, or false otherwise */
 
   /* === See if we need to re-create columns ============================== */
 
-  if (stats [ColamdStatus] == ColamdOkButJumbled)
+  if (stats [Status] == OkButJumbled)
   {
     COLAMD_DEBUG0 (("colamd: reconstructing column form, matrix jumbled\n")) ;
 
@@ -775,8 +781,8 @@ static void init_scoring
 
   /* === Extract knobs ==================================================== */
 
-  dense_row_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [ColamdDenseRow] * n_col), n_col)) ;
-  dense_col_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [ColamdDenseCol] * n_row), n_row)) ;
+  dense_row_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [DenseRow] * n_col), n_col)) ;
+  dense_col_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [DenseCol] * n_row), n_row)) ;
   COLAMD_DEBUG1 (("colamd: densecount: %d %d\n", dense_row_count, dense_col_count)) ;
   max_deg = 0 ;
   n_col2 = n_col ;
@@ -913,7 +919,7 @@ static void init_scoring
   /* clear the hash buckets */
   for (c = 0 ; c <= n_col ; c++)
   {
-    head [c] = ColamdEmpty ;
+    head [c] = Empty ;
   }
   min_score = n_col ;
   /* place in reverse order, so low column indices are at the front */
@@ -934,16 +940,16 @@ static void init_scoring
       COLAMD_ASSERT (min_score <= n_col) ;
       COLAMD_ASSERT (score >= 0) ;
       COLAMD_ASSERT (score <= n_col) ;
-      COLAMD_ASSERT (head [score] >= ColamdEmpty) ;
+      COLAMD_ASSERT (head [score] >= Empty) ;
 
       /* now add this column to dList at proper score location */
       next_col = head [score] ;
-      Col [c].shared3.prev = ColamdEmpty ;
+      Col [c].shared3.prev = Empty ;
       Col [c].shared4.degree_next = next_col ;
 
       /* if there already was a column with the same score, set its */
       /* previous pointer to this new column */
-      if (next_col != ColamdEmpty)
+      if (next_col != Empty)
       {
 	Col [next_col].shared3.prev = c ;
       }
@@ -1044,10 +1050,10 @@ static IndexType find_ordering /* return the number of garbage collections */
     /* make sure degree list isn't empty */
     COLAMD_ASSERT (min_score >= 0) ;
     COLAMD_ASSERT (min_score <= n_col) ;
-    COLAMD_ASSERT (head [min_score] >= ColamdEmpty) ;
+    COLAMD_ASSERT (head [min_score] >= Empty) ;
 
     /* get pivot column from head of minimum degree list */
-    while (min_score < n_col && head [min_score] == ColamdEmpty)
+    while (min_score < n_col && head [min_score] == Empty)
     {
       min_score++ ;
     }
@@ -1055,9 +1061,9 @@ static IndexType find_ordering /* return the number of garbage collections */
     COLAMD_ASSERT (pivot_col >= 0 && pivot_col <= n_col) ;
     next_col = Col [pivot_col].shared4.degree_next ;
     head [min_score] = next_col ;
-    if (next_col != ColamdEmpty)
+    if (next_col != Empty)
     {
-      Col [next_col].shared3.prev = ColamdEmpty ;
+      Col [next_col].shared3.prev = Empty ;
     }
 
     COLAMD_ASSERT (col_is_alive(Col, pivot_col)) ;
@@ -1163,7 +1169,7 @@ static IndexType find_ordering /* return the number of garbage collections */
     else
     {
       /* there is no pivot row, since it is of zero length */
-      pivot_row = ColamdEmpty ;
+      pivot_row = Empty ;
       COLAMD_ASSERT (pivot_row_length == 0) ;
     }
     COLAMD_ASSERT (Col [pivot_col].length > 0 || pivot_row_length == 0) ;
@@ -1215,8 +1221,8 @@ static IndexType find_ordering /* return the number of garbage collections */
       next_col = Col [col].shared4.degree_next ;
       COLAMD_ASSERT (cur_score >= 0) ;
       COLAMD_ASSERT (cur_score <= n_col) ;
-      COLAMD_ASSERT (cur_score >= ColamdEmpty) ;
-      if (prev_col == ColamdEmpty)
+      COLAMD_ASSERT (cur_score >= Empty) ;
+      if (prev_col == Empty)
       {
 	head [cur_score] = next_col ;
       }
@@ -1224,7 +1230,7 @@ static IndexType find_ordering /* return the number of garbage collections */
       {
 	Col [prev_col].shared4.degree_next = next_col ;
       }
-      if (next_col != ColamdEmpty)
+      if (next_col != Empty)
       {
 	Col [next_col].shared3.prev = prev_col ;
       }
@@ -1345,7 +1351,7 @@ static IndexType find_ordering /* return the number of garbage collections */
 	COLAMD_ASSERT (hash <= n_col) ;
 
 	head_column = head [hash] ;
-	if (head_column > ColamdEmpty)
+	if (head_column > Empty)
 	{
 	  /* degree list "hash" is non-empty, use prev (shared3) of */
 	  /* first column in degree list as head of hash bucket */
@@ -1434,11 +1440,11 @@ static IndexType find_ordering /* return the number of garbage collections */
       COLAMD_ASSERT (min_score <= n_col) ;
       COLAMD_ASSERT (cur_score >= 0) ;
       COLAMD_ASSERT (cur_score <= n_col) ;
-      COLAMD_ASSERT (head [cur_score] >= ColamdEmpty) ;
+      COLAMD_ASSERT (head [cur_score] >= Empty) ;
       next_col = head [cur_score] ;
       Col [col].shared4.degree_next = next_col ;
-      Col [col].shared3.prev = ColamdEmpty ;
-      if (next_col != ColamdEmpty)
+      Col [col].shared3.prev = Empty ;
+      if (next_col != Empty)
       {
 	Col [next_col].shared3.prev = col ;
       }
@@ -1508,7 +1514,7 @@ static inline  void order_children
   {
     /* find an un-ordered non-principal column */
     COLAMD_ASSERT (col_is_dead(Col, i)) ;
-    if (!col_is_dead_principal(Col, i) && Col [i].shared2.order == ColamdEmpty)
+    if (!col_is_dead_principal(Col, i) && Col [i].shared2.order == Empty)
     {
       parent = i ;
       /* once found, find its principal parent */
@@ -1525,7 +1531,7 @@ static inline  void order_children
 
       do
       {
-	COLAMD_ASSERT (Col [c].shared2.order == ColamdEmpty) ;
+	COLAMD_ASSERT (Col [c].shared2.order == Empty) ;
 
 	/* order this column */
 	Col [c].shared2.order = order++ ;
@@ -1538,7 +1544,7 @@ static inline  void order_children
 	/* continue until we hit an ordered column.  There are */
 	/* guaranteed not to be anymore unordered columns */
 	/* above an ordered column */
-      } while (Col [c].shared2.order == ColamdEmpty) ;
+      } while (Col [c].shared2.order == Empty) ;
 
       /* re-order the super_col parent to largest order for this group */
       Col [parent].shared2.order = order ;
@@ -1633,7 +1639,7 @@ static void detect_super_cols
     /* === Get the first column in this hash bucket ===================== */
 
     head_column = head [hash] ;
-    if (head_column > ColamdEmpty)
+    if (head_column > Empty)
     {
       first_col = Col [head_column].shared3.headhash ;
     }
@@ -1644,7 +1650,7 @@ static void detect_super_cols
 
     /* === Consider each column in the hash bucket ====================== */
 
-    for (super_c = first_col ; super_c != ColamdEmpty ;
+    for (super_c = first_col ; super_c != Empty ;
 	 super_c = Col [super_c].shared4.hash_next)
     {
       COLAMD_ASSERT (col_is_alive(Col, super_c)) ;
@@ -1657,7 +1663,7 @@ static void detect_super_cols
       /* === Compare super_c with all columns after it ================ */
 
       for (c = Col [super_c].shared4.hash_next ;
-	   c != ColamdEmpty ; c = Col [c].shared4.hash_next)
+	   c != Empty ; c = Col [c].shared4.hash_next)
       {
 	COLAMD_ASSERT (c != super_c) ;
 	COLAMD_ASSERT (col_is_alive(Col, c)) ;
@@ -1703,7 +1709,7 @@ static void detect_super_cols
 	Col [c].shared1.parent = super_c ;
 	kill_non_principal_col(Col, c) ;
 	/* order c later, in order_children() */
-	Col [c].shared2.order = ColamdEmpty ;
+	Col [c].shared2.order = Empty ;
 	/* remove c from hash bucket */
 	Col [prev_c].shared4.hash_next = Col [c].shared4.hash_next ;
       }
@@ -1711,15 +1717,15 @@ static void detect_super_cols
 
     /* === Empty this hash bucket ======================================= */
 
-    if (head_column > ColamdEmpty)
+    if (head_column > Empty)
     {
       /* corresponding degree list "hash" is not empty */
-      Col [head_column].shared3.headhash = ColamdEmpty ;
+      Col [head_column].shared3.headhash = Empty ;
     }
     else
     {
       /* corresponding degree list "hash" is empty */
-      head [hash] = ColamdEmpty ;
+      head [hash] = Empty ;
     }
   }
 }

From 15f3d9d2720f060100b2559058c9383b6ffa4d3e Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Tue, 3 Sep 2019 00:50:51 +0200
Subject: [PATCH 26/30] More colamd cleanup: - Move colamd implementation in
 its own namespace to avoid polluting the internal namespace with Ok, Status,
 etc. - Fix signed/unsigned warning - move some ugly free functions as member
 functions

---
 Eigen/src/OrderingMethods/Eigen_Colamd.h | 335 +++++++++++------------
 Eigen/src/OrderingMethods/Ordering.h     |  10 +-
 2 files changed, 158 insertions(+), 187 deletions(-)

diff --git a/Eigen/src/OrderingMethods/Eigen_Colamd.h b/Eigen/src/OrderingMethods/Eigen_Colamd.h
index 3c9e85aa8..8e339a704 100644
--- a/Eigen/src/OrderingMethods/Eigen_Colamd.h
+++ b/Eigen/src/OrderingMethods/Eigen_Colamd.h
@@ -47,25 +47,26 @@
 #ifndef EIGEN_COLAMD_H
 #define EIGEN_COLAMD_H
 
-/* ========================================================================== */
-/* === Knob and statistics definitions used elsewhere ======================= */
-/* ========================================================================== */
-
-/* size of the knobs [ ] array.  Only knobs [0..1] are currently used. */
-const size_t ColamdKnobs = 20;
-
-/* number of output statistics.  Only stats [0..6] are currently used. */
-const size_t ColamdStats = 20;
-
 namespace internal {
+
+namespace Colamd {
+
 /* Ensure that debugging is turned off: */
 #ifndef COLAMD_NDEBUG
 #define COLAMD_NDEBUG
 #endif /* NDEBUG */
+
+
 /* ========================================================================== */
 /* === Knob and statistics definitions ====================================== */
 /* ========================================================================== */
 
+/* size of the knobs [ ] array.  Only knobs [0..1] are currently used. */
+const int NKnobs = 20;
+
+/* number of output statistics.  Only stats [0..6] are currently used. */
+const int NStats = 20;
+
 /* Indices into knobs and stats array. */
 enum KnobsStatsIndex {
   /* knobs [0] and stats [0]: dense row knob and output statistic. */
@@ -132,7 +133,7 @@ enum ColumnStatus {
 
 // == Row and Column structures ==
 template <typename IndexType>
-struct colamd_col
+struct ColStructure
 {
   IndexType start ;   /* index for A of first row in this column, or Dead */
   /* if column is dead */
@@ -163,10 +164,20 @@ struct colamd_col
     IndexType hash_next ;   /* next column, if col is in a hash list */
   } shared4 ;
 
+  inline bool is_dead() const { return start < Alive; }
+
+  inline bool is_alive() const { return start >= Alive; }
+
+  inline bool is_dead_principal() const { return start == DeadPrincipal; }
+
+  inline void kill_principal() { start = DeadPrincipal; }
+
+  inline void kill_non_principal() { start = DeadNonPrincipal; }
+
 };
 
 template <typename IndexType>
-struct Colamd_Row
+struct RowStructure
 {
   IndexType start ;   /* index for A of first col in this row */
   IndexType length ;  /* number of principal columns in this row */
@@ -181,54 +192,14 @@ struct Colamd_Row
     IndexType first_column ;/* first column in row (used in garbage collection) */
   } shared2 ;
 
+  inline bool is_dead() const { return shared2.mark < Alive; }
+
+  inline bool is_alive() const { return shared2.mark >= Alive; }
+
+  inline void kill() { shared2.mark = Dead; }
+
 };
 
-/* Methods for row and column status update and checking. */
-template <typename IndexType>
-bool row_is_marked_dead(const IndexType row_mark) {
-  return row_mark < Alive;
-}
-
-template <typename IndexType>
-bool row_is_dead(const Colamd_Row<IndexType>* row, const IndexType r) {
-  return row_is_marked_dead(row[r].shared2.mark);
-}
-
-template <typename IndexType>
-bool row_is_alive(const Colamd_Row<IndexType>* row, const IndexType r) {
-  return row[r].shared2.mark >= Alive;
-}
-
-template <typename IndexType>
-void kill_row(Colamd_Row<IndexType>* row, const IndexType r) {
-  row[r].shared2.mark = Dead;
-}
-
-template <typename IndexType>
-bool col_is_dead(const colamd_col<IndexType>* col, const IndexType c) {
-  return col[c].start < Alive;
-}
-
-template <typename IndexType>
-bool col_is_alive(const colamd_col<IndexType>* col, const IndexType c) {
-  return col[c].start >= Alive;
-}
-
-template <typename IndexType>
-bool col_is_dead_principal(const colamd_col<IndexType>* col, const IndexType c) {
-  return col[c].start == DeadPrincipal;
-}
-
-template <typename IndexType>
-void kill_principal_col(colamd_col<IndexType>* col, const IndexType c) {
-  col[c].start = DeadPrincipal;
-}
-
-template <typename IndexType>
-void kill_non_principal_col(colamd_col<IndexType>* col, const IndexType c) {
-  col[c].start = DeadNonPrincipal;
-}
-
 /* ========================================================================== */
 /* === Colamd recommended memory size ======================================= */
 /* ========================================================================== */
@@ -249,33 +220,33 @@ void kill_non_principal_col(colamd_col<IndexType>* col, const IndexType c) {
 */
 template <typename IndexType>
 inline IndexType colamd_c(IndexType n_col)
-{ return IndexType( ((n_col) + 1) * sizeof (colamd_col<IndexType>) / sizeof (IndexType) ) ; }
+{ return IndexType( ((n_col) + 1) * sizeof (ColStructure<IndexType>) / sizeof (IndexType) ) ; }
 
 template <typename IndexType>
 inline IndexType  colamd_r(IndexType n_row)
-{ return IndexType(((n_row) + 1) * sizeof (Colamd_Row<IndexType>) / sizeof (IndexType)); }
+{ return IndexType(((n_row) + 1) * sizeof (RowStructure<IndexType>) / sizeof (IndexType)); }
 
 // Prototypes of non-user callable routines
 template <typename IndexType>
-static IndexType init_rows_cols (IndexType n_row, IndexType n_col, Colamd_Row<IndexType> Row [], colamd_col<IndexType> col [], IndexType A [], IndexType p [], IndexType stats[ColamdStats] );
+static IndexType init_rows_cols (IndexType n_row, IndexType n_col, RowStructure<IndexType> Row [], ColStructure<IndexType> col [], IndexType A [], IndexType p [], IndexType stats[NStats] );
 
 template <typename IndexType>
-static void init_scoring (IndexType n_row, IndexType n_col, Colamd_Row<IndexType> Row [], colamd_col<IndexType> Col [], IndexType A [], IndexType head [], double knobs[ColamdKnobs], IndexType *p_n_row2, IndexType *p_n_col2, IndexType *p_max_deg);
+static void init_scoring (IndexType n_row, IndexType n_col, RowStructure<IndexType> Row [], ColStructure<IndexType> Col [], IndexType A [], IndexType head [], double knobs[NKnobs], IndexType *p_n_row2, IndexType *p_n_col2, IndexType *p_max_deg);
 
 template <typename IndexType>
-static IndexType find_ordering (IndexType n_row, IndexType n_col, IndexType Alen, Colamd_Row<IndexType> Row [], colamd_col<IndexType> Col [], IndexType A [], IndexType head [], IndexType n_col2, IndexType max_deg, IndexType pfree);
+static IndexType find_ordering (IndexType n_row, IndexType n_col, IndexType Alen, RowStructure<IndexType> Row [], ColStructure<IndexType> Col [], IndexType A [], IndexType head [], IndexType n_col2, IndexType max_deg, IndexType pfree);
 
 template <typename IndexType>
-static void order_children (IndexType n_col, colamd_col<IndexType> Col [], IndexType p []);
+static void order_children (IndexType n_col, ColStructure<IndexType> Col [], IndexType p []);
 
 template <typename IndexType>
-static void detect_super_cols (colamd_col<IndexType> Col [], IndexType A [], IndexType head [], IndexType row_start, IndexType row_length ) ;
+static void detect_super_cols (ColStructure<IndexType> Col [], IndexType A [], IndexType head [], IndexType row_start, IndexType row_length ) ;
 
 template <typename IndexType>
-static IndexType garbage_collection (IndexType n_row, IndexType n_col, Colamd_Row<IndexType> Row [], colamd_col<IndexType> Col [], IndexType A [], IndexType *pfree) ;
+static IndexType garbage_collection (IndexType n_row, IndexType n_col, RowStructure<IndexType> Row [], ColStructure<IndexType> Col [], IndexType A [], IndexType *pfree) ;
 
 template <typename IndexType>
-static inline  IndexType clear_mark (IndexType n_row, Colamd_Row<IndexType> Row [] ) ;
+static inline  IndexType clear_mark (IndexType n_row, RowStructure<IndexType> Row [] ) ;
 
 /* === No debugging ========================================================= */
 
@@ -303,7 +274,7 @@ static inline  IndexType clear_mark (IndexType n_row, Colamd_Row<IndexType> Row
  * \return recommended value of Alen for use by colamd
  */
 template <typename IndexType>
-inline IndexType colamd_recommended ( IndexType nnz, IndexType n_row, IndexType n_col)
+inline IndexType recommended ( IndexType nnz, IndexType n_row, IndexType n_col)
 {
   if ((nnz) < 0 || (n_row) < 0 || (n_col) < 0)
     return (-1);
@@ -332,7 +303,7 @@ inline IndexType colamd_recommended ( IndexType nnz, IndexType n_row, IndexType
  * \param knobs parameter settings for colamd
  */
 
-static inline void colamd_set_defaults(double knobs[ColamdKnobs])
+static inline void set_defaults(double knobs[NKnobs])
 {
   /* === Local variables ================================================== */
 
@@ -342,12 +313,12 @@ static inline void colamd_set_defaults(double knobs[ColamdKnobs])
   {
     return ;      /* no knobs to initialize */
   }
-  for (i = 0 ; i < ColamdKnobs ; i++)
+  for (i = 0 ; i < NKnobs ; i++)
   {
     knobs [i] = 0 ;
   }
-  knobs [DenseRow] = 0.5 ;  /* ignore rows over 50% dense */
-  knobs [DenseCol] = 0.5 ;  /* ignore columns over 50% dense */
+  knobs [Colamd::DenseRow] = 0.5 ;  /* ignore rows over 50% dense */
+  knobs [Colamd::DenseCol] = 0.5 ;  /* ignore columns over 50% dense */
 }
 
 /**
@@ -368,7 +339,7 @@ static inline void colamd_set_defaults(double knobs[ColamdKnobs])
  * \param stats colamd output statistics and error codes
  */
 template <typename IndexType>
-static bool colamd(IndexType n_row, IndexType n_col, IndexType Alen, IndexType *A, IndexType *p, double knobs[ColamdKnobs], IndexType stats[ColamdStats])
+static bool compute_ordering(IndexType n_row, IndexType n_col, IndexType Alen, IndexType *A, IndexType *p, double knobs[NKnobs], IndexType stats[NStats])
 {
   /* === Local variables ================================================== */
 
@@ -377,13 +348,13 @@ static bool colamd(IndexType n_row, IndexType n_col, IndexType Alen, IndexType *
   IndexType Row_size ;    /* size of Row [], in integers */
   IndexType Col_size ;    /* size of Col [], in integers */
   IndexType need ;      /* minimum required length of A */
-  Colamd_Row<IndexType> *Row ;   /* pointer into A of Row [0..n_row] array */
-  colamd_col<IndexType> *Col ;   /* pointer into A of Col [0..n_col] array */
+  Colamd::RowStructure<IndexType> *Row ;   /* pointer into A of Row [0..n_row] array */
+  Colamd::ColStructure<IndexType> *Col ;   /* pointer into A of Col [0..n_col] array */
   IndexType n_col2 ;    /* number of non-dense, non-empty columns */
   IndexType n_row2 ;    /* number of non-dense, non-empty rows */
   IndexType ngarbage ;    /* number of garbage collections performed */
   IndexType max_deg ;   /* maximum row degree */
-  double default_knobs [ColamdKnobs] ; /* default knobs array */
+  double default_knobs [NKnobs] ; /* default knobs array */
 
 
   /* === Check the input arguments ======================================== */
@@ -393,40 +364,40 @@ static bool colamd(IndexType n_row, IndexType n_col, IndexType Alen, IndexType *
     COLAMD_DEBUG0 (("colamd: stats not present\n")) ;
     return (false) ;
   }
-  for (i = 0 ; i < ColamdStats ; i++)
+  for (i = 0 ; i < NStats ; i++)
   {
     stats [i] = 0 ;
   }
-  stats [Status] = Ok ;
-  stats [Info1] = -1 ;
-  stats [Info2] = -1 ;
+  stats [Colamd::Status] = Colamd::Ok ;
+  stats [Colamd::Info1] = -1 ;
+  stats [Colamd::Info2] = -1 ;
 
   if (!A)   /* A is not present */
   {
-    stats [Status] = ErrorANotPresent ;
+    stats [Colamd::Status] = Colamd::ErrorANotPresent ;
     COLAMD_DEBUG0 (("colamd: A not present\n")) ;
     return (false) ;
   }
 
   if (!p)   /* p is not present */
   {
-    stats [Status] = ErrorPNotPresent ;
+    stats [Colamd::Status] = Colamd::ErrorPNotPresent ;
     COLAMD_DEBUG0 (("colamd: p not present\n")) ;
     return (false) ;
   }
 
   if (n_row < 0)  /* n_row must be >= 0 */
   {
-    stats [Status] = ErrorNrowNegative ;
-    stats [Info1] = n_row ;
+    stats [Colamd::Status] = Colamd::ErrorNrowNegative ;
+    stats [Colamd::Info1] = n_row ;
     COLAMD_DEBUG0 (("colamd: nrow negative %d\n", n_row)) ;
     return (false) ;
   }
 
   if (n_col < 0)  /* n_col must be >= 0 */
   {
-    stats [Status] = ErrorNcolNegative ;
-    stats [Info1] = n_col ;
+    stats [Colamd::Status] = Colamd::ErrorNcolNegative ;
+    stats [Colamd::Info1] = n_col ;
     COLAMD_DEBUG0 (("colamd: ncol negative %d\n", n_col)) ;
     return (false) ;
   }
@@ -434,16 +405,16 @@ static bool colamd(IndexType n_row, IndexType n_col, IndexType Alen, IndexType *
   nnz = p [n_col] ;
   if (nnz < 0)  /* nnz must be >= 0 */
   {
-    stats [Status] = ErrorNnzNegative ;
-    stats [Info1] = nnz ;
+    stats [Colamd::Status] = Colamd::ErrorNnzNegative ;
+    stats [Colamd::Info1] = nnz ;
     COLAMD_DEBUG0 (("colamd: number of entries negative %d\n", nnz)) ;
     return (false) ;
   }
 
   if (p [0] != 0)
   {
-    stats [Status] = ErrorP0Nonzero ;
-    stats [Info1] = p [0] ;
+    stats [Colamd::Status] = Colamd::ErrorP0Nonzero ;
+    stats [Colamd::Info1] = p [0] ;
     COLAMD_DEBUG0 (("colamd: p[0] not zero %d\n", p [0])) ;
     return (false) ;
   }
@@ -452,7 +423,7 @@ static bool colamd(IndexType n_row, IndexType n_col, IndexType Alen, IndexType *
 
   if (!knobs)
   {
-    colamd_set_defaults (default_knobs) ;
+    set_defaults (default_knobs) ;
     knobs = default_knobs ;
   }
 
@@ -465,20 +436,20 @@ static bool colamd(IndexType n_row, IndexType n_col, IndexType Alen, IndexType *
   if (need > Alen)
   {
     /* not enough space in array A to perform the ordering */
-    stats [Status] = ErrorATooSmall ;
-    stats [Info1] = need ;
-    stats [Info2] = Alen ;
+    stats [Colamd::Status] = Colamd::ErrorATooSmall ;
+    stats [Colamd::Info1] = need ;
+    stats [Colamd::Info2] = Alen ;
     COLAMD_DEBUG0 (("colamd: Need Alen >= %d, given only Alen = %d\n", need,Alen));
     return (false) ;
   }
 
   Alen -= Col_size + Row_size ;
-  Col = (colamd_col<IndexType> *) &A [Alen] ;
-  Row = (Colamd_Row<IndexType> *) &A [Alen + Col_size] ;
+  Col = (ColStructure<IndexType> *) &A [Alen] ;
+  Row = (RowStructure<IndexType> *) &A [Alen + Col_size] ;
 
   /* === Construct the row and column data structures ===================== */
 
-  if (!Eigen::internal::init_rows_cols (n_row, n_col, Row, Col, A, p, stats))
+  if (!Colamd::init_rows_cols (n_row, n_col, Row, Col, A, p, stats))
   {
     /* input matrix is invalid */
     COLAMD_DEBUG0 (("colamd: Matrix invalid\n")) ;
@@ -487,23 +458,23 @@ static bool colamd(IndexType n_row, IndexType n_col, IndexType Alen, IndexType *
 
   /* === Initialize scores, kill dense rows/columns ======================= */
 
-  Eigen::internal::init_scoring (n_row, n_col, Row, Col, A, p, knobs,
+  Colamd::init_scoring (n_row, n_col, Row, Col, A, p, knobs,
 		&n_row2, &n_col2, &max_deg) ;
 
   /* === Order the supercolumns =========================================== */
 
-  ngarbage = Eigen::internal::find_ordering (n_row, n_col, Alen, Row, Col, A, p,
+  ngarbage = Colamd::find_ordering (n_row, n_col, Alen, Row, Col, A, p,
 			    n_col2, max_deg, 2*nnz) ;
 
   /* === Order the non-principal columns ================================== */
 
-  Eigen::internal::order_children (n_col, Col, p) ;
+  Colamd::order_children (n_col, Col, p) ;
 
   /* === Return statistics in stats ======================================= */
 
-  stats [DenseRow] = n_row - n_row2 ;
-  stats [DenseCol] = n_col - n_col2 ;
-  stats [DefragCount] = ngarbage ;
+  stats [Colamd::DenseRow] = n_row - n_row2 ;
+  stats [Colamd::DenseCol] = n_col - n_col2 ;
+  stats [Colamd::DefragCount] = ngarbage ;
   COLAMD_DEBUG0 (("colamd: done.\n")) ;
   return (true) ;
 }
@@ -514,7 +485,6 @@ static bool colamd(IndexType n_row, IndexType n_col, IndexType Alen, IndexType *
 
 /* There are no user-callable routines beyond this point in the file */
 
-
 /* ========================================================================== */
 /* === init_rows_cols ======================================================= */
 /* ========================================================================== */
@@ -534,11 +504,11 @@ static IndexType init_rows_cols  /* returns true if OK, or false otherwise */
 
     IndexType n_row,      /* number of rows of A */
     IndexType n_col,      /* number of columns of A */
-    Colamd_Row<IndexType> Row [],    /* of size n_row+1 */
-    colamd_col<IndexType> Col [],    /* of size n_col+1 */
+    RowStructure<IndexType> Row [],    /* of size n_row+1 */
+    ColStructure<IndexType> Col [],    /* of size n_col+1 */
     IndexType A [],     /* row indices of A, of size Alen */
     IndexType p [],     /* pointers to columns in A, of size n_col+1 */
-    IndexType stats [ColamdStats]  /* colamd statistics */
+    IndexType stats [NStats]  /* colamd statistics */
     )
 {
   /* === Local variables ================================================== */
@@ -561,9 +531,9 @@ static IndexType init_rows_cols  /* returns true if OK, or false otherwise */
     if ((Col [col].length) < 0) // extra parentheses to work-around gcc bug 10200
     {
       /* column pointers must be non-decreasing */
-      stats [Status] = ErrorColLengthNegative ;
-      stats [Info1] = col ;
-      stats [Info2] = Col [col].length ;
+      stats [Colamd::Status] = Colamd::ErrorColLengthNegative ;
+      stats [Colamd::Info1] = col ;
+      stats [Colamd::Info2] = Col [col].length ;
       COLAMD_DEBUG0 (("colamd: col %d length %d < 0\n", col, Col [col].length)) ;
       return (false) ;
     }
@@ -600,10 +570,10 @@ static IndexType init_rows_cols  /* returns true if OK, or false otherwise */
       /* make sure row indices within range */
       if (row < 0 || row >= n_row)
       {
-	stats [Status] = ErrorRowIndexOutOfBounds ;
-	stats [Info1] = col ;
-	stats [Info2] = row ;
-	stats [Info3] = n_row ;
+	stats [Colamd::Status] = Colamd::ErrorRowIndexOutOfBounds ;
+	stats [Colamd::Info1] = col ;
+	stats [Colamd::Info2] = row ;
+	stats [Colamd::Info3] = n_row ;
 	COLAMD_DEBUG0 (("colamd: row %d col %d out of bounds\n", row, col)) ;
 	return (false) ;
       }
@@ -612,10 +582,10 @@ static IndexType init_rows_cols  /* returns true if OK, or false otherwise */
       {
 	/* row index are unsorted or repeated (or both), thus col */
 	/* is jumbled.  This is a notice, not an error condition. */
-	stats [Status] = OkButJumbled ;
-	stats [Info1] = col ;
-	stats [Info2] = row ;
-	(stats [Info3]) ++ ;
+	stats [Colamd::Status] = Colamd::OkButJumbled ;
+	stats [Colamd::Info1] = col ;
+	stats [Colamd::Info2] = row ;
+	(stats [Colamd::Info3]) ++ ;
 	COLAMD_DEBUG1 (("colamd: row %d col %d unsorted/duplicate\n",row,col));
       }
 
@@ -750,11 +720,11 @@ static void init_scoring
 
     IndexType n_row,      /* number of rows of A */
     IndexType n_col,      /* number of columns of A */
-    Colamd_Row<IndexType> Row [],    /* of size n_row+1 */
-    colamd_col<IndexType> Col [],    /* of size n_col+1 */
+    RowStructure<IndexType> Row [],    /* of size n_row+1 */
+    ColStructure<IndexType> Col [],    /* of size n_col+1 */
     IndexType A [],     /* column form and row form of A */
     IndexType head [],    /* of size n_col+1 */
-    double knobs [ColamdKnobs],/* parameters */
+    double knobs [NKnobs],/* parameters */
     IndexType *p_n_row2,    /* number of non-dense, non-empty rows */
     IndexType *p_n_col2,    /* number of non-dense, non-empty columns */
     IndexType *p_max_deg    /* maximum row degree */
@@ -781,8 +751,8 @@ static void init_scoring
 
   /* === Extract knobs ==================================================== */
 
-  dense_row_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [DenseRow] * n_col), n_col)) ;
-  dense_col_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [DenseCol] * n_row), n_row)) ;
+  dense_row_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [Colamd::DenseRow] * n_col), n_col)) ;
+  dense_col_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [Colamd::DenseCol] * n_row), n_row)) ;
   COLAMD_DEBUG1 (("colamd: densecount: %d %d\n", dense_row_count, dense_col_count)) ;
   max_deg = 0 ;
   n_col2 = n_col ;
@@ -799,7 +769,7 @@ static void init_scoring
     {
       /* this is a empty column, kill and order it last */
       Col [c].shared2.order = --n_col2 ;
-      kill_principal_col(Col, c) ;
+      Col[c].kill_principal() ;
     }
   }
   COLAMD_DEBUG1 (("colamd: null columns killed: %d\n", n_col - n_col2)) ;
@@ -810,7 +780,7 @@ static void init_scoring
   for (c = n_col-1 ; c >= 0 ; c--)
   {
     /* skip any dead columns */
-    if (col_is_dead(Col, c))
+    if (Col[c].is_dead())
     {
       continue ;
     }
@@ -826,7 +796,7 @@ static void init_scoring
       {
 	Row [*cp++].shared1.degree-- ;
       }
-      kill_principal_col(Col, c) ;
+      Col[c].kill_principal() ;
     }
   }
   COLAMD_DEBUG1 (("colamd: Dense and null columns killed: %d\n", n_col - n_col2)) ;
@@ -840,7 +810,7 @@ static void init_scoring
     if (deg > dense_row_count || deg == 0)
     {
       /* kill a dense or empty row */
-      kill_row(Row, r) ;
+      Row[r].kill() ;
       --n_row2 ;
     }
     else
@@ -862,7 +832,7 @@ static void init_scoring
   for (c = n_col-1 ; c >= 0 ; c--)
   {
     /* skip dead column */
-    if (col_is_dead(Col, c))
+    if (Col[c].is_dead())
     {
       continue ;
     }
@@ -875,7 +845,7 @@ static void init_scoring
       /* get a row */
       row = *cp++ ;
       /* skip if dead */
-      if (row_is_dead(Row, row))
+      if (Row[row].is_dead())
       {
 	continue ;
       }
@@ -894,7 +864,7 @@ static void init_scoring
       /* and have already been killed) */
       COLAMD_DEBUG2 (("Newly null killed: %d\n", c)) ;
       Col [c].shared2.order = --n_col2 ;
-      kill_principal_col(Col, c) ;
+      Col[c].kill_principal() ;
     }
     else
     {
@@ -927,7 +897,7 @@ static void init_scoring
   for (c = n_col-1 ; c >= 0 ; c--)
   {
     /* only add principal columns to degree lists */
-    if (col_is_alive(Col, c))
+    if (Col[c].is_alive())
     {
       COLAMD_DEBUG4 (("place %d score %d minscore %d ncol %d\n",
 		      c, Col [c].shared2.score, min_score, n_col)) ;
@@ -988,8 +958,8 @@ static IndexType find_ordering /* return the number of garbage collections */
     IndexType n_row,      /* number of rows of A */
     IndexType n_col,      /* number of columns of A */
     IndexType Alen,     /* size of A, 2*nnz + n_col or larger */
-    Colamd_Row<IndexType> Row [],    /* of size n_row+1 */
-    colamd_col<IndexType> Col [],    /* of size n_col+1 */
+    RowStructure<IndexType> Row [],    /* of size n_row+1 */
+    ColStructure<IndexType> Col [],    /* of size n_col+1 */
     IndexType A [],     /* column form and row form of A */
     IndexType head [],    /* of size n_col+1 */
     IndexType n_col2,     /* Remaining columns to order */
@@ -1035,7 +1005,7 @@ static IndexType find_ordering /* return the number of garbage collections */
   /* === Initialization and clear mark ==================================== */
 
   max_mark = INT_MAX - n_col ;  /* INT_MAX defined in <limits.h> */
-  tag_mark = Eigen::internal::clear_mark (n_row, Row) ;
+  tag_mark = Colamd::clear_mark (n_row, Row) ;
   min_score = 0 ;
   ngarbage = 0 ;
   COLAMD_DEBUG1 (("colamd: Ordering, n_col2=%d\n", n_col2)) ;
@@ -1066,7 +1036,7 @@ static IndexType find_ordering /* return the number of garbage collections */
       Col [next_col].shared3.prev = Empty ;
     }
 
-    COLAMD_ASSERT (col_is_alive(Col, pivot_col)) ;
+    COLAMD_ASSERT (Col[pivot_col].is_alive()) ;
     COLAMD_DEBUG3 (("Pivot col: %d\n", pivot_col)) ;
 
     /* remember score for defrag check */
@@ -1085,12 +1055,12 @@ static IndexType find_ordering /* return the number of garbage collections */
     needed_memory = numext::mini(pivot_col_score, n_col - k) ;
     if (pfree + needed_memory >= Alen)
     {
-      pfree = Eigen::internal::garbage_collection (n_row, n_col, Row, Col, A, &A [pfree]) ;
+      pfree = Colamd::garbage_collection (n_row, n_col, Row, Col, A, &A [pfree]) ;
       ngarbage++ ;
       /* after garbage collection we will have enough */
       COLAMD_ASSERT (pfree + needed_memory < Alen) ;
       /* garbage collection has wiped out the Row[].shared2.mark array */
-      tag_mark = Eigen::internal::clear_mark (n_row, Row) ;
+      tag_mark = Colamd::clear_mark (n_row, Row) ;
 
     }
 
@@ -1113,9 +1083,9 @@ static IndexType find_ordering /* return the number of garbage collections */
     {
       /* get a row */
       row = *cp++ ;
-      COLAMD_DEBUG4 (("Pivot col pattern %d %d\n", row_is_alive(Row, row), row)) ;
+      COLAMD_DEBUG4 (("Pivot col pattern %d %d\n", Row[row].is_alive(), row)) ;
       /* skip if row is dead */
-      if (row_is_dead(Row, row))
+      if (Row[row].is_dead())
       {
 	continue ;
       }
@@ -1127,7 +1097,7 @@ static IndexType find_ordering /* return the number of garbage collections */
 	col = *rp++ ;
 	/* add the column, if alive and untagged */
 	col_thickness = Col [col].shared1.thickness ;
-	if (col_thickness > 0 && col_is_alive(Col, col))
+	if (col_thickness > 0 && Col[col].is_alive())
 	{
 	  /* tag column in pivot row */
 	  Col [col].shared1.thickness = -col_thickness ;
@@ -1154,7 +1124,7 @@ static IndexType find_ordering /* return the number of garbage collections */
       /* may be killing an already dead row */
       row = *cp++ ;
       COLAMD_DEBUG3 (("Kill row in pivot col: %d\n", row)) ;
-      kill_row(Row, row) ;
+      Row[row].kill() ;
     }
 
     /* === Select a row index to use as the new pivot row =============== */
@@ -1206,7 +1176,7 @@ static IndexType find_ordering /* return the number of garbage collections */
     while (rp < rp_end)
     {
       col = *rp++ ;
-      COLAMD_ASSERT (col_is_alive(Col, col) && col != pivot_col) ;
+      COLAMD_ASSERT (Col[col].is_alive() && col != pivot_col) ;
       COLAMD_DEBUG3 (("Col: %d\n", col)) ;
 
       /* clear tags used to construct pivot row pattern */
@@ -1243,12 +1213,12 @@ static IndexType find_ordering /* return the number of garbage collections */
       {
 	/* get a row */
 	row = *cp++ ;
-	row_mark = Row [row].shared2.mark ;
 	/* skip if dead */
-	if (row_is_marked_dead (row_mark))
+	if (Row[row].is_dead())
 	{
 	  continue ;
 	}
+  row_mark = Row [row].shared2.mark ;
 	COLAMD_ASSERT (row != pivot_row) ;
 	set_difference = row_mark - tag_mark ;
 	/* check if the row has been seen yet */
@@ -1264,7 +1234,7 @@ static IndexType find_ordering /* return the number of garbage collections */
 	if (set_difference == 0)
 	{
 	  COLAMD_DEBUG3 (("aggressive absorption. Row: %d\n", row)) ;
-	  kill_row(Row, row) ;
+	  Row[row].kill() ;
 	}
 	else
 	{
@@ -1286,7 +1256,7 @@ static IndexType find_ordering /* return the number of garbage collections */
     {
       /* get a column */
       col = *rp++ ;
-      COLAMD_ASSERT (col_is_alive(Col, col) && col != pivot_col) ;
+      COLAMD_ASSERT (Col[col].is_alive() && col != pivot_col) ;
       hash = 0 ;
       cur_score = 0 ;
       cp = &A [Col [col].start] ;
@@ -1301,12 +1271,12 @@ static IndexType find_ordering /* return the number of garbage collections */
 	/* get a row */
 	row = *cp++ ;
 	COLAMD_ASSERT(row >= 0 && row < n_row) ;
-	row_mark = Row [row].shared2.mark ;
 	/* skip if dead */
-	if (row_is_marked_dead (row_mark))
+	if (Row [row].is_dead())
 	{
 	  continue ;
 	}
+  row_mark = Row [row].shared2.mark ;
 	COLAMD_ASSERT (row_mark > tag_mark) ;
 	/* compact the column */
 	*new_cp++ = row ;
@@ -1327,7 +1297,7 @@ static IndexType find_ordering /* return the number of garbage collections */
       {
 	COLAMD_DEBUG4 (("further mass elimination. Col: %d\n", col)) ;
 	/* nothing left but the pivot row in this column */
-	kill_principal_col(Col, col) ;
+	Col[col].kill_principal() ;
 	pivot_row_degree -= Col [col].shared1.thickness ;
 	COLAMD_ASSERT (pivot_row_degree >= 0) ;
 	/* order it */
@@ -1368,7 +1338,7 @@ static IndexType find_ordering /* return the number of garbage collections */
 
 	/* save hash function in Col [col].shared3.hash */
 	Col [col].shared3.hash = (IndexType) hash ;
-	COLAMD_ASSERT (col_is_alive(Col, col)) ;
+	COLAMD_ASSERT (Col[col].is_alive()) ;
       }
     }
 
@@ -1378,11 +1348,11 @@ static IndexType find_ordering /* return the number of garbage collections */
 
     COLAMD_DEBUG3 (("** Supercolumn detection phase. **\n")) ;
 
-    Eigen::internal::detect_super_cols (Col, A, head, pivot_row_start, pivot_row_length) ;
+    Colamd::detect_super_cols (Col, A, head, pivot_row_start, pivot_row_length) ;
 
     /* === Kill the pivotal column ====================================== */
 
-    kill_principal_col(Col, pivot_col) ;
+    Col[pivot_col].kill_principal() ;
 
     /* === Clear mark =================================================== */
 
@@ -1390,7 +1360,7 @@ static IndexType find_ordering /* return the number of garbage collections */
     if (tag_mark >= max_mark)
     {
       COLAMD_DEBUG2 (("clearing tag_mark\n")) ;
-      tag_mark = Eigen::internal::clear_mark (n_row, Row) ;
+      tag_mark = Colamd::clear_mark (n_row, Row) ;
     }
 
     /* === Finalize the new pivot row, and column scores ================ */
@@ -1406,7 +1376,7 @@ static IndexType find_ordering /* return the number of garbage collections */
     {
       col = *rp++ ;
       /* skip dead columns */
-      if (col_is_dead(Col, col))
+      if (Col[col].is_dead())
       {
 	continue ;
       }
@@ -1497,7 +1467,7 @@ static inline  void order_children
   /* === Parameters ======================================================= */
 
   IndexType n_col,      /* number of columns of A */
-  colamd_col<IndexType> Col [],    /* of size n_col+1 */
+  ColStructure<IndexType> Col [],    /* of size n_col+1 */
   IndexType p []      /* p [0 ... n_col-1] is the column permutation*/
   )
 {
@@ -1514,14 +1484,14 @@ static inline  void order_children
   {
     /* find an un-ordered non-principal column */
     COLAMD_ASSERT (col_is_dead(Col, i)) ;
-    if (!col_is_dead_principal(Col, i) && Col [i].shared2.order == Empty)
+    if (!Col[i].is_dead_principal() && Col [i].shared2.order == Empty)
     {
       parent = i ;
       /* once found, find its principal parent */
       do
       {
 	parent = Col [parent].shared1.parent ;
-      } while (!col_is_dead_principal(Col, parent)) ;
+      } while (!Col[parent].is_dead_principal()) ;
 
       /* now, order all un-ordered non-principal columns along path */
       /* to this parent.  collapse tree at the same time */
@@ -1597,7 +1567,7 @@ static void detect_super_cols
 (
   /* === Parameters ======================================================= */
 
-  colamd_col<IndexType> Col [],    /* of size n_col+1 */
+  ColStructure<IndexType> Col [],    /* of size n_col+1 */
   IndexType A [],     /* row indices of A */
   IndexType head [],    /* head of degree lists and hash buckets */
   IndexType row_start,    /* pointer to set of columns to check */
@@ -1627,7 +1597,7 @@ static void detect_super_cols
   while (rp < rp_end)
   {
     col = *rp++ ;
-    if (col_is_dead(Col, col))
+    if (Col[col].is_dead())
     {
       continue ;
     }
@@ -1653,7 +1623,7 @@ static void detect_super_cols
     for (super_c = first_col ; super_c != Empty ;
 	 super_c = Col [super_c].shared4.hash_next)
     {
-      COLAMD_ASSERT (col_is_alive(Col, super_c)) ;
+      COLAMD_ASSERT (Col [super_c].is_alive()) ;
       COLAMD_ASSERT (Col [super_c].shared3.hash == hash) ;
       length = Col [super_c].length ;
 
@@ -1666,7 +1636,7 @@ static void detect_super_cols
 	   c != Empty ; c = Col [c].shared4.hash_next)
       {
 	COLAMD_ASSERT (c != super_c) ;
-	COLAMD_ASSERT (col_is_alive(Col, c)) ;
+	COLAMD_ASSERT (Col[c].is_alive()) ;
 	COLAMD_ASSERT (Col [c].shared3.hash == hash) ;
 
 	/* not identical if lengths or scores are different */
@@ -1684,8 +1654,8 @@ static void detect_super_cols
 	for (i = 0 ; i < length ; i++)
 	{
 	  /* the columns are "clean" (no dead rows) */
-	  COLAMD_ASSERT (ROW_IS_ALIVE (*cp1))  ;
-	  COLAMD_ASSERT (ROW_IS_ALIVE (*cp2))  ;
+	  COLAMD_ASSERT ( cp1->is_alive() );
+	  COLAMD_ASSERT ( cp2->is_alive() );
 	  /* row indices will same order for both supercols, */
 	  /* no gather scatter necessary */
 	  if (*cp1++ != *cp2++)
@@ -1707,7 +1677,7 @@ static void detect_super_cols
 
 	Col [super_c].shared1.thickness += Col [c].shared1.thickness ;
 	Col [c].shared1.parent = super_c ;
-	kill_non_principal_col(Col, c) ;
+	Col[c].kill_non_principal() ;
 	/* order c later, in order_children() */
 	Col [c].shared2.order = Empty ;
 	/* remove c from hash bucket */
@@ -1750,8 +1720,8 @@ static IndexType garbage_collection  /* returns the new value of pfree */
 
     IndexType n_row,      /* number of rows */
     IndexType n_col,      /* number of columns */
-    Colamd_Row<IndexType> Row [],    /* row info */
-    colamd_col<IndexType> Col [],    /* column info */
+    RowStructure<IndexType> Row [],    /* row info */
+    ColStructure<IndexType> Col [],    /* column info */
     IndexType A [],     /* A [0 ... Alen-1] holds the matrix */
     IndexType *pfree      /* &A [0] ... pfree is in use */
     )
@@ -1770,7 +1740,7 @@ static IndexType garbage_collection  /* returns the new value of pfree */
   pdest = &A[0] ;
   for (c = 0 ; c < n_col ; c++)
   {
-    if (col_is_alive(Col, c))
+    if (Col[c].is_alive())
     {
       psrc = &A [Col [c].start] ;
 
@@ -1781,7 +1751,7 @@ static IndexType garbage_collection  /* returns the new value of pfree */
       for (j = 0 ; j < length ; j++)
       {
 	r = *psrc++ ;
-	if (row_is_alive(Row, r))
+	if (Row[r].is_alive())
 	{
 	  *pdest++ = r ;
 	}
@@ -1794,22 +1764,22 @@ static IndexType garbage_collection  /* returns the new value of pfree */
 
   for (r = 0 ; r < n_row ; r++)
   {
-    if (row_is_alive(Row, r))
+    if (Row[r].is_alive())
     {
       if (Row [r].length == 0)
       {
-	/* this row is of zero length.  cannot compact it, so kill it */
-	COLAMD_DEBUG3 (("Defrag row kill\n")) ;
-	kill_row(Row, r) ;
+        /* this row is of zero length.  cannot compact it, so kill it */
+        COLAMD_DEBUG3 (("Defrag row kill\n")) ;
+        Row[r].kill() ;
       }
       else
       {
-	/* save first column index in Row [r].shared2.first_column */
-	psrc = &A [Row [r].start] ;
-	Row [r].shared2.first_column = *psrc ;
-	COLAMD_ASSERT (row_is_alive(Row, r)) ;
-	/* flag the start of the row with the one's complement of row */
-	*psrc = ones_complement(r) ;
+        /* save first column index in Row [r].shared2.first_column */
+        psrc = &A [Row [r].start] ;
+        Row [r].shared2.first_column = *psrc ;
+        COLAMD_ASSERT (Row[r].is_alive()) ;
+        /* flag the start of the row with the one's complement of row */
+        *psrc = ones_complement(r) ;
 
       }
     }
@@ -1829,7 +1799,7 @@ static IndexType garbage_collection  /* returns the new value of pfree */
       COLAMD_ASSERT (r >= 0 && r < n_row) ;
       /* restore first column index */
       *psrc = Row [r].shared2.first_column ;
-      COLAMD_ASSERT (row_is_alive(Row, r)) ;
+      COLAMD_ASSERT (Row[r].is_alive()) ;
 
       /* move and compact the row */
       COLAMD_ASSERT (pdest <= psrc) ;
@@ -1838,7 +1808,7 @@ static IndexType garbage_collection  /* returns the new value of pfree */
       for (j = 0 ; j < length ; j++)
       {
 	c = *psrc++ ;
-	if (col_is_alive(Col, c))
+	if (Col[c].is_alive())
 	{
 	  *pdest++ = c ;
 	}
@@ -1870,7 +1840,7 @@ static inline  IndexType clear_mark  /* return the new value for tag_mark */
       /* === Parameters ======================================================= */
 
     IndexType n_row,    /* number of rows in A */
-    Colamd_Row<IndexType> Row [] /* Row [0 ... n_row-1].shared2.mark is set to zero */
+    RowStructure<IndexType> Row [] /* Row [0 ... n_row-1].shared2.mark is set to zero */
     )
 {
   /* === Local variables ================================================== */
@@ -1879,7 +1849,7 @@ static inline  IndexType clear_mark  /* return the new value for tag_mark */
 
   for (r = 0 ; r < n_row ; r++)
   {
-    if (row_is_alive(Row, r))
+    if (Row[r].is_alive())
     {
       Row [r].shared2.mark = 0 ;
     }
@@ -1887,6 +1857,7 @@ static inline  IndexType clear_mark  /* return the new value for tag_mark */
   return (1) ;
 }
 
+} // namespace Colamd
 
 } // namespace internal
 #endif
diff --git a/Eigen/src/OrderingMethods/Ordering.h b/Eigen/src/OrderingMethods/Ordering.h
index 10ba6b464..c57897014 100644
--- a/Eigen/src/OrderingMethods/Ordering.h
+++ b/Eigen/src/OrderingMethods/Ordering.h
@@ -129,17 +129,17 @@ class COLAMDOrdering
       StorageIndex n = StorageIndex(mat.cols());
       StorageIndex nnz = StorageIndex(mat.nonZeros());
       // Get the recommended value of Alen to be used by colamd
-      StorageIndex Alen = internal::colamd_recommended(nnz, m, n); 
+      StorageIndex Alen = internal::Colamd::recommended(nnz, m, n); 
       // Set the default parameters
-      double knobs [ColamdKnobs]; 
-      StorageIndex stats [ColamdStats];
-      internal::colamd_set_defaults(knobs);
+      double knobs [internal::Colamd::NKnobs]; 
+      StorageIndex stats [internal::Colamd::NStats];
+      internal::Colamd::set_defaults(knobs);
       
       IndexVector p(n+1), A(Alen); 
       for(StorageIndex i=0; i <= n; i++)   p(i) = mat.outerIndexPtr()[i];
       for(StorageIndex i=0; i < nnz; i++)  A(i) = mat.innerIndexPtr()[i];
       // Call Colamd routine to compute the ordering 
-      StorageIndex info = internal::colamd(m, n, Alen, A.data(), p.data(), knobs, stats); 
+      StorageIndex info = internal::Colamd::compute_ordering(m, n, Alen, A.data(), p.data(), knobs, stats); 
       EIGEN_UNUSED_VARIABLE(info);
       eigen_assert( info && "COLAMD failed " );
       

From c694be1214d99c3cc0431c719c110d10cf64a7ec Mon Sep 17 00:00:00 2001
From: Alberto Luaces <alberto.luaces@udc.es>
Date: Tue, 23 Jul 2019 09:24:06 +0000
Subject: [PATCH 27/30] Fixed Tensor documentation formatting.

---
 unsupported/Eigen/CXX11/src/Tensor/README.md | 158 ++++++++++---------
 1 file changed, 80 insertions(+), 78 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/README.md b/unsupported/Eigen/CXX11/src/Tensor/README.md
index 006f35b23..1f4cf272b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/README.md
+++ b/unsupported/Eigen/CXX11/src/Tensor/README.md
@@ -1630,81 +1630,81 @@ dimension in RowMajor layout.
 
 For example, given the following input tensor:
 
-  Eigen::Tensor<float, 2, DataLayout> tensor(3,4);
-  tensor.setValues({{0.0f, 1.0f, 2.0f, 3.0f},
-                    {4.0f, 5.0f, 6.0f, 7.0f},
-                    {8.0f, 9.0f, 10.0f, 11.0f}});
+    Eigen::Tensor<float, 2, DataLayout> tensor(3,4);
+    tensor.setValues({{0.0f, 1.0f, 2.0f, 3.0f},
+                      {4.0f, 5.0f, 6.0f, 7.0f},
+                      {8.0f, 9.0f, 10.0f, 11.0f}});
 
-  cout << "tensor: " << endl << tensor << endl;
-=>
-tensor:
- 0   1   2   3
- 4   5   6   7
- 8   9  10  11
+    cout << "tensor: " << endl << tensor << endl;
+    =>
+    tensor:
+     0   1   2   3
+     4   5   6   7
+     8   9  10  11
 
 Six 2x2 patches can be extracted and indexed using the following code:
 
-  Eigen::Tensor<float, 3, DataLayout> patch;
-  Eigen::array<ptrdiff_t, 2> patch_dims;
-  patch_dims[0] = 2;
-  patch_dims[1] = 2;
-  patch = tensor.extract_patches(patch_dims);
-  for (int k = 0; k < 6; ++k) {
-    cout << "patch index: " << k << endl;
-    for (int i = 0; i < 2; ++i) {
-      for (int j = 0; j < 2; ++j) {
-        if (DataLayout == ColMajor) {
-          cout << patch(i, j, k) << " ";
-        } else {
-          cout << patch(k, i, j) << " ";
-        }
+    Eigen::Tensor<float, 3, DataLayout> patch;
+    Eigen::array<ptrdiff_t, 2> patch_dims;
+    patch_dims[0] = 2;
+    patch_dims[1] = 2;
+    patch = tensor.extract_patches(patch_dims);
+    for (int k = 0; k < 6; ++k) {
+      cout << "patch index: " << k << endl;
+      for (int i = 0; i < 2; ++i) {
+    	for (int j = 0; j < 2; ++j) {
+    	  if (DataLayout == ColMajor) {
+    		cout << patch(i, j, k) << " ";
+    	  } else {
+    		cout << patch(k, i, j) << " ";
+    	  }
+    	}
+    	cout << endl;
       }
-      cout << endl;
     }
-  }
 
 This code results in the following output when the data layout is ColMajor:
 
-patch index: 0
-0 1
-4 5
-patch index: 1
-4 5
-8 9
-patch index: 2
-1 2
-5 6
-patch index: 3
-5 6
-9 10
-patch index: 4
-2 3
-6 7
-patch index: 5
-6 7
-10 11
+    patch index: 0
+    0 1
+    4 5
+    patch index: 1
+    4 5
+    8 9
+    patch index: 2
+    1 2
+    5 6
+    patch index: 3
+    5 6
+    9 10
+    patch index: 4
+    2 3
+    6 7
+    patch index: 5
+    6 7
+    10 11
 
 This code results in the following output when the data layout is RowMajor:
 (NOTE: the set of patches is the same as in ColMajor, but are indexed differently).
 
-patch index: 0
-0 1
-4 5
-patch index: 1
-1 2
-5 6
-patch index: 2
-2 3
-6 7
-patch index: 3
-4 5
-8 9
-patch index: 4
-5 6
-9 10
-patch index: 5
-6 7
-10 11
+    patch index: 0
+    0 1
+    4 5
+    patch index: 1
+    1 2
+    5 6
+    patch index: 2
+    2 3
+    6 7
+    patch index: 3
+    4 5
+    8 9
+    patch index: 4
+    5 6
+    9 10
+    patch index: 5
+    6 7
+    10 11
 
 ### `<Operation>  extract_image_patches(const Index patch_rows, const Index patch_cols, const Index row_stride, const Index col_stride, const PaddingType padding_type)`
 
@@ -1736,28 +1736,30 @@ sizes:
  *) columns: 5
  *) batch:   7
 
-  Tensor<float, 4> tensor(2,3,5,7);
-  Tensor<float, 4, RowMajor> tensor_row_major = tensor.swap_layout();
+    Tensor<float, 4> tensor(2,3,5,7);
+    Tensor<float, 4, RowMajor> tensor_row_major = tensor.swap_layout();
 
 2x2 image patches can be extracted and indexed using the following code:
 
 *) 2D patch: ColMajor (patch indexed by second-to-last dimension)
-  Tensor<float, 5> twod_patch;
-  twod_patch = tensor.extract_image_patches<2, 2>();
-  // twod_patch.dimension(0) == 2
-  // twod_patch.dimension(1) == 2
-  // twod_patch.dimension(2) == 2
-  // twod_patch.dimension(3) == 3*5
-  // twod_patch.dimension(4) == 7
+
+    Tensor<float, 5> twod_patch;
+    twod_patch = tensor.extract_image_patches<2, 2>();
+    // twod_patch.dimension(0) == 2
+    // twod_patch.dimension(1) == 2
+    // twod_patch.dimension(2) == 2
+    // twod_patch.dimension(3) == 3*5
+    // twod_patch.dimension(4) == 7
 
 *) 2D patch: RowMajor (patch indexed by the second dimension)
-  Tensor<float, 5, RowMajor> twod_patch_row_major;
-  twod_patch_row_major = tensor_row_major.extract_image_patches<2, 2>();
-  // twod_patch_row_major.dimension(0) == 7
-  // twod_patch_row_major.dimension(1) == 3*5
-  // twod_patch_row_major.dimension(2) == 2
-  // twod_patch_row_major.dimension(3) == 2
-  // twod_patch_row_major.dimension(4) == 2
+
+    Tensor<float, 5, RowMajor> twod_patch_row_major;
+    twod_patch_row_major = tensor_row_major.extract_image_patches<2, 2>();
+    // twod_patch_row_major.dimension(0) == 7
+    // twod_patch_row_major.dimension(1) == 3*5
+    // twod_patch_row_major.dimension(2) == 2
+    // twod_patch_row_major.dimension(3) == 2
+    // twod_patch_row_major.dimension(4) == 2
 
 ## Special Operations
 

From 8e7e3d9bc85152654ab27fbbaecb3ea1397c3ae7 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Tue, 3 Sep 2019 13:09:03 +0200
Subject: [PATCH 28/30] Makes Scalar/RealScalar typedefs public in Pardiso's
 wrappers (see PR 688)

---
 Eigen/src/PardisoSupport/PardisoSupport.h | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/Eigen/src/PardisoSupport/PardisoSupport.h b/Eigen/src/PardisoSupport/PardisoSupport.h
index 07006b5c4..f89b79bd5 100644
--- a/Eigen/src/PardisoSupport/PardisoSupport.h
+++ b/Eigen/src/PardisoSupport/PardisoSupport.h
@@ -386,14 +386,15 @@ class PardisoLU : public PardisoImpl< PardisoLU<MatrixType> >
 {
   protected:
     typedef PardisoImpl<PardisoLU> Base;
-    typedef typename Base::Scalar Scalar;
-    typedef typename Base::RealScalar RealScalar;
     using Base::pardisoInit;
     using Base::m_matrix;
     friend class PardisoImpl< PardisoLU<MatrixType> >;
 
   public:
 
+    typedef typename Base::Scalar Scalar;
+    typedef typename Base::RealScalar RealScalar;
+
     using Base::compute;
     using Base::solve;
 
@@ -441,14 +442,14 @@ class PardisoLLT : public PardisoImpl< PardisoLLT<MatrixType,_UpLo> >
 {
   protected:
     typedef PardisoImpl< PardisoLLT<MatrixType,_UpLo> > Base;
-    typedef typename Base::Scalar Scalar;
-    typedef typename Base::RealScalar RealScalar;
     using Base::pardisoInit;
     using Base::m_matrix;
     friend class PardisoImpl< PardisoLLT<MatrixType,_UpLo> >;
 
   public:
 
+    typedef typename Base::Scalar Scalar;
+    typedef typename Base::RealScalar RealScalar;
     typedef typename Base::StorageIndex StorageIndex;
     enum { UpLo = _UpLo };
     using Base::compute;
@@ -504,14 +505,14 @@ class PardisoLDLT : public PardisoImpl< PardisoLDLT<MatrixType,Options> >
 {
   protected:
     typedef PardisoImpl< PardisoLDLT<MatrixType,Options> > Base;
-    typedef typename Base::Scalar Scalar;
-    typedef typename Base::RealScalar RealScalar;
     using Base::pardisoInit;
     using Base::m_matrix;
     friend class PardisoImpl< PardisoLDLT<MatrixType,Options> >;
 
   public:
 
+    typedef typename Base::Scalar Scalar;
+    typedef typename Base::RealScalar RealScalar;
     typedef typename Base::StorageIndex StorageIndex;
     using Base::compute;
     enum { UpLo = Options&(Upper|Lower) };

From f68f2bba09b556d98e314600676304193e60cfcb Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Tue, 3 Sep 2019 11:08:09 -0700
Subject: [PATCH 29/30] TensorMap constness should not change underlying
 storage constness

---
 unsupported/Eigen/CXX11/src/Tensor/TensorMap.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
index b28cd822f..172a6bab8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
@@ -136,10 +136,10 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE StoragePointerType data() { return m_data; }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE PointerConstType data() const { return m_data; }
+    EIGEN_STRONG_INLINE StoragePointerType data() const { return m_data; }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const
+    EIGEN_STRONG_INLINE StorageRefType operator()(const array<Index, NumIndices>& indices) const
     {
       //      eigen_assert(checkIndexRange(indices));
       if (PlainObjectType::Options&RowMajor) {
@@ -152,14 +152,14 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator()() const
+    EIGEN_STRONG_INLINE StorageRefType operator()() const
     {
       EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE)
       return m_data[0];
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index index) const
     {
       eigen_internal_assert(index >= 0 && index < size());
       return m_data[index];
@@ -167,7 +167,7 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
 
 #if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes> EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
     {
       EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
       eigen_assert(internal::all((Eigen::NumTraits<Index>::highest() >= otherIndices)...));
@@ -181,7 +181,7 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
     }
 #else
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1) const
     {
       if (PlainObjectType::Options&RowMajor) {
         const Index index = i1 + i0 * m_dimensions[1];
@@ -192,7 +192,7 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
       }
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2) const
     {
       if (PlainObjectType::Options&RowMajor) {
          const Index index = i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0);
@@ -203,7 +203,7 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
       }
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3) const
     {
       if (PlainObjectType::Options&RowMajor) {
         const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0));
@@ -214,7 +214,7 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
       }
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
     {
       if (PlainObjectType::Options&RowMajor) {
         const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)));

From a8d264fa9c56e42f77e2129d4e504f5c854821c2 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Tue, 3 Sep 2019 11:38:39 -0700
Subject: [PATCH 30/30] Add test for const TensorMap underlying data mutation

---
 unsupported/test/cxx11_tensor_map.cpp | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/unsupported/test/cxx11_tensor_map.cpp b/unsupported/test/cxx11_tensor_map.cpp
index dc8532f5c..4d4f68911 100644
--- a/unsupported/test/cxx11_tensor_map.cpp
+++ b/unsupported/test/cxx11_tensor_map.cpp
@@ -288,6 +288,30 @@ static void test_0d_const_tensor()
   VERIFY_IS_EQUAL(scalar4(), 13);
 }
 
+static void test_0d_const_tensor_map()
+{
+  Tensor<int, 0> scalar1;
+  Tensor<int, 0, RowMajor> scalar2;
+
+  const TensorMap<Tensor<int, 0> > scalar3(scalar1.data());
+  const TensorMap<Tensor<int, 0, RowMajor> > scalar4(scalar2.data());
+
+  // Although TensorMap is constant, we still can write to the underlying
+  // storage, because we map over non-constant Tensor.
+  scalar3() = 7;
+  scalar4() = 13;
+
+  VERIFY_IS_EQUAL(scalar1(), 7);
+  VERIFY_IS_EQUAL(scalar2(), 13);
+
+  // Pointer to the underlying storage is also non-const.
+  scalar3.data()[0] = 8;
+  scalar4.data()[0] = 14;
+
+  VERIFY_IS_EQUAL(scalar1(), 8);
+  VERIFY_IS_EQUAL(scalar2(), 14);
+}
+
 EIGEN_DECLARE_TEST(cxx11_tensor_map)
 {
   CALL_SUBTEST(test_0d());
@@ -299,4 +323,5 @@ EIGEN_DECLARE_TEST(cxx11_tensor_map)
   CALL_SUBTEST(test_casting());
 
   CALL_SUBTEST(test_0d_const_tensor());
+  CALL_SUBTEST(test_0d_const_tensor_map());
 }