From 6f5b126e6d23f1339d15b26fe87916132397d619 Mon Sep 17 00:00:00 2001
From: Mark D Ryan <mark.d.ryan@intel.com>
Date: Tue, 31 Jul 2018 09:33:37 +0100
Subject: [PATCH 1/2] Fix tensor contraction for AVX512 machines

This patch modifies the TensorContraction class to ensure that the kc_ field is
always a multiple of the packet_size, if the packet_size is > 8.  Without this
change spatial convolutions in Tensorflow do not work properly as the code that
re-arranges the input matrices can assert if kc_ is not a multiple of the
packet_size.  This leads to a unit test failure,
//tensorflow/python/kernel_tests:conv_ops_test, on AVX512 builds of tensorflow.
---
 .../Eigen/CXX11/src/Tensor/TensorContractionBlocking.h        | 4 ++++
 1 file changed, 4 insertions(+)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
index 71fd19774..c51f3f8dd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
@@ -51,6 +51,10 @@ class TensorContractionBlocking {
     else {
       computeProductBlockingSizes<LhsScalar, RhsScalar, 1>(kc_, nc_, mc_, num_threads);
     }
+
+    const int rhs_packet_size = internal::packet_traits<RhsScalar>::size;
+    kc_ = (rhs_packet_size <= 8 || kc_ <= rhs_packet_size) ?
+      kc_ : (kc_ / rhs_packet_size) * rhs_packet_size;
   }
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex kc() const { return kc_; }

From 36f8f6d0be1543e12c87c6f33df46fe7bcecab87 Mon Sep 17 00:00:00 2001
From: Mark D Ryan <mark.d.ryan@intel.com>
Date: Wed, 5 Dec 2018 12:29:03 +0100
Subject: [PATCH 2/2] Fix evalShardedByInnerDim for AVX512 builds

evalShardedByInnerDim ensures that the values it passes for start_k and
end_k to evalGemmPartialWithoutOutputKernel are multiples of 8 as the kernel
does not work correctly when the values of k are not multiples of the
packet_size.  While this precaution works for AVX builds, it is insufficient
for AVX512 builds where the maximum packet size is 16.  The result is slightly
incorrect float32 contractions on AVX512 builds.

This commit fixes the problem by ensuring that k is always a multiple of
the packet_size if the packet_size is > 8.
---
 .../CXX11/src/Tensor/TensorContractionThreadPool.h   | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index 24ba3e431..3946e2fc4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -788,9 +788,11 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     const Index m = this->m_i_size;
     const Index n = this->m_j_size;
     const Index k = this->m_k_size;
-    // The underlying GEMM kernel assumes that k is a multiple of 8 and
-    // subtle breakage occurs if this is violated.
-    Index block_size = 8 * divup<Index>(k, 8 * num_threads);
+    const Index packet_size = internal::packet_traits<RhsScalar>::size;
+    const Index kmultiple = packet_size <= 8 ? 8 : packet_size;
+    // The underlying GEMM kernel assumes that k is a multiple of
+    // the packet size and subtle breakage occurs if this is violated.
+    Index block_size = kmultiple * divup<Index>(k, kmultiple * num_threads);
     Index num_blocks = divup<Index>(k, block_size);
     // we use 'result' for the first block's partial result.
     MaxSizeVector<Scalar*> block_buffers(num_blocks - 1);
@@ -805,9 +807,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     Index start = 0;
     for (Index blocks_left = num_blocks; blocks_left > 0; --blocks_left) {
       // The underlying GEMM kernel assumes that k is a multiple of packet size
-      // (currently largest packet size is 8) and subtle breakage occurs if
+      // (currently largest packet size is 16) and subtle breakage occurs if
       // this is violated.
-      block_size = 8 * divup<Index>(k - start, 8 * blocks_left);
+      block_size = kmultiple * divup<Index>(k - start, kmultiple * blocks_left);
       Scalar* buf;
       if (start == 0) {
         buf = result;