Fix tensor contraction for AVX512 machines

This patch modifies the TensorContraction class to ensure that the kc_ field is always a multiple of the packet_size, if the packet_size is > 8. Without this change spatial convolutions in Tensorflow do not work properly as the code that re-arranges the input matrices can assert if kc_ is not a multiple of the packet_size. This leads to a unit test failure, //tensorflow/python/kernel_tests:conv_ops_test, on AVX512 builds of tensorflow.
2025-05-21 12:07:36 +08:00 · 2018-07-31 09:33:37 +01:00 · 2018-07-31 09:33:37 +01:00 · 6f5b126e6d
commit 6f5b126e6d
parent 77b447c24e
1 changed files with 4 additions and 0 deletions
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
@ -51,6 +51,10 @@ class TensorContractionBlocking {
    else {
      computeProductBlockingSizes<LhsScalar, RhsScalar, 1>(kc_, nc_, mc_, num_threads);
    }
+
+    const int rhs_packet_size = internal::packet_traits<RhsScalar>::size;
+    kc_ = (rhs_packet_size <= 8 || kc_ <= rhs_packet_size) ?
+      kc_ : (kc_ / rhs_packet_size) * rhs_packet_size;
  }

  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex kc() const { return kc_; }