Support multiple contraction kernel types in TensorContractionThreadPool

2025-09-26 00:03:14 +08:00 · 2018-09-26 11:08:47 -07:00 · 2018-09-26 11:08:47 -07:00 · 71cd3fbd6a
commit 71cd3fbd6a
parent 0a3356f4ec
6 changed files with 484 additions and 20 deletions
--- a/cmake/FindMkldnn.cmake
+++ b/cmake/FindMkldnn.cmake
@ -0,0 +1,18 @@
 # Intel mkl-dnn support.
 # Link: https://github.com/intel/mkl-dnn
 if (MKLDNN)
    set(MKLDNN_FIND_QUIETLY TRUE)
    set(MKLDNN_INCLUDES ${MKLDNN}/include)
    set(MKLDNN_LIBRARIES ${MKLDNN}/lib)
 endif (MKLDNN)
 find_path(MKLDNN
        NAMES
        mkldnn.h
        PATHS
        $ENV{MKLDNNDIR}/include
        ${INCLUDE_INSTALL_DIR}
        )
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(MKLDNN DEFAULT_MSG
        MKLDNN)
 mark_as_advanced(MKLDNN)
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@ -75,6 +75,10 @@ typedef unsigned __int64 uint64_t;
 #include "libxsmm.h"
 #endif
 #if defined(EIGEN_USE_MKLDNN)
 #include "mkldnn.h"
 #endif
 #ifdef EIGEN_USE_THREADS
 #include "ThreadPool"
 #endif
@ -121,6 +125,7 @@ typedef unsigned __int64 uint64_t;
 #include "src/Tensor/TensorArgMax.h"
 #include "src/Tensor/TensorConcatenation.h"
 #include "src/Tensor/TensorContractionMapper.h"
 #include "src/Tensor/TensorContractionMkldnn.h"
 #include "src/Tensor/TensorContractionBlocking.h"
 #include "src/Tensor/TensorContraction.h"
 #include "src/Tensor/TensorContractionThreadPool.h"
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMkldnn.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMkldnn.h
@ -0,0 +1,116 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2018 Eugene Zhulenev <ezhulenev@google.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MKLDNN_H
 #define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MKLDNN_H
 #if defined(EIGEN_USE_MKLDNN)
 // Support for MklDnn sgemm kernel in Tensor contractions:
 //
 // 1. Prepare packed Lhs/Rhs blocks from tensor expressions using
 //    DataMapper (see TensorContractionInputMapper).
 // 2. Invoke gemm kernel with packed blocks (replacement for default
 // gebp_kernel).
 namespace Eigen {
 namespace internal {
 template <typename Scalar, typename StorageIndex, typename DataMapper,
          int StorageOrder>
 struct mkldnn_gemm_pack;
 // mkl_gemm_pack for ColMajor storage order.
 template <typename Scalar, typename StorageIndex, typename DataMapper>
 struct mkldnn_gemm_pack<Scalar, StorageIndex, DataMapper,
                        /*StorageOrder*/ ColMajor> {
  typedef typename internal::packet_traits<Scalar>::type Packet;
  typedef typename DataMapper::LinearMapper LinearMapper;
  enum { PacketSize = internal::packet_traits<Scalar>::size };
  EIGEN_DONT_INLINE
  void operator()(Scalar *block, const DataMapper &data_mapper,
                  StorageIndex rows, StorageIndex cols) {
    const StorageIndex unrolled_rows =
        (rows / (4 * PacketSize)) * (4 * PacketSize);
    const StorageIndex vectorized_rows = (rows / PacketSize) * PacketSize;
    for (StorageIndex col = 0; col < cols; ++col) {
      LinearMapper lm = data_mapper.getLinearMapper(0, col);
      // Give compiler a strong possibility to unroll the loop.
      for (StorageIndex i = 0; i < unrolled_rows; i += 4 * PacketSize) {
        for (StorageIndex j = 0; j < 4; ++j) {
          const Packet p = lm.template loadPacket<Packet>(i + j * PacketSize);
          internal::pstoreu(block + j * PacketSize, p);
        }
        block += 4 * PacketSize;
      }
      // Process remaining rows with packets.
      for (StorageIndex i = unrolled_rows; i < vectorized_rows;
           i += PacketSize) {
        const Packet p = lm.template loadPacket<Packet>(i);
        internal::pstoreu(block, p);
        block += PacketSize;
      }
      // Finalize with coefficients.
      for (StorageIndex i = vectorized_rows; i < rows; ++i) {
        *block = lm(i);
        ++block;
      }
    }
  }
 };
 template <typename Scalar, typename StorageIndex, typename OutputMapper,
          bool ConjugateLhs = false, bool ConjugateRhs = false>
 struct mkldnn_gemm_kernel;
 // mkldnn_gemm_kernel for floats defined as a thin layer on top of mkldnn_sgemm.
 template <typename StorageIndex, typename OutputMapper, bool ConjugateLhs,
          bool ConjugateRhs>
 struct mkldnn_gemm_kernel</*Scalar*/ float, StorageIndex, OutputMapper,
                          ConjugateLhs, ConjugateRhs> {
  EIGEN_DONT_INLINE
  void operator()(const OutputMapper &output, const float *blockA,
                  const float *blockB, const StorageIndex rows,
                  const StorageIndex depth, const StorageIndex cols,
                  float alpha) {
    static const int max_index = (std::numeric_limits<int>::max)();
    eigen_assert(max_index > rows);
    eigen_assert(max_index > cols);
    eigen_assert(max_index > depth);
    eigen_assert(max_index > output.stride());
    const int m = static_cast<int>(rows);
    const int n = static_cast<int>(cols);
    const int k = static_cast<int>(depth);
    const char transposeA = ConjugateLhs ? 'Y' : 'N';
    const char transposeB = ConjugateRhs ? 'Y' : 'N';
    const int ldA = ConjugateLhs ? k : m;
    const int ldB = ConjugateRhs ? n : k;
    const int ldC = static_cast<int>(output.stride());
    const float beta = 1.0;
    mkldnn_status_t st = mkldnn_sgemm(&transposeA, &transposeB, &m, &n, &k,
                                      &alpha, blockA, &ldA, blockB, &ldB, &beta,
                                      const_cast<float*>(output.data()), &ldC);
    eigen_assert(st == 0);
  }
 };
 }  // namespace internal
 }  // namespace Eigen
 #endif  // EIGEN_USE_MKLDNN
 #endif  // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MKLDNN_H
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@ -15,6 +15,177 @@
 namespace Eigen {
 namespace internal {
 // WARNING: In this code we assume that Lhs and Rhs tensor expressions are in
 // ColMajor storage order. This property is guaranteed by the
 // TensorContractionOp evaluator. TensorContractionKernel specifies how we pack
 // blocks of Lhs and Rhs tensor expressions, and how we invoke matrix
 // multiplication for these blocks. Default tensor contraction uses
 // gemm_pack_rhs, gemm_pack_lhs and gebp_kernel from Eigen Core (see
 // GeneralBlocPanelKernel.h for details).
 //
 // By specializing contraction kernels we can use other low level libraries to
 // perform matrix multiplication, and still rely on Eigen thread pool evaluator
 // for scaling. Assumption is that custom gemm do not use it's own threading for
 // parallelisation.
 //
 // - ResScalar/LhsScalar/RhsScalar - scalar type for the result of
 //   multiplication, lhs tensor and rhs tensor respectively.
 //
 // - StorageIndex - index type for the tensor expressions. In practice almost
 //   always is Eigen::Index.
 //
 // - OutputMapper provides access to the memory of the output matrix. In
 //   practice it's always column major blas_data_mapper (it must be of ResScalar
 //   type).
 //
 // - LhsMapper/RhsMapper similarly to blas_data_mapper provide a two dimensional
 //   view into the Lhs/Rhs tensor expressions. In practice it's
 //   TensorContractionInputMapper, or some specialization of it based on the
 //   type of tensor expression (e.g. TensorImagePatchOp has optimized input
 //   mapper).
 //
 // TODO(ezhulenev): Use TensorContractionKernel in default tensor contraction
 // evaluator.
 template<typename ResScalar, typename LhsScalar, typename RhsScalar,
    typename StorageIndex, typename OutputMapper, typename LhsMapper,
    typename RhsMapper>
 struct TensorContractionKernel {
  typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
  typedef internal::gemm_pack_lhs<LhsScalar, StorageIndex,
                                  typename LhsMapper::SubMapper,
                                  Traits::mr, Traits::LhsProgress,
                                  typename Traits::LhsPacket4Packing, ColMajor>
      LhsPacker;
  typedef internal::gemm_pack_rhs<RhsScalar, StorageIndex,
                                  typename RhsMapper::SubMapper, Traits::nr,
                                  ColMajor>
      RhsPacker;
  typedef internal::gebp_kernel<LhsScalar, RhsScalar, StorageIndex,
                                OutputMapper, Traits::mr, Traits::nr,
      /*ConjugateLhs*/ false, /*ConjugateRhs*/ false>
      GebpKernel;
  EIGEN_DONT_INLINE
  static void packLhs(LhsScalar* lhsBlock,
                      const typename LhsMapper::SubMapper& data_mapper,
                      const StorageIndex depth, const StorageIndex rows) {
    LhsPacker()(lhsBlock, data_mapper, depth, rows);
  }
  EIGEN_DONT_INLINE
  static void packRhs(RhsScalar* rhsBlock,
                      const typename RhsMapper::SubMapper& data_mapper,
                      const StorageIndex depth, const StorageIndex cols) {
    RhsPacker()(rhsBlock, data_mapper, depth, cols);
  }
  EIGEN_DONT_INLINE
  static void invoke(const OutputMapper& output_mapper,
                     const LhsScalar* lhsBlock, const RhsScalar* rhsBlock,
                     const StorageIndex rows, const StorageIndex depth,
                     const StorageIndex cols, const ResScalar alpha) {
    GebpKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha,
        /*strideA*/ -1, /*strideB*/ -1,
        /*offsetA*/ 0, /*offsetB*/ 0);
  }
 };
 // Some tensor contraction kernels might rely on the gemm libraries that are
 // optimized for a specific dimension sizes. By default Eigen picks block
 // sizes to fit the working set in the L1/L2 caches, by specializing we can
 // refine this choice and round up these sizes to work well with underlying gemm
 // library.
 // TODO(ezhulenev): Move it to TensorContractionBlocking, or keep separate?
 template<typename ResScalar, typename LhsScalar, typename RhsScalar,
    typename StorageIndex>
 struct TensorContractionKernelBlocking {
  static void refine(const StorageIndex /*m*/,
                     const StorageIndex /*n*/,
                     const StorageIndex /*k*/,
                     StorageIndex* /*bm*/,
                     StorageIndex* /*bn*/,
                     StorageIndex* /*bk*/) {
    // By default we do nothing and stick to the block sizes picked by Eigen.
  }
 };
 #if defined(EIGEN_USE_MKLDNN)
 // If all scalar types in tensor contraction are floats, we can use mkldnn gemm
 // as our low level kernel.
 template<typename StorageIndex, typename OutputMapper, typename LhsMapper,
    typename RhsMapper>
 struct TensorContractionKernel<float, float, float, StorageIndex, OutputMapper,
                               LhsMapper, RhsMapper> {
  // For now mkldnn has only mkldnn_sgemm (gemm for floats).
  typedef float Scalar;
  typedef typename internal::gebp_traits<Scalar, Scalar> Traits;
  typedef internal::mkldnn_gemm_pack<Scalar, StorageIndex,
                                     typename LhsMapper::SubMapper, ColMajor>
      LhsPacker;
  typedef internal::mkldnn_gemm_pack<Scalar, StorageIndex,
                                     typename RhsMapper::SubMapper, ColMajor>
      RhsPacker;
  typedef internal::mkldnn_gemm_kernel<Scalar, StorageIndex, OutputMapper>
      GemmKernel;
  EIGEN_DONT_INLINE
  static void packLhs(Scalar* lhsBlock,
                      const typename LhsMapper::SubMapper& data_mapper,
                      StorageIndex depth, StorageIndex rows) {
    LhsPacker()(lhsBlock, data_mapper, rows, depth);
  }
  EIGEN_DONT_INLINE
  static void packRhs(Scalar* rhsBlock,
                      const typename RhsMapper::SubMapper& data_mapper,
                      const StorageIndex depth, const StorageIndex cols) {
    RhsPacker()(rhsBlock, data_mapper, depth, cols);
  }
  EIGEN_DONT_INLINE
  static void invoke(const OutputMapper& output_mapper, const Scalar* lhsBlock,
                     const Scalar* rhsBlock, const StorageIndex rows,
                     const StorageIndex depth, const StorageIndex cols,
                     const Scalar alpha) {
    GemmKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha);
  }
 };
 // For mkldnn_sgemm having the right dimensions (especially for small matrices)
 // is more important than fitting all the working set in L1/L2 caches.
 template<typename StorageIndex>
 struct TensorContractionKernelBlocking<float, float, float, StorageIndex> {
  // Mkldnn Avx/Avx2/Avx512 unroll factors are: 8/16/48. We pick the largest.
  static const StorageIndex kUnrollM = 48;
  // Mkldnn Avx/Avx2/Avx512 unroll factors are: 6/6/8. We pick the closest
  // number that divides to both of them.
  static const StorageIndex kUnrollN = 24;
  static void refine(const StorageIndex m,
                     const StorageIndex n,
                     const StorageIndex /*k*/,
                     StorageIndex* bm,
                     StorageIndex* bn,
                     StorageIndex* /*bk*/) {
    // TODO(ezhulenev): There is probably a better way to pick block sizes.
    *bm = (std::min)(m, Eigen::divup(*bm, kUnrollM) * kUnrollM);
    *bn = (std::min)(n, Eigen::divup(*bn, kUnrollN) * kUnrollN);
    // Stick with default bk.
  }
 };
 #endif  // EIGEN_USE_MKLDNN
 } // namespace internal
 template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
 struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, ThreadPoolDevice> :
    public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, ThreadPoolDevice> > {
@ -175,6 +346,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
      bn = blocking.nc();
      bk = blocking.kc();
    }
    // Refine blocking choice to work well with contraction kernel.
    internal::TensorContractionKernelBlocking<Scalar, LhsScalar, RhsScalar,
                                              Index>::refine(m, n, k, &bm,
                                                             &bn, &bk);
    // Number of kernels for each dimension.
    Index nm0 = divup(m, bm);
@ -242,17 +417,12 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
        contract_t, internal::packet_traits<RhsScalar>::size,
        rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned>
        RhsMapper;
-    typedef internal::gemm_pack_lhs<LhsScalar, Index,
+
                                    typename LhsMapper::SubMapper, Traits::mr,
                                    Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor>
        LhsPacker;
    typedef internal::gemm_pack_rhs<
        RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor>
        RhsPacker;
    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
-    typedef internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper,
+
-                                  Traits::mr, Traits::nr, false, false>
+    typedef internal::TensorContractionKernel<
-        GebpKernel;
+        Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper>
        TensorContractionKernel;
    Context(const Self* self, int num_threads, Scalar* buffer, Index tm, Index tn,
            Index tk, Index bm, Index bn, Index bk, Index nm, Index nn, Index nk,
@ -434,8 +604,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
    void pack_lhs(Index m, Index k) {
      const Index mend = m * gm_ + gm(m);
      for (Index m1 = m * gm_; m1 < mend; m1++)
-        LhsPacker()(packed_lhs_[k % (P - 1)][m1],
+        TensorContractionKernel::packLhs(packed_lhs_[k % (P - 1)][m1],
-                    lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1));
+                                         lhs_.getSubMapper(m1 * bm_, k * bk_),
                                         bk(k), bm(m1));
      if (!parallel_pack_ && shard_by_col_) {
        signal_packing(k);
@ -458,8 +629,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
          // deadlocks.
          memset(buffer_ + n1 * bn_ * m_, 0, bn(n1) * m_ * sizeof(Scalar));
        }
-        RhsPacker()(packed_rhs_[k % (P - 1)][n1],
+        TensorContractionKernel::packRhs(packed_rhs_[k % (P - 1)][n1],
-                    rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1));
+                                         rhs_.getSubMapper(k * bk_, n1 * bn_),
                                         bk(k), bn(n1));
      }
      if (parallel_pack_ || shard_by_col_) {
@ -480,9 +652,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
        for (Index n1 = n * gn_; n1 < nend; n1++) {
          for (Index m1 = m * gm_; m1 < mend; m1++) {
            const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_);
-            GebpKernel()(output_mapper, packed_lhs_[k % (P - 1)][m1],
+            TensorContractionKernel::invoke(
-                         packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1),
+                output_mapper, packed_lhs_[k % (P - 1)][m1],
-                         Scalar(1), -1, -1, 0, 0);
+                packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), Scalar(1));
            // We are done with the last task for the [m1, n1] block.
            if (k + 1 == nk_) {
@ -495,9 +667,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
        for (Index m1 = m * gm_; m1 < mend; m1++)
          for (Index n1 = n * gn_; n1 < nend; n1++) {
            const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_);
-            GebpKernel()(output_mapper, packed_lhs_[k % (P - 1)][m1],
+            TensorContractionKernel::invoke(
-                         packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1),
+                output_mapper, packed_lhs_[k % (P - 1)][m1],
-                         Scalar(1), -1, -1, 0, 0);
+                packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), Scalar(1));
            // We are done with the last task for the [m1, n1] block.
            if (k + 1 == nk_) {
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@ -23,6 +23,17 @@ else(XSMM_FOUND)
  ei_add_property(EIGEN_MISSING_BACKENDS  "Xsmm, ")
 endif(XSMM_FOUND)
 find_package(Mkldnn)
 if(MKLDNN_FOUND)
  add_definitions("-DEIGEN_USE_MKLDNN")
  include_directories(${MKLDNN_INCLUDES})
  link_directories(${MKLDNN_LIBRARIES})
  set(EXTERNAL_LIBS ${EXTERNAL_LIBS} mkldnn)
  ei_add_property(EIGEN_TESTED_BACKENDS  "Mkldd, ")
 else(MKLDNN_FOUND)
  ei_add_property(EIGEN_MISSING_BACKENDS  "Mkldnn, ")
 endif(MKLDNN_FOUND)
 find_package(GoogleHash)
 if(GOOGLEHASH_FOUND)
  add_definitions("-DEIGEN_GOOGLEHASH_SUPPORT")
@ -190,6 +201,7 @@ if(EIGEN_TEST_CXX11)
  ei_add_test(cxx11_tensor_index_list)
  ei_add_test(cxx11_tensor_mixed_indices)
  ei_add_test(cxx11_tensor_contraction)
  ei_add_test(cxx11_tensor_contraction_mkldnn)
  ei_add_test(cxx11_tensor_convolution)
  ei_add_test(cxx11_tensor_expr)
  ei_add_test(cxx11_tensor_fixed_size)
--- a/unsupported/test/cxx11_tensor_contraction_mkldnn.cpp
+++ b/unsupported/test/cxx11_tensor_contraction_mkldnn.cpp
@ -0,0 +1,141 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2018 Eugene Zhulenev <ezhulenev@google.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #include "main.h"
 // Nothing to test here if we do not have mkldnn enabled.
 #if defined(EIGEN_USE_MKLDNN)
 #include <Eigen/CXX11/Tensor>
 using Eigen::array;
 using Eigen::ColMajor;
 using Eigen::Tensor;
 using Eigen::Index;
 using Eigen::internal::blas_data_mapper;
 using Eigen::internal::mkldnn_gemm_kernel;
 using Eigen::internal::mkldnn_gemm_pack;
 template <int NumDims>
 static array<Index, NumDims> RandomDims(int min_dim = 1, int max_dim = 20) {
  array<Index, NumDims> dims;
  for (int i = 0; i < NumDims; ++i) {
    dims[i] = internal::random<int>(min_dim, max_dim);
  }
  return dims;
 }
 // Packing with mkldnn_gemm_pack is the same as taking a slice of 2 dimensional
 // Tensor.
 template <typename Scalar>
 static void test_mkldnn_gemm_pack() {
  static const int Options = 0 | ColMajor;
  typedef blas_data_mapper<Scalar, Index, ColMajor> DataMapper;
  typedef mkldnn_gemm_pack<Scalar, Index, DataMapper, ColMajor> MkldnnGemmPack;
  typedef Tensor<Scalar, 2, Options, Index> Tensor2d;
  array<Index, 2> dims = RandomDims<2>(1, 500);
  // Create a tensor initialized with random data.
  Tensor2d src(dims);
  src.setRandom();
  // Pick a random slice of src tensor.
  array<Index, 2> slice_start = RandomDims<2>(0, 250);
  array<Index, 2> slice_size = RandomDims<2>(100, 500);
  // Make sure that slice start + size do not overflow tensor dims.
  for (int i = 0; i < 2; ++i) {
    slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
    slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
  }
  // Prepare tensors for packing and slicing results.
  Tensor2d pack_dst(slice_size[0], slice_size[1]);
  Tensor2d slice_dst(slice_size[0], slice_size[1]);
  // Pack memory using mkldnn_gemm_pack.
  DataMapper data_mapper(src.data(), dims[0]);
  MkldnnGemmPack gemm_pack;
  gemm_pack(pack_dst.data(),
            data_mapper.getSubMapper(slice_start[0], slice_start[1]),
            slice_size[0], slice_size[1]);
  // Slice the source tensor.
  slice_dst = src.slice(slice_start, slice_size);
  // Verify that dst tensors are equal.
  VERIFY_IS_EQUAL(pack_dst.dimensions().TotalSize(),
                  slice_dst.dimensions().TotalSize());
  for (Index i = 0; i < pack_dst.dimensions().TotalSize(); ++i) {
    Scalar packed = pack_dst.coeff(i);
    Scalar sliced = slice_dst.coeff(i);
    VERIFY_IS_EQUAL(packed, sliced);
  }
 }
 template <typename Scalar>
 static void test_mkldnn_gemm_kernel() {
  static const int Options = 0 | ColMajor;
  typedef Tensor<Scalar, 2, Options, Index> Tensor2d;
  int m = internal::random<int>(1, 100);
  int n = internal::random<int>(1, 100);
  int k = internal::random<int>(1, 100);
  Tensor2d lhs(m, k);
  lhs.setRandom();
  Tensor2d rhs(k, n);
  rhs.setRandom();
  // Compute matmul with mkldnn gemm kernel.
  typedef blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
  typedef mkldnn_gemm_kernel<Scalar, Index, OutputMapper, ColMajor>
      MkldnnGemmKernel;
  Tensor2d mkldnn_result(m, n);
  mkldnn_result.setZero();
  OutputMapper output_mapper(mkldnn_result.data(), m);
  MkldnnGemmKernel gemm_kernel;
  gemm_kernel(output_mapper, lhs.data(), rhs.data(), m, k, n, /*alpha*/ 1.0);
  // Compute matmul with Eigen::Matrix.
  typedef Eigen::Matrix<Scalar, Dynamic, Dynamic, ColMajor> Matrix;
  typedef Map<Eigen::Matrix<Scalar, Dynamic, Dynamic, ColMajor> > MatrixMap;
  MatrixMap lhs_mat(lhs.data(), m, k);
  MatrixMap rhs_mat(rhs.data(), k, n);
  Matrix matmul_result(m, n);
  matmul_result.setZero();
  matmul_result = lhs_mat * rhs_mat;
  static const float error_threshold = 1e-4f;
  // Verify that results are equal.
  for (Index i = 0; i < m * n; ++i) {
    Scalar gemm = mkldnn_result(i);
    Scalar matmul = matmul_result(i % m, i / m);
    if ((std::abs)(gemm) > error_threshold &&
        (std::abs)(matmul) > error_threshold) {
      if (!Eigen::internal::isApprox(gemm, matmul, error_threshold))
        std::cout << "gemm=" << gemm << " matmul=" << matmul << std::endl;
      VERIFY(Eigen::internal::isApprox(gemm, matmul, error_threshold));
    }
  }
 }
 EIGEN_DECLARE_TEST(cxx11_tensor_contraction_mkldnn) {
  CALL_SUBTEST(test_mkldnn_gemm_pack<float>());
  CALL_SUBTEST(test_mkldnn_gemm_kernel<float>());
 }
 #else
 EIGEN_DECLARE_TEST(cxx11_tensor_contraction_mkldnn) {}
 #endif  // EIGEN_USE_MKLDNN